diff --git a/components/nmdc_runtime/workflow/spec.py b/components/nmdc_runtime/workflow/spec.py index 3368b0b1..3ae88861 100644 --- a/components/nmdc_runtime/workflow/spec.py +++ b/components/nmdc_runtime/workflow/spec.py @@ -25,9 +25,9 @@ class Sequencing(Workflow): enabled: bool = True git_repo: str = "" version: str = "1.0.0" - activity: Literal[ + activity: Literal["metagenome_sequencing_activity_set"] = ( "metagenome_sequencing_activity_set" - ] = "metagenome_sequencing_activity_set" + ) predecessor: str = "" id_type: str = "" input_prefix: str = "" @@ -69,9 +69,9 @@ class MetagenomeAnnotation(Workflow): git_repo: str = "https://github.com/microbiomedata/mg_annotation" version: str = "1.0.0" wdl: str = "annotation_full.wdl" - activity: Literal[ + activity: Literal["metagenome_annotation_activity_set"] = ( "metagenome_annotation_activity_set" - ] = "metagenome_annotation_activity_set" + ) predecessor: str = "MetagenomeAssembly" input_prefix: str = "annotation" id_type: str = "mgann" @@ -152,9 +152,9 @@ class ReadBasedAnalysis(Workflow): git_repo: str = "https://github.com/microbiomedata/ReadbasedAnalysis" version: str = "1.0.2" wdl: str = "ReadbasedAnalysis.wdl" - activity: Literal[ + activity: Literal["read_based_taxonomy_analysis_activity_set"] = ( "read_based_taxonomy_analysis_activity_set" - ] = "read_based_taxonomy_analysis_activity_set" + ) predecessor: str = "Read QC Analysis" input_prefix: str = "nmdc_rba" id_type: str = "mgrba" @@ -162,9 +162,14 @@ class ReadBasedAnalysis(Workflow): class WorkflowModel(BaseModel): - workflow: ReadQcAnalysis | MetagenomeAssembly | MAGs | ReadBasedAnalysis | Sequencing | MetagenomeAnnotation = Field( - ..., discriminator="activity" - ) + workflow: ( + ReadQcAnalysis + | MetagenomeAssembly + | MAGs + | ReadBasedAnalysis + | Sequencing + | MetagenomeAnnotation + ) = Field(..., discriminator="activity") def get_all_workflows() -> list[Workflow]: diff --git a/components/nmdc_runtime/workflow_execution_activity/__init__.py b/components/nmdc_runtime/workflow_execution_activity/__init__.py index f91be83e..394b635b 100644 --- a/components/nmdc_runtime/workflow_execution_activity/__init__.py +++ b/components/nmdc_runtime/workflow_execution_activity/__init__.py @@ -3,5 +3,6 @@ Workflow Execution Activies are a map of relevant data a user would like to have with regards to job execution or instantiation within their local system. """ + from .core import ActivityService, init_activity_service from .spec import Database, WorkflowExecutionActivity diff --git a/components/nmdc_runtime/workflow_execution_activity/core.py b/components/nmdc_runtime/workflow_execution_activity/core.py index 43236234..e9dbad37 100644 --- a/components/nmdc_runtime/workflow_execution_activity/core.py +++ b/components/nmdc_runtime/workflow_execution_activity/core.py @@ -1,4 +1,5 @@ """Core functionality of the activity service module.""" + from dataclasses import fields from typing import Any, TypedDict from uuid import uuid1 diff --git a/demo/metadata_discovery/indexing.ipynb b/demo/metadata_discovery/indexing.ipynb index 2bb06cfb..9f2e7f21 100644 --- a/demo/metadata_discovery/indexing.ipynb +++ b/demo/metadata_discovery/indexing.ipynb @@ -42,7 +42,7 @@ " \"https://localhost:9200\",\n", " # docker cp nmdc-elasticsearch_es01_1:/usr/share/elasticsearch/config/certs/ca/ca.crt .\n", " ca_certs=\"ca.crt\",\n", - " basic_auth=(\"elastic\", ELASTIC_PASSWORD)\n", + " basic_auth=(\"elastic\", ELASTIC_PASSWORD),\n", ")\n", "\n", "client.info().body" @@ -69,13 +69,13 @@ "\n", "docs = []\n", "rv = requests.get(\"https://api.microbiomedata.org/biosamples?page=1&per_page=200\")\n", - "docs.extend(rv.json()['results'])\n", + "docs.extend(rv.json()[\"results\"])\n", "rv = requests.get(\"https://api.microbiomedata.org/biosamples?page=2&per_page=200\")\n", - "docs.extend(rv.json()['results'])\n", + "docs.extend(rv.json()[\"results\"])\n", "rv = requests.get(\"https://api.microbiomedata.org/biosamples?page=3&per_page=200\")\n", - "docs.extend(rv.json()['results'])\n", + "docs.extend(rv.json()[\"results\"])\n", "rv = requests.get(\"https://api.microbiomedata.org/biosamples?page=4&per_page=200\")\n", - "docs.extend(rv.json()['results'])\n", + "docs.extend(rv.json()[\"results\"])\n", "len(docs)" ] }, @@ -133,12 +133,12 @@ "source": [ "from typing import Dict\n", "\n", - "ecosystem_paths : Dict[str, int] = {}\n", + "ecosystem_paths: Dict[str, int] = {}\n", "for row in df_ecosystem_paths.itertuples():\n", " _path = \" > \".join([str(e) for e in row[2:]])\n", " _id = row[1]\n", " ecosystem_paths[_path] = _id\n", - " \n", + "\n", "assert len(df_ecosystem_paths) == len(ecosystem_paths)" ] }, @@ -286,7 +286,7 @@ "outputs": [], "source": [ "def curie_purl(curie):\n", - " prefix, n = curie.split(':', maxsplit=1)\n", + " prefix, n = curie.split(\":\", maxsplit=1)\n", " return f\"http://purl.obolibrary.org/obo/{prefix}_{n}\"" ] }, @@ -314,15 +314,9 @@ "\n", "for c, purl in curie_purl.items():\n", " if c.startswith(\"ENVO:\"):\n", - " curie_label[c] = str(g_envo.value(\n", - " subject=URIRef(purl),\n", - " predicate=RDFS.label\n", - " ))\n", + " curie_label[c] = str(g_envo.value(subject=URIRef(purl), predicate=RDFS.label))\n", " elif c.startswith(\"PO:\"):\n", - " curie_label[c] = str(g_po.value(\n", - " subject=URIRef(purl),\n", - " predicate=RDFS.label\n", - " ))\n", + " curie_label[c] = str(g_po.value(subject=URIRef(purl), predicate=RDFS.label))\n", " else:\n", " raise ValueError(\"Unknown CURIE prefix\")" ] @@ -377,12 +371,13 @@ "metadata": {}, "outputs": [], "source": [ - "index_name = 'biosamples'\n", + "index_name = \"biosamples\"\n", "\n", "if client.indices.exists(index=index_name):\n", " client.indices.delete(index=index_name)\n", "\n", "import json\n", + "\n", "with open(f\"{index_name}.json\") as f:\n", " index_body = json.load(f)\n", "\n", @@ -398,7 +393,7 @@ "source": [ "from elasticsearch.helpers import bulk\n", "\n", - "bulk(client, [dict(_id=d[\"id\"], _index=index_name, **d)for d in docs])" + "bulk(client, [dict(_id=d[\"id\"], _index=index_name, **d) for d in docs])" ] }, { @@ -419,7 +414,7 @@ "outputs": [], "source": [ "for doc in docs:\n", - " print(doc['ecosystem_path'])" + " print(doc[\"ecosystem_path\"])" ] }, { @@ -430,7 +425,7 @@ "outputs": [], "source": [ "for doc in docs:\n", - " print(doc['mixs_triad'])" + " print(doc[\"mixs_triad\"])" ] }, { diff --git a/demo/metadata_migration/main.ipynb b/demo/metadata_migration/main.ipynb index 9b2cb043..1bc4cdfc 100644 --- a/demo/metadata_migration/main.ipynb +++ b/demo/metadata_migration/main.ipynb @@ -34,13 +34,13 @@ " host=os.getenv(\"MONGO_HOST\"),\n", " username=os.getenv(\"MONGO_USERNAME\"),\n", " password=os.getenv(\"MONGO_PASSWORD\"),\n", - " directConnection=True, # connect to host as a standalone, rather than to entire replicaset\n", + " directConnection=True, # connect to host as a standalone, rather than to entire replicaset\n", ")\n", "reader_client = MongoClient(\n", " host=os.getenv(\"MONGO_HOST\"),\n", " username=os.getenv(\"MONGO_READONLY_USERNAME\"),\n", " password=os.getenv(\"MONGO_READONLY_PASSWORD\"),\n", - " directConnection=True, # connect to host as a standalone, rather than to entire replicaset\n", + " directConnection=True, # connect to host as a standalone, rather than to entire replicaset\n", ")\n", "\n", "mdb_src = reader_client[os.getenv(\"MONGO_DBNAME\")]\n", @@ -84,18 +84,25 @@ "import fastjsonschema\n", "from toolz import dissoc\n", "\n", + "\n", "def strip_oid(doc):\n", " return dissoc(doc, \"_id\")\n", "\n", + "\n", "def nmdc_schema_collection_names() -> set:\n", " return {\n", - " k for k, v in get_nmdc_jsonschema_dict()[\"$defs\"][\"Database\"][\"properties\"].items()\n", - " if v.get(\"items\",{}).get(\"$ref\")\n", + " k\n", + " for k, v in get_nmdc_jsonschema_dict()[\"$defs\"][\"Database\"][\n", + " \"properties\"\n", + " ].items()\n", + " if v.get(\"items\", {}).get(\"$ref\")\n", " }\n", "\n", + "\n", "def present_src_collections(mdb) -> list:\n", " return sorted(\n", - " n for n in (nmdc_schema_collection_names() & set(mdb_src.list_collection_names()))\n", + " n\n", + " for n in (nmdc_schema_collection_names() & set(mdb_src.list_collection_names()))\n", " if mdb_src[n].estimated_document_count()\n", " )" ] @@ -121,6 +128,7 @@ "\n", "from nmdc_runtime.util import get_nmdc_jsonschema_dict\n", "\n", + "\n", "def without_id_patterns(nmdc_jsonschema):\n", " rv = deepcopy(nmdc_jsonschema)\n", " for cls_, spec in rv[\"$defs\"].items():\n", @@ -197,7 +205,7 @@ " raise Exception(f\"needs `term` or `has_raw_value`\")\n", " if not (m := re.search(id_pattern, v[\"has_raw_value\"])):\n", " raise Exception(f'{v[\"has_raw_value\"]} does not match a known ID pattern')\n", - " \n", + "\n", " return assoc(v, \"term\", {\"id\": v[\"has_raw_value\"]})\n", "\n", "\n", @@ -206,7 +214,7 @@ " raise Exception(\"list expected\")\n", " if not all(\":\" in elt for elt in v):\n", " raise Exception(\"CURIEs expected\")\n", - " \n", + "\n", " rv = []\n", " for elt in v:\n", " prefix, localpart = elt.split(\":\", maxsplit=1)\n", @@ -219,7 +227,7 @@ " raise Exception(\"list expected\")\n", " if not all(\":\" in elt for elt in v):\n", " raise Exception(\"CURIEs expected\")\n", - " \n", + "\n", " rv = []\n", " for elt in v:\n", " prefix, localpart = elt.split(\":\", maxsplit=1)\n", @@ -243,7 +251,7 @@ "def ensure_depth_via_depth2(v, d):\n", " if \"depth\" not in d:\n", " raise Exception(\"no `depth` field\")\n", - " \n", + "\n", " depth = d[\"depth\"]\n", " return {\"depth\": depth, \"depth2\": None}\n", "\n", @@ -281,8 +289,7 @@ "\n", "def rename_num_tRNA(v):\n", " return [\n", - " change_fieldname(elt, \"num_tRNA\", \"num_t_rna\")\n", - " for elt in v if \"num_tRNA\" in elt\n", + " change_fieldname(elt, \"num_tRNA\", \"num_t_rna\") for elt in v if \"num_tRNA\" in elt\n", " ]\n", "\n", "\n", @@ -340,7 +347,7 @@ " \"emsl\": \"emsl_biosample_identifiers\",\n", " \"gold\": \"gold_biosample_identifiers\",\n", " \"igsn\": \"igsn_biosample_identifiers\",\n", - " \"img.taxon\": \"img_identifiers\"\n", + " \"img.taxon\": \"img_identifiers\",\n", "}\n", "\n", "\n", @@ -350,10 +357,14 @@ "\n", "def replace_fields(d, context):\n", " assert \"collection_name\" in context\n", - " for fieldname, replacement in fieldname_replacements[context[\"collection_name\"]].items():\n", + " for fieldname, replacement in fieldname_replacements[\n", + " context[\"collection_name\"]\n", + " ].items():\n", " if fieldname in d:\n", " if isinstance(replacement, list):\n", - " assert all(callable(r) for r in replacement), \"replacement-list must be all functions\"\n", + " assert all(\n", + " callable(r) for r in replacement\n", + " ), \"replacement-list must be all functions\"\n", " for rfun in replacement:\n", " n_params = len(signature(rfun).parameters)\n", " if n_params == 1:\n", @@ -381,8 +392,8 @@ " except fastjsonschema.JsonSchemaException as e:\n", " print(d[\"id\"])\n", " print(e)\n", - " #pprint(d)\n", - " #raise e\n", + " # pprint(d)\n", + " # raise e\n", " return None\n", " validated.append(d)\n", " return validated" diff --git a/demo/metadata_migration/notebooks/migrate_10_0_0_to_10_1_4.ipynb b/demo/metadata_migration/notebooks/migrate_10_0_0_to_10_1_4.ipynb index c57b7f18..145ae1a5 100644 --- a/demo/metadata_migration/notebooks/migrate_10_0_0_to_10_1_4.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_0_0_to_10_1_4.ipynb @@ -46,9 +46,7 @@ }, "outputs": [], "source": [ - "COLLECTION_NAMES: list[str] = [\n", - " \"data_object_set\"\n", - "]" + "COLLECTION_NAMES: list[str] = [\"data_object_set\"]" ] }, { @@ -157,7 +155,9 @@ "from nmdc_schema.nmdc_data import get_nmdc_jsonschema_dict\n", "from nmdc_schema.migrators.adapters.mongo_adapter import MongoAdapter\n", "\n", - "from nmdc_schema.migrators.migrator_from_10_0_0_to_10_1_2 import Migrator # note: the migrator to 10.1.2 was introduced in schema version 10.1.4\n", + "from nmdc_schema.migrators.migrator_from_10_0_0_to_10_1_2 import (\n", + " Migrator,\n", + ") # note: the migrator to 10.1.2 was introduced in schema version 10.1.4\n", "\n", "# First-party packages:\n", "from helpers import Config\n", @@ -210,7 +210,9 @@ "outputs": [], "source": [ "# Mongo client for \"origin\" MongoDB server.\n", - "origin_mongo_client = pymongo.MongoClient(host=cfg.origin_mongo_server_uri, directConnection=True)\n", + "origin_mongo_client = pymongo.MongoClient(\n", + " host=cfg.origin_mongo_server_uri, directConnection=True\n", + ")\n", "\n", "# Mongo client for \"transformer\" MongoDB server.\n", "transformer_mongo_client = pymongo.MongoClient(host=cfg.transformer_mongo_server_uri)\n", @@ -218,16 +220,26 @@ "# Perform sanity tests of those MongoDB clients' abilities to access their respective MongoDB servers.\n", "with pymongo.timeout(3):\n", " # Display the MongoDB server version (running on the \"origin\" Mongo server).\n", - " print(\"Origin Mongo server version: \" + origin_mongo_client.server_info()[\"version\"])\n", + " print(\n", + " \"Origin Mongo server version: \"\n", + " + origin_mongo_client.server_info()[\"version\"]\n", + " )\n", "\n", " # Sanity test: Ensure the origin database exists.\n", - " assert \"nmdc\" in origin_mongo_client.list_database_names(), \"Origin database does not exist.\"\n", + " assert (\n", + " \"nmdc\" in origin_mongo_client.list_database_names()\n", + " ), \"Origin database does not exist.\"\n", "\n", " # Display the MongoDB server version (running on the \"transformer\" Mongo server).\n", - " print(\"Transformer Mongo server version: \" + transformer_mongo_client.server_info()[\"version\"])\n", + " print(\n", + " \"Transformer Mongo server version: \"\n", + " + transformer_mongo_client.server_info()[\"version\"]\n", + " )\n", "\n", " # Sanity test: Ensure the transformation database does not exist.\n", - " assert \"nmdc\" not in transformer_mongo_client.list_database_names(), \"Transformation database already exists.\"" + " assert (\n", + " \"nmdc\" not in transformer_mongo_client.list_database_names()\n", + " ), \"Transformation database already exists.\"" ] }, { @@ -279,7 +291,11 @@ " \"\"\"\n", " custom_schema = deepcopy(nmdc_schema)\n", " for spec in custom_schema[\"$defs\"].values():\n", - " if \"properties\" in spec and \"id\" in spec[\"properties\"] and \"pattern\" in spec[\"properties\"][\"id\"]:\n", + " if (\n", + " \"properties\" in spec\n", + " and \"id\" in spec[\"properties\"]\n", + " and \"pattern\" in spec[\"properties\"][\"id\"]\n", + " ):\n", " del spec[\"properties\"][\"id\"][\"pattern\"]\n", " return custom_schema\n", "\n", @@ -293,7 +309,9 @@ "print(\"NMDC Schema title: \" + nmdc_jsonschema[\"title\"])\n", "print(\"NMDC Schema version: \" + nmdc_jsonschema[\"version\"])\n", "\n", - "nmdc_jsonschema_validator.check_schema(nmdc_jsonschema) # raises exception if schema is invalid" + "nmdc_jsonschema_validator.check_schema(\n", + " nmdc_jsonschema\n", + ") # raises exception if schema is invalid" ] }, { @@ -436,10 +454,16 @@ " # Note: `root_to_validate` is a dictionary having the shape: { \"some_collection_name\": [ some_document ] }\n", " # Reference: https://docs.python.org/3/library/stdtypes.html#dict (see the \"type constructor\" section)\n", " #\n", - " document_without_underscore_id_key = {key: value for key, value in document.items() if key != \"_id\"}\n", - " root_to_validate = dict([(collection_name, [document_without_underscore_id_key])])\n", + " document_without_underscore_id_key = {\n", + " key: value for key, value in document.items() if key != \"_id\"\n", + " }\n", + " root_to_validate = dict(\n", + " [(collection_name, [document_without_underscore_id_key])]\n", + " )\n", " try:\n", - " nmdc_jsonschema_validator.validate(root_to_validate) # raises exception if invalid\n", + " nmdc_jsonschema_validator.validate(\n", + " root_to_validate\n", + " ) # raises exception if invalid\n", " except ValidationError as err:\n", " # Print the offending document (to facilitate debug) before propagating the exception.\n", " print(document)\n", @@ -467,7 +491,9 @@ }, "outputs": [], "source": [ - "bookkeeper.record_migration_event(migrator=migrator, event=MigrationEvent.MIGRATION_STARTED)" + "bookkeeper.record_migration_event(\n", + " migrator=migrator, event=MigrationEvent.MIGRATION_STARTED\n", + ")" ] }, { @@ -547,7 +573,9 @@ }, "outputs": [], "source": [ - "bookkeeper.record_migration_event(migrator=migrator, event=MigrationEvent.MIGRATION_COMPLETED)" + "bookkeeper.record_migration_event(\n", + " migrator=migrator, event=MigrationEvent.MIGRATION_COMPLETED\n", + ")" ] } ], diff --git a/demo/metadata_migration/notebooks/migrate_7_7_2_to_7_8_0.ipynb b/demo/metadata_migration/notebooks/migrate_7_7_2_to_7_8_0.ipynb index 247fce4c..eea49fb8 100644 --- a/demo/metadata_migration/notebooks/migrate_7_7_2_to_7_8_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_7_7_2_to_7_8_0.ipynb @@ -110,14 +110,20 @@ "outputs": [], "source": [ "# Create temporary file in the notebook's folder, containing the origin MongoDB password.\n", - "origin_mongo_config_file = NamedTemporaryFile(delete=False, dir=str(Path.cwd()), prefix=\"tmp.origin_mongo_config.\")\n", + "origin_mongo_config_file = NamedTemporaryFile(\n", + " delete=False, dir=str(Path.cwd()), prefix=\"tmp.origin_mongo_config.\"\n", + ")\n", "origin_mongo_config_file.write(bytes(f\"password: {origin_mongo_password}\", \"utf-8\"))\n", "origin_mongo_config_file.close()\n", "origin_mongo_config_file_path: str = origin_mongo_config_file.name\n", "\n", "# Create temporary file in the notebook's folder, containing the transformer MongoDB password.\n", - "transformer_mongo_config_file = NamedTemporaryFile(delete=False, dir=str(Path.cwd()), prefix=\"tmp.transformer_mongo_config.\")\n", - "transformer_mongo_config_file.write(bytes(f\"password: {transformer_mongo_password}\", \"utf-8\"))\n", + "transformer_mongo_config_file = NamedTemporaryFile(\n", + " delete=False, dir=str(Path.cwd()), prefix=\"tmp.transformer_mongo_config.\"\n", + ")\n", + "transformer_mongo_config_file.write(\n", + " bytes(f\"password: {transformer_mongo_password}\", \"utf-8\")\n", + ")\n", "transformer_mongo_config_file.close()\n", "transformer_mongo_config_file_path: str = transformer_mongo_config_file.name" ] @@ -176,7 +182,9 @@ "users_initial = result[\"users\"]\n", "\n", "# Create temporary file in the notebook's folder, containing the initial users.\n", - "users_file = NamedTemporaryFile(delete=False, dir=str(Path.cwd()), prefix=\"tmp.origin_users_initial.\")\n", + "users_file = NamedTemporaryFile(\n", + " delete=False, dir=str(Path.cwd()), prefix=\"tmp.origin_users_initial.\"\n", + ")\n", "users_file.write(bytes(pformat(users_initial), \"utf-8\"))\n", "users_file.close()" ] @@ -199,8 +207,14 @@ " break # Abort! TODO: Remove me when I'm ready to run this notebook for real.\n", "\n", " if any((role[\"db\"] == \"nmdc\") for role in user[\"roles\"]):\n", - " origin_mongo_client[\"admin\"].command(\"grantRolesToUser\", user[\"user\"], roles=[{ \"role\": \"read\", \"db\": \"nmdc\" }])\n", - " origin_mongo_client[\"admin\"].command(\"revokeRolesFromUser\", user[\"user\"], roles=[{ \"role\": \"readWrite\", \"db\": \"nmdc\" }])" + " origin_mongo_client[\"admin\"].command(\n", + " \"grantRolesToUser\", user[\"user\"], roles=[{\"role\": \"read\", \"db\": \"nmdc\"}]\n", + " )\n", + " origin_mongo_client[\"admin\"].command(\n", + " \"revokeRolesFromUser\",\n", + " user[\"user\"],\n", + " roles=[{\"role\": \"readWrite\", \"db\": \"nmdc\"}],\n", + " )" ] }, { @@ -291,22 +305,23 @@ "metadata": {}, "outputs": [], "source": [ - "\n", "# \n", - "doi_url_pattern = r'^https?:\\/\\/[a-zA-Z\\.]+\\/10\\.'\n", + "doi_url_pattern = r\"^https?:\\/\\/[a-zA-Z\\.]+\\/10\\.\"\n", + "\n", "\n", "def migrate_studies_7_7_2_to_7_8(retrieved_study):\n", " print(f\"Starting migration of {retrieved_study['id']}\")\n", " if \"doi\" in retrieved_study:\n", - " match = re.search(doi_url_pattern, retrieved_study[\"doi\"]['has_raw_value'])\n", + " match = re.search(doi_url_pattern, retrieved_study[\"doi\"][\"has_raw_value\"])\n", " if match:\n", " start_index = match.end()\n", " as_curie = f\"doi:10.{retrieved_study['doi']['has_raw_value'][start_index:]}\"\n", " retrieved_study[\"award_dois\"] = [as_curie]\n", " del retrieved_study[\"doi\"]\n", " return retrieved_study\n", - "# \n", - "\n" + "\n", + "\n", + "# " ] }, { @@ -325,7 +340,9 @@ "\n", "# Replace the original versions with the transformed versions of themselves (in the transformer database).\n", "for transformed_study in transformed_studies:\n", - " transformer_mongo_client[\"nmdc\"][\"study_set\"].replace_one({\"id\": {\"$eq\": transformed_study[\"id\"]}}, transformed_study)\n" + " transformer_mongo_client[\"nmdc\"][\"study_set\"].replace_one(\n", + " {\"id\": {\"$eq\": transformed_study[\"id\"]}}, transformed_study\n", + " )" ] }, { @@ -421,9 +438,17 @@ "\n", " break # Abort! TODO: Remove me when I'm ready to run this notebook for real.\n", "\n", - " if any((role[\"db\"] == \"nmdc\" and role[\"role\"] == \"readWrite\") for role in user[\"roles\"]):\n", - " origin_mongo_client[\"admin\"].command(\"grantRolesToUser\", user[\"user\"], roles=[{ \"role\": \"readWrite\", \"db\": \"nmdc\" }])\n", - " origin_mongo_client[\"admin\"].command(\"revokeRolesFromUser\", user[\"user\"], roles=[{ \"role\": \"read\", \"db\": \"nmdc\" }])" + " if any(\n", + " (role[\"db\"] == \"nmdc\" and role[\"role\"] == \"readWrite\") for role in user[\"roles\"]\n", + " ):\n", + " origin_mongo_client[\"admin\"].command(\n", + " \"grantRolesToUser\",\n", + " user[\"user\"],\n", + " roles=[{\"role\": \"readWrite\", \"db\": \"nmdc\"}],\n", + " )\n", + " origin_mongo_client[\"admin\"].command(\n", + " \"revokeRolesFromUser\", user[\"user\"], roles=[{\"role\": \"read\", \"db\": \"nmdc\"}]\n", + " )" ] }, { diff --git a/demo/metadata_migration/notebooks/migrate_7_8_0_to_8_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_7_8_0_to_8_0_0.ipynb index 7d222140..60d9ac57 100644 --- a/demo/metadata_migration/notebooks/migrate_7_8_0_to_8_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_7_8_0_to_8_0_0.ipynb @@ -180,7 +180,9 @@ "outputs": [], "source": [ "# MongoDB client for origin MongoDB server.\n", - "origin_mongo_client = pymongo.MongoClient(host=cfg.origin_mongo_server_uri, directConnection=True)\n", + "origin_mongo_client = pymongo.MongoClient(\n", + " host=cfg.origin_mongo_server_uri, directConnection=True\n", + ")\n", "\n", "# MongoDB client for transformer MongoDB server.\n", "transformer_mongo_client = pymongo.MongoClient(host=cfg.transformer_mongo_server_uri)" @@ -207,7 +209,9 @@ "users_initial = result[\"users\"]\n", "\n", "# Create temporary file in the notebook's folder, containing the initial users.\n", - "users_file = NamedTemporaryFile(delete=False, dir=str(Path.cwd()), prefix=\"tmp.origin_users_initial.\")\n", + "users_file = NamedTemporaryFile(\n", + " delete=False, dir=str(Path.cwd()), prefix=\"tmp.origin_users_initial.\"\n", + ")\n", "users_file.write(bytes(pformat(users_initial), \"utf-8\"))\n", "users_file.close()" ] @@ -232,8 +236,14 @@ " break # Abort! TODO: Remove me when I'm ready to run this notebook for real.\n", "\n", " if any((role[\"db\"] == \"nmdc\") for role in user[\"roles\"]):\n", - " origin_mongo_client[\"admin\"].command(\"grantRolesToUser\", user[\"user\"], roles=[{ \"role\": \"read\", \"db\": \"nmdc\" }])\n", - " origin_mongo_client[\"admin\"].command(\"revokeRolesFromUser\", user[\"user\"], roles=[{ \"role\": \"readWrite\", \"db\": \"nmdc\" }])" + " origin_mongo_client[\"admin\"].command(\n", + " \"grantRolesToUser\", user[\"user\"], roles=[{\"role\": \"read\", \"db\": \"nmdc\"}]\n", + " )\n", + " origin_mongo_client[\"admin\"].command(\n", + " \"revokeRolesFromUser\",\n", + " user[\"user\"],\n", + " roles=[{\"role\": \"readWrite\", \"db\": \"nmdc\"}],\n", + " )" ] }, { @@ -402,7 +412,9 @@ "# TODO: Consider defining this mapping in the `nmdc-schema` repository/package instead.\n", "transformation_pipelines = dict(\n", " extraction_set=[migrator.migrate_extractions_7_8_0_to_8_0_0],\n", - " omics_processing_set=[migrator.migrate_uc_gold_sequencing_project_identifiers_7_8_0_to_8_0_0],\n", + " omics_processing_set=[\n", + " migrator.migrate_uc_gold_sequencing_project_identifiers_7_8_0_to_8_0_0\n", + " ],\n", " biosample_set=[migrator.migrate_uc_gold_biosample_identifiers_7_8_0_to_8_0_0],\n", " study_set=[migrator.migrate_uc_gold_study_identifiers_7_8_0_to_8_0_0],\n", ")\n", @@ -415,7 +427,7 @@ " # Get each document from this collection.\n", " collection = transformer_mongo_client[\"nmdc\"][collection_name]\n", " for original_document in collection.find():\n", - " \n", + "\n", " # Put the document through the transformation pipeline associated with this collection.\n", " print(original_document)\n", " transformed_document = original_document # initializes the variable\n", @@ -429,7 +441,9 @@ "\n", " # Replace the original documents with the transformed versions of themselves (in the transformer database).\n", " for transformed_document in transformed_documents:\n", - " collection.replace_one({\"id\": {\"$eq\": transformed_document[\"id\"]}}, transformed_document)\n" + " collection.replace_one(\n", + " {\"id\": {\"$eq\": transformed_document[\"id\"]}}, transformed_document\n", + " )" ] }, { @@ -520,9 +534,17 @@ "\n", " break # Abort! TODO: Remove me when I'm ready to run this notebook for real.\n", "\n", - " if any((role[\"db\"] == \"nmdc\" and role[\"role\"] == \"readWrite\") for role in user[\"roles\"]):\n", - " origin_mongo_client[\"admin\"].command(\"grantRolesToUser\", user[\"user\"], roles=[{ \"role\": \"readWrite\", \"db\": \"nmdc\" }])\n", - " origin_mongo_client[\"admin\"].command(\"revokeRolesFromUser\", user[\"user\"], roles=[{ \"role\": \"read\", \"db\": \"nmdc\" }])" + " if any(\n", + " (role[\"db\"] == \"nmdc\" and role[\"role\"] == \"readWrite\") for role in user[\"roles\"]\n", + " ):\n", + " origin_mongo_client[\"admin\"].command(\n", + " \"grantRolesToUser\",\n", + " user[\"user\"],\n", + " roles=[{\"role\": \"readWrite\", \"db\": \"nmdc\"}],\n", + " )\n", + " origin_mongo_client[\"admin\"].command(\n", + " \"revokeRolesFromUser\", user[\"user\"], roles=[{\"role\": \"read\", \"db\": \"nmdc\"}]\n", + " )" ] }, { diff --git a/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb b/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb index 67ee6038..ae66e2fc 100644 --- a/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb +++ b/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb @@ -220,7 +220,9 @@ "outputs": [], "source": [ "# Mongo client for origin Mongo server.\n", - "origin_mongo_client = pymongo.MongoClient(host=cfg.origin_mongo_server_uri, directConnection=True)\n", + "origin_mongo_client = pymongo.MongoClient(\n", + " host=cfg.origin_mongo_server_uri, directConnection=True\n", + ")\n", "\n", "# Mongo client for transformer Mongo server.\n", "transformer_mongo_client = pymongo.MongoClient(host=cfg.transformer_mongo_server_uri)" @@ -240,16 +242,25 @@ "outputs": [], "source": [ "# Display the Mongo server version (running on the \"origin\" Mongo server).\n", - "print(\"Origin Mongo server version: \" + origin_mongo_client.server_info()[\"version\"])\n", + "print(\n", + " \"Origin Mongo server version: \" + origin_mongo_client.server_info()[\"version\"]\n", + ")\n", "\n", "# Sanity test: Ensure the origin database exists.\n", - "assert \"nmdc\" in origin_mongo_client.list_database_names(), \"Origin database does not exist.\"\n", + "assert (\n", + " \"nmdc\" in origin_mongo_client.list_database_names()\n", + "), \"Origin database does not exist.\"\n", "\n", "# Display the Mongo server version (running on the \"transformer\" Mongo server).\n", - "print(\"Transformer Mongo server version: \" + transformer_mongo_client.server_info()[\"version\"])\n", + "print(\n", + " \"Transformer Mongo server version: \"\n", + " + transformer_mongo_client.server_info()[\"version\"]\n", + ")\n", "\n", "# Sanity test: Ensure the transformation database does not exist.\n", - "assert \"nmdc\" not in transformer_mongo_client.list_database_names(), \"Transformation database already exists.\"" + "assert (\n", + " \"nmdc\" not in transformer_mongo_client.list_database_names()\n", + "), \"Transformation database already exists.\"" ] }, { @@ -289,7 +300,9 @@ "print(\"NMDC Schema title: \" + nmdc_jsonschema[\"title\"])\n", "print(\"NMDC Schema version: \" + nmdc_jsonschema[\"version\"])\n", "\n", - "nmdc_jsonschema_validator.check_schema(nmdc_jsonschema) # raises exception if schema is invalid" + "nmdc_jsonschema_validator.check_schema(\n", + " nmdc_jsonschema\n", + ") # raises exception if schema is invalid" ] }, { @@ -318,9 +331,15 @@ "source": [ "# Build a string containing zero or more `--excludeCollection=\"...\"` options, which can be included in a `mongodump` command.\n", "all_collection_names: list[str] = origin_mongo_client[\"nmdc\"].list_collection_names()\n", - "non_agenda_collection_names = [name for name in all_collection_names if name not in agenda_collection_names]\n", - "exclusion_options = [f\"--excludeCollection='{name}'\" for name in non_agenda_collection_names]\n", - "exclusion_options_str = \" \".join(exclusion_options) # separates each option with a space\n", + "non_agenda_collection_names = [\n", + " name for name in all_collection_names if name not in agenda_collection_names\n", + "]\n", + "exclusion_options = [\n", + " f\"--excludeCollection='{name}'\" for name in non_agenda_collection_names\n", + "]\n", + "exclusion_options_str = \" \".join(\n", + " exclusion_options\n", + ") # separates each option with a space\n", "\n", "print(exclusion_options_str)" ] @@ -367,7 +386,9 @@ "outputs": [], "source": [ "inclusion_options = [f\"--nsInclude='nmdc.{name}'\" for name in agenda_collection_names]\n", - "inclusion_options_str = \" \".join(inclusion_options) # separates each option with a space\n", + "inclusion_options_str = \" \".join(\n", + " inclusion_options\n", + ") # separates each option with a space\n", "\n", "print(inclusion_options_str)" ] @@ -432,13 +453,13 @@ " # Make a deep copy of the original document, to enable before-and-after comparison.\n", " print(original_document)\n", " copy_of_original_document = deepcopy(original_document)\n", - " \n", + "\n", " # Put the document through the transformation pipeline associated with this collection.\n", " transformed_document = original_document # initializes the variable\n", " for transformation_function in transformation_pipeline:\n", " transformed_document = transformation_function(transformed_document)\n", " print(transformed_document)\n", - " \n", + "\n", " # Compare the transformed document with a copy of the original document;\n", " # and, if there are any differences, print those differences.\n", " difference = diff(copy_of_original_document, transformed_document)\n", @@ -458,17 +479,25 @@ " # Note: `root_to_validate` is a dictionary having the shape: { \"some_collection_name\": [ some_document ] }\n", " # Reference: https://docs.python.org/3/library/stdtypes.html#dict (see the \"type constructor\" section)\n", " #\n", - " transformed_document_without_underscore_id_key = {key: value for key, value in transformed_document.items() if key != \"_id\"}\n", - " root_to_validate = dict([(collection_name, [transformed_document_without_underscore_id_key])])\n", - " nmdc_jsonschema_validator.validate(root_to_validate) # raises exception if invalid\n", + " transformed_document_without_underscore_id_key = {\n", + " key: value for key, value in transformed_document.items() if key != \"_id\"\n", + " }\n", + " root_to_validate = dict(\n", + " [(collection_name, [transformed_document_without_underscore_id_key])]\n", + " )\n", + " nmdc_jsonschema_validator.validate(\n", + " root_to_validate\n", + " ) # raises exception if invalid\n", "\n", " # Store the transformed document.\n", - " transformed_documents.append(transformed_document) \n", - " print(\"\") \n", + " transformed_documents.append(transformed_document)\n", + " print(\"\")\n", "\n", " # Replace the original documents with the transformed versions of themselves (in the transformer database).\n", " for transformed_document in transformed_documents:\n", - " collection.replace_one({\"id\": {\"$eq\": transformed_document[\"id\"]}}, transformed_document)\n" + " collection.replace_one(\n", + " {\"id\": {\"$eq\": transformed_document[\"id\"]}}, transformed_document\n", + " )" ] }, { diff --git a/demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb b/demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb index b2e91df3..719279c0 100644 --- a/demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb +++ b/demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb @@ -227,7 +227,9 @@ "outputs": [], "source": [ "# Mongo client for origin Mongo server.\n", - "origin_mongo_client = pymongo.MongoClient(host=cfg.origin_mongo_server_uri, directConnection=True)\n", + "origin_mongo_client = pymongo.MongoClient(\n", + " host=cfg.origin_mongo_server_uri, directConnection=True\n", + ")\n", "\n", "# Mongo client for transformer Mongo server.\n", "transformer_mongo_client = pymongo.MongoClient(host=cfg.transformer_mongo_server_uri)" @@ -247,16 +249,25 @@ "outputs": [], "source": [ "# Display the Mongo server version (running on the \"origin\" Mongo server).\n", - "print(\"Origin Mongo server version: \" + origin_mongo_client.server_info()[\"version\"])\n", + "print(\n", + " \"Origin Mongo server version: \" + origin_mongo_client.server_info()[\"version\"]\n", + ")\n", "\n", "# Sanity test: Ensure the origin database exists.\n", - "assert \"nmdc\" in origin_mongo_client.list_database_names(), \"Origin database does not exist.\"\n", + "assert (\n", + " \"nmdc\" in origin_mongo_client.list_database_names()\n", + "), \"Origin database does not exist.\"\n", "\n", "# Display the Mongo server version (running on the \"transformer\" Mongo server).\n", - "print(\"Transformer Mongo server version: \" + transformer_mongo_client.server_info()[\"version\"])\n", + "print(\n", + " \"Transformer Mongo server version: \"\n", + " + transformer_mongo_client.server_info()[\"version\"]\n", + ")\n", "\n", "# Sanity test: Ensure the transformation database does not exist.\n", - "assert \"nmdc\" not in transformer_mongo_client.list_database_names(), \"Transformation database already exists.\"" + "assert (\n", + " \"nmdc\" not in transformer_mongo_client.list_database_names()\n", + "), \"Transformation database already exists.\"" ] }, { @@ -296,7 +307,9 @@ "print(\"NMDC Schema title: \" + nmdc_jsonschema[\"title\"])\n", "print(\"NMDC Schema version: \" + nmdc_jsonschema[\"version\"])\n", "\n", - "nmdc_jsonschema_validator.check_schema(nmdc_jsonschema) # raises exception if schema is invalid" + "nmdc_jsonschema_validator.check_schema(\n", + " nmdc_jsonschema\n", + ") # raises exception if schema is invalid" ] }, { @@ -325,9 +338,15 @@ "source": [ "# Build a string containing zero or more `--excludeCollection=\"...\"` options, which can be included in a `mongodump` command.\n", "all_collection_names: list[str] = origin_mongo_client[\"nmdc\"].list_collection_names()\n", - "non_agenda_collection_names = [name for name in all_collection_names if name not in agenda_collection_names]\n", - "exclusion_options = [f\"--excludeCollection='{name}'\" for name in non_agenda_collection_names]\n", - "exclusion_options_str = \" \".join(exclusion_options) # separates each option with a space\n", + "non_agenda_collection_names = [\n", + " name for name in all_collection_names if name not in agenda_collection_names\n", + "]\n", + "exclusion_options = [\n", + " f\"--excludeCollection='{name}'\" for name in non_agenda_collection_names\n", + "]\n", + "exclusion_options_str = \" \".join(\n", + " exclusion_options\n", + ") # separates each option with a space\n", "\n", "print(exclusion_options_str)" ] @@ -374,7 +393,9 @@ "outputs": [], "source": [ "inclusion_options = [f\"--nsInclude='nmdc.{name}'\" for name in agenda_collection_names]\n", - "inclusion_options_str = \" \".join(inclusion_options) # separates each option with a space\n", + "inclusion_options_str = \" \".join(\n", + " inclusion_options\n", + ") # separates each option with a space\n", "\n", "print(inclusion_options_str)" ] @@ -439,7 +460,7 @@ " # Make a deep copy of the original document, to enable before-and-after comparison.\n", " print(original_document)\n", " copy_of_original_document = deepcopy(original_document)\n", - " \n", + "\n", " # Put the document through the transformation pipeline associated with this collection.\n", " transformed_document = original_document # initializes the variable\n", " for transformation_function in transformation_pipeline:\n", @@ -449,7 +470,7 @@ " # Note: Some of the transformation functions in the migration class specific to this migration\n", " # do not return the transformed dictionary. As a result, transformation functions\n", " # \"further down the pipeline\" do not receive a dictionary as input.\n", - " # \n", + " #\n", " # The workaround I have employed here is:\n", " # 1. Manually read the transformation functions and verify they all do, indeed,\n", " # modify the dictionary \"in place\" (as opposed to returning a copy).\n", @@ -460,7 +481,7 @@ " # transformed_document = transformation_function(transformed_document)\n", " transformation_function(transformed_document)\n", " print(transformed_document)\n", - " \n", + "\n", " # Compare the transformed document with a copy of the original document;\n", " # and, if there are any differences, print those differences.\n", " difference = diff(copy_of_original_document, transformed_document)\n", @@ -480,17 +501,25 @@ " # Note: `root_to_validate` is a dictionary having the shape: { \"some_collection_name\": [ some_document ] }\n", " # Reference: https://docs.python.org/3/library/stdtypes.html#dict (see the \"type constructor\" section)\n", " #\n", - " transformed_document_without_underscore_id_key = {key: value for key, value in transformed_document.items() if key != \"_id\"}\n", - " root_to_validate = dict([(collection_name, [transformed_document_without_underscore_id_key])])\n", - " nmdc_jsonschema_validator.validate(root_to_validate) # raises exception if invalid\n", + " transformed_document_without_underscore_id_key = {\n", + " key: value for key, value in transformed_document.items() if key != \"_id\"\n", + " }\n", + " root_to_validate = dict(\n", + " [(collection_name, [transformed_document_without_underscore_id_key])]\n", + " )\n", + " nmdc_jsonschema_validator.validate(\n", + " root_to_validate\n", + " ) # raises exception if invalid\n", "\n", " # Store the transformed document.\n", - " transformed_documents.append(transformed_document) \n", - " print(\"\") \n", + " transformed_documents.append(transformed_document)\n", + " print(\"\")\n", "\n", " # Replace the original documents with the transformed versions of themselves (in the transformer database).\n", " for transformed_document in transformed_documents:\n", - " collection.replace_one({\"id\": {\"$eq\": transformed_document[\"id\"]}}, transformed_document)\n" + " collection.replace_one(\n", + " {\"id\": {\"$eq\": transformed_document[\"id\"]}}, transformed_document\n", + " )" ] }, { diff --git a/demo/metadata_migration/notebooks/migrate_9_1_0_to_9_2_0.ipynb b/demo/metadata_migration/notebooks/migrate_9_1_0_to_9_2_0.ipynb index 3521ca44..f4296f69 100644 --- a/demo/metadata_migration/notebooks/migrate_9_1_0_to_9_2_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_9_1_0_to_9_2_0.ipynb @@ -137,7 +137,9 @@ "# Third-party packages:\n", "import pymongo\n", "from nmdc_schema.nmdc_data import get_nmdc_jsonschema_dict\n", - "from nmdc_schema.migrators.migrator_from_9_1_to_9_2 import Migrator_from_9_1_to_9_2 as Migrator\n", + "from nmdc_schema.migrators.migrator_from_9_1_to_9_2 import (\n", + " Migrator_from_9_1_to_9_2 as Migrator,\n", + ")\n", "from jsonschema import Draft7Validator\n", "from dictdiffer import diff\n", "\n", @@ -223,7 +225,9 @@ "outputs": [], "source": [ "# Mongo client for origin Mongo server.\n", - "origin_mongo_client = pymongo.MongoClient(host=cfg.origin_mongo_server_uri, directConnection=True)\n", + "origin_mongo_client = pymongo.MongoClient(\n", + " host=cfg.origin_mongo_server_uri, directConnection=True\n", + ")\n", "\n", "# Mongo client for transformer Mongo server.\n", "transformer_mongo_client = pymongo.MongoClient(host=cfg.transformer_mongo_server_uri)" @@ -243,16 +247,25 @@ "outputs": [], "source": [ "# Display the Mongo server version (running on the \"origin\" Mongo server).\n", - "print(\"Origin Mongo server version: \" + origin_mongo_client.server_info()[\"version\"])\n", + "print(\n", + " \"Origin Mongo server version: \" + origin_mongo_client.server_info()[\"version\"]\n", + ")\n", "\n", "# Sanity test: Ensure the origin database exists.\n", - "assert \"nmdc\" in origin_mongo_client.list_database_names(), \"Origin database does not exist.\"\n", + "assert (\n", + " \"nmdc\" in origin_mongo_client.list_database_names()\n", + "), \"Origin database does not exist.\"\n", "\n", "# Display the Mongo server version (running on the \"transformer\" Mongo server).\n", - "print(\"Transformer Mongo server version: \" + transformer_mongo_client.server_info()[\"version\"])\n", + "print(\n", + " \"Transformer Mongo server version: \"\n", + " + transformer_mongo_client.server_info()[\"version\"]\n", + ")\n", "\n", "# Sanity test: Ensure the transformation database does not exist.\n", - "assert \"nmdc\" not in transformer_mongo_client.list_database_names(), \"Transformation database already exists.\"" + "assert (\n", + " \"nmdc\" not in transformer_mongo_client.list_database_names()\n", + "), \"Transformation database already exists.\"" ] }, { @@ -292,7 +305,9 @@ "print(\"NMDC Schema title: \" + nmdc_jsonschema[\"title\"])\n", "print(\"NMDC Schema version: \" + nmdc_jsonschema[\"version\"])\n", "\n", - "nmdc_jsonschema_validator.check_schema(nmdc_jsonschema) # raises exception if schema is invalid" + "nmdc_jsonschema_validator.check_schema(\n", + " nmdc_jsonschema\n", + ") # raises exception if schema is invalid" ] }, { @@ -321,9 +336,15 @@ "source": [ "# Build a string containing zero or more `--excludeCollection=\"...\"` options, which can be included in a `mongodump` command.\n", "all_collection_names: list[str] = origin_mongo_client[\"nmdc\"].list_collection_names()\n", - "non_agenda_collection_names = [name for name in all_collection_names if name not in agenda_collection_names]\n", - "exclusion_options = [f\"--excludeCollection='{name}'\" for name in non_agenda_collection_names]\n", - "exclusion_options_str = \" \".join(exclusion_options) # separates each option with a space\n", + "non_agenda_collection_names = [\n", + " name for name in all_collection_names if name not in agenda_collection_names\n", + "]\n", + "exclusion_options = [\n", + " f\"--excludeCollection='{name}'\" for name in non_agenda_collection_names\n", + "]\n", + "exclusion_options_str = \" \".join(\n", + " exclusion_options\n", + ") # separates each option with a space\n", "\n", "print(exclusion_options_str)" ] @@ -370,7 +391,9 @@ "outputs": [], "source": [ "inclusion_options = [f\"--nsInclude='nmdc.{name}'\" for name in agenda_collection_names]\n", - "inclusion_options_str = \" \".join(inclusion_options) # separates each option with a space\n", + "inclusion_options_str = \" \".join(\n", + " inclusion_options\n", + ") # separates each option with a space\n", "\n", "print(inclusion_options_str)" ] @@ -435,13 +458,13 @@ " # Make a deep copy of the original document, to enable before-and-after comparison.\n", " print(original_document)\n", " copy_of_original_document = deepcopy(original_document)\n", - " \n", + "\n", " # Put the document through the transformation pipeline associated with this collection.\n", " transformed_document = original_document # initializes the variable\n", " for transformation_function in transformation_pipeline:\n", " transformed_document = transformation_function(transformed_document)\n", " print(transformed_document)\n", - " \n", + "\n", " # Compare the transformed document with a copy of the original document;\n", " # and, if there are any differences, print those differences.\n", " difference = diff(copy_of_original_document, transformed_document)\n", @@ -461,17 +484,25 @@ " # Note: `root_to_validate` is a dictionary having the shape: { \"some_collection_name\": [ some_document ] }\n", " # Reference: https://docs.python.org/3/library/stdtypes.html#dict (see the \"type constructor\" section)\n", " #\n", - " transformed_document_without_underscore_id_key = {key: value for key, value in transformed_document.items() if key != \"_id\"}\n", - " root_to_validate = dict([(collection_name, [transformed_document_without_underscore_id_key])])\n", - " nmdc_jsonschema_validator.validate(root_to_validate) # raises exception if invalid\n", + " transformed_document_without_underscore_id_key = {\n", + " key: value for key, value in transformed_document.items() if key != \"_id\"\n", + " }\n", + " root_to_validate = dict(\n", + " [(collection_name, [transformed_document_without_underscore_id_key])]\n", + " )\n", + " nmdc_jsonschema_validator.validate(\n", + " root_to_validate\n", + " ) # raises exception if invalid\n", "\n", " # Store the transformed document.\n", - " transformed_documents.append(transformed_document) \n", - " print(\"\") \n", + " transformed_documents.append(transformed_document)\n", + " print(\"\")\n", "\n", " # Replace the original documents with the transformed versions of themselves (in the transformer database).\n", " for transformed_document in transformed_documents:\n", - " collection.replace_one({\"id\": {\"$eq\": transformed_document[\"id\"]}}, transformed_document)\n" + " collection.replace_one(\n", + " {\"id\": {\"$eq\": transformed_document[\"id\"]}}, transformed_document\n", + " )" ] }, { diff --git a/demo/metadata_migration/notebooks/migrate_9_3_2_to_10_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_9_3_2_to_10_0_0.ipynb index 249bd18b..ef168e9d 100644 --- a/demo/metadata_migration/notebooks/migrate_9_3_2_to_10_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_9_3_2_to_10_0_0.ipynb @@ -212,7 +212,9 @@ "outputs": [], "source": [ "# Mongo client for origin MongoDB server.\n", - "origin_mongo_client = pymongo.MongoClient(host=cfg.origin_mongo_server_uri, directConnection=True)\n", + "origin_mongo_client = pymongo.MongoClient(\n", + " host=cfg.origin_mongo_server_uri, directConnection=True\n", + ")\n", "\n", "# Mongo client for transformer MongoDB server.\n", "transformer_mongo_client = pymongo.MongoClient(host=cfg.transformer_mongo_server_uri)\n", @@ -220,16 +222,26 @@ "# Perform sanity tests of those MongoDB clients' abilities to access their respective MongoDB servers.\n", "with pymongo.timeout(3):\n", " # Display the MongoDB server version (running on the \"origin\" Mongo server).\n", - " print(\"Origin Mongo server version: \" + origin_mongo_client.server_info()[\"version\"])\n", + " print(\n", + " \"Origin Mongo server version: \"\n", + " + origin_mongo_client.server_info()[\"version\"]\n", + " )\n", "\n", " # Sanity test: Ensure the origin database exists.\n", - " assert \"nmdc\" in origin_mongo_client.list_database_names(), \"Origin database does not exist.\"\n", + " assert (\n", + " \"nmdc\" in origin_mongo_client.list_database_names()\n", + " ), \"Origin database does not exist.\"\n", "\n", " # Display the MongoDB server version (running on the \"transformer\" Mongo server).\n", - " print(\"Transformer Mongo server version: \" + transformer_mongo_client.server_info()[\"version\"])\n", + " print(\n", + " \"Transformer Mongo server version: \"\n", + " + transformer_mongo_client.server_info()[\"version\"]\n", + " )\n", "\n", " # Sanity test: Ensure the transformation database does not exist.\n", - " assert \"nmdc\" not in transformer_mongo_client.list_database_names(), \"Transformation database already exists.\"" + " assert (\n", + " \"nmdc\" not in transformer_mongo_client.list_database_names()\n", + " ), \"Transformation database already exists.\"" ] }, { @@ -257,7 +269,9 @@ "print(\"NMDC Schema title: \" + nmdc_jsonschema[\"title\"])\n", "print(\"NMDC Schema version: \" + nmdc_jsonschema[\"version\"])\n", "\n", - "nmdc_jsonschema_validator.check_schema(nmdc_jsonschema) # raises exception if schema is invalid" + "nmdc_jsonschema_validator.check_schema(\n", + " nmdc_jsonschema\n", + ") # raises exception if schema is invalid" ] }, { @@ -398,9 +412,15 @@ " # Note: `root_to_validate` is a dictionary having the shape: { \"some_collection_name\": [ some_document ] }\n", " # Reference: https://docs.python.org/3/library/stdtypes.html#dict (see the \"type constructor\" section)\n", " #\n", - " document_without_underscore_id_key = {key: value for key, value in document.items() if key != \"_id\"}\n", - " root_to_validate = dict([(collection_name, [document_without_underscore_id_key])])\n", - " nmdc_jsonschema_validator.validate(root_to_validate) # raises exception if invalid" + " document_without_underscore_id_key = {\n", + " key: value for key, value in document.items() if key != \"_id\"\n", + " }\n", + " root_to_validate = dict(\n", + " [(collection_name, [document_without_underscore_id_key])]\n", + " )\n", + " nmdc_jsonschema_validator.validate(\n", + " root_to_validate\n", + " ) # raises exception if invalid" ] }, { diff --git a/demo/metadata_migration/notebooks/migrate_A_B_C_to_X_Y_Z.ipynb b/demo/metadata_migration/notebooks/migrate_A_B_C_to_X_Y_Z.ipynb index 4165e883..6bd57d55 100644 --- a/demo/metadata_migration/notebooks/migrate_A_B_C_to_X_Y_Z.ipynb +++ b/demo/metadata_migration/notebooks/migrate_A_B_C_to_X_Y_Z.ipynb @@ -41,9 +41,7 @@ "metadata": {}, "outputs": [], "source": [ - "COLLECTION_NAMES: list[str] = [\n", - " \n", - "]" + "COLLECTION_NAMES: list[str] = []" ] }, { @@ -203,7 +201,9 @@ "outputs": [], "source": [ "# Mongo client for \"origin\" MongoDB server.\n", - "origin_mongo_client = pymongo.MongoClient(host=cfg.origin_mongo_server_uri, directConnection=True)\n", + "origin_mongo_client = pymongo.MongoClient(\n", + " host=cfg.origin_mongo_server_uri, directConnection=True\n", + ")\n", "\n", "# Mongo client for \"transformer\" MongoDB server.\n", "transformer_mongo_client = pymongo.MongoClient(host=cfg.transformer_mongo_server_uri)\n", @@ -211,16 +211,26 @@ "# Perform sanity tests of those MongoDB clients' abilities to access their respective MongoDB servers.\n", "with pymongo.timeout(3):\n", " # Display the MongoDB server version (running on the \"origin\" Mongo server).\n", - " print(\"Origin Mongo server version: \" + origin_mongo_client.server_info()[\"version\"])\n", + " print(\n", + " \"Origin Mongo server version: \"\n", + " + origin_mongo_client.server_info()[\"version\"]\n", + " )\n", "\n", " # Sanity test: Ensure the origin database exists.\n", - " assert \"nmdc\" in origin_mongo_client.list_database_names(), \"Origin database does not exist.\"\n", + " assert (\n", + " \"nmdc\" in origin_mongo_client.list_database_names()\n", + " ), \"Origin database does not exist.\"\n", "\n", " # Display the MongoDB server version (running on the \"transformer\" Mongo server).\n", - " print(\"Transformer Mongo server version: \" + transformer_mongo_client.server_info()[\"version\"])\n", + " print(\n", + " \"Transformer Mongo server version: \"\n", + " + transformer_mongo_client.server_info()[\"version\"]\n", + " )\n", "\n", " # Sanity test: Ensure the transformation database does not exist.\n", - " assert \"nmdc\" not in transformer_mongo_client.list_database_names(), \"Transformation database already exists.\"" + " assert (\n", + " \"nmdc\" not in transformer_mongo_client.list_database_names()\n", + " ), \"Transformation database already exists.\"" ] }, { @@ -272,7 +282,9 @@ "print(\"NMDC Schema title: \" + nmdc_jsonschema[\"title\"])\n", "print(\"NMDC Schema version: \" + nmdc_jsonschema[\"version\"])\n", "\n", - "nmdc_jsonschema_validator.check_schema(nmdc_jsonschema) # raises exception if schema is invalid" + "nmdc_jsonschema_validator.check_schema(\n", + " nmdc_jsonschema\n", + ") # raises exception if schema is invalid" ] }, { @@ -369,7 +381,9 @@ "adapter = MongoAdapter(\n", " database=transformer_mongo_client[\"nmdc\"],\n", " on_collection_created=lambda name: print(f'Created collection \"{name}\"'),\n", - " on_collection_renamed=lambda old_name, name: print(f'Renamed collection \"{old_name}\" to \"{name}\"'),\n", + " on_collection_renamed=lambda old_name, name: print(\n", + " f'Renamed collection \"{old_name}\" to \"{name}\"'\n", + " ),\n", " on_collection_deleted=lambda name: print(f'Deleted collection \"{name}\"'),\n", ")\n", "\n", @@ -414,9 +428,15 @@ " # Note: `root_to_validate` is a dictionary having the shape: { \"some_collection_name\": [ some_document ] }\n", " # Reference: https://docs.python.org/3/library/stdtypes.html#dict (see the \"type constructor\" section)\n", " #\n", - " document_without_underscore_id_key = {key: value for key, value in document.items() if key != \"_id\"}\n", - " root_to_validate = dict([(collection_name, [document_without_underscore_id_key])])\n", - " nmdc_jsonschema_validator.validate(root_to_validate) # raises exception if invalid" + " document_without_underscore_id_key = {\n", + " key: value for key, value in document.items() if key != \"_id\"\n", + " }\n", + " root_to_validate = dict(\n", + " [(collection_name, [document_without_underscore_id_key])]\n", + " )\n", + " nmdc_jsonschema_validator.validate(\n", + " root_to_validate\n", + " ) # raises exception if invalid" ] }, { @@ -440,7 +460,9 @@ }, "outputs": [], "source": [ - "bookkeeper.record_migration_event(migrator=migrator, event=MigrationEvent.MIGRATION_STARTED)" + "bookkeeper.record_migration_event(\n", + " migrator=migrator, event=MigrationEvent.MIGRATION_STARTED\n", + ")" ] }, { @@ -520,7 +542,9 @@ }, "outputs": [], "source": [ - "bookkeeper.record_migration_event(migrator=migrator, event=MigrationEvent.MIGRATION_COMPLETED)" + "bookkeeper.record_migration_event(\n", + " migrator=migrator, event=MigrationEvent.MIGRATION_COMPLETED\n", + ")" ] } ], diff --git a/docs/nb/get_data.ipynb b/docs/nb/get_data.ipynb index 203c64b9..fd386795 100644 --- a/docs/nb/get_data.ipynb +++ b/docs/nb/get_data.ipynb @@ -58,14 +58,17 @@ "\n", "HOST = \"https://api.microbiomedata.org\"\n", "\n", + "\n", "def get_json(path, host=HOST, **kwargs):\n", " r = requests.get(host + path, **kwargs)\n", " r.raise_for_status()\n", " return r.json()\n", "\n", + "\n", "def pick(allowlist, d):\n", " return keyfilter(lambda k: k in allowlist, d)\n", "\n", + "\n", "meta = itemgetter(\"meta\")\n", "results = itemgetter(\"results\")" ] @@ -208,13 +211,12 @@ "def get_json_mql(path, filter_):\n", " return get_json(path, params={\"filter\": json.dumps(filter_)})\n", "\n", + "\n", "def resources_count(json_response):\n", " return len(json_response[\"resources\"])\n", "\n", - "resources_count(get_json_mql(\n", - " \"/nmdcschema/biosample_set\",\n", - " {\"ecosystem\": \"Engineered\"}\n", - "))" + "\n", + "resources_count(get_json_mql(\"/nmdcschema/biosample_set\", {\"ecosystem\": \"Engineered\"}))" ] }, { @@ -327,9 +329,8 @@ ], "source": [ "def id_and_ecosystem_fields(doc):\n", - " return pick(\n", - " [\"id\"] + [f for f in doc if f.startswith(\"ecosystem\")],\n", - " doc)\n", + " return pick([\"id\"] + [f for f in doc if f.startswith(\"ecosystem\")], doc)\n", + "\n", "\n", "print(\"\\nStudies filter:\\n\")\n", "json_response = get_json(\"/studies?filter=ecosystem_type:Soil\")\n", @@ -339,16 +340,15 @@ "print(\"\\nData Objects filter and sort:\\n\")\n", "\n", "json_response = get_json(\n", - " \"/data_objects?\"\n", - " \"filter=description.search:GFF\"\n", - " \"&\"\n", - " \"sort=file_size_bytes:desc\"\n", + " \"/data_objects?\" \"filter=description.search:GFF\" \"&\" \"sort=file_size_bytes:desc\"\n", ")\n", "pprint(meta(json_response))\n", - "pprint([pick(\n", - " [\"description\", \"file_size_bytes\", \"id\", \"url\"]\n", - " , r\n", - ") for r in results(json_response)][:5])\n", + "pprint(\n", + " [\n", + " pick([\"description\", \"file_size_bytes\", \"id\", \"url\"], r)\n", + " for r in results(json_response)\n", + " ][:5]\n", + ")\n", "\n", "print(\"\\nActivities filter and sort:\\n\")\n", "\n", @@ -361,15 +361,13 @@ " \"sort=ended_at_time:desc\"\n", ")\n", "pprint(meta(json_response))\n", - "pprint([\n", - " pick([\n", - " \"id\",\n", - " \"started_at_time\",\n", - " \"ended_at_time\",\n", - " \"execution_resource\",\n", - " \"type\"],\n", - " r\n", - " ) for r in results(json_response)][:5]\n", + "pprint(\n", + " [\n", + " pick(\n", + " [\"id\", \"started_at_time\", \"ended_at_time\", \"execution_resource\", \"type\"], r\n", + " )\n", + " for r in results(json_response)\n", + " ][:5]\n", ")" ] }, @@ -420,7 +418,8 @@ "source": [ "def write_jsonlines_file(path, all_results):\n", " with open(path, \"w\") as f:\n", - " f.writelines([json.dumps(doc)+\"\\n\" for doc in all_results])\n", + " f.writelines([json.dumps(doc) + \"\\n\" for doc in all_results])\n", + "\n", "\n", "cursor = \"*\"\n", "all_results = []\n", @@ -429,16 +428,13 @@ " f\"/biosamples?filter=part_of:gold:Gs0110119&cursor={cursor}\"\n", " )\n", " m, rs = meta(json_response), results(json_response)\n", - " cursor = m['next_cursor']\n", + " cursor = m[\"next_cursor\"]\n", " print(\"fetched\", len(rs), f\"results out of {m['count']} total\")\n", " all_results.extend(rs)\n", "\n", "path = \"~/biosamples_part_of_gold:Gs0110119.jsonl\"\n", "\n", - "write_jsonlines_file(\n", - " Path(path).expanduser(),\n", - " all_results\n", - ")\n", + "write_jsonlines_file(Path(path).expanduser(), all_results)\n", "\n", "subprocess.check_output(\n", " f\"head -1 {path}\",\n", @@ -496,22 +492,29 @@ ], "source": [ "def download_file(url, directory=\"~/\"):\n", - " local_filename = url.split('/')[-1]\n", + " local_filename = url.split(\"/\")[-1]\n", " with requests.get(url, stream=True) as r:\n", - " with open(Path(directory + local_filename).expanduser(), 'wb') as f:\n", + " with open(Path(directory + local_filename).expanduser(), \"wb\") as f:\n", " shutil.copyfileobj(r.raw, f)\n", "\n", " return local_filename\n", "\n", + "\n", "id_biosample = \"igsn:IEWFS000A\"\n", - "rs_ompro = results(get_json(f\"/activities?filter=type:nmdc:OmicsProcessing,has_input:{id_biosample}\"))\n", + "rs_ompro = results(\n", + " get_json(f\"/activities?filter=type:nmdc:OmicsProcessing,has_input:{id_biosample}\")\n", + ")\n", "for id_ompro in tqdm([d[\"id\"] for d in rs_ompro]):\n", " rs_act = results(get_json(f\"/activities?filter=was_informed_by:{id_ompro}\"))\n", - " for data_object_ids, activity_type in [(d[\"has_output\"], d[\"type\"]) for d in rs_act]:\n", + " for data_object_ids, activity_type in [\n", + " (d[\"has_output\"], d[\"type\"]) for d in rs_act\n", + " ]:\n", " for data_object_id in data_object_ids:\n", " do = results(get_json(f\"/data_objects?filter=id:{data_object_id}\"))[0]\n", - " print(f'downloading biosample {id_biosample} > omics processing activity {id_ompro} '\n", - " f'> {activity_type} activity > data object {data_object_id} from {do[\"url\"]}...')\n", + " print(\n", + " f\"downloading biosample {id_biosample} > omics processing activity {id_ompro} \"\n", + " f'> {activity_type} activity > data object {data_object_id} from {do[\"url\"]}...'\n", + " )\n", " download_file(do[\"url\"])" ] }, @@ -575,11 +578,11 @@ " f\"/biosamples?filter=part_of:gold:Gs0110119&cursor={cursor}\"\n", " )\n", " m, rs = meta(json_response), results(json_response)\n", - " cursor = m['next_cursor']\n", + " cursor = m[\"next_cursor\"]\n", " print(\"fetched\", len(rs), f\"results out of {m['count']} total\")\n", " all_results.extend(rs)\n", "\n", - "pprint([pick([\"id\",\"lat_lon\"], r) for r in all_results][:5])" + "pprint([pick([\"id\", \"lat_lon\"], r) for r in all_results][:5])" ] }, { @@ -652,9 +655,12 @@ "\n", " return b.getvalue()\n", "\n", - "b = load_bytes(get_json(\"/nmdcschema/data_object_set/nmdc:4b649d353b2c2385ab042682ba516d14\")[\"url\"])\n", "\n", - "for line in b.decode('utf-8').split(\"\\n\"):\n", + "b = load_bytes(\n", + " get_json(\"/nmdcschema/data_object_set/nmdc:4b649d353b2c2385ab042682ba516d14\")[\"url\"]\n", + ")\n", + "\n", + "for line in b.decode(\"utf-8\").split(\"\\n\"):\n", " print(line)" ] }, diff --git a/docs/nb/queue_and_trigger_data_jobs.ipynb b/docs/nb/queue_and_trigger_data_jobs.ipynb index 54737c78..ce9d958c 100644 --- a/docs/nb/queue_and_trigger_data_jobs.ipynb +++ b/docs/nb/queue_and_trigger_data_jobs.ipynb @@ -73,68 +73,72 @@ "\n", "load_dotenv(envfile_path)\n", "\n", - "ENV = {\n", - " k: v for k, v in os.environ.items()\n", - " if k.startswith(\"NMDC_RUNTIME_\")\n", - "}\n", - "\n", - "assert (\n", - " ENV[\"NMDC_RUNTIME_HOST\"] == \n", - " \"https://api.microbiomedata.org\"\n", - ")\n", + "ENV = {k: v for k, v in os.environ.items() if k.startswith(\"NMDC_RUNTIME_\")}\n", + "\n", + "assert ENV[\"NMDC_RUNTIME_HOST\"] == \"https://api.microbiomedata.org\"\n", "\n", "HOST = ENV[\"NMDC_RUNTIME_HOST\"]\n", "\n", + "\n", "def request_and_return_json(method, path, host=HOST, **kwargs):\n", " r = requests.request(method, host + path, **kwargs)\n", " r.raise_for_status()\n", " return r.json()\n", "\n", + "\n", "def get_json(path, host=HOST, **kwargs):\n", " return request_and_return_json(\"GET\", path, host=host, **kwargs)\n", "\n", + "\n", "def post_and_return_json(path, host=HOST, **kwargs):\n", - " return request_and_return_json(\"POST\", path, host=host, **kwargs)\n", + " return request_and_return_json(\"POST\", path, host=host, **kwargs)\n", + "\n", "\n", "def patch_and_return_json(path, host=HOST, **kwargs):\n", - " return request_and_return_json(\"PATCH\", path, host=host, **kwargs)\n", + " return request_and_return_json(\"PATCH\", path, host=host, **kwargs)\n", + "\n", "\n", "def put_and_return_json(path, host=HOST, **kwargs):\n", - " return request_and_return_json(\"PUT\", path, host=host, **kwargs)\n", + " return request_and_return_json(\"PUT\", path, host=host, **kwargs)\n", + "\n", "\n", "def auth_header(bearer_token):\n", " return {\"Authorization\": f\"Bearer {bearer_token}\"}\n", "\n", + "\n", "def get_token_for_user():\n", " response = post_and_return_json(\n", " \"/token\",\n", " data={\n", " \"grant_type\": \"password\",\n", " \"username\": ENV[\"NMDC_RUNTIME_USER\"],\n", - " \"password\": ENV[\"NMDC_RUNTIME_PASS\"]\n", - " }\n", + " \"password\": ENV[\"NMDC_RUNTIME_PASS\"],\n", + " },\n", " )\n", - " expires_minutes = response['expires']['minutes']\n", + " expires_minutes = response[\"expires\"][\"minutes\"]\n", " print(f\"Bearer token expires in {expires_minutes} minutes\")\n", " return response[\"access_token\"]\n", "\n", + "\n", "def get_token_for_site_client():\n", " response = post_and_return_json(\n", " \"/token\",\n", " data={\n", " \"grant_type\": \"client_credentials\",\n", " \"client_id\": ENV[\"NMDC_RUNTIME_SITE_CLIENT_ID\"],\n", - " \"client_secret\": ENV[\"NMDC_RUNTIME_SITE_CLIENT_SECRET\"]\n", - " }\n", + " \"client_secret\": ENV[\"NMDC_RUNTIME_SITE_CLIENT_SECRET\"],\n", + " },\n", " )\n", - " expires_minutes = response['expires']['minutes']\n", + " expires_minutes = response[\"expires\"][\"minutes\"]\n", " print(f\"Bearer token expires in {expires_minutes} minutes\")\n", " return response[\"access_token\"]\n", "\n", + "\n", "def now(as_str=False):\n", " dt = datetime.now(timezone.utc)\n", " return dt.isoformat() if as_str else dt\n", "\n", + "\n", "TOKEN_U = get_token_for_user()" ] }, @@ -173,11 +177,7 @@ "\n", "id_newsite = f'{ENV[\"NMDC_RUNTIME_USER\"]}-{secrets.token_urlsafe()}'\n", "\n", - "post_and_return_json(\n", - " \"/sites\",\n", - " json={\"id\": id_newsite},\n", - " headers=auth_header(TOKEN_U)\n", - ")\n", + "post_and_return_json(\"/sites\", json={\"id\": id_newsite}, headers=auth_header(TOKEN_U))\n", "ENV[\"NMDC_RUNTIME_SITE_ID\"] = id_newsite\n", "print(ENV[\"NMDC_RUNTIME_SITE_ID\"])" ] @@ -273,17 +273,18 @@ "source": [ "TOKEN_S = get_token_for_site_client()\n", "\n", + "\n", "def filter_jobs(filter_):\n", " return get_json(\n", - " f\"/jobs/\",\n", - " headers=auth_header(TOKEN_U),\n", - " params={\"filter\": json.dumps(filter_)})\n", + " f\"/jobs/\", headers=auth_header(TOKEN_U), params={\"filter\": json.dumps(filter_)}\n", + " )\n", + "\n", "\n", "response = filter_jobs({\"workflow.id\": \"test\"})\n", "\n", "pprint(response)\n", "\n", - "job_id = response['resources'][0]['id']\n", + "job_id = response[\"resources\"][0][\"id\"]\n", "print(job_id)" ] }, @@ -401,7 +402,7 @@ "response = patch_and_return_json(\n", " f\"/operations/{operation_id}\",\n", " json={\"done\": True, \"result\": \"code green\", \"metadata\": {\"a\": 3}},\n", - " headers=auth_header(TOKEN_S)\n", + " headers=auth_header(TOKEN_S),\n", ")\n", "pprint(response)" ] @@ -467,7 +468,7 @@ " {\"access_url\": {\"url\": \"http://example.com/path/to/thing\"}},\n", " ],\n", " },\n", - " headers=auth_header(TOKEN_S)\n", + " headers=auth_header(TOKEN_S),\n", ")\n", "pprint(response)\n", "object_id = response[\"id\"]\n", @@ -537,9 +538,9 @@ "source": [ "def filter_jobs(filter_):\n", " return get_json(\n", - " f\"/jobs/\",\n", - " headers=auth_header(TOKEN_U),\n", - " params={\"filter\": json.dumps(filter_)})\n", + " f\"/jobs/\", headers=auth_header(TOKEN_U), params={\"filter\": json.dumps(filter_)}\n", + " )\n", + "\n", "\n", "pprint(filter_jobs({\"workflow.id\": \"test\", \"config.object_id\": object_id}))" ] diff --git a/metadata-translation/notebooks/202106_curation_updates.ipynb b/metadata-translation/notebooks/202106_curation_updates.ipynb index 1e1ae9c6..06c53ef6 100644 --- a/metadata-translation/notebooks/202106_curation_updates.ipynb +++ b/metadata-translation/notebooks/202106_curation_updates.ipynb @@ -50,7 +50,7 @@ "\n", "mongo = get_mongo(run_config_frozen__normal_env)\n", "mdb = mongo.db\n", - "#set(db.list_collection_names())" + "# set(db.list_collection_names())" ] }, { @@ -83,7 +83,9 @@ "metadata": {}, "outputs": [], "source": [ - "gold_etl_latest = mdb.objects.find_one({\"name\": \"nmdc_database.json.zip\"}, sort=[(\"created_time\", -1)])" + "gold_etl_latest = mdb.objects.find_one(\n", + " {\"name\": \"nmdc_database.json.zip\"}, sort=[(\"created_time\", -1)]\n", + ")" ] }, { @@ -133,7 +135,9 @@ "source": [ "from pprint import pprint\n", "\n", - "nmdc_db_collection_names_to_drop = set(nmdc_jsonschema[\"definitions\"][\"Database\"][\"properties\"])\n", + "nmdc_db_collection_names_to_drop = set(\n", + " nmdc_jsonschema[\"definitions\"][\"Database\"][\"properties\"]\n", + ")\n", "nmdc_db_collection_names_to_drop -= {\n", " # not actually collections\n", " \"activity_set\",\n", @@ -143,7 +147,6 @@ " # big collections, loaded elsewhere\n", " \"functional_annotation_set\",\n", " \"genome_feature_set\",\n", - " \n", "}\n", "pprint(nmdc_db_collection_names_to_drop)" ] @@ -160,7 +163,8 @@ " print(f\"dropping {coll_name}, creating index\")\n", " db.drop_collection(coll_name)\n", " db[coll_name].create_index(\"id\", unique=True)\n", - " \n", + "\n", + "\n", "init_database(db, nmdc_db_collection_names_to_drop)" ] }, @@ -223,7 +227,7 @@ "metadata": {}, "outputs": [], "source": [ - "rv['biosample_set'].upserted_count" + "rv[\"biosample_set\"].upserted_count" ] }, { @@ -322,12 +326,12 @@ "new_biosample_docs = []\n", "\n", "for igsn, golds in igsn_golds.items():\n", - " igsn_curie = \"igsn:\"+igsn\n", + " igsn_curie = \"igsn:\" + igsn\n", " doc = mdb.biosample_set.find_one({\"id\": igsn_curie})\n", " doc = assoc_in(doc, [\"alternative_identifiers\"], [f\"gold:{g}\" for g in golds])\n", " doc = dissoc(doc, \"_id\")\n", " new_biosample_docs.append(doc)\n", - " \n", + "\n", "rv = mongo.add_docs({\"biosample_set\": new_biosample_docs})" ] }, @@ -347,7 +351,7 @@ "new_biosample_docs = []\n", "\n", "for igsn, golds in igsn_golds.items():\n", - " igsn_curie = \"igsn:\"+igsn\n", + " igsn_curie = \"igsn:\" + igsn\n", " doc = db.biosample_set.find_one({\"id\": {\"$in\": [f\"gold:{g}\" for g in golds]}})\n", " if doc is None:\n", " print(igsn, golds)\n", @@ -379,7 +383,9 @@ "from pymongo import DeleteMany\n", "from toolz import concat\n", "\n", - "requests = [DeleteMany({\"id\": {\"$in\": [\"gold:\"+g for g in concat(igsn_golds.values())]}})]\n", + "requests = [\n", + " DeleteMany({\"id\": {\"$in\": [\"gold:\" + g for g in concat(igsn_golds.values())]}})\n", + "]\n", "rv = mongo.db.biosample_set.bulk_write(requests)\n", "rv.deleted_count" ] @@ -413,12 +419,14 @@ "outputs": [], "source": [ "requests = []\n", - "to_replace = {\"gold:\"+k: \"igsn:\"+v for k, v in goldid_igsn.items()}\n", + "to_replace = {\"gold:\" + k: \"igsn:\" + v for k, v in goldid_igsn.items()}\n", "\n", "for doc in db.omics_processing_set.find({\"has_input\": {\"$in\": list(to_replace)}}):\n", - " operations = {\"$set\": {\n", - " \"has_input\": [to_replace.get(i, i) for i in doc[\"has_input\"]],\n", - " }}\n", + " operations = {\n", + " \"$set\": {\n", + " \"has_input\": [to_replace.get(i, i) for i in doc[\"has_input\"]],\n", + " }\n", + " }\n", " requests.append({\"filter\": {\"_id\": doc[\"_id\"]}, \"update\": operations})" ] }, @@ -520,7 +528,8 @@ "outputs": [], "source": [ "n_with_emsl_id = db.omics_processing_set.count_documents(\n", - " {\"id\": {\"$in\": [\"emsl:\"+i for i in emslid_igsn]}})" + " {\"id\": {\"$in\": [\"emsl:\" + i for i in emslid_igsn]}}\n", + ")" ] }, { @@ -531,20 +540,24 @@ "outputs": [], "source": [ "requests = []\n", - "to_replace = {\"emsl:\"+k: \"igsn:\"+v for k, v in emslid_igsn.items()}\n", - "to_replace.update({\"emsl:output_\"+k: \"igsn:\"+v for k, v in emslid_igsn.items()})\n", + "to_replace = {\"emsl:\" + k: \"igsn:\" + v for k, v in emslid_igsn.items()}\n", + "to_replace.update({\"emsl:output_\" + k: \"igsn:\" + v for k, v in emslid_igsn.items()})\n", + "\n", "\n", "def omit(blacklist, d):\n", " return keyfilter(lambda k: k not in blacklist, d)\n", "\n", + "\n", "def sans_mongo_id(d):\n", " return omit([\"_id\"], d)\n", "\n", "\n", "for doc in db.omics_processing_set.find({\"has_input\": {\"$in\": list(to_replace)}}):\n", - " operations = {\"$set\": {\n", - " \"has_input\": [to_replace.get(i, i) for i in doc[\"has_input\"]],\n", - " }}\n", + " operations = {\n", + " \"$set\": {\n", + " \"has_input\": [to_replace.get(i, i) for i in doc[\"has_input\"]],\n", + " }\n", + " }\n", " requests.append({\"filter\": {\"_id\": doc[\"_id\"]}, \"update\": operations})" ] }, @@ -575,15 +588,18 @@ "metadata": {}, "outputs": [], "source": [ - "to_fetch = [{\n", - " # >100MB\n", - " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/stegen_MetaProteomicAnalysis_activity.json\",\n", - " \"type\": \"metaproteomics_analysis_activity_set\",\n", - "}, {\n", - " # ~50KB\n", - " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/stegen_emsl_analysis_data_objects.json\",\n", - " \"type\": \"data_object_set\"\n", - "}]" + "to_fetch = [\n", + " {\n", + " # >100MB\n", + " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/stegen_MetaProteomicAnalysis_activity.json\",\n", + " \"type\": \"metaproteomics_analysis_activity_set\",\n", + " },\n", + " {\n", + " # ~50KB\n", + " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/stegen_emsl_analysis_data_objects.json\",\n", + " \"type\": \"data_object_set\",\n", + " },\n", + "]" ] }, { @@ -597,6 +613,7 @@ "\n", "pattern = re.compile(r\"https?://(?P[^/]+)/(?P.+)\")\n", "\n", + "\n", "def url_to_name(url):\n", " m = pattern.match(url)\n", " return f\"{'.'.join(reversed(m.group('domain').split('.')))}__{m.group('path').replace('/', '.')}\"" @@ -614,6 +631,7 @@ "\n", "import requests\n", "\n", + "\n", "def download_them_all(to_fetch):\n", " for i, spec in enumerate(to_fetch):\n", " url = spec[\"url\"]\n", @@ -621,7 +639,7 @@ " print(f\"{i+1}/{len(to_fetch)}: fetching {url}\")\n", " rv = requests.get(url)\n", " print(f\"saving as {name}\")\n", - " with open(f'/Users/dwinston/Downloads/{name}', 'w') as f:\n", + " with open(f\"/Users/dwinston/Downloads/{name}\", \"w\") as f:\n", " json.dump(rv.json(), f)" ] }, @@ -644,18 +662,26 @@ "source": [ "import requests\n", "\n", + "\n", "def check_data_object(d):\n", " rv = requests.head(\n", - " d[\"url\"], allow_redirects=True, verify=False, timeout=5, headers={\"Accept-Encoding\": \"gzip;q=0\"}\n", - " )\n", + " d[\"url\"],\n", + " allow_redirects=True,\n", + " verify=False,\n", + " timeout=5,\n", + " headers={\"Accept-Encoding\": \"gzip;q=0\"},\n", + " )\n", " if not rv.status_code == 200:\n", - " return {\"error\": {\"status_code\": rv.status_code, \"details\": \"not OK\"}, \"id\": d[\"id\"]}\n", + " return {\n", + " \"error\": {\"status_code\": rv.status_code, \"details\": \"not OK\"},\n", + " \"id\": d[\"id\"],\n", + " }\n", " if d[\"file_size_bytes\"] != int(rv.headers[\"Content-Length\"]):\n", " return {\n", " \"error\": {\n", " \"details\": \"file size different than reported\",\n", " \"file_size_actual\": rv.headers[\"Content-Length\"],\n", - " \"file_size_reported\": d[\"file_size_bytes\"]\n", + " \"file_size_reported\": d[\"file_size_bytes\"],\n", " },\n", " \"id\": d[\"id\"],\n", " }\n", @@ -670,7 +696,7 @@ "outputs": [], "source": [ "def fetch_downloaded_json(name):\n", - " with open(f'/Users/dwinston/Downloads/{name}') as f:\n", + " with open(f\"/Users/dwinston/Downloads/{name}\") as f:\n", " return json.load(f)" ] }, @@ -703,7 +729,6 @@ " \"BestProtein\": (\"best_protein\", identity),\n", " \"min(QValue)\": (\"min_q_value\", float),\n", " \"min_QValue\": (\"min_q_value\", float),\n", - " \n", " \"peptide_sequence\": (\"peptide_sequence\", identity),\n", " \"peptide_sum_masic_abundance\": (\"peptide_sum_masic_abundance\", int),\n", " \"peptide_spectral_count\": (\"peptide_spectral_count\", int),\n", @@ -749,6 +774,7 @@ "source": [ "from collections import defaultdict\n", "\n", + "\n", "def fetch_metaP_validate_and_add(to_fetch):\n", " to_add = defaultdict(list)\n", " for i, spec in enumerate(to_fetch):\n", @@ -784,13 +810,16 @@ "metadata": {}, "outputs": [], "source": [ - "to_fetch = [{\n", - " \"url\": \"https://portal.nersc.gov/project/m3408/meta/501128_1781_100340_stegen_MetaProteomicAnalysis_activity.json\",\n", - " \"type\": \"metaproteomics_analysis_activity_set\",\n", - "}, {\n", - " \"url\": \"https://portal.nersc.gov/project/m3408/meta/501128_1781_100340_stegen_emsl_analysis_data_objects.json\",\n", - " \"type\": \"data_object_set\"\n", - "}]" + "to_fetch = [\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/project/m3408/meta/501128_1781_100340_stegen_MetaProteomicAnalysis_activity.json\",\n", + " \"type\": \"metaproteomics_analysis_activity_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/project/m3408/meta/501128_1781_100340_stegen_emsl_analysis_data_objects.json\",\n", + " \"type\": \"data_object_set\",\n", + " },\n", + "]" ] }, { @@ -840,49 +869,64 @@ "metadata": {}, "outputs": [], "source": [ - "to_fetch = [{\n", - " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/mt_annotation_objects.json\",\n", - " \"type\": \"metagenome_annotation_activity_set\"\n", - "}, {\n", - " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/mt_annotation_data_objects.json\",\n", - " \"type\": \"data_object_set\"\n", - "}, {\n", - " \"url\": \"https://portal.nersc.gov/project/m3408/meta/metagenomeAssembly_activity.json\",\n", - " \"type\": \"metagenome_assembly_set\",\n", - "}, {\n", - " \"url\": \"https://portal.nersc.gov/project/m3408/meta/metagenomeAssembly_data_objects.json\",\n", - " \"type\": \"data_object_set\",\n", - "}, {\n", - " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/ReadbasedAnalysis_activity.json\",\n", - " \"type\": \"read_based_analysis_activity_set\"\n", - "}, {\n", - " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/ReadbasedAnalysis_data_objects.json\",\n", - " \"type\": \"data_object_set\"\n", - "}, {\n", - " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/MAGs_activity.json\",\n", - " \"type\": \"mags_activity_set\",\n", - "}, {\n", - " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/MAGs_data_objects.json\",\n", - " \"type\": \"data_object_set\"\n", - "}, {\n", - " \"url\": \"https://portal.nersc.gov/project/m3408/meta/readQC_activity.json\",\n", - " \"type\": \"read_QC_analysis_activity_set\"\n", - "}, {\n", - " \"url\": \"https://portal.nersc.gov/project/m3408/meta/readQC_activity_data_objects.json\",\n", - " \"type\": \"data_object_set\"\n", - "}, {\n", - " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/img_mg_annotation_objects.json\",\n", - " \"type\": \"metagenome_annotation_activity_set\",\n", - "}, {\n", - " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/img_mg_annotation_data_objects.json\",\n", - " \"type\": \"data_object_set\",\n", - "}, {\n", - " \"url\": \"https://nmdcdemo.emsl.pnnl.gov/metabolomics/registration/gcms_metabolomics_data_products.json\",\n", - " \"type\": \"data_object_set\"\n", - "}, {\n", - " \"url\": \"https://nmdcdemo.emsl.pnnl.gov/nom/registration/ftms_nom_data_products.json\",\n", - " \"type\": \"data_object_set\"\n", - "}]" + "to_fetch = [\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/mt_annotation_objects.json\",\n", + " \"type\": \"metagenome_annotation_activity_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/mt_annotation_data_objects.json\",\n", + " \"type\": \"data_object_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/project/m3408/meta/metagenomeAssembly_activity.json\",\n", + " \"type\": \"metagenome_assembly_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/project/m3408/meta/metagenomeAssembly_data_objects.json\",\n", + " \"type\": \"data_object_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/ReadbasedAnalysis_activity.json\",\n", + " \"type\": \"read_based_analysis_activity_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/ReadbasedAnalysis_data_objects.json\",\n", + " \"type\": \"data_object_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/MAGs_activity.json\",\n", + " \"type\": \"mags_activity_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/MAGs_data_objects.json\",\n", + " \"type\": \"data_object_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/project/m3408/meta/readQC_activity.json\",\n", + " \"type\": \"read_QC_analysis_activity_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/project/m3408/meta/readQC_activity_data_objects.json\",\n", + " \"type\": \"data_object_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/img_mg_annotation_objects.json\",\n", + " \"type\": \"metagenome_annotation_activity_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/img_mg_annotation_data_objects.json\",\n", + " \"type\": \"data_object_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://nmdcdemo.emsl.pnnl.gov/metabolomics/registration/gcms_metabolomics_data_products.json\",\n", + " \"type\": \"data_object_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://nmdcdemo.emsl.pnnl.gov/nom/registration/ftms_nom_data_products.json\",\n", + " \"type\": \"data_object_set\",\n", + " },\n", + "]" ] }, { @@ -904,6 +948,7 @@ "source": [ "from collections import defaultdict\n", "\n", + "\n", "def fetch_validate_and_add(to_fetch):\n", " to_add = defaultdict(list)\n", " for i, spec in enumerate(to_fetch):\n", @@ -938,19 +983,22 @@ "metadata": {}, "outputs": [], "source": [ - "manifests = [{\n", - " \"url\": (\n", - " \"https://nmdcdemo.emsl.pnnl.gov/metabolomics/registration/\"\n", - " \"gcms_metabolomics_metadata_products.json\"\n", - " ),\n", - " \"type\": \"metabolomics_analysis_activity_set\"\n", - "}, {\n", - " \"url\": (\n", - " \"https://nmdcdemo.emsl.pnnl.gov/nom/registration/\"\n", - " \"ftms_nom_metadata_products.json\"\n", - " ),\n", - " \"type\": \"nom_analysis_activity_set\"\n", - "}]" + "manifests = [\n", + " {\n", + " \"url\": (\n", + " \"https://nmdcdemo.emsl.pnnl.gov/metabolomics/registration/\"\n", + " \"gcms_metabolomics_metadata_products.json\"\n", + " ),\n", + " \"type\": \"metabolomics_analysis_activity_set\",\n", + " },\n", + " {\n", + " \"url\": (\n", + " \"https://nmdcdemo.emsl.pnnl.gov/nom/registration/\"\n", + " \"ftms_nom_metadata_products.json\"\n", + " ),\n", + " \"type\": \"nom_analysis_activity_set\",\n", + " },\n", + "]" ] }, { @@ -996,6 +1044,7 @@ "import requests\n", "from tqdm.notebook import tqdm\n", "\n", + "\n", "def fetch_json(url):\n", " return requests.get(url).json()\n", "\n", @@ -1006,10 +1055,7 @@ " urls = [spec[\"url\"] for spec in to_fetch]\n", "\n", " with concurrent.futures.ThreadPoolExecutor() as executor:\n", - " future_to_url = {\n", - " executor.submit(fetch_json, url): url\n", - " for url in urls\n", - " }\n", + " future_to_url = {executor.submit(fetch_json, url): url for url in urls}\n", " for future in concurrent.futures.as_completed(future_to_url):\n", " pbar.update(1)\n", " url = future_to_url[future]\n", @@ -1019,7 +1065,7 @@ " error_urls.append((url, str(e)))\n", " else:\n", " name = url_to_name(url)\n", - " with open(f'/Users/dwinston/Downloads/{name}', 'w') as f:\n", + " with open(f\"/Users/dwinston/Downloads/{name}\", \"w\") as f:\n", " json.dump(payload, f)\n", "\n", " pbar.close()\n", @@ -1045,7 +1091,7 @@ "source": [ "def fetch_downloaded_json_given_url(url):\n", " name = url_to_name(url)\n", - " with open(f'/Users/dwinston/Downloads/{name}') as f:\n", + " with open(f\"/Users/dwinston/Downloads/{name}\") as f:\n", " return json.load(f)" ] }, @@ -1059,6 +1105,7 @@ "import requests\n", "from tqdm.notebook import tqdm\n", "\n", + "\n", "def validate_and_add_parallel(to_fetch):\n", " nmdc_db = defaultdict(list)\n", " error_urls = []\n", @@ -1206,11 +1253,11 @@ "source": [ "rows = []\n", "with open(\"../src/data/2021-02-03-stegen_biosample_linking_update.csv\") as f:\n", - " next(f) # skip header row\n", + " next(f) # skip header row\n", " for row in f:\n", " line = row.strip()\n", " tokens = line.split(\",\")\n", - " if tokens[-1] == '':\n", + " if tokens[-1] == \"\":\n", " rows.append(tokens[:-1])\n", " else:\n", " rows.append(tokens)" @@ -1255,6 +1302,7 @@ "\n", "gold_pattern = re.compile(r\"Gb\\d+\")\n", "\n", + "\n", "def prefix_sample_id(s):\n", " if \":\" in s:\n", " return s\n", @@ -1273,13 +1321,15 @@ "source": [ "omics = []\n", "for i, row in enumerate(rows):\n", - " omics.append({\n", - " \"omics_id\": row[0],\n", - " \"omics_type\": row[1],\n", - " \"sample_name\": row[2],\n", - " \"sample_id\": prefix_sample_id(row[3]),\n", - " \"new\": len(row) > 4 and row[4] == \"TRUE\"\n", - " })" + " omics.append(\n", + " {\n", + " \"omics_id\": row[0],\n", + " \"omics_type\": row[1],\n", + " \"sample_name\": row[2],\n", + " \"sample_id\": prefix_sample_id(row[3]),\n", + " \"new\": len(row) > 4 and row[4] == \"TRUE\",\n", + " }\n", + " )" ] }, { @@ -1290,8 +1340,10 @@ "outputs": [], "source": [ "existing_ids = [\n", - " d[\"id\"] for d in\n", - " db.biosample_set.find({\"id\": {\"$in\": [o[\"sample_id\"] for o in omics]}}, [\"id\"])\n", + " d[\"id\"]\n", + " for d in db.biosample_set.find(\n", + " {\"id\": {\"$in\": [o[\"sample_id\"] for o in omics]}}, [\"id\"]\n", + " )\n", "]" ] }, @@ -1304,6 +1356,7 @@ "source": [ "from toolz import assoc_in, get_in\n", "\n", + "\n", "def transform_in(doc, keys, fn):\n", " initial = get_in(keys, doc)\n", " transformed = fn(initial)\n", @@ -1320,12 +1373,12 @@ "def fill_template(template, sample_id, sample_name):\n", " doc = assoc_in(template, [\"id\"], sample_id)\n", " doc = transform_in(\n", - " doc, [\"identifier\", \"has_raw_value\"],\n", - " lambda s: s.replace(\"$BIOSAMPLE_NAME\", sample_name)\n", + " doc,\n", + " [\"identifier\", \"has_raw_value\"],\n", + " lambda s: s.replace(\"$BIOSAMPLE_NAME\", sample_name),\n", " )\n", " doc = transform_in(\n", - " doc, [\"name\"],\n", - " lambda s: s.replace(\"$BIOSAMPLE_NAME\", sample_name)\n", + " doc, [\"name\"], lambda s: s.replace(\"$BIOSAMPLE_NAME\", sample_name)\n", " )\n", " return doc" ] @@ -1365,7 +1418,7 @@ "\n", "for sample_id, sample_name in new_samples.items():\n", " doc = fill_template(stegen_sample_template, sample_id, sample_name)\n", - " #doc = term_subdocs_to_id_strings(doc)\n", + " # doc = term_subdocs_to_id_strings(doc)\n", " docs.append(doc)" ] }, @@ -1378,6 +1431,7 @@ "source": [ "from toolz import get_in, assoc_in\n", "\n", + "\n", "def un_raw_value(doc, key):\n", " value = get_in([key, \"has_raw_value\"], doc)\n", " if value is not None:\n", @@ -1385,6 +1439,7 @@ " else:\n", " return doc\n", "\n", + "\n", "def re_raw_value(doc, key):\n", " value = get_in([key], doc)\n", " if value is not None and not isinstance(value, dict):\n", @@ -1393,6 +1448,7 @@ " else:\n", " return doc\n", "\n", + "\n", "raws = [\n", " \"ecosystem\",\n", " \"collection_date\",\n", @@ -1409,12 +1465,14 @@ " \"specific_ecosystem\",\n", "]\n", "timestampvalue_fields = [\n", - " p for p, spec in nmdc_jsonschema['definitions']['Biosample']['properties'].items()\n", - " if '$ref' in spec and spec[\"$ref\"].endswith(\"TimestampValue\")\n", + " p\n", + " for p, spec in nmdc_jsonschema[\"definitions\"][\"Biosample\"][\"properties\"].items()\n", + " if \"$ref\" in spec and spec[\"$ref\"].endswith(\"TimestampValue\")\n", "]\n", "textvalue_fields = [\n", - " p for p, spec in nmdc_jsonschema['definitions']['Biosample']['properties'].items()\n", - " if '$ref' in spec and spec[\"$ref\"].endswith(\"TextValue\")\n", + " p\n", + " for p, spec in nmdc_jsonschema[\"definitions\"][\"Biosample\"][\"properties\"].items()\n", + " if \"$ref\" in spec and spec[\"$ref\"].endswith(\"TextValue\")\n", "]\n", "\n", "for key in raws:\n", @@ -1450,7 +1508,7 @@ "metadata": {}, "outputs": [], "source": [ - "rv['biosample_set'].upserted_count" + "rv[\"biosample_set\"].upserted_count" ] }, { @@ -1469,7 +1527,7 @@ "outputs": [], "source": [ "omics = [\n", - " transform_in(o, [\"omics_id\"], lambda s: \"emsl:\"+s if \":\" not in s else s)\n", + " transform_in(o, [\"omics_id\"], lambda s: \"emsl:\" + s if \":\" not in s else s)\n", " for o in omics\n", "]" ] @@ -1484,8 +1542,7 @@ "omics_ids = [o[\"omics_id\"] for o in omics]\n", "\n", "found_omics_ids = [\n", - " d[\"id\"] for d in\n", - " db.omics_processing_set.find({\"id\": {\"$in\": omics_ids}},[\"id\"])\n", + " d[\"id\"] for d in db.omics_processing_set.find({\"id\": {\"$in\": omics_ids}}, [\"id\"])\n", "]" ] }, @@ -1572,7 +1629,7 @@ "metadata": {}, "outputs": [], "source": [ - "rv['omics_processing_set'].modified_count" + "rv[\"omics_processing_set\"].modified_count" ] }, { @@ -1592,11 +1649,11 @@ "source": [ "rows = []\n", "with open(\"../src/data/2021-02-04-brodie_biosample_linking_update.csv\") as f:\n", - " next(f) # skip header row\n", + " next(f) # skip header row\n", " for row in f:\n", " line = row.strip()\n", " tokens = line.split(\",\")\n", - " if tokens[-1] == '':\n", + " if tokens[-1] == \"\":\n", " rows.append(tokens[:-1])\n", " else:\n", " rows.append(tokens)" @@ -1611,12 +1668,14 @@ "source": [ "omics = []\n", "for i, row in enumerate(rows):\n", - " omics.append({\n", - " \"omics_id\": \"emsl:\" + row[0].strip(),\n", - " \"omics_type\": row[1].strip(),\n", - " \"sample_name\": row[2].strip(),\n", - " \"sample_id\": \"igsn:\" + row[3].strip(),\n", - " })" + " omics.append(\n", + " {\n", + " \"omics_id\": \"emsl:\" + row[0].strip(),\n", + " \"omics_type\": row[1].strip(),\n", + " \"sample_name\": row[2].strip(),\n", + " \"sample_id\": \"igsn:\" + row[3].strip(),\n", + " }\n", + " )" ] }, { @@ -1629,8 +1688,7 @@ "omics_ids = [o[\"omics_id\"] for o in omics]\n", "\n", "found_omics_ids = [\n", - " d[\"id\"] for d in\n", - " db.omics_processing_set.find({\"id\": {\"$in\": omics_ids}},[\"id\"])\n", + " d[\"id\"] for d in db.omics_processing_set.find({\"id\": {\"$in\": omics_ids}}, [\"id\"])\n", "]" ] }, @@ -1715,7 +1773,7 @@ "metadata": {}, "outputs": [], "source": [ - "rv['omics_processing_set'].modified_count" + "rv[\"omics_processing_set\"].modified_count" ] }, { @@ -1743,7 +1801,7 @@ "source": [ "mfilter = {\n", " \"part_of\": [\"gold:Gs0114675\"],\n", - " \"processing_institution\": \"Environmental Molecular Sciences Lab\"\n", + " \"processing_institution\": \"Environmental Molecular Sciences Lab\",\n", "}\n", "\n", "db.omics_processing_set.count_documents(filter=mfilter)" @@ -1766,7 +1824,9 @@ "omics_processing_ids = [d[\"id\"] for d in docs]\n", "data_object_ids = list(concat(d[\"has_output\"] for d in docs))\n", "\n", - "assert len(omics_processing_ids) == db.data_object_set.count_documents({\"id\": {\"$in\": data_object_ids}})" + "assert len(omics_processing_ids) == db.data_object_set.count_documents(\n", + " {\"id\": {\"$in\": data_object_ids}}\n", + ")" ] }, { @@ -1816,14 +1876,19 @@ "from datetime import datetime\n", "import re\n", "\n", - "dt_pattern = re.compile(r\"\\d{2}-(?P\\w+)-\\d{2} \\d{2}\\.\\d{2}\\.\\d{2}\\.(?P\\d+) [A|P]M\")\n", + "dt_pattern = re.compile(\n", + " r\"\\d{2}-(?P\\w+)-\\d{2} \\d{2}\\.\\d{2}\\.\\d{2}\\.(?P\\d+) [A|P]M\"\n", + ")\n", "dt_format = \"%d-%b-%y %I.%M.%S.%f %p\"\n", "\n", + "\n", "def gold_dtstr_to_iso8601(s):\n", " match = dt_pattern.search(s)\n", " first, month, rest = s.partition(match.group(\"month\"))\n", " s_new = first + month[0] + month[1:].lower() + rest\n", - " s_new = s_new.replace(match.group(\"ns\"), match.group(\"ns\")[:-3]) # truncate to microseconds\n", + " s_new = s_new.replace(\n", + " match.group(\"ns\"), match.group(\"ns\")[:-3]\n", + " ) # truncate to microseconds\n", " dt = datetime.strptime(s_new, dt_format)\n", " return dt.strftime(\"%Y-%m-%d\")" ] @@ -1836,186 +1901,171 @@ "outputs": [], "source": [ "docs_brodie_emsl = [\n", - " {\n", - " \"name\":\"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States - ER_145\",\n", - " \"description\":\"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States\",\n", - " \"lat_lon\":{\n", - " \"has_raw_value\":\"38.92045766 -106.9484528\",\n", - " \"latitude\":38.92045766,\n", - " \"longitude\":-106.9484528\n", - " },\n", - " \"geo_loc_name\":\"USA: Colorado\",\n", - " \"collection_date\":\"2017-05-09\",\n", - " \"env_broad_scale\":{\n", - " \"has_raw_value\":\"ENVO_00000446\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"env_local_scale\":{\n", - " \"has_raw_value\":\"ENVO_00000292\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"env_medium\":{\n", - " \"has_raw_value\":\"ENVO_00001998\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"ecosystem\":\"Environmental\",\n", - " \"ecosystem_category\":\"Terrestrial\",\n", - " \"ecosystem_type\":\"Soil\",\n", - " \"ecosystem_subtype\":\"Unclassified\",\n", - " \"specific_ecosystem\":\"Unclassified\",\n", - " \"depth\": {\"has_numeric_value\": 5},\n", - " \"ncbi_taxonomy_name\":\"soil metagenome\",\n", - " \"community\":\"microbial communities\",\n", - " \"location\":\"The East River watershed near Crested Butte, Colorado, USA\",\n", - " \"habitat\":\"soil\",\n", - " \"sample_collection_site\":\"soil\",\n", - " \"id\":\"igsn:IEWFS000I\",\n", - " \"identifier\":\"igsn:IEWFS000I\"\n", - " },\n", - " {\n", - " \"name\":\"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States - ER_147\",\n", - " \"description\":\"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States\",\n", - " \"lat_lon\":{\n", - " \"has_raw_value\":\"38.92045766 -106.9484528\",\n", - " \"latitude\":38.92045766,\n", - " \"longitude\":-106.9484528\n", - " },\n", - " \"geo_loc_name\":\"USA: Colorado\",\n", - " \"collection_date\":\"2017-05-09\",\n", - " \"env_broad_scale\":{\n", - " \"has_raw_value\":\"ENVO_00000446\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"env_local_scale\":{\n", - " \"has_raw_value\":\"ENVO_00000292\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"env_medium\":{\n", - " \"has_raw_value\":\"ENVO_00001998\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"ecosystem\":\"Environmental\",\n", - " \"ecosystem_category\":\"Terrestrial\",\n", - " \"ecosystem_type\":\"Soil\",\n", - " \"ecosystem_subtype\":\"Unclassified\",\n", - " \"specific_ecosystem\":\"Unclassified\",\n", - " \"depth\":{\"has_numeric_value\": 15},\n", - " \"ncbi_taxonomy_name\":\"soil metagenome\",\n", - " \"community\":\"microbial communities\",\n", - " \"location\":\"The East River watershed near Crested Butte, Colorado, USA\",\n", - " \"habitat\":\"soil\",\n", - " \"sample_collection_site\":\"soil\",\n", - " \"id\":\"igsn:IEWFS000K\",\n", - " \"identifier\":\"igsn:IEWFS000K\"\n", - " },\n", - " {\n", - " \"name\":\"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States - ER_135\",\n", - " \"description\":\"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States\",\n", - " \"lat_lon\":{\n", - " \"has_raw_value\":\"38.92028116 -106.9489189\",\n", - " \"latitude\":38.92028116,\n", - " \"longitude\":-106.94891899\n", - " },\n", - " \"geo_loc_name\":\"USA: Colorado\",\n", - " \"collection_date\":\"2017-05-09\",\n", - " \"env_broad_scale\":{\n", - " \"has_raw_value\":\"ENVO_00000446\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"env_local_scale\":{\n", - " \"has_raw_value\":\"ENVO_00000292\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"env_medium\":{\n", - " \"has_raw_value\":\"ENVO_00001998\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"ecosystem\":\"Environmental\",\n", - " \"ecosystem_category\":\"Terrestrial\",\n", - " \"ecosystem_type\":\"Soil\",\n", - " \"ecosystem_subtype\":\"Unclassified\",\n", - " \"specific_ecosystem\":\"Unclassified\",\n", - " \"depth\":{\"has_numeric_value\": 15},\n", - " \"ncbi_taxonomy_name\":\"soil metagenome\",\n", - " \"community\":\"microbial communities\",\n", - " \"location\":\"The East River watershed near Crested Butte, Colorado, USA\",\n", - " \"habitat\":\"soil\",\n", - " \"sample_collection_site\":\"soil\",\n", - " \"id\":\"igsn:IEWFS000B\",\n", - " \"identifier\":\"igsn:IEWFS000B\"\n", - " },\n", - " {\n", - " \"name\":\"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States - ER_134\",\n", - " \"description\":\"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States\",\n", - " \"lat_lon\":{\n", - " \"has_raw_value\":\"38.92028116 -106.9489189\",\n", - " \"latitude\":38.92028116,\n", - " \"longitude\":-106.9489189\n", - " },\n", - " \"geo_loc_name\":\"USA: Colorado\",\n", - " \"collection_date\":\"2017-05-09\",\n", - " \"env_broad_scale\":{\n", - " \"has_raw_value\":\"ENVO_00000446\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"env_local_scale\":{\n", - " \"has_raw_value\":\"ENVO_00000292\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"env_medium\":{\n", - " \"has_raw_value\":\"ENVO_00001998\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"ecosystem\":\"Environmental\",\n", - " \"ecosystem_category\":\"Terrestrial\",\n", - " \"ecosystem_type\":\"Soil\",\n", - " \"ecosystem_subtype\":\"Unclassified\",\n", - " \"specific_ecosystem\":\"Unclassified\",\n", - " \"depth\":{\"has_numeric_value\": 5},\n", - " \"ncbi_taxonomy_name\":\"soil metagenome\",\n", - " \"community\":\"microbial communities\",\n", - " \"location\":\"The East River watershed near Crested Butte, Colorado, USA\",\n", - " \"habitat\":\"soil\",\n", - " \"sample_collection_site\":\"soil\",\n", - " \"id\":\"igsn:IEWFS000A\",\n", - " \"identifier\":\"igsn:IEWFS000A\"\n", - " },\n", - " {\n", - " \"name\":\"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States - ER_146\",\n", - " \"description\":\"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States\",\n", - " \"lat_lon\":{\n", - " \"has_raw_value\":\"38.92045766 -106.9484528\",\n", - " \"latitude\":38.92045766,\n", - " \"longitude\":-106.9484528\n", - " },\n", - " \"geo_loc_name\":\"USA: Colorado\",\n", - " \"collection_date\":\"2017-05-09\",\n", - " \"env_broad_scale\":{\n", - " \"has_raw_value\":\"ENVO_00000446\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"env_local_scale\":{\n", - " \"has_raw_value\":\"ENVO_00000292\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"env_medium\":{\n", - " \"has_raw_value\":\"ENVO_00001998\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"ecosystem\":\"Environmental\",\n", - " \"ecosystem_category\":\"Terrestrial\",\n", - " \"ecosystem_type\":\"Soil\",\n", - " \"ecosystem_subtype\":\"Unclassified\",\n", - " \"specific_ecosystem\":\"Unclassified\",\n", - " \"depth\":{\"has_numeric_value\": 5},\n", - " \"ncbi_taxonomy_name\":\"soil metagenome\",\n", - " \"community\":\"microbial communities\",\n", - " \"location\":\"The East River watershed near Crested Butte, Colorado, USA\",\n", - " \"habitat\":\"soil\",\n", - " \"sample_collection_site\":\"soil\",\n", - " \"id\":\"igsn:IEWFS000J\",\n", - " \"identifier\":\"igsn:IEWFS000J\"\n", - " }\n", + " {\n", + " \"name\": \"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States - ER_145\",\n", + " \"description\": \"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States\",\n", + " \"lat_lon\": {\n", + " \"has_raw_value\": \"38.92045766 -106.9484528\",\n", + " \"latitude\": 38.92045766,\n", + " \"longitude\": -106.9484528,\n", + " },\n", + " \"geo_loc_name\": \"USA: Colorado\",\n", + " \"collection_date\": \"2017-05-09\",\n", + " \"env_broad_scale\": {\n", + " \"has_raw_value\": \"ENVO_00000446\",\n", + " \"type\": \"ControlledTermValue\",\n", + " },\n", + " \"env_local_scale\": {\n", + " \"has_raw_value\": \"ENVO_00000292\",\n", + " \"type\": \"ControlledTermValue\",\n", + " },\n", + " \"env_medium\": {\"has_raw_value\": \"ENVO_00001998\", \"type\": \"ControlledTermValue\"},\n", + " \"ecosystem\": \"Environmental\",\n", + " \"ecosystem_category\": \"Terrestrial\",\n", + " \"ecosystem_type\": \"Soil\",\n", + " \"ecosystem_subtype\": \"Unclassified\",\n", + " \"specific_ecosystem\": \"Unclassified\",\n", + " \"depth\": {\"has_numeric_value\": 5},\n", + " \"ncbi_taxonomy_name\": \"soil metagenome\",\n", + " \"community\": \"microbial communities\",\n", + " \"location\": \"The East River watershed near Crested Butte, Colorado, USA\",\n", + " \"habitat\": \"soil\",\n", + " \"sample_collection_site\": \"soil\",\n", + " \"id\": \"igsn:IEWFS000I\",\n", + " \"identifier\": \"igsn:IEWFS000I\",\n", + " },\n", + " {\n", + " \"name\": \"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States - ER_147\",\n", + " \"description\": \"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States\",\n", + " \"lat_lon\": {\n", + " \"has_raw_value\": \"38.92045766 -106.9484528\",\n", + " \"latitude\": 38.92045766,\n", + " \"longitude\": -106.9484528,\n", + " },\n", + " \"geo_loc_name\": \"USA: Colorado\",\n", + " \"collection_date\": \"2017-05-09\",\n", + " \"env_broad_scale\": {\n", + " \"has_raw_value\": \"ENVO_00000446\",\n", + " \"type\": \"ControlledTermValue\",\n", + " },\n", + " \"env_local_scale\": {\n", + " \"has_raw_value\": \"ENVO_00000292\",\n", + " \"type\": \"ControlledTermValue\",\n", + " },\n", + " \"env_medium\": {\"has_raw_value\": \"ENVO_00001998\", \"type\": \"ControlledTermValue\"},\n", + " \"ecosystem\": \"Environmental\",\n", + " \"ecosystem_category\": \"Terrestrial\",\n", + " \"ecosystem_type\": \"Soil\",\n", + " \"ecosystem_subtype\": \"Unclassified\",\n", + " \"specific_ecosystem\": \"Unclassified\",\n", + " \"depth\": {\"has_numeric_value\": 15},\n", + " \"ncbi_taxonomy_name\": \"soil metagenome\",\n", + " \"community\": \"microbial communities\",\n", + " \"location\": \"The East River watershed near Crested Butte, Colorado, USA\",\n", + " \"habitat\": \"soil\",\n", + " \"sample_collection_site\": \"soil\",\n", + " \"id\": \"igsn:IEWFS000K\",\n", + " \"identifier\": \"igsn:IEWFS000K\",\n", + " },\n", + " {\n", + " \"name\": \"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States - ER_135\",\n", + " \"description\": \"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States\",\n", + " \"lat_lon\": {\n", + " \"has_raw_value\": \"38.92028116 -106.9489189\",\n", + " \"latitude\": 38.92028116,\n", + " \"longitude\": -106.94891899,\n", + " },\n", + " \"geo_loc_name\": \"USA: Colorado\",\n", + " \"collection_date\": \"2017-05-09\",\n", + " \"env_broad_scale\": {\n", + " \"has_raw_value\": \"ENVO_00000446\",\n", + " \"type\": \"ControlledTermValue\",\n", + " },\n", + " \"env_local_scale\": {\n", + " \"has_raw_value\": \"ENVO_00000292\",\n", + " \"type\": \"ControlledTermValue\",\n", + " },\n", + " \"env_medium\": {\"has_raw_value\": \"ENVO_00001998\", \"type\": \"ControlledTermValue\"},\n", + " \"ecosystem\": \"Environmental\",\n", + " \"ecosystem_category\": \"Terrestrial\",\n", + " \"ecosystem_type\": \"Soil\",\n", + " \"ecosystem_subtype\": \"Unclassified\",\n", + " \"specific_ecosystem\": \"Unclassified\",\n", + " \"depth\": {\"has_numeric_value\": 15},\n", + " \"ncbi_taxonomy_name\": \"soil metagenome\",\n", + " \"community\": \"microbial communities\",\n", + " \"location\": \"The East River watershed near Crested Butte, Colorado, USA\",\n", + " \"habitat\": \"soil\",\n", + " \"sample_collection_site\": \"soil\",\n", + " \"id\": \"igsn:IEWFS000B\",\n", + " \"identifier\": \"igsn:IEWFS000B\",\n", + " },\n", + " {\n", + " \"name\": \"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States - ER_134\",\n", + " \"description\": \"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States\",\n", + " \"lat_lon\": {\n", + " \"has_raw_value\": \"38.92028116 -106.9489189\",\n", + " \"latitude\": 38.92028116,\n", + " \"longitude\": -106.9489189,\n", + " },\n", + " \"geo_loc_name\": \"USA: Colorado\",\n", + " \"collection_date\": \"2017-05-09\",\n", + " \"env_broad_scale\": {\n", + " \"has_raw_value\": \"ENVO_00000446\",\n", + " \"type\": \"ControlledTermValue\",\n", + " },\n", + " \"env_local_scale\": {\n", + " \"has_raw_value\": \"ENVO_00000292\",\n", + " \"type\": \"ControlledTermValue\",\n", + " },\n", + " \"env_medium\": {\"has_raw_value\": \"ENVO_00001998\", \"type\": \"ControlledTermValue\"},\n", + " \"ecosystem\": \"Environmental\",\n", + " \"ecosystem_category\": \"Terrestrial\",\n", + " \"ecosystem_type\": \"Soil\",\n", + " \"ecosystem_subtype\": \"Unclassified\",\n", + " \"specific_ecosystem\": \"Unclassified\",\n", + " \"depth\": {\"has_numeric_value\": 5},\n", + " \"ncbi_taxonomy_name\": \"soil metagenome\",\n", + " \"community\": \"microbial communities\",\n", + " \"location\": \"The East River watershed near Crested Butte, Colorado, USA\",\n", + " \"habitat\": \"soil\",\n", + " \"sample_collection_site\": \"soil\",\n", + " \"id\": \"igsn:IEWFS000A\",\n", + " \"identifier\": \"igsn:IEWFS000A\",\n", + " },\n", + " {\n", + " \"name\": \"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States - ER_146\",\n", + " \"description\": \"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States\",\n", + " \"lat_lon\": {\n", + " \"has_raw_value\": \"38.92045766 -106.9484528\",\n", + " \"latitude\": 38.92045766,\n", + " \"longitude\": -106.9484528,\n", + " },\n", + " \"geo_loc_name\": \"USA: Colorado\",\n", + " \"collection_date\": \"2017-05-09\",\n", + " \"env_broad_scale\": {\n", + " \"has_raw_value\": \"ENVO_00000446\",\n", + " \"type\": \"ControlledTermValue\",\n", + " },\n", + " \"env_local_scale\": {\n", + " \"has_raw_value\": \"ENVO_00000292\",\n", + " \"type\": \"ControlledTermValue\",\n", + " },\n", + " \"env_medium\": {\"has_raw_value\": \"ENVO_00001998\", \"type\": \"ControlledTermValue\"},\n", + " \"ecosystem\": \"Environmental\",\n", + " \"ecosystem_category\": \"Terrestrial\",\n", + " \"ecosystem_type\": \"Soil\",\n", + " \"ecosystem_subtype\": \"Unclassified\",\n", + " \"specific_ecosystem\": \"Unclassified\",\n", + " \"depth\": {\"has_numeric_value\": 5},\n", + " \"ncbi_taxonomy_name\": \"soil metagenome\",\n", + " \"community\": \"microbial communities\",\n", + " \"location\": \"The East River watershed near Crested Butte, Colorado, USA\",\n", + " \"habitat\": \"soil\",\n", + " \"sample_collection_site\": \"soil\",\n", + " \"id\": \"igsn:IEWFS000J\",\n", + " \"identifier\": \"igsn:IEWFS000J\",\n", + " },\n", "]" ] }, @@ -2028,6 +2078,7 @@ "source": [ "from toolz import get_in, assoc_in\n", "\n", + "\n", "def re_raw_value(doc, key):\n", " value = get_in([key], doc)\n", " if value is not None and not isinstance(value, dict):\n", @@ -2036,17 +2087,21 @@ " else:\n", " return doc\n", "\n", + "\n", "timestampvalue_fields = [\n", - " p for p, spec in nmdc_jsonschema['definitions']['Biosample']['properties'].items()\n", - " if '$ref' in spec and spec[\"$ref\"].endswith(\"TimestampValue\")\n", + " p\n", + " for p, spec in nmdc_jsonschema[\"definitions\"][\"Biosample\"][\"properties\"].items()\n", + " if \"$ref\" in spec and spec[\"$ref\"].endswith(\"TimestampValue\")\n", "]\n", "textvalue_fields = [\n", - " p for p, spec in nmdc_jsonschema['definitions']['Biosample']['properties'].items()\n", - " if '$ref' in spec and spec[\"$ref\"].endswith(\"TextValue\")\n", + " p\n", + " for p, spec in nmdc_jsonschema[\"definitions\"][\"Biosample\"][\"properties\"].items()\n", + " if \"$ref\" in spec and spec[\"$ref\"].endswith(\"TextValue\")\n", "]\n", "quantityvalue_fields = [\n", - " p for p, spec in nmdc_jsonschema['definitions']['Biosample']['properties'].items()\n", - " if '$ref' in spec and spec[\"$ref\"].endswith(\"QuantityValue\")\n", + " p\n", + " for p, spec in nmdc_jsonschema[\"definitions\"][\"Biosample\"][\"properties\"].items()\n", + " if \"$ref\" in spec and spec[\"$ref\"].endswith(\"QuantityValue\")\n", "]\n", "\n", "docs = docs_brodie_emsl\n", @@ -2094,7 +2149,9 @@ "metadata": {}, "outputs": [], "source": [ - "assert db.biosample_set.count_documents({\"id\": {\"$in\": [d[\"id\"] for d in docs]}}) == len(docs)" + "assert db.biosample_set.count_documents(\n", + " {\"id\": {\"$in\": [d[\"id\"] for d in docs]}}\n", + ") == len(docs)" ] }, { @@ -2192,7 +2249,9 @@ "metadata": {}, "outputs": [], "source": [ - "to_delete = [d[\"id\"] for d in db.omics_processing_set.find({\"has_input\": \"emsl:TBD\"}, [\"id\"])]\n", + "to_delete = [\n", + " d[\"id\"] for d in db.omics_processing_set.find({\"has_input\": \"emsl:TBD\"}, [\"id\"])\n", + "]\n", "print(len(to_delete))" ] }, @@ -2244,18 +2303,23 @@ "\n", "pattern = re.compile(r\"https?://(?P[^/]+)/(?P.+)\")\n", "\n", - "to_fetch = [{\n", - " \"url\": \"https://portal.nersc.gov/project/m3408/meta/metaT_activity.json\",\n", - " \"type\": \"metatranscriptome_activity_set\" # waiting on PR microbiomedata/nmdc-schema#86\n", - "}, {\n", - " \"url\": \"https://portal.nersc.gov/project/m3408/meta/metaT_data_objects.json\",\n", - " \"type\": \"data_object_set\" # already mongoimported, but good to re-do via notebook\n", - "}]\n", + "to_fetch = [\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/project/m3408/meta/metaT_activity.json\",\n", + " \"type\": \"metatranscriptome_activity_set\", # waiting on PR microbiomedata/nmdc-schema#86\n", + " },\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/project/m3408/meta/metaT_data_objects.json\",\n", + " \"type\": \"data_object_set\", # already mongoimported, but good to re-do via notebook\n", + " },\n", + "]\n", + "\n", "\n", "def url_to_name(url):\n", " m = pattern.match(url)\n", " return f\"{'.'.join(reversed(m.group('domain').split('.')))}__{m.group('path').replace('/', '.')}\"\n", "\n", + "\n", "def download_them_all(to_fetch):\n", " for i, spec in enumerate(to_fetch):\n", " url = spec[\"url\"]\n", @@ -2263,18 +2327,21 @@ " print(f\"{i+1}/{len(to_fetch)}: fetching {url}\")\n", " rv = requests.get(url)\n", " print(f\"saving as {name}\")\n", - " with open(f'/Users/dwinston/Downloads/{name}', 'w') as f:\n", + " with open(f\"/Users/dwinston/Downloads/{name}\", \"w\") as f:\n", " json.dump(rv.json(), f)\n", - " \n", + "\n", + "\n", "def fetch_downloaded_json(name):\n", - " with open(f'/Users/dwinston/Downloads/{name}') as f:\n", + " with open(f\"/Users/dwinston/Downloads/{name}\") as f:\n", " return json.load(f)\n", - " \n", + "\n", + "\n", "def fetch_downloaded_json_given_url(url):\n", " name = url_to_name(url)\n", - " with open(f'/Users/dwinston/Downloads/{name}') as f:\n", + " with open(f\"/Users/dwinston/Downloads/{name}\") as f:\n", " return json.load(f)\n", "\n", + "\n", "def fetch_validate_and_add(to_fetch):\n", " to_add = defaultdict(list)\n", " for i, spec in enumerate(to_fetch):\n", @@ -2290,7 +2357,8 @@ " nmdc_jsonschema_validate(to_add)\n", " print(\"adding\")\n", " mongo.add_docs(to_add, validate=False)\n", - " \n", + "\n", + "\n", "download_them_all(to_fetch)\n", "fetch_validate_and_add(to_fetch)" ] @@ -2317,7 +2385,8 @@ "class FileTypeEnumBase(BaseModel):\n", " name: str\n", " description: str\n", - " filter: str # JSON-encoded data_object_set mongo collection filter document \n", + " filter: str # JSON-encoded data_object_set mongo collection filter document\n", + "\n", "\n", "class FileTypeEnum(FileTypeEnumBase):\n", " id: str" @@ -2334,10 +2403,12 @@ "\n", "from nmdc_runtime.api.core.idgen import generate_one_id\n", "\n", + "\n", "@lru_cache\n", "def _fte_id(fte_as_str):\n", " return generate_one_id(mongo.db, \"file_type_enum\")\n", "\n", + "\n", "def get_fte_id(fte):\n", " rv = _fte_id(fte.json())\n", " assert isinstance(rv, str)\n", @@ -2368,152 +2439,158 @@ " FileTypeEnumBase(\n", " name=\"FT ICR-MS analysis results\",\n", " description=\"FT ICR-MS-based metabolite assignment results table\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"nom\\/results\"}, \"description\": {\"$regex\": \"FT ICR-MS\"}})\n", + " filter=json.dumps(\n", + " {\"url\": {\"$regex\": \"nom\\/results\"}, \"description\": {\"$regex\": \"FT ICR-MS\"}}\n", + " ),\n", " ),\n", " FileTypeEnumBase(\n", " name=\"GC-MS Metabolomics Results\",\n", " description=\"GC-MS-based metabolite assignment results table\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"metabolomics\\/results\"}})\n", + " filter=json.dumps({\"url\": {\"$regex\": \"metabolomics\\/results\"}}),\n", " ),\n", " FileTypeEnumBase(\n", " name=\"Metaproteomics Workflow Statistics\",\n", " description=\"Aggregate workflow statistics file\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"QC_Metrics.tsv\", \"$options\": \"i\"}})\n", + " filter=json.dumps({\"url\": {\"$regex\": \"QC_Metrics.tsv\", \"$options\": \"i\"}}),\n", " ),\n", " FileTypeEnumBase(\n", " name=\"Protein Report\",\n", " description=\"Filtered protein report file\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"Protein_Report.tsv\"}})\n", + " filter=json.dumps({\"url\": {\"$regex\": \"Protein_Report.tsv\"}}),\n", " ),\n", " FileTypeEnumBase(\n", " name=\"Peptide Report\",\n", " description=\"Filtered peptide report file\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"Peptide_Report.tsv\"}})\n", + " filter=json.dumps({\"url\": {\"$regex\": \"Peptide_Report.tsv\"}}),\n", " ),\n", " FileTypeEnumBase(\n", " name=\"Unfiltered Metaproteomics Results\",\n", " description=\"MSGFjobs and MASIC output file\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"MSGFjobs_MASIC_resultant.tsv\"}})\n", + " filter=json.dumps({\"url\": {\"$regex\": \"MSGFjobs_MASIC_resultant.tsv\"}}),\n", " ),\n", " FileTypeEnumBase(\n", " name=\"Read Count and RPKM\",\n", " description=\"Annotation read count and RPKM per feature JSON\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"metat_out_json\\/output.json\"}})\n", + " filter=json.dumps({\"url\": {\"$regex\": \"metat_out_json\\/output.json\"}}),\n", " ),\n", " FileTypeEnumBase(\n", " name=\"QC non-rRNA R2\",\n", " description=\"QC removed rRNA reads (R2) fastq\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"filtered_R2.fastq\"}})\n", + " filter=json.dumps({\"url\": {\"$regex\": \"filtered_R2.fastq\"}}),\n", " ),\n", " FileTypeEnumBase(\n", " name=\"QC non-rRNA R1\",\n", " description=\"QC removed rRNA reads (R1) fastq\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"filtered_R1.fastq\"}})\n", + " filter=json.dumps({\"url\": {\"$regex\": \"filtered_R1.fastq\"}}),\n", " ),\n", " FileTypeEnumBase(\n", " name=\"Metagenome Bins\",\n", " description=\"Metagenome bin contigs fasta\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"(hqmq\\_bin\\.zip)|(bins\\.\\d+\\.fa)\"}})\n", + " filter=json.dumps({\"url\": {\"$regex\": \"(hqmq\\_bin\\.zip)|(bins\\.\\d+\\.fa)\"}}),\n", " ),\n", " FileTypeEnumBase(\n", " name=\"CheckM Statistics\",\n", " description=\"CheckM statistics report\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"checkm_qa.out\"}})\n", - " ), \n", + " filter=json.dumps({\"url\": {\"$regex\": \"checkm_qa.out\"}}),\n", + " ),\n", " FileTypeEnumBase(\n", " name=\"Krona Plot\",\n", " description=\"[GOTTCHA2] krona plot HTML file\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"gottcha2.*krona.html\"}})\n", + " filter=json.dumps({\"url\": {\"$regex\": \"gottcha2.*krona.html\"}}),\n", " ),\n", " FileTypeEnumBase(\n", " name=\"Krona Plot\",\n", " description=\"[Kraken2] krona plot HTML file\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"kraken2.*krona.html\"}})\n", + " filter=json.dumps({\"url\": {\"$regex\": \"kraken2.*krona.html\"}}),\n", " ),\n", " FileTypeEnumBase(\n", " name=\"Classification Report\",\n", " description=\"[Kraken2] output report file\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"kraken2.*report.tsv\"}})\n", - " ), \n", + " filter=json.dumps({\"url\": {\"$regex\": \"kraken2.*report.tsv\"}}),\n", + " ),\n", " FileTypeEnumBase(\n", " name=\"Taxonomic Classification\",\n", " description=\"[Kraken2] output read classification file\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"kraken2.*classification.tsv\"}})\n", - " ), \n", + " filter=json.dumps({\"url\": {\"$regex\": \"kraken2.*classification.tsv\"}}),\n", + " ),\n", " FileTypeEnumBase(\n", " name=\"Krona Plot\",\n", " description=\"[Centrifuge] krona plot HTML file\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"centrifuge.*krona.html\"}})\n", - " ), \n", + " filter=json.dumps({\"url\": {\"$regex\": \"centrifuge.*krona.html\"}}),\n", + " ),\n", " FileTypeEnumBase(\n", " name=\"Classification Report\",\n", " description=\"[Centrifuge] output report file\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"centrifuge.*report.tsv\"}})\n", - " ), \n", + " filter=json.dumps({\"url\": {\"$regex\": \"centrifuge.*report.tsv\"}}),\n", + " ),\n", " FileTypeEnumBase(\n", " name=\"Taxonomic Classification\",\n", " description=\"[Centrifuge] output read classification file\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"centrifuge.*classification.tsv\"}})\n", - " ), \n", + " filter=json.dumps({\"url\": {\"$regex\": \"centrifuge.*classification.tsv\"}}),\n", + " ),\n", " FileTypeEnumBase(\n", " name=\"Structural Annotation GFF\",\n", " description=\"GFF3 format file with structural annotations\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"annotation\\/.*structural_annotation\\.gff\"}})\n", - " ), \n", + " filter=json.dumps(\n", + " {\"url\": {\"$regex\": \"annotation\\/.*structural_annotation\\.gff\"}}\n", + " ),\n", + " ),\n", " FileTypeEnumBase(\n", " name=\"Functional Annotation GFF\",\n", " description=\"GFF3 format file with functional annotations\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"annotation\\/.*functional_annotation\\.gff\"}})\n", - " ), \n", + " filter=json.dumps(\n", + " {\"url\": {\"$regex\": \"annotation\\/.*functional_annotation\\.gff\"}}\n", + " ),\n", + " ),\n", " FileTypeEnumBase(\n", " name=\"Annotation Amino Acid FASTA\",\n", " description=\"FASTA amino acid file for annotated proteins\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"annotation.*\\.faa\"}})\n", - " ), \n", + " filter=json.dumps({\"url\": {\"$regex\": \"annotation.*\\.faa\"}}),\n", + " ),\n", " FileTypeEnumBase(\n", " name=\"Annotation Enzyme Commission\",\n", " description=\"Tab delimited file for EC annotation\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"_ec.tsv\"}})\n", - " ), \n", + " filter=json.dumps({\"url\": {\"$regex\": \"_ec.tsv\"}}),\n", + " ),\n", " FileTypeEnumBase(\n", " name=\"Annotation KEGG Orthology\",\n", " description=\"Tab delimited file for KO annotation\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"_ko.tsv\"}})\n", - " ), \n", + " filter=json.dumps({\"url\": {\"$regex\": \"_ko.tsv\"}}),\n", + " ),\n", " FileTypeEnumBase(\n", " name=\"Assembly Coverage BAM\",\n", " description=\"Sorted bam file of reads mapping back to the final assembly\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"pairedMapped_sorted.bam\"}})\n", - " ), \n", + " filter=json.dumps({\"url\": {\"$regex\": \"pairedMapped_sorted.bam\"}}),\n", + " ),\n", " FileTypeEnumBase(\n", " name=\"Assembly AGP\",\n", " description=\"An AGP format file describes the assembly\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"assembly.agp\"}})\n", - " ), \n", + " filter=json.dumps({\"url\": {\"$regex\": \"assembly.agp\"}}),\n", + " ),\n", " FileTypeEnumBase(\n", " name=\"Assembly Scaffolds\",\n", " description=\"Final assembly scaffolds fasta\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"assembly_scaffolds.fna\"}})\n", - " ), \n", + " filter=json.dumps({\"url\": {\"$regex\": \"assembly_scaffolds.fna\"}}),\n", + " ),\n", " FileTypeEnumBase(\n", " name=\"Assembly Contigs\",\n", " description=\"Final assembly contigs fasta\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"assembly_contigs.fna\"}})\n", - " ), \n", + " filter=json.dumps({\"url\": {\"$regex\": \"assembly_contigs.fna\"}}),\n", + " ),\n", " FileTypeEnumBase(\n", " name=\"Assembly Coverage Stats\",\n", " description=\"Assembled contigs coverage information\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"mapping_stats.txt\"}})\n", + " filter=json.dumps({\"url\": {\"$regex\": \"mapping_stats.txt\"}}),\n", " ),\n", " FileTypeEnumBase(\n", " name=\"Filtered Sequencing Reads\",\n", " description=\"Reads QC result fastq (clean data)\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"filtered.fastq.gz\"}})\n", + " filter=json.dumps({\"url\": {\"$regex\": \"filtered.fastq.gz\"}}),\n", " ),\n", " FileTypeEnumBase(\n", " name=\"QC Statistics\",\n", " description=\"Reads QC summary statistics\",\n", - " filter=json.dumps({\"url\": {\"$regex\": \"filterStats.txt\"}})\n", + " filter=json.dumps({\"url\": {\"$regex\": \"filterStats.txt\"}}),\n", " ),\n", "]\n", "\n", @@ -2576,7 +2653,8 @@ "class FileTypeEnumBase(BaseModel):\n", " name: str\n", " description: str\n", - " filter: str # JSON-encoded data_object_set mongo collection filter document \n", + " filter: str # JSON-encoded data_object_set mongo collection filter document\n", + "\n", "\n", "class FileTypeEnum(FileTypeEnumBase):\n", " id: str" @@ -2593,8 +2671,11 @@ "\n", "from toolz import dissoc\n", "\n", + "\n", "def fte_matches(fte):\n", - " return [dissoc(d, \"_id\") for d in mongo.db.data_object_set.find(json.loads(fte.filter))]" + " return [\n", + " dissoc(d, \"_id\") for d in mongo.db.data_object_set.find(json.loads(fte.filter))\n", + " ]" ] }, { @@ -2714,7 +2795,7 @@ "for doc in docs:\n", " pi = doc[\"principal_investigator\"]\n", " pi_name = pi[\"has_raw_value\"]\n", - " image_name = \"_\".join(reversed(pi_name.lower().split(\" \"))) + '.jpg'\n", + " image_name = \"_\".join(reversed(pi_name.lower().split(\" \"))) + \".jpg\"\n", " url = base_url + image_name\n", " if requests.head(url).status_code == 200:\n", " doc[\"principal_investigator\"][\"profile_image_url\"] = url\n", @@ -2749,7 +2830,9 @@ "metadata": {}, "outputs": [], "source": [ - "db.study_set.count_documents({\"principal_investigator.profile_image_url\": {\"$exists\": True}})" + "db.study_set.count_documents(\n", + " {\"principal_investigator.profile_image_url\": {\"$exists\": True}}\n", + ")" ] }, { @@ -2785,7 +2868,7 @@ "# for line in f:\n", "# do_id, url = line.strip().split(\",\")\n", "# do_ids.append(do_id)\n", - " \n", + "\n", "# print(f\"{len(docs)} listed\")\n", "\n", "# docs = [dissoc(d, \"_id\") for d in db.data_object_set.find({\"id\": {\"$in\": do_ids}})]\n", @@ -2806,23 +2889,32 @@ "\n", "urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)\n", "\n", + "\n", "def get_file_size_bytes(d, header_ok=True):\n", " url = d[\"url\"].replace(\"https://\", \"http://\")\n", " try:\n", " rv = requests.head(\n", - " url, allow_redirects=True, verify=False, timeout=5, headers={\"Accept-Encoding\": \"gzip;q=0\"}\n", + " url,\n", + " allow_redirects=True,\n", + " verify=False,\n", + " timeout=5,\n", + " headers={\"Accept-Encoding\": \"gzip;q=0\"},\n", " )\n", " if not rv.status_code == 200:\n", " return {\"no_ok_response\": [d[\"id\"]]}\n", "\n", " if header_ok:\n", " try:\n", - " return {\"data\": [assoc(d, 'file_size_bytes', int(rv.headers['Content-Length']))]}\n", + " return {\n", + " \"data\": [\n", + " assoc(d, \"file_size_bytes\", int(rv.headers[\"Content-Length\"]))\n", + " ]\n", + " }\n", " except KeyError:\n", " pass\n", - " \n", - " #rv = requests.get(url, allow_redirects=True, verify=False, timeout=0.5)\n", - " #return {\"data\": [assoc(d, 'file_size_bytes', len(rv.content))]}\n", + "\n", + " # rv = requests.get(url, allow_redirects=True, verify=False, timeout=0.5)\n", + " # return {\"data\": [assoc(d, 'file_size_bytes', len(rv.content))]}\n", " return {\"no_header_content_length\": [d[\"id\"]]}\n", " except Exception as e:\n", " return {\"error\": [(d, str(e))]}" @@ -2857,12 +2949,7 @@ "pbar = tqdm(total=len(docs))\n", "\n", "with concurrent.futures.ThreadPoolExecutor() as executor:\n", - " future_to_doc = {\n", - " executor.submit(\n", - " get_file_size_bytes, doc\n", - " ): doc\n", - " for doc in docs\n", - " }\n", + " future_to_doc = {executor.submit(get_file_size_bytes, doc): doc for doc in docs}\n", " print(\"created futures...\")\n", " for future in concurrent.futures.as_completed(future_to_doc):\n", " pbar.update(1)\n", @@ -3009,8 +3096,11 @@ "\n", "from nmdc_runtime.api.core.util import pick\n", "\n", + "\n", "def study_summary(doc):\n", - " return pick([\"id\", \"principal_investigator\", \"name\", \"publications\", \"websites\"], doc)" + " return pick(\n", + " [\"id\", \"principal_investigator\", \"name\", \"publications\", \"websites\"], doc\n", + " )" ] }, { @@ -3031,258 +3121,271 @@ "metadata": {}, "outputs": [], "source": [ - "commands = [{\n", - " # row 6\n", - " \"update\": \"study_set\",\n", - " \"updates\": [{\n", - " \"q\": {\n", - " \"id\": stegen_study_id\n", - " },\n", - " \"u\": {\n", - " \"$addToSet\": {\n", - " \"publications\": \"https://doi.org/10.1371/journal.pone.0228165\"\n", + "commands = [\n", + " {\n", + " # row 6\n", + " \"update\": \"study_set\",\n", + " \"updates\": [\n", + " {\n", + " \"q\": {\"id\": stegen_study_id},\n", + " \"u\": {\n", + " \"$addToSet\": {\n", + " \"publications\": \"https://doi.org/10.1371/journal.pone.0228165\"\n", + " }\n", + " },\n", " }\n", - " },\n", - " }],\n", - " \"comment\": \"add to studies/stegen/publications\" \n", - "}, {\n", - " # row 7\n", - " \"update\": \"study_set\",\n", - " \"updates\": [{\n", - " \"q\": {\n", - " \"id\": stegen_study_id\n", - " },\n", - " \"u\": {\n", - " \"$set\": {\n", - " \"description\": \"\"\"\\\n", + " ],\n", + " \"comment\": \"add to studies/stegen/publications\",\n", + " },\n", + " {\n", + " # row 7\n", + " \"update\": \"study_set\",\n", + " \"updates\": [\n", + " {\n", + " \"q\": {\"id\": stegen_study_id},\n", + " \"u\": {\n", + " \"$set\": {\n", + " \"description\": \"\"\"\\\n", "This research project, led by James Stegen at PNNL, aimed to understand how molecular-scale processes govern the biogeochemical function of subsurface groundwater-surface water mixing zones (i.e., the hyporheic zone). This project was conducted along the Columbia River in Eastern Washington State, which exhibits variation in microbiome composition, biogeochemical activity, and substrate biogeochemistry, making it an ideal environment for studying biogeochemical hotspots. To capture a range of biogeochemical activities, samples were collected from areas with dense vegetation and virtually no vegetation.\n", "\n", "This project’s long-term goal is to develop models that can simulate impacts of disturbance on river corridor hydro-biogeochemistry by understanding fundamental molecular processes that lead to emergent function. This project is part of PNNL’s River Corridor Hydrobiogeochemistry Science Focus Area (https://www.pnnl.gov/projects/river-corridor-hydrobiogeochemistry-science-focus-area).\n", "\"\"\"\n", + " }\n", + " },\n", " }\n", - " },\n", - " }],\n", - " \"comment\": \"replace studies/stegen/description\"\n", - "}, {\n", - " # row 8\n", - " \"update\": \"study_set\",\n", - " \"updates\": [{\n", - " \"q\": {\n", - " \"id\": wrighton_study_id\n", - " },\n", - " \"u\": {\n", - " \"$set\": {\n", - " \"description\": \"\"\"\\\n", + " ],\n", + " \"comment\": \"replace studies/stegen/description\",\n", + " },\n", + " {\n", + " # row 8\n", + " \"update\": \"study_set\",\n", + " \"updates\": [\n", + " {\n", + " \"q\": {\"id\": wrighton_study_id},\n", + " \"u\": {\n", + " \"$set\": {\n", + " \"description\": \"\"\"\\\n", "This project aims to improve the understanding of microbial diversity and metabolism in deep shale, with implications for novel enzyme discovery and energy development. This project was conducted along two Appalachian basin shales, the Marcellus and Utica/Point Pleasant formations in Pennsylvania and Ohio, respectively. Samples were collected from input and produced fluids up to a year after hydraulic fracturing at varying depths and locations (4 wells, 2 basin shales).\n", "\"\"\"\n", + " }\n", + " },\n", " }\n", - " },\n", - " }],\n", - " \"comment\": \"replace studies/wrighton/description\"\n", - "}, {\n", - " # row 9\n", - " \"update\": \"study_set\",\n", - " \"updates\": [{\n", - " \"q\": {\n", - " \"id\": brodie_study_id\n", - " },\n", - " \"u\": {\n", - " \"$addToSet\": {\n", - " \"publications\": \"https://doi.org/10.21952/WTR/1573029\"\n", + " ],\n", + " \"comment\": \"replace studies/wrighton/description\",\n", + " },\n", + " {\n", + " # row 9\n", + " \"update\": \"study_set\",\n", + " \"updates\": [\n", + " {\n", + " \"q\": {\"id\": brodie_study_id},\n", + " \"u\": {\n", + " \"$addToSet\": {\n", + " \"publications\": \"https://doi.org/10.21952/WTR/1573029\"\n", + " }\n", + " },\n", " }\n", - " },\n", - " }],\n", - " \"comment\": \"add to studies/brodie/publications\"\n", - "}, {\n", - " # row 10\n", - " \"update\": \"study_set\",\n", - " \"updates\": [{\n", - " \"q\": {\n", - " \"id\": brodie_study_id\n", - " },\n", - " \"u\": {\n", - " \"$set\": {\n", - " \"description\": \"\"\"\\\n", + " ],\n", + " \"comment\": \"add to studies/brodie/publications\",\n", + " },\n", + " {\n", + " # row 10\n", + " \"update\": \"study_set\",\n", + " \"updates\": [\n", + " {\n", + " \"q\": {\"id\": brodie_study_id},\n", + " \"u\": {\n", + " \"$set\": {\n", + " \"description\": \"\"\"\\\n", "This research project aimed to understand how snow accumulation and snowmelt influences the mobilization of nitrogen through the soil microbiome in a mountainous catchment at the East River Watershed in Colorado. This project sought to identify bacteria, archaea, and fungi that were associated with the microbial biomass bloom that occurs during winter and the biomass crash following snowmelt. This project also sought to understand whether the traits that govern microbial community assembly during and after snowmelt were phylogenetically conserved. Samples were collected during winter, the snowmelt period, and after snowmelt in spring, from an area that transitioned from an upland hillslope to a riparian floodplain.\n", "\n", "This project is part of the Watershed Function Science Focus Area: https://watershed.lbl.gov/.\n", "\"\"\"\n", + " }\n", + " },\n", " }\n", - " },\n", - " }],\n", - " \"comment\": \"replace studies/brodie/description\"\n", - "}, {\n", - " # row 11\n", - " \"update\": \"study_set\",\n", - " \"updates\": [{\n", - " \"q\": {\n", - " \"id\": bioscales_study_id\n", - " },\n", - " \"u\": {\n", - " \"$set\": {\n", - " \"description\": \"\"\"\\\n", + " ],\n", + " \"comment\": \"replace studies/brodie/description\",\n", + " },\n", + " {\n", + " # row 11\n", + " \"update\": \"study_set\",\n", + " \"updates\": [\n", + " {\n", + " \"q\": {\"id\": bioscales_study_id},\n", + " \"u\": {\n", + " \"$set\": {\n", + " \"description\": \"\"\"\\\n", "The goal of this Bio-Scales Pilot Project study is to understand how plant traits modify the microbiome and in particular how the coupled plant-soil-microbial system influences nitrogen transformation patterns and fluxes.\n", "\"\"\"\n", + " }\n", + " },\n", " }\n", - " },\n", - " }],\n", - " \"comment\": \"replace studies/bioscales/description\"\n", - "}, {\n", - " # row 12\n", - " \"update\": \"study_set\",\n", - " \"updates\": [{\n", - " \"q\": {\n", - " \"id\": bioscales_study_id\n", - " },\n", - " \"u\": {\n", - " \"$addToSet\": {\n", - " \"websites\": {\n", - " \"$each\": [\n", - " \"https://www.ornl.gov/staff-profile/mitchel-j-doktycz\",\n", - " \"https://www.ornl.gov/section/bioimaging-and-analytics\",\n", - " \"https://pmiweb.ornl.gov/\",\n", - " \"https://www.ornl.gov/project/bio-scales\",\n", - " ]\n", - " }\n", + " ],\n", + " \"comment\": \"replace studies/bioscales/description\",\n", + " },\n", + " {\n", + " # row 12\n", + " \"update\": \"study_set\",\n", + " \"updates\": [\n", + " {\n", + " \"q\": {\"id\": bioscales_study_id},\n", + " \"u\": {\n", + " \"$addToSet\": {\n", + " \"websites\": {\n", + " \"$each\": [\n", + " \"https://www.ornl.gov/staff-profile/mitchel-j-doktycz\",\n", + " \"https://www.ornl.gov/section/bioimaging-and-analytics\",\n", + " \"https://pmiweb.ornl.gov/\",\n", + " \"https://www.ornl.gov/project/bio-scales\",\n", + " ]\n", + " }\n", + " }\n", + " },\n", " }\n", - " },\n", - " }],\n", - " \"comment\": \"add to studies/bioscales/websites\"\n", - "}, {\n", - " # row 14 (row 13 done elsewhere)\n", - " \"update\": \"study_set\",\n", - " \"updates\": [{\n", - " \"q\": {\n", - " \"id\": bioscales_study_id\n", - " },\n", - " \"u\": {\n", - " \"$set\": {\n", - " \"principal_investigator.has_raw_value\": \"Mitchel J. Doktycz\"\n", + " ],\n", + " \"comment\": \"add to studies/bioscales/websites\",\n", + " },\n", + " {\n", + " # row 14 (row 13 done elsewhere)\n", + " \"update\": \"study_set\",\n", + " \"updates\": [\n", + " {\n", + " \"q\": {\"id\": bioscales_study_id},\n", + " \"u\": {\n", + " \"$set\": {\n", + " \"principal_investigator.has_raw_value\": \"Mitchel J. Doktycz\"\n", + " }\n", + " },\n", " }\n", - " },\n", - " }],\n", - " \"comment\": \"replace studies/bioscales/principal_investigator name\"\n", - "}, {\n", - " # row 15\n", - " \"update\": \"study_set\",\n", - " \"updates\": [{\n", - " \"q\": {\n", - " \"id\": microbes_persist_sfa_study_id\n", - " },\n", - " \"u\": {\n", - " \"$addToSet\": {\n", - " \"websites\": \"https://sc-programs.llnl.gov/biological-and-environmental-research-at-llnl/soil-microbiome\"\n", + " ],\n", + " \"comment\": \"replace studies/bioscales/principal_investigator name\",\n", + " },\n", + " {\n", + " # row 15\n", + " \"update\": \"study_set\",\n", + " \"updates\": [\n", + " {\n", + " \"q\": {\"id\": microbes_persist_sfa_study_id},\n", + " \"u\": {\n", + " \"$addToSet\": {\n", + " \"websites\": \"https://sc-programs.llnl.gov/biological-and-environmental-research-at-llnl/soil-microbiome\"\n", + " }\n", + " },\n", " }\n", - " },\n", - " }],\n", - " \"comment\": \"add to studies/microbes_persist_sfa/websites\"\n", - "}, {\n", - " # row 16\n", - " \"update\": \"study_set\",\n", - " \"updates\": [{\n", - " \"q\": {\n", - " \"id\": microbes_persist_sfa_study_id\n", - " },\n", - " \"u\": {\n", - " \"$set\": {\n", - " \"description\": \"\"\"\\\n", + " ],\n", + " \"comment\": \"add to studies/microbes_persist_sfa/websites\",\n", + " },\n", + " {\n", + " # row 16\n", + " \"update\": \"study_set\",\n", + " \"updates\": [\n", + " {\n", + " \"q\": {\"id\": microbes_persist_sfa_study_id},\n", + " \"u\": {\n", + " \"$set\": {\n", + " \"description\": \"\"\"\\\n", "The Microbes Persist: Systems Biology of the Soil Microbiome SFA seeks to understand how microbial ecophysiology, population dynamics, and microbe–mineral–organic matter interactions regulate the persistence of microbial residues in soil under changing moisture regimes.\n", "\"\"\"\n", + " }\n", + " },\n", " }\n", - " },\n", - " }],\n", - " \"comment\": \"replace studies/microbes_persist_sfa/description\"\n", - "}, {\n", - " # row 19 (rows 17 and 18 done elsewhere)\n", - " \"update\": \"study_set\",\n", - " \"updates\": [{\n", - " \"q\": {\n", - " \"id\": plant_microbe_interfaces_sfa_study_id\n", - " },\n", - " \"u\": {\n", - " \"$addToSet\": {\n", - " \"websites\": \"https://pmiweb.ornl.gov/pmi-project-aims/\"\n", + " ],\n", + " \"comment\": \"replace studies/microbes_persist_sfa/description\",\n", + " },\n", + " {\n", + " # row 19 (rows 17 and 18 done elsewhere)\n", + " \"update\": \"study_set\",\n", + " \"updates\": [\n", + " {\n", + " \"q\": {\"id\": plant_microbe_interfaces_sfa_study_id},\n", + " \"u\": {\n", + " \"$addToSet\": {\n", + " \"websites\": \"https://pmiweb.ornl.gov/pmi-project-aims/\"\n", + " }\n", + " },\n", " }\n", - " },\n", - " }],\n", - " \"comment\": \"add to studies/plant_microbe_interfaces_sfa/websites\"\n", - "}, {\n", - " # row 20\n", - " \"update\": \"study_set\",\n", - " \"updates\": [{\n", - " \"q\": {\n", - " \"id\": plant_microbe_interfaces_sfa_study_id\n", - " },\n", - " \"u\": {\n", - " \"$set\": {\n", - " \"description\": \"\"\"\\\n", + " ],\n", + " \"comment\": \"add to studies/plant_microbe_interfaces_sfa/websites\",\n", + " },\n", + " {\n", + " # row 20\n", + " \"update\": \"study_set\",\n", + " \"updates\": [\n", + " {\n", + " \"q\": {\"id\": plant_microbe_interfaces_sfa_study_id},\n", + " \"u\": {\n", + " \"$set\": {\n", + " \"description\": \"\"\"\\\n", "The goal of the Plant-Microbe Interfaces SFA is to gain a deeper understanding of the diversity and functioning of mutually beneficial interactions between plants and microbes in the rhizosphere.\n", "\"\"\"\n", + " }\n", + " },\n", " }\n", - " },\n", - " }],\n", - " \"comment\": \"set studies/plant_microbe_interfaces_sfa/description\"\n", - "}, {\n", - " # row 21\n", - " \"update\": \"study_set\",\n", - " \"updates\": [{\n", - " \"q\": {\n", - " \"id\": spruce_study_id\n", - " },\n", - " \"u\": {\n", - " \"$addToSet\": {\n", - " \"websites\": \"https://mnspruce.ornl.gov/project/overview\"\n", + " ],\n", + " \"comment\": \"set studies/plant_microbe_interfaces_sfa/description\",\n", + " },\n", + " {\n", + " # row 21\n", + " \"update\": \"study_set\",\n", + " \"updates\": [\n", + " {\n", + " \"q\": {\"id\": spruce_study_id},\n", + " \"u\": {\n", + " \"$addToSet\": {\n", + " \"websites\": \"https://mnspruce.ornl.gov/project/overview\"\n", + " }\n", + " },\n", " }\n", - " },\n", - " }],\n", - " \"comment\": \"add to studies/spruce/webites\"\n", - "}, {\n", - " # row 22\n", - " \"update\": \"study_set\",\n", - " \"updates\": [{\n", - " \"q\": {\n", - " \"id\": spruce_study_id\n", - " },\n", - " \"u\": {\n", - " \"$set\": {\n", - " \"description\": \"\"\"\\\n", + " ],\n", + " \"comment\": \"add to studies/spruce/webites\",\n", + " },\n", + " {\n", + " # row 22\n", + " \"update\": \"study_set\",\n", + " \"updates\": [\n", + " {\n", + " \"q\": {\"id\": spruce_study_id},\n", + " \"u\": {\n", + " \"$set\": {\n", + " \"description\": \"\"\"\\\n", "The Spruce and Peatland Responses Under Changing Environments (SPRUCE) experiment is the primary component of the Terrestrial Ecosystem Science Scientific Focus Area of ORNL's Climate Change Program, focused on terrestrial ecosystems and the mechanisms that underlie their responses to climatic change. This project seeks to assess the response of northern peatland ecosystems to increases in temperature and exposures to elevated atmospheric CO2 concentrations.\n", "\"\"\"\n", + " }\n", + " },\n", " }\n", - " },\n", - " }],\n", - " \"comment\": \"set studies/spruce/description\"\n", - "}, {\n", - " # row 23\n", - " \"update\": \"study_set\",\n", - " \"updates\": [{\n", - " \"q\": {\n", - " \"id\": watershed_sfa_study_id\n", - " },\n", - " \"u\": {\n", - " \"$addToSet\": {\n", - " \"websites\": \"https://watershed.lbl.gov/about/\"\n", + " ],\n", + " \"comment\": \"set studies/spruce/description\",\n", + " },\n", + " {\n", + " # row 23\n", + " \"update\": \"study_set\",\n", + " \"updates\": [\n", + " {\n", + " \"q\": {\"id\": watershed_sfa_study_id},\n", + " \"u\": {\"$addToSet\": {\"websites\": \"https://watershed.lbl.gov/about/\"}},\n", " }\n", - " },\n", - " }],\n", - " \"comment\": \"add to studies/watershed_sfa/webites\"\n", - "}, {\n", - " # row 24\n", - " \"update\": \"study_set\",\n", - " \"updates\": [{\n", - " \"q\": {\n", - " \"id\": watershed_sfa_study_id\n", - " },\n", - " \"u\": {\n", - " \"$set\": {\n", - " \"description\": \"\"\"\\\n", + " ],\n", + " \"comment\": \"add to studies/watershed_sfa/webites\",\n", + " },\n", + " {\n", + " # row 24\n", + " \"update\": \"study_set\",\n", + " \"updates\": [\n", + " {\n", + " \"q\": {\"id\": watershed_sfa_study_id},\n", + " \"u\": {\n", + " \"$set\": {\n", + " \"description\": \"\"\"\\\n", "The Watershed Function Scientific SFA is developing a predictive understanding of how mountainous watersheds retain and release water, nutrients, carbon, and metals. In particular, the SFA is developing understanding and tools to measure and predict how droughts, early snowmelt, and other perturbations impact downstream water availability and biogeochemical cycling at episodic to decadal timescales.\n", "\"\"\"\n", + " }\n", + " },\n", " }\n", - " },\n", - " }],\n", - " \"comment\": \"set studies/watershed_sfa/description\"\n", - "}]" + " ],\n", + " \"comment\": \"set studies/watershed_sfa/description\",\n", + " },\n", + "]" ] }, { @@ -3317,7 +3420,7 @@ "source": [ "from toolz import assoc\n", "\n", - "_tmp_commands = [assoc(c, 'update', tmp_coll) for c in commands]" + "_tmp_commands = [assoc(c, \"update\", tmp_coll) for c in commands]" ] }, { @@ -3405,15 +3508,17 @@ " reader = csv.DictReader(f)\n", " include = False\n", " for row in reader:\n", - " if row['Term'] == 'study title':\n", + " if row[\"Term\"] == \"study title\":\n", " include = True\n", " if include:\n", - " rows.append({\n", - " 'name': row[\"Study\"].lower().replace(' ','_').replace('-','_'),\n", - " 'id': f'gold:{row[\"GOLD Study ID\"]}',\n", - " 'field': re.findall(r\"\\w+\", row[\"Term\"])[-1].lower(),\n", - " 'value': row['Value'].strip(),\n", - " })" + " rows.append(\n", + " {\n", + " \"name\": row[\"Study\"].lower().replace(\" \", \"_\").replace(\"-\", \"_\"),\n", + " \"id\": f'gold:{row[\"GOLD Study ID\"]}',\n", + " \"field\": re.findall(r\"\\w+\", row[\"Term\"])[-1].lower(),\n", + " \"value\": row[\"Value\"].strip(),\n", + " }\n", + " )" ] }, { @@ -3423,7 +3528,7 @@ "metadata": {}, "outputs": [], "source": [ - "{r['field'] for r in rows}" + "{r[\"field\"] for r in rows}" ] }, { @@ -3452,16 +3557,15 @@ "for row in rows:\n", " c = {\n", " \"update\": \"study_set\",\n", - " \"updates\": [{\n", - " \"q\": {\"id\": row[\"id\"]},\n", - " \"u\": {}\n", - " }],\n", + " \"updates\": [{\"q\": {\"id\": row[\"id\"]}, \"u\": {}}],\n", " }\n", " if row[\"field\"] in {\"title\", \"description\"}:\n", " c[\"updates\"][0][\"u\"] = {\"$set\": {row[\"field\"]: row[\"value\"]}}\n", " c[\"comment\"] = f'set studies/{row[\"name\"]}/{row[\"field\"]}'\n", " elif row[\"field\"] == \"citation\":\n", - " c[\"updates\"][0][\"u\"] = {\"$set\": {\"doi\": {\"has_raw_value\": urlify(row[\"value\"])}}}\n", + " c[\"updates\"][0][\"u\"] = {\n", + " \"$set\": {\"doi\": {\"has_raw_value\": urlify(row[\"value\"])}}\n", + " }\n", " c[\"comment\"] = f'set studies/{row[\"name\"]}/doi'\n", " elif row[\"field\"] == \"publication\":\n", " c[\"updates\"][0][\"u\"] = {\"$addToSet\": {\"publications\": urlify(row[\"value\"])}}\n", @@ -3485,7 +3589,7 @@ "db.drop_collection(tmp_coll)\n", "db[tmp_coll].insert_many(docs)\n", "db[tmp_coll].create_index(\"id\", unique=True)\n", - "_tmp_commands = [assoc(c, 'update', tmp_coll) for c in commands]\n", + "_tmp_commands = [assoc(c, \"update\", tmp_coll) for c in commands]\n", "rvs = []\n", "for c in _tmp_commands:\n", " rvs.append(db.command(c))\n", @@ -3528,10 +3632,11 @@ "def local_file_to_api_object(file_info):\n", " return lftao(context, file_info)\n", "\n", - "#obj = local_file_to_api_object({\"storage_path\": storage_path, \"mime_type\": 'text/csv'})\n", "\n", - "#doc = db.objects.find_one({\"id\": obj[\"id\"]})\n", - "#assert doc[\"name\"] == Path(storage_path).name" + "# obj = local_file_to_api_object({\"storage_path\": storage_path, \"mime_type\": 'text/csv'})\n", + "\n", + "# doc = db.objects.find_one({\"id\": obj[\"id\"]})\n", + "# assert doc[\"name\"] == Path(storage_path).name" ] }, { @@ -3561,7 +3666,10 @@ "metadata": {}, "outputs": [], "source": [ - "docs = [assoc_in(d, [\"doi\", \"has_raw_value\"], urlify(d[\"doi\"][\"has_raw_value\"])) for d in docs]" + "docs = [\n", + " assoc_in(d, [\"doi\", \"has_raw_value\"], urlify(d[\"doi\"][\"has_raw_value\"]))\n", + " for d in docs\n", + "]" ] }, { @@ -3626,7 +3734,9 @@ "metadata": {}, "outputs": [], "source": [ - "ids_biosamples__no_part_of = db.biosample_set.distinct(\"id\", {\"part_of\": {\"$exists\": False}})" + "ids_biosamples__no_part_of = db.biosample_set.distinct(\n", + " \"id\", {\"part_of\": {\"$exists\": False}}\n", + ")" ] }, { @@ -3646,7 +3756,9 @@ "metadata": {}, "outputs": [], "source": [ - "db.omics_processing_set.count_documents({\"has_input\": {\"$in\": ids_biosamples__no_part_of}})" + "db.omics_processing_set.count_documents(\n", + " {\"has_input\": {\"$in\": ids_biosamples__no_part_of}}\n", + ")" ] }, { @@ -3686,7 +3798,10 @@ "source": [ "from toolz import dissoc\n", "\n", - "docs = [dissoc(d, \"_id\") for d in db.biosample_set.find({\"id\": {\"$in\": list(biosample__part_of.keys())}})]" + "docs = [\n", + " dissoc(d, \"_id\")\n", + " for d in db.biosample_set.find({\"id\": {\"$in\": list(biosample__part_of.keys())}})\n", + "]" ] }, { @@ -3767,7 +3882,7 @@ "metadata": {}, "outputs": [], "source": [ - "ids_biosamples = [d[\"id\"]for d in db.biosample_set.find({\"part_of\": study_id})]" + "ids_biosamples = [d[\"id\"] for d in db.biosample_set.find({\"part_of\": study_id})]" ] }, { @@ -3787,7 +3902,9 @@ "metadata": {}, "outputs": [], "source": [ - "ids_omics_processings_via_study = [d[\"id\"] for d in db.omics_processing_set.find({\"part_of\": study_id})]" + "ids_omics_processings_via_study = [\n", + " d[\"id\"] for d in db.omics_processing_set.find({\"part_of\": study_id})\n", + "]" ] }, { @@ -3808,7 +3925,8 @@ "outputs": [], "source": [ "ids_omics_processings_via_biosamples = [\n", - " d[\"id\"] for d in db.omics_processing_set.find({\"has_input\": {\"$in\": ids_biosamples}})\n", + " d[\"id\"]\n", + " for d in db.omics_processing_set.find({\"has_input\": {\"$in\": ids_biosamples}})\n", "]" ] }, @@ -3829,7 +3947,9 @@ "metadata": {}, "outputs": [], "source": [ - "ids_omics_processings = list(set(ids_omics_processings_via_study) | set(ids_omics_processings_via_biosamples))" + "ids_omics_processings = list(\n", + " set(ids_omics_processings_via_study) | set(ids_omics_processings_via_biosamples)\n", + ")" ] }, { @@ -3851,13 +3971,16 @@ "source": [ "from toolz import concat\n", "\n", - "ids_data_objects_from_omics_processings = list(concat([\n", - " d[\"has_output\"] for d in db.omics_processing_set.find({\n", - " \"id\": {\n", - " \"$in\": ids_omics_processings\n", - " }\n", - " }, [\"has_output\"])\n", - "]))" + "ids_data_objects_from_omics_processings = list(\n", + " concat(\n", + " [\n", + " d[\"has_output\"]\n", + " for d in db.omics_processing_set.find(\n", + " {\"id\": {\"$in\": ids_omics_processings}}, [\"has_output\"]\n", + " )\n", + " ]\n", + " )\n", + ")" ] }, { @@ -3886,8 +4009,10 @@ " print(coll_name)\n", " db[coll_name].create_index(\"was_informed_by\")\n", " ids_analyses[coll_name] = [\n", - " d[\"id\"] for d in\n", - " db[coll_name].find({\"was_informed_by\": {\"$in\": ids_omics_processings}})\n", + " d[\"id\"]\n", + " for d in db[coll_name].find(\n", + " {\"was_informed_by\": {\"$in\": ids_omics_processings}}\n", + " )\n", " ]" ] }, @@ -3903,13 +4028,18 @@ "ids_data_objects_from_analyses = []\n", "\n", "for coll_name, ids_analysis_set in ids_analyses.items():\n", - " ids_data_objects_from_analyses.extend(list(concat([\n", - " d[\"has_output\"] for d in db[coll_name].find({\n", - " \"id\": {\n", - " \"$in\": ids_analysis_set\n", - " }\n", - " }, [\"has_output\"])\n", - " ])))" + " ids_data_objects_from_analyses.extend(\n", + " list(\n", + " concat(\n", + " [\n", + " d[\"has_output\"]\n", + " for d in db[coll_name].find(\n", + " {\"id\": {\"$in\": ids_analysis_set}}, [\"has_output\"]\n", + " )\n", + " ]\n", + " )\n", + " )\n", + " )" ] }, { @@ -3929,7 +4059,9 @@ "metadata": {}, "outputs": [], "source": [ - "ids_data_objects = list(set(ids_data_objects_from_omics_processings) | set(ids_data_objects_from_analyses))" + "ids_data_objects = list(\n", + " set(ids_data_objects_from_omics_processings) | set(ids_data_objects_from_analyses)\n", + ")" ] }, { @@ -3939,7 +4071,9 @@ "metadata": {}, "outputs": [], "source": [ - "len(ids_data_objects), db.data_object_set.count_documents({\"id\": {\"$in\": ids_data_objects}})" + "len(ids_data_objects), db.data_object_set.count_documents(\n", + " {\"id\": {\"$in\": ids_data_objects}}\n", + ")" ] }, { @@ -3949,7 +4083,9 @@ "metadata": {}, "outputs": [], "source": [ - "assert len(ids_data_objects) == db.data_object_set.count_documents({\"id\": {\"$in\": ids_data_objects}})" + "assert len(ids_data_objects) == db.data_object_set.count_documents(\n", + " {\"id\": {\"$in\": ids_data_objects}}\n", + ")" ] }, { @@ -3989,7 +4125,9 @@ "outputs": [], "source": [ "studies = [dissoc(d, \"_id\") for d in db.study_set.find({\"id\": study_id})]\n", - "biosamples = [dissoc(d, \"_id\") for d in db.biosample_set.find({\"id\": {\"$in\": ids_biosamples}})]" + "biosamples = [\n", + " dissoc(d, \"_id\") for d in db.biosample_set.find({\"id\": {\"$in\": ids_biosamples}})\n", + "]" ] }, { @@ -4097,9 +4235,11 @@ "metadata": {}, "outputs": [], "source": [ - "list(mdb.omics_processing_set.find(\n", - " {\"id\": {\"$in\": [\"emsl:512156\", \"emsl:512155\", \"emsl:504850\", \"emsl:502966\"]}}\n", - "))" + "list(\n", + " mdb.omics_processing_set.find(\n", + " {\"id\": {\"$in\": [\"emsl:512156\", \"emsl:512155\", \"emsl:504850\", \"emsl:502966\"]}}\n", + " )\n", + ")" ] }, { diff --git a/metadata-translation/notebooks/202106_workflow_execution_demo.ipynb b/metadata-translation/notebooks/202106_workflow_execution_demo.ipynb index 4f5d45b0..a61c7597 100644 --- a/metadata-translation/notebooks/202106_workflow_execution_demo.ipynb +++ b/metadata-translation/notebooks/202106_workflow_execution_demo.ipynb @@ -150,13 +150,15 @@ "def fetch_downloaded_json(url, save_dir):\n", " with open(os.path.join(save_dir, url_to_name(url))) as f:\n", " return json.load(f)\n", - " \n", + "\n", + "\n", "type_collections = {\n", " f'nmdc:{spec[\"items\"][\"$ref\"].split(\"/\")[-1]}': collection_name\n", " for collection_name, spec in nmdc_jsonschema[\"properties\"].items()\n", " if collection_name.endswith(\"_set\")\n", "}\n", "\n", + "\n", "def load_url(url, timeout):\n", " return requests.get(url, timeout=timeout)" ] @@ -168,7 +170,9 @@ "metadata": {}, "outputs": [], "source": [ - "rv = requests.get(\"https://data.microbiomedata.org/data/nmdc:mga0wa96/annotation/annotations.json\")" + "rv = requests.get(\n", + " \"https://data.microbiomedata.org/data/nmdc:mga0wa96/annotation/annotations.json\"\n", + ")" ] }, { @@ -198,7 +202,7 @@ "metadata": {}, "outputs": [], "source": [ - "strsizes = [line.split('\\t')[0] for line in sizes]" + "strsizes = [line.split(\"\\t\")[0] for line in sizes]" ] }, { @@ -243,7 +247,6 @@ "import concurrent.futures\n", "\n", "\n", - "\n", "def validate_json_urls(urls):\n", " validator = Draft7Validator(get_nmdc_schema())\n", " validation_errors = {}\n", @@ -260,7 +263,10 @@ " validation_errors[url] = f\"Exception: {exc}\"\n", " continue\n", " if response.status_code != 200:\n", - " validation_errors[url] = {\"status_code\": response.status_code, \"details\": response.text}\n", + " validation_errors[url] = {\n", + " \"status_code\": response.status_code,\n", + " \"details\": response.text,\n", + " }\n", " else:\n", " try:\n", " response.json()\n", @@ -269,7 +275,7 @@ " validation_errors[url] = \"Invalid JSON\"\n", " except Exception as exc:\n", " validation_errors[url] = f\"Could not write to file: {exc}\"\n", - " \n", + "\n", " pbar.close()\n", "\n", " for url in tqdm(set(urls) - set(validation_errors)):\n", @@ -300,7 +306,7 @@ " for coll_name, coll_docs in docs.items():\n", " errors = list(validator.iter_errors({coll_name: coll_docs}))\n", " validation_errors[coll_name] = [e.message for e in errors]\n", - " \n", + "\n", " return validation_errors" ] }, @@ -314,8 +320,7 @@ "from pathlib import Path\n", "\n", "filepath = Path.home().joinpath(\n", - " \"Dropbox\", \"diary\", \"21\", \"09\",\n", - " \"scanon-annotations-urls.txt\"\n", + " \"Dropbox\", \"diary\", \"21\", \"09\", \"scanon-annotations-urls.txt\"\n", ")\n", "with open(filepath) as f:\n", " urls = [line.strip() for line in f if line.strip()]" @@ -349,8 +354,7 @@ "from pathlib import Path\n", "\n", "filepath = Path.home().joinpath(\n", - " \"Dropbox\", \"diary\", \"21\", \"09\",\n", - " \"2021-09-15-scanon-nmdc-metadata-file-urls.txt\"\n", + " \"Dropbox\", \"diary\", \"21\", \"09\", \"2021-09-15-scanon-nmdc-metadata-file-urls.txt\"\n", ")\n", "with open(filepath) as f:\n", " urls = [line.strip() for line in f if line.strip()]" @@ -365,12 +369,14 @@ "source": [ "import json\n", "\n", + "\n", "def result_for_url_to_json_file(data, url):\n", - " with open(f'/Users/dwinston/Downloads/{url_to_name(url)}', 'w') as f:\n", - " json.dump(data.json(), f)\n", - " \n", + " with open(f\"/Users/dwinston/Downloads/{url_to_name(url)}\", \"w\") as f:\n", + " json.dump(data.json(), f)\n", + "\n", + "\n", "def fetch_downloaded_json(url):\n", - " with open(f'/Users/dwinston/Downloads/{url_to_name(url)}') as f:\n", + " with open(f\"/Users/dwinston/Downloads/{url_to_name(url)}\") as f:\n", " return json.load(f)" ] }, @@ -386,9 +392,11 @@ "from tqdm.notebook import tqdm\n", "import requests\n", "\n", + "\n", "def load_url(url, timeout):\n", " return requests.get(url, timeout=timeout)\n", "\n", + "\n", "pbar = tqdm(total=len(urls))\n", "\n", "with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:\n", @@ -399,7 +407,7 @@ " data = future.result()\n", " result_for_url_to_json_file(data, url)\n", " except Exception as exc:\n", - " print('%r generated an exception: %s' % (url, exc))\n", + " print(\"%r generated an exception: %s\" % (url, exc))\n", " else:\n", " print(f\"saved {url} data\")\n", " pbar.update(1)\n", @@ -427,8 +435,7 @@ "\n", "type_collections = {\n", " f'nmdc:{spec[\"items\"][\"$ref\"].split(\"/\")[-1]}'.lower(): collection_name\n", - " for collection_name, spec in\n", - " nmdc_jsonschema[\"properties\"].items()\n", + " for collection_name, spec in nmdc_jsonschema[\"properties\"].items()\n", " if collection_name.endswith(\"_set\")\n", "}\n", "\n", @@ -443,7 +450,7 @@ " else:\n", " docs[collection_name] = [doc]\n", " del docs[\"activity_set\"]\n", - " \n", + "\n", " _ = nmdc_jsonschema_validate(docs)" ] }, @@ -493,22 +500,31 @@ "metadata": {}, "outputs": [], "source": [ - "ids = [d[\"_id\"] for d in mdb.data_object_set.find({\"data_object_type\": {\"$exists\": True}}, [\"_id\"])]\n", + "ids = [\n", + " d[\"_id\"]\n", + " for d in mdb.data_object_set.find({\"data_object_type\": {\"$exists\": True}}, [\"_id\"])\n", + "]\n", "\n", - "stats_docs = list(mdb.data_object_set.aggregate([\n", - " {\"$match\": {\"data_object_type\": {\"$exists\": True}}},\n", - " {\"$group\": {\n", - " \"_id\": \"$data_object_type\",\n", - " \"size_total\": {\"$sum\": \"$file_size_bytes\"},\n", - " \"count\": {\"$sum\": 1}\n", - " }},\n", - "]))\n", + "stats_docs = list(\n", + " mdb.data_object_set.aggregate(\n", + " [\n", + " {\"$match\": {\"data_object_type\": {\"$exists\": True}}},\n", + " {\n", + " \"$group\": {\n", + " \"_id\": \"$data_object_type\",\n", + " \"size_total\": {\"$sum\": \"$file_size_bytes\"},\n", + " \"count\": {\"$sum\": 1},\n", + " }\n", + " },\n", + " ]\n", + " )\n", + ")\n", "\n", "for sdoc in stats_docs:\n", " edoc = mdb.file_type_enum.find_one({\"id\": sdoc[\"_id\"]})\n", " sdoc[\"name\"] = edoc[\"name\"]\n", " sdoc[\"description\"] = edoc[\"description\"]\n", - " \n", + "\n", "for d in stats_docs:\n", " print(\"{name} ({description}):\".format(**d))\n", " print(f\"total size (TB): {(d['size_total'] / (1024 * 1024 * 1024 * 1024)):.3}\")\n", @@ -539,7 +555,7 @@ "rv = mdb.users.replace_one(\n", " {\"username\": username},\n", " {\"username\": username, \"hashed_password\": get_password_hash(password)},\n", - " upsert=True\n", + " upsert=True,\n", ")\n", "rv" ] @@ -566,7 +582,11 @@ "\n", "dt_past = datetime.now(timezone.utc) - timedelta(days=1)\n", "\n", - "mdb.operations.insert_one(Operation(id=generate_one_id(mdb, \"op\"), expire_time=dt_past).dict(exclude_unset=True))" + "mdb.operations.insert_one(\n", + " Operation(id=generate_one_id(mdb, \"op\"), expire_time=dt_past).dict(\n", + " exclude_unset=True\n", + " )\n", + ")" ] }, { @@ -606,10 +626,12 @@ "\n", "pattern = re.compile(r\"https?://(?P[^/]+)/(?P.+)\")\n", "\n", + "\n", "def url_to_name(url):\n", " m = pattern.match(url)\n", " return f\"{'.'.join(reversed(m.group('domain').split('.')))}__{m.group('path').replace('/', '.')}\"\n", "\n", + "\n", "def download_them_all(to_fetch, dirname):\n", " results = []\n", " for i, spec in enumerate(to_fetch):\n", @@ -624,6 +646,7 @@ " results.append(dict(**spec, filename=filename))\n", " return results\n", "\n", + "\n", "def generate_drsobject_metadata_for(to_fetch):\n", " with TemporaryDirectory() as dirname:\n", " to_fetch = download_them_all(to_fetch, dirname)\n", @@ -634,7 +657,9 @@ " \"access_methods\": [{\"access_url\": {\"url\": spec[\"url\"]}}],\n", " \"description\": spec[\"type\"],\n", " }\n", - " print(f\"{i+1}/{len(to_fetch)}: generating DrsObject metdata for {spec['url']}\")\n", + " print(\n", + " f\"{i+1}/{len(to_fetch)}: generating DrsObject metdata for {spec['url']}\"\n", + " )\n", " drsobjects.append(drs_metadata_for(spec[\"filename\"], base=base))\n", " return drsobjects" ] @@ -646,15 +671,18 @@ "metadata": {}, "outputs": [], "source": [ - "to_fetch = [{\n", - " # >100MB\n", - " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/stegen_MetaProteomicAnalysis_activity.json\",\n", - " \"type\": \"metaproteomics_analysis_activity_set\",\n", - "}, {\n", - " # ~50KB\n", - " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/stegen_emsl_analysis_data_objects.json\",\n", - " \"type\": \"data_object_set\"\n", - "}]" + "to_fetch = [\n", + " {\n", + " # >100MB\n", + " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/stegen_MetaProteomicAnalysis_activity.json\",\n", + " \"type\": \"metaproteomics_analysis_activity_set\",\n", + " },\n", + " {\n", + " # ~50KB\n", + " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/stegen_emsl_analysis_data_objects.json\",\n", + " \"type\": \"data_object_set\",\n", + " },\n", + "]" ] }, { @@ -751,7 +779,11 @@ "metadata": {}, "outputs": [], "source": [ - "mdb.objects.find_one({\"name\": \"gov.nersc.portal__cfs.m3408.meta.stegen_MetaProteomicAnalysis_activity.json\"})" + "mdb.objects.find_one(\n", + " {\n", + " \"name\": \"gov.nersc.portal__cfs.m3408.meta.stegen_MetaProteomicAnalysis_activity.json\"\n", + " }\n", + ")" ] }, { @@ -770,7 +802,9 @@ "outputs": [], "source": [ "object_id = \"sys0b109\"\n", - "rv = client.request(\"PUT\", f\"/objects/{object_id}/types\", [\"metaproteomics_analysis_activity_set\"])\n", + "rv = client.request(\n", + " \"PUT\", f\"/objects/{object_id}/types\", [\"metaproteomics_analysis_activity_set\"]\n", + ")\n", "rv.content" ] }, @@ -935,7 +969,7 @@ "for path in tqdm(paths):\n", " meta = drs_metadata_for(path)\n", " url = f'{url_base}/{meta[\"name\"]}'\n", - " \n", + "\n", " rv = requests.head(\n", " url, allow_redirects=True, timeout=5, headers={\"Accept-Encoding\": \"gzip;q=0\"}\n", " )\n", @@ -943,17 +977,16 @@ " raise Exception(f\"url {url} not OK\")\n", "\n", " try:\n", - " size_bytes = int(rv.headers['Content-Length'])\n", + " size_bytes = int(rv.headers[\"Content-Length\"])\n", " except KeyError:\n", " raise Exception(f\"no content-length response for {url}\")\n", - " \n", + "\n", " if size_bytes != meta[\"size\"]:\n", - " raise Exception(f'size of local file {path} ({meta[\"size\"]}) does not match size at {url} ({size_bytes})')\n", - " \n", - " objects.append(DrsObjectIn(\n", - " access_methods=[{\"access_url\": {\"url\": url}}],\n", - " **meta\n", - " ))" + " raise Exception(\n", + " f'size of local file {path} ({meta[\"size\"]}) does not match size at {url} ({size_bytes})'\n", + " )\n", + "\n", + " objects.append(DrsObjectIn(access_methods=[{\"access_url\": {\"url\": url}}], **meta))" ] }, { @@ -996,11 +1029,14 @@ "paths = glob(os.path.expanduser(\"~/mongoexport/2021-10-14/*.jsonl.gz\"))\n", "names = [Path(p).name for p in paths]\n", "bundle_content_objects = [\n", - " dissoc(mdb.objects.find_one(\n", - " filter={\"name\": n},\n", - " projection=[\"id\", \"name\", \"size\", \"checksums\", \"created_time\"],\n", - " sort=[(\"created_time\", -1)]\n", - " ), \"_id\")\n", + " dissoc(\n", + " mdb.objects.find_one(\n", + " filter={\"name\": n},\n", + " projection=[\"id\", \"name\", \"size\", \"checksums\", \"created_time\"],\n", + " sort=[(\"created_time\", -1)],\n", + " ),\n", + " \"_id\",\n", + " )\n", " for n in names\n", "]" ] @@ -1039,7 +1075,9 @@ " if c[\"type\"] == \"sha256\"\n", "]\n", "if len(checksums) != len(bundle_content_objects):\n", - " raise Exception(\"Contents of bundle must have sha-256 checksums to compute bundle checksum\")" + " raise Exception(\n", + " \"Contents of bundle must have sha-256 checksums to compute bundle checksum\"\n", + " )" ] }, { @@ -1177,7 +1215,12 @@ "def local_file_to_api_object(file_info):\n", " return lftao(context, file_info)\n", "\n", - "obj = next(local_file_to_api_object({\"storage_path\": storage_path, \"mime_type\": 'application/json'}))\n", + "\n", + "obj = next(\n", + " local_file_to_api_object(\n", + " {\"storage_path\": storage_path, \"mime_type\": \"application/json\"}\n", + " )\n", + ")\n", "\n", "doc = mdb.objects.find_one({\"id\": obj[\"id\"]})\n", "assert doc[\"name\"] == Path(storage_path).name" @@ -1399,15 +1442,16 @@ " ExpiresIn=expires_in,\n", " )\n", "\n", + "\n", "s3client = get_s3_client()\n", "\n", "response = s3client.list_buckets()\n", - "for space in response['Buckets']:\n", - " print(space['Name'])\n", + "for space in response[\"Buckets\"]:\n", + " print(space[\"Name\"])\n", "print(f\"\\nusing Bucket {API_SITE_BUCKET}\\n\")\n", "response = s3client.list_objects(Bucket=API_SITE_BUCKET)\n", - "for obj in response['Contents']:\n", - " print(obj['Key'])" + "for obj in response[\"Contents\"]:\n", + " print(obj[\"Key\"])" ] }, { @@ -1445,6 +1489,7 @@ "tic = time()\n", "\n", "from dotenv import load_dotenv\n", + "\n", "load_dotenv(os.path.expanduser(\"~/.nmdc_mongo.env\"))" ] }, @@ -1481,7 +1526,7 @@ " fetch_json,\n", " get_db,\n", " reset_database,\n", - " snake_case_set_name\n", + " snake_case_set_name,\n", ")" ] }, @@ -1505,21 +1550,27 @@ "\n", "existing_set_names = set(dbschema[\"properties\"])\n", "\n", - "for object_without_set in (defined_object_names - set(set_for_object_name.keys())):\n", + "for object_without_set in defined_object_names - set(set_for_object_name.keys()):\n", " proposed_set_name = snake_case_set_name(object_without_set)\n", " if proposed_set_name not in existing_set_names:\n", " dbschema[\"properties\"][proposed_set_name] = {\n", - " \"description\": (f\"This property links a database object to the set of\"\n", - " f\" {object_without_set} objects within it.\"),\n", + " \"description\": (\n", + " f\"This property links a database object to the set of\"\n", + " f\" {object_without_set} objects within it.\"\n", + " ),\n", " \"items\": {\"$ref\": f\"#/definitions/{object_without_set}\"},\n", " \"type\": \"array\",\n", " }\n", - " \n", - "dbschema = assoc_in(dbschema, [\"definitions\", \"ControlledTermValue\", \"properties\", \"term\", \"type\"], \"string\")\n", + "\n", + "dbschema = assoc_in(\n", + " dbschema,\n", + " [\"definitions\", \"ControlledTermValue\", \"properties\", \"term\", \"type\"],\n", + " \"string\",\n", + ")\n", "del dbschema[\"definitions\"][\"ControlledTermValue\"][\"properties\"][\"term\"][\"$ref\"]\n", "\n", "# 'k' not capitalized upstream perhaps. should conform!\n", - "#dbschema = assoc_in(dbschema, [\"definitions\", \"MetagenomeAssembly\", \"properties\", \"scaf_l_gt50k\", \"type\"], \"number\")" + "# dbschema = assoc_in(dbschema, [\"definitions\", \"MetagenomeAssembly\", \"properties\", \"scaf_l_gt50k\", \"type\"], \"number\")" ] }, { @@ -1575,8 +1626,10 @@ "metadata": {}, "outputs": [], "source": [ - "rv = requests.post(\"http://localhost:8080/auth/token\",\n", - " {\"grant_type\": \"password\", \"username\": \"\", \"password\": \"\"})" + "rv = requests.post(\n", + " \"http://localhost:8080/auth/token\",\n", + " {\"grant_type\": \"password\", \"username\": \"\", \"password\": \"\"},\n", + ")" ] }, { @@ -1616,17 +1669,21 @@ "metadata": {}, "outputs": [], "source": [ - "rv = requests.post(\"http://localhost:8080/uploadObjects\", json={\n", - " \"results\": [\n", - " {\n", - " \"type\": \"Schema\",\n", - " \"content\": {\n", - " \"name\": \"Biosample\",\n", - " \"schema\": collschemas[\"biosample_set\"]\n", - " }\n", - " }\n", - " ]\n", - "}, headers=auth_header)" + "rv = requests.post(\n", + " \"http://localhost:8080/uploadObjects\",\n", + " json={\n", + " \"results\": [\n", + " {\n", + " \"type\": \"Schema\",\n", + " \"content\": {\n", + " \"name\": \"Biosample\",\n", + " \"schema\": collschemas[\"biosample_set\"],\n", + " },\n", + " }\n", + " ]\n", + " },\n", + " headers=auth_header,\n", + ")" ] }, { @@ -1647,19 +1704,24 @@ "outputs": [], "source": [ "import json\n", - "with open('/Users/dwinston/Desktop/cordra-upload.json','w') as f:\n", - " json.dump({\n", - " \"results\": [\n", - " {\n", - " \"id\": \"test/activity\",\n", - " \"type\": \"Schema\",\n", - " \"content\": {\n", - " \"name\": \"Activity\",\n", - " \"schema\": collschemas[\"activity_set\"]\n", - " }\n", - " }\n", - " ]\n", - " }, f, indent=2)" + "\n", + "with open(\"/Users/dwinston/Desktop/cordra-upload.json\", \"w\") as f:\n", + " json.dump(\n", + " {\n", + " \"results\": [\n", + " {\n", + " \"id\": \"test/activity\",\n", + " \"type\": \"Schema\",\n", + " \"content\": {\n", + " \"name\": \"Activity\",\n", + " \"schema\": collschemas[\"activity_set\"],\n", + " },\n", + " }\n", + " ]\n", + " },\n", + " f,\n", + " indent=2,\n", + " )" ] }, { @@ -1670,19 +1732,21 @@ "outputs": [], "source": [ "import json\n", - "with open('/Users/dwinston/Desktop/cordra-upload.json','w') as f:\n", - " json.dump({\n", - " \"results\": [\n", - " {\n", - " \"id\": \"test/study\",\n", - " \"type\": \"Schema\",\n", - " \"content\": {\n", - " \"name\": \"Study\",\n", - " \"schema\": collschemas[\"study_set\"]\n", - " }\n", - " }\n", - " ]\n", - " }, f, indent=2)" + "\n", + "with open(\"/Users/dwinston/Desktop/cordra-upload.json\", \"w\") as f:\n", + " json.dump(\n", + " {\n", + " \"results\": [\n", + " {\n", + " \"id\": \"test/study\",\n", + " \"type\": \"Schema\",\n", + " \"content\": {\"name\": \"Study\", \"schema\": collschemas[\"study_set\"]},\n", + " }\n", + " ]\n", + " },\n", + " f,\n", + " indent=2,\n", + " )" ] }, { @@ -1692,7 +1756,9 @@ "metadata": {}, "outputs": [], "source": [ - "rv = requests.get(\"http://localhost:8080/search?query=type:%22Schema%22\", headers=auth_header)" + "rv = requests.get(\n", + " \"http://localhost:8080/search?query=type:%22Schema%22\", headers=auth_header\n", + ")" ] }, { @@ -1726,7 +1792,8 @@ "outputs": [], "source": [ "import json\n", - "with open('/Users/dwinston/Desktop/cordra-upload.json','w') as f:\n", + "\n", + "with open(\"/Users/dwinston/Desktop/cordra-upload.json\", \"w\") as f:\n", " json.dump(template, f, indent=2)" ] }, @@ -1747,7 +1814,9 @@ "metadata": {}, "outputs": [], "source": [ - "rv = requests.post(\"http://localhost:8080/uploadObjects\", json=template, headers=auth_header)\n", + "rv = requests.post(\n", + " \"http://localhost:8080/uploadObjects\", json=template, headers=auth_header\n", + ")\n", "rv" ] }, diff --git a/metadata-translation/notebooks/202109_metadata_ingest.ipynb b/metadata-translation/notebooks/202109_metadata_ingest.ipynb index e4ee0733..ede58caa 100644 --- a/metadata-translation/notebooks/202109_metadata_ingest.ipynb +++ b/metadata-translation/notebooks/202109_metadata_ingest.ipynb @@ -100,12 +100,12 @@ "site_id = \"\"\n", "\n", "mdb.users.insert_one(\n", - " UserInDB(\n", - " username=username,\n", - " hashed_password=get_password_hash(password),\n", - " site_admin=[site_id ],\n", - " ).dict(exclude_unset=True)\n", - " )\n", + " UserInDB(\n", + " username=username,\n", + " hashed_password=get_password_hash(password),\n", + " site_admin=[site_id],\n", + " ).dict(exclude_unset=True)\n", + ")\n", "mdb.sites.insert_one(SiteInDB(id=site_id).dict(exclude_unset=True))" ] }, @@ -127,8 +127,7 @@ "from pathlib import Path\n", "\n", "filepath = Path.home().joinpath(\n", - " \"Dropbox\", \"diary\", \"21\", \"09\",\n", - " \"2021-09-15-scanon-nmdc-metadata-file-urls.txt\"\n", + " \"Dropbox\", \"diary\", \"21\", \"09\", \"2021-09-15-scanon-nmdc-metadata-file-urls.txt\"\n", ")\n", "with open(filepath) as f:\n", " urls = [line.strip() for line in f if line.strip()]" @@ -163,6 +162,7 @@ "\n", "pattern = re.compile(r\"https?://(?P[^/]+)/(?P.+)\")\n", "\n", + "\n", "def url_to_name(url):\n", " m = pattern.match(url)\n", " return (\n", @@ -178,9 +178,11 @@ "class HttpResponseNotOk(Exception):\n", " pass\n", "\n", + "\n", "class HttpResponseNotJson(Exception):\n", " pass\n", "\n", + "\n", "def response_to_json(response):\n", " if response.status_code != 200:\n", " raise HttpResponseNotOk()\n", @@ -197,6 +199,7 @@ " json.dump(json_data, f)\n", " return filepath\n", "\n", + "\n", "def json_clean(d, model, exclude_unset=False):\n", " return json.loads(model(**d).json(exclude_unset=exclude_unset))" ] @@ -226,8 +229,8 @@ " filepath,\n", " {\n", " \"access_methods\": [{\"access_url\": {\"url\": url}}],\n", - " \"name\": Path(filepath).name.replace(\":\",\"-\")\n", - " }\n", + " \"name\": Path(filepath).name.replace(\":\", \"-\"),\n", + " },\n", " )\n", " )\n", " result[url] = {\"result\": drs_object_in}\n", @@ -246,10 +249,9 @@ "for url, doc in tqdm(list(result.items())):\n", " if \"error\" in doc:\n", " continue\n", - " \n", + "\n", " drs_object_in = doc[\"result\"]\n", - " rv = client.create_object(\n", - " json.loads(drs_object_in.json(exclude_unset=True)))\n", + " rv = client.create_object(json.loads(drs_object_in.json(exclude_unset=True)))\n", " response[url] = rv.status_code\n", "\n", "all(v == 201 for v in response.values())" @@ -272,11 +274,11 @@ "source": [ "type_collections = {\n", " f'nmdc:{spec[\"items\"][\"$ref\"].split(\"/\")[-1]}': collection_name\n", - " for collection_name, spec in\n", - " nmdc_jsonschema[\"properties\"].items()\n", + " for collection_name, spec in nmdc_jsonschema[\"properties\"].items()\n", " if collection_name.endswith(\"_set\")\n", "}\n", "\n", + "\n", "def specialize_activity_set_docs(docs):\n", " if \"activity_set\" in docs:\n", " for doc in docs[\"activity_set\"]:\n", @@ -316,9 +318,7 @@ " docs = client.get_object_bytes(drs_id).json()\n", " docs = specialize_activity_set_docs(docs)\n", " _ = nmdc_jsonschema_validate(docs)\n", - " response[drs_id] = client.ensure_object_tag(\n", - " drs_id, \"schema#/definitions/Database\"\n", - " )\n", + " response[drs_id] = client.ensure_object_tag(drs_id, \"schema#/definitions/Database\")\n", "all(v is None or v.status_code == 200 for v in response.values())" ] }, @@ -351,9 +351,7 @@ " docs = client.get_object_bytes(drs_id).json()\n", " docs = specialize_activity_set_docs(docs)\n", " _ = nmdc_jsonschema_validate(docs)\n", - " response[drs_id] = client.ensure_object_tag(\n", - " drs_id, \"metadata-in\"\n", - " )\n", + " response[drs_id] = client.ensure_object_tag(drs_id, \"metadata-in\")\n", "all(v is None or v.status_code == 200 for v in response.values())" ] }, @@ -364,10 +362,12 @@ "metadata": {}, "outputs": [], "source": [ - "mdb.jobs.count_documents({\n", - " \"workflow.id\": \"portal-etl-1.0.0\",\n", - " \"config.object_id\": {\"$in\": list(drs_object_id.values())}\n", - "}) == len(drs_object_id.values())" + "mdb.jobs.count_documents(\n", + " {\n", + " \"workflow.id\": \"portal-etl-1.0.0\",\n", + " \"config.object_id\": {\"$in\": list(drs_object_id.values())},\n", + " }\n", + ") == len(drs_object_id.values())" ] }, { @@ -388,10 +388,15 @@ "from nmdc_runtime.api.models.util import ListRequest\n", "\n", "max_page_size = 1000\n", - "lr = ListRequest(filter=json.dumps({\n", - " \"workflow.id\": \"portal-etl-1.0.0\",\n", - " \"config.object_id\": {\"$in\": list(drs_object_id.values())}\n", - "}), max_page_size=max_page_size)\n", + "lr = ListRequest(\n", + " filter=json.dumps(\n", + " {\n", + " \"workflow.id\": \"portal-etl-1.0.0\",\n", + " \"config.object_id\": {\"$in\": list(drs_object_id.values())},\n", + " }\n", + " ),\n", + " max_page_size=max_page_size,\n", + ")\n", "jobs = []\n", "while True:\n", " rv = client.list_jobs(lr.dict()).json()\n", @@ -401,7 +406,7 @@ " break\n", " else:\n", " lr.page_token = rv[\"next_page_token\"]\n", - " \n", + "\n", " # safety escape\n", " if len(jobs) == len(drs_object_id.values()):\n", " break" @@ -456,6 +461,7 @@ "source": [ "from toolz import dissoc\n", "\n", + "\n", "def mongo_add_docs_result_as_dict(rv):\n", " return {\n", " collection_name: dissoc(bulk_write_result.bulk_api_result, \"upserted\")\n", @@ -478,7 +484,7 @@ "\n", "for doc in tqdm(job_ops):\n", " op = Operation[ResultT, JobOperationMetadata](**doc)\n", - " \n", + "\n", " docs = client.get_object_bytes(op.metadata.job.config[\"object_id\"]).json()\n", " docs = specialize_activity_set_docs(docs)\n", " op_result[op.id] = mongo.add_docs(docs, validate=True)" @@ -503,11 +509,9 @@ " if client.operation_is_done(op_id):\n", " print(\"op\", op_id, \"marked as done already. Skipping...\")\n", " continue\n", - " \n", + "\n", " op_patch = UpdateOperationRequest(\n", - " done=True,\n", - " result=mongo_add_docs_result_as_dict(rv),\n", - " metadata={\"done_at\": now}\n", + " done=True, result=mongo_add_docs_result_as_dict(rv), metadata={\"done_at\": now}\n", " )\n", " op_patch_result[op_id] = client.update_operation(op_id, op_patch).json()" ] @@ -548,8 +552,7 @@ "from pathlib import Path\n", "\n", "filepath = Path.home().joinpath(\n", - " \"Dropbox\", \"diary\", \"21\", \"09\",\n", - " \"scanon-annotations-urls.txt\"\n", + " \"Dropbox\", \"diary\", \"21\", \"09\", \"scanon-annotations-urls.txt\"\n", ")\n", "with open(filepath) as f:\n", " anno_additions_urls = [line.strip() for line in f if line.strip()]" @@ -579,15 +582,15 @@ "urlpath = \"https://portal.nersc.gov/project/m3408/meta/anno2/\"\n", "rv = requests.get(f\"{urlpath}?C=M;O=D\")\n", "\n", - "soup = BeautifulSoup(rv.text, 'html.parser')\n", + "soup = BeautifulSoup(rv.text, \"html.parser\")\n", "\n", - "anno_fixes_urls = [] \n", + "anno_fixes_urls = []\n", "\n", "for tr in soup.find_all(\"tr\"):\n", " tds = tr.find_all(\"td\")\n", " if len(tds) != 5:\n", " continue\n", - " \n", + "\n", " _, td_name, td_last_modified, td_size, _ = tds\n", " if td_last_modified.text.startswith(\"2021-09\"):\n", " name = td_name.a.text\n", @@ -662,29 +665,24 @@ "import json\n", "\n", "doc = {\n", - " \"aliases\": None,\n", - " \"description\": \"fix biosamples INSDC ID Mongo update\",\n", - " \"mime_type\": \"application/json\",\n", - " \"name\": \"fix_biosample_insdc_ids.json\",\n", - " \"access_methods\": [\n", - " {\n", - " \"access_url\": {\n", - " \"url\": \"https://portal.nersc.gov/project/m3408/meta/fix_biosample_insdc_ids.json\"\n", - " },\n", - " \"region\": None,\n", - " \"type\": \"https\"\n", - " }\n", - " ],\n", - " \"checksums\": [\n", - " {\n", - " \"checksum\": \"8aca72ffe32265e2c2a6a4de9ae47a53\",\n", - " \"type\": \"md5\"\n", - " }\n", - " ],\n", - " \"created_time\": \"2021-10-13T23:34:13.740Z\",\n", - " \"size\": 47968,\n", - " \"updated_time\": \"2021-10-13T23:34:13.740Z\",\n", - " \"version\": None\n", + " \"aliases\": None,\n", + " \"description\": \"fix biosamples INSDC ID Mongo update\",\n", + " \"mime_type\": \"application/json\",\n", + " \"name\": \"fix_biosample_insdc_ids.json\",\n", + " \"access_methods\": [\n", + " {\n", + " \"access_url\": {\n", + " \"url\": \"https://portal.nersc.gov/project/m3408/meta/fix_biosample_insdc_ids.json\"\n", + " },\n", + " \"region\": None,\n", + " \"type\": \"https\",\n", + " }\n", + " ],\n", + " \"checksums\": [{\"checksum\": \"8aca72ffe32265e2c2a6a4de9ae47a53\", \"type\": \"md5\"}],\n", + " \"created_time\": \"2021-10-13T23:34:13.740Z\",\n", + " \"size\": 47968,\n", + " \"updated_time\": \"2021-10-13T23:34:13.740Z\",\n", + " \"version\": None,\n", "}" ] }, @@ -723,10 +721,9 @@ "for url, doc in tqdm(list(drs_object_in_for.items())):\n", " if \"error\" in doc:\n", " continue\n", - " \n", + "\n", " drs_object_in = doc[\"result\"]\n", - " rv = client.create_object(\n", - " json.loads(drs_object_in.json(exclude_unset=True)))\n", + " rv = client.create_object(json.loads(drs_object_in.json(exclude_unset=True)))\n", " create_drs_object_response[url] = rv.status_code" ] }, @@ -771,7 +768,13 @@ "metadata": {}, "outputs": [], "source": [ - "us = [u.replace(\"https://data.microbiomedata.org/data/\",\"/project/projectdirs/m3408/ficus/pipeline_products/\") for u in anno_additions_urls]" + "us = [\n", + " u.replace(\n", + " \"https://data.microbiomedata.org/data/\",\n", + " \"/project/projectdirs/m3408/ficus/pipeline_products/\",\n", + " )\n", + " for u in anno_additions_urls\n", + "]" ] }, { @@ -791,7 +794,12 @@ "metadata": {}, "outputs": [], "source": [ - "us = [u.replace(\"https://portal.nersc.gov/project/m3408/\",\"/project/projectdirs/m3408/www/\") for u in anno_fixes_urls]" + "us = [\n", + " u.replace(\n", + " \"https://portal.nersc.gov/project/m3408/\", \"/project/projectdirs/m3408/www/\"\n", + " )\n", + " for u in anno_fixes_urls\n", + "]" ] }, { @@ -826,6 +834,7 @@ " \"https://portal.nersc.gov/project/m3408/\": \"/Users/dwinston/nmdc_files/2021-09-scanon-meta/www/\",\n", "}\n", "\n", + "\n", "def load_local_json(url):\n", " path = url\n", " for before, after in prefixes_url_to_local.items():\n", @@ -866,25 +875,24 @@ "\n", "skip = True\n", "for url, drs_id in tqdm(list(drs_object_id.items())):\n", - " if url == \"https://portal.nersc.gov/project/m3408/meta/anno2/503568_186507_features.json\":\n", + " if (\n", + " url\n", + " == \"https://portal.nersc.gov/project/m3408/meta/anno2/503568_186507_features.json\"\n", + " ):\n", " skip = False\n", " print(\"skipping\", url, \"...\")\n", " continue\n", " if skip:\n", " continue\n", - " \n", + "\n", " print(\"loading bytes for\", url, \"...\")\n", " docs = load_local_json(url)\n", " print(docs.keys())\n", " print(\"loaded. validating...\")\n", " _ = nmdc_jsonschema_validate(docs)\n", " print(\"validated. ensuring tags...\")\n", - " response[drs_id] = client.ensure_object_tag(\n", - " drs_id, \"schema#/definitions/Database\"\n", - " )\n", - " response[drs_id] = client.ensure_object_tag(\n", - " drs_id, \"metadata-in\"\n", - " )\n", + " response[drs_id] = client.ensure_object_tag(drs_id, \"schema#/definitions/Database\")\n", + " response[drs_id] = client.ensure_object_tag(drs_id, \"metadata-in\")\n", " print(\"done with\", url)\n", "all(v is None or v.status_code == 200 for v in response.values())" ] @@ -897,7 +905,10 @@ "outputs": [], "source": [ "for url, drs_id in drs_object_id.items():\n", - " if url == \"https://portal.nersc.gov/project/m3408/meta/anno2/503568_186507_features.json\":\n", + " if (\n", + " url\n", + " == \"https://portal.nersc.gov/project/m3408/meta/anno2/503568_186507_features.json\"\n", + " ):\n", " print(drs_id)" ] }, @@ -926,7 +937,7 @@ "metadata": {}, "outputs": [], "source": [ - "with open(\"drs_object_ids_to_ingest.json\",\"w\") as f:\n", + "with open(\"drs_object_ids_to_ingest.json\", \"w\") as f:\n", " json.dump(drs_object_ids_to_ingest, f)" ] }, @@ -948,10 +959,12 @@ "metadata": {}, "outputs": [], "source": [ - "mdb.jobs.count_documents({\n", - " \"workflow.id\": \"portal-etl-1.0.0\",\n", - " \"config.object_id\": {\"$in\": drs_object_ids_to_ingest}\n", - "}) == len(drs_object_ids_to_ingest)" + "mdb.jobs.count_documents(\n", + " {\n", + " \"workflow.id\": \"portal-etl-1.0.0\",\n", + " \"config.object_id\": {\"$in\": drs_object_ids_to_ingest},\n", + " }\n", + ") == len(drs_object_ids_to_ingest)" ] }, { @@ -964,10 +977,15 @@ "from nmdc_runtime.api.models.util import ListRequest\n", "\n", "max_page_size = 1000\n", - "lr = ListRequest(filter=json.dumps({\n", - " \"workflow.id\": \"portal-etl-1.0.0\",\n", - " \"config.object_id\": {\"$in\": drs_object_ids_to_ingest}\n", - "}), max_page_size=max_page_size)\n", + "lr = ListRequest(\n", + " filter=json.dumps(\n", + " {\n", + " \"workflow.id\": \"portal-etl-1.0.0\",\n", + " \"config.object_id\": {\"$in\": drs_object_ids_to_ingest},\n", + " }\n", + " ),\n", + " max_page_size=max_page_size,\n", + ")\n", "jobs = []\n", "while True:\n", " rv = client.list_jobs(lr.dict()).json()\n", @@ -977,7 +995,7 @@ " break\n", " else:\n", " lr.page_token = rv[\"next_page_token\"]\n", - " \n", + "\n", " # safety escape\n", " if len(jobs) == len(drs_object_ids_to_ingest):\n", " break\n", @@ -1004,11 +1022,15 @@ "metadata": {}, "outputs": [], "source": [ - "job_ops = list(mdb.operations.find({\n", - " \"metadata.job.workflow.id\": \"portal-etl-1.0.0\",\n", - " \"metadata.job.config.object_id\": {\"$in\": drs_object_ids_to_ingest},\n", - " \"done\": False\n", - "}))\n", + "job_ops = list(\n", + " mdb.operations.find(\n", + " {\n", + " \"metadata.job.workflow.id\": \"portal-etl-1.0.0\",\n", + " \"metadata.job.config.object_id\": {\"$in\": drs_object_ids_to_ingest},\n", + " \"done\": False,\n", + " }\n", + " )\n", + ")\n", "\n", "len(job_ops)" ] @@ -1039,14 +1061,16 @@ " op_result[op.id] = mongo.add_docs(docs, validate=False, replace=False)\n", " del docs\n", " gc.collect()\n", - " \n", + "\n", " if client.operation_is_done(op.id):\n", " print(\"op\", op.id, \"marked as done already. Skipping...\")\n", " else:\n", " op_patch = UpdateOperationRequest(\n", " done=True,\n", " result=mongo_add_docs_result_as_dict(op_result[op.id]),\n", - " metadata={\"done_at\": datetime.now(timezone.utc).isoformat(timespec=\"seconds\")}\n", + " metadata={\n", + " \"done_at\": datetime.now(timezone.utc).isoformat(timespec=\"seconds\")\n", + " },\n", " )\n", " op_patch_result[op.id] = client.update_operation(op.id, op_patch).json()\n", " print(\"op\", op.id, \"marked as done.\")" @@ -1084,7 +1108,9 @@ "source": [ "from toolz import dissoc\n", "\n", - "omics_processing_docs = [dissoc(d, \"_id\") for d in mdb_staging[\"gold.omics_processing_set\"].find()]" + "omics_processing_docs = [\n", + " dissoc(d, \"_id\") for d in mdb_staging[\"gold.omics_processing_set\"].find()\n", + "]" ] }, { @@ -1105,7 +1131,9 @@ "metadata": {}, "outputs": [], "source": [ - "omics_processing_docs_for_spruce = [d for d in omics_processing_docs if \"gold:Gs0110138\" in d.get(\"part_of\" ,[])]" + "omics_processing_docs_for_spruce = [\n", + " d for d in omics_processing_docs if \"gold:Gs0110138\" in d.get(\"part_of\", [])\n", + "]" ] }, { @@ -1135,13 +1163,15 @@ "for ompro_doc in tqdm(omics_processing_docs_for_spruce):\n", " project_id = ompro_doc[\"id\"]\n", " activity_docs = [\n", - " dissoc(d, \"_id\") for d in\n", - " mdb.read_QC_analysis_activity_set.find({\"was_informed_by\": project_id})\n", + " dissoc(d, \"_id\")\n", + " for d in mdb.read_QC_analysis_activity_set.find({\"was_informed_by\": project_id})\n", " ]\n", " for adoc in activity_docs:\n", " assert len(adoc.get(\"has_input\", [])) == 1\n", " data_object_id = adoc.get(\"has_input\")[0]\n", - " docs_to_add[\"omics_processing_set\"].append(assoc(ompro_doc, \"has_output\", [data_object_id]))" + " docs_to_add[\"omics_processing_set\"].append(\n", + " assoc(ompro_doc, \"has_output\", [data_object_id])\n", + " )" ] }, { @@ -1183,7 +1213,7 @@ "metadata": {}, "outputs": [], "source": [ - "rv['omics_processing_set'].upserted_count" + "rv[\"omics_processing_set\"].upserted_count" ] }, { @@ -1193,7 +1223,9 @@ "metadata": {}, "outputs": [], "source": [ - "ompro_ids_not_added = {d[\"id\"] for d in omics_processing_docs_for_spruce} - {d[\"id\"] for d in docs_to_add[\"omics_processing_set\"]}" + "ompro_ids_not_added = {d[\"id\"] for d in omics_processing_docs_for_spruce} - {\n", + " d[\"id\"] for d in docs_to_add[\"omics_processing_set\"]\n", + "}" ] }, { @@ -1266,9 +1298,9 @@ "metadata": {}, "outputs": [], "source": [ - "(mdb.omics_processing_set.count_documents({\"has_output.0\": {\"$exists\": True}})\n", - " ==\n", - " mdb.omics_processing_set.count_documents({})\n", + "(\n", + " mdb.omics_processing_set.count_documents({\"has_output.0\": {\"$exists\": True}})\n", + " == mdb.omics_processing_set.count_documents({})\n", ")" ] }, @@ -1300,10 +1332,17 @@ "import json\n", "from pathlib import Path\n", "\n", - "with open(Path(\"~\").expanduser().joinpath(\n", - " 'Dropbox', 'diary', '21', '10',\n", - " '2021-09-14-stegen_emsl_analysis_data_objects.json'\n", - ")) as f:\n", + "with open(\n", + " Path(\"~\")\n", + " .expanduser()\n", + " .joinpath(\n", + " \"Dropbox\",\n", + " \"diary\",\n", + " \"21\",\n", + " \"10\",\n", + " \"2021-09-14-stegen_emsl_analysis_data_objects.json\",\n", + " )\n", + ") as f:\n", " docs = json.load(f)" ] }, @@ -1419,7 +1458,11 @@ "metadata": {}, "outputs": [], "source": [ - "names = [n for n in mdb.list_collection_names() if n.endswith(\"_set\") and mdb[n].estimated_document_count() > 0]" + "names = [\n", + " n\n", + " for n in mdb.list_collection_names()\n", + " if n.endswith(\"_set\") and mdb[n].estimated_document_count() > 0\n", + "]" ] }, { diff --git a/metadata-translation/notebooks/NMDC-GOLD-data-counts.ipynb b/metadata-translation/notebooks/NMDC-GOLD-data-counts.ipynb index 6ca9c558..6494c3c9 100644 --- a/metadata-translation/notebooks/NMDC-GOLD-data-counts.ipynb +++ b/metadata-translation/notebooks/NMDC-GOLD-data-counts.ipynb @@ -253,7 +253,9 @@ ], "source": [ "## import data from the 2020-10-16 dump\n", - "df = pds.read_csv(\"/Users/wdduncan/Desktop/biosample-join-packages.tsv\", sep='\\t', dtype=str)\n", + "df = pds.read_csv(\n", + " \"/Users/wdduncan/Desktop/biosample-join-packages.tsv\", sep=\"\\t\", dtype=str\n", + ")\n", "df.head()" ] }, @@ -381,7 +383,9 @@ } ], "source": [ - "countDf = pds.DataFrame(df.count(), columns=['count']).sort_values(by='count', ascending=False)\n", + "countDf = pds.DataFrame(df.count(), columns=[\"count\"]).sort_values(\n", + " by=\"count\", ascending=False\n", + ")\n", "countDf" ] }, @@ -440,7 +444,7 @@ } ], "source": [ - "len(countDf[countDf['count']>0])" + "len(countDf[countDf[\"count\"] > 0])" ] }, { @@ -538,7 +542,7 @@ "metadata": {}, "outputs": [], "source": [ - "countDf['pct rank'] = countDf['count'].rank(pct=True)" + "countDf[\"pct rank\"] = countDf[\"count\"].rank(pct=True)" ] }, { @@ -690,7 +694,7 @@ ], "source": [ "%matplotlib inline\n", - "countDf[['count']].plot.barh(figsize=(10,7))" + "countDf[[\"count\"]].plot.barh(figsize=(10, 7))" ] }, { @@ -724,7 +728,7 @@ ], "source": [ "%matplotlib inline\n", - "countDf[countDf['count']>0][['count']].plot.barh(figsize=(25,20))" + "countDf[countDf[\"count\"] > 0][[\"count\"]].plot.barh(figsize=(25, 20))" ] }, { @@ -747,7 +751,7 @@ } ], "source": [ - "countDf[countDf['count']>0].loc['pressure']" + "countDf[countDf[\"count\"] > 0].loc[\"pressure\"]" ] }, { @@ -1042,7 +1046,7 @@ } ], "source": [ - "countDf[countDf['count']>129]" + "countDf[countDf[\"count\"] > 129]" ] }, { @@ -1063,7 +1067,7 @@ } ], "source": [ - "len(countDf[countDf['count']>129])" + "len(countDf[countDf[\"count\"] > 129])" ] }, { @@ -1096,7 +1100,7 @@ } ], "source": [ - "countDf[countDf['count']>129][['count']].plot.barh(figsize=(25,20))" + "countDf[countDf[\"count\"] > 129][[\"count\"]].plot.barh(figsize=(25, 20))" ] }, { @@ -1117,7 +1121,7 @@ } ], "source": [ - "len(countDf[countDf['count']>129])" + "len(countDf[countDf[\"count\"] > 129])" ] }, { @@ -1370,7 +1374,7 @@ } ], "source": [ - "countDf[countDf['count']>129][['count']].cumsum()" + "countDf[countDf[\"count\"] > 129][[\"count\"]].cumsum()" ] }, { @@ -1380,7 +1384,7 @@ "metadata": {}, "outputs": [], "source": [ - "countDf['cum pct'] = 100*(countDf['count'].cumsum() / countDf['count'].sum())\n", + "countDf[\"cum pct\"] = 100 * (countDf[\"count\"].cumsum() / countDf[\"count\"].sum())\n", "# countDf['count'].sum()" ] }, @@ -1599,7 +1603,7 @@ } ], "source": [ - "countDf[countDf['cum pct'] < 91]" + "countDf[countDf[\"cum pct\"] < 91]" ] }, { @@ -1620,7 +1624,7 @@ } ], "source": [ - "len(countDf[countDf['cum pct'] <= 99])" + "len(countDf[countDf[\"cum pct\"] <= 99])" ] }, { @@ -1655,7 +1659,7 @@ } ], "source": [ - "countDf[countDf['cum pct'] <= 99][['count']].plot.barh(figsize=(25,20))" + "countDf[countDf[\"cum pct\"] <= 99][[\"count\"]].plot.barh(figsize=(25, 20))" ] }, { @@ -1677,7 +1681,7 @@ ], "source": [ "pds.options.display.max_columns = 500\n", - "df['water_alkalinity_method'].value_counts()" + "df[\"water_alkalinity_method\"].value_counts()" ] }, { @@ -1820,7 +1824,9 @@ ], "source": [ "# list(df['subsurface_depth'].unique())\n", - "df[pds.notnull(df.salinity)][['biosample_id', 'gold_id','salinity', 'salinity_concentration']]" + "df[pds.notnull(df.salinity)][\n", + " [\"biosample_id\", \"gold_id\", \"salinity\", \"salinity_concentration\"]\n", + "]" ] }, { diff --git a/metadata-translation/notebooks/archive/generate-gold-sample-json.ipynb b/metadata-translation/notebooks/archive/generate-gold-sample-json.ipynb index 2f1099be..1514bc16 100644 --- a/metadata-translation/notebooks/archive/generate-gold-sample-json.ipynb +++ b/metadata-translation/notebooks/archive/generate-gold-sample-json.ipynb @@ -39,51 +39,51 @@ "metadata": {}, "outputs": [], "source": [ - "subset_cols = \\\n", - " ['biosample_id',\n", - " 'biosample_name',\n", - " 'description',\n", - " 'add_date',\n", - " 'mod_date',\n", - " 'ecosystem_path_id',\n", - " 'ecosystem',\n", - " 'ecosystem_category',\n", - " 'ecosystem_type',\n", - " 'ecosystem_subtype',\n", - " 'specific_ecosystem',\n", - " 'habitat',\n", - " 'location',\n", - " 'community',\n", - " 'ncbi_taxonomy_name',\n", - " 'geographic_location',\n", - " 'latitude',\n", - " 'longitude',\n", - " 'sample_collection_site',\n", - " 'identifier',\n", - " 'sample_collection_year',\n", - " 'sample_collection_month',\n", - " 'sample_collection_day',\n", - " 'sample_collection_hour',\n", - " 'sample_collection_minute',\n", - " 'host_name',\n", - " 'depth',\n", - " 'subsurface_depth',\n", - " 'altitude',\n", - " 'temperature_range',\n", - " 'proport_woa_temperature',\n", - " 'biogas_temperature',\n", - " 'growth_temperature',\n", - " 'soil_annual_season_temp',\n", - " 'water_samp_store_temp',\n", - " 'biogas_retention_time',\n", - " 'salinity',\n", - " 'pressure',\n", - " 'ph',\n", - " 'chlorophyll_concentration',\n", - " 'nitrate_concentration',\n", - " 'oxygen_concentration',\n", - " 'salinity_concentration'\n", - " ]\n" + "subset_cols = [\n", + " \"biosample_id\",\n", + " \"biosample_name\",\n", + " \"description\",\n", + " \"add_date\",\n", + " \"mod_date\",\n", + " \"ecosystem_path_id\",\n", + " \"ecosystem\",\n", + " \"ecosystem_category\",\n", + " \"ecosystem_type\",\n", + " \"ecosystem_subtype\",\n", + " \"specific_ecosystem\",\n", + " \"habitat\",\n", + " \"location\",\n", + " \"community\",\n", + " \"ncbi_taxonomy_name\",\n", + " \"geographic_location\",\n", + " \"latitude\",\n", + " \"longitude\",\n", + " \"sample_collection_site\",\n", + " \"identifier\",\n", + " \"sample_collection_year\",\n", + " \"sample_collection_month\",\n", + " \"sample_collection_day\",\n", + " \"sample_collection_hour\",\n", + " \"sample_collection_minute\",\n", + " \"host_name\",\n", + " \"depth\",\n", + " \"subsurface_depth\",\n", + " \"altitude\",\n", + " \"temperature_range\",\n", + " \"proport_woa_temperature\",\n", + " \"biogas_temperature\",\n", + " \"growth_temperature\",\n", + " \"soil_annual_season_temp\",\n", + " \"water_samp_store_temp\",\n", + " \"biogas_retention_time\",\n", + " \"salinity\",\n", + " \"pressure\",\n", + " \"ph\",\n", + " \"chlorophyll_concentration\",\n", + " \"nitrate_concentration\",\n", + " \"oxygen_concentration\",\n", + " \"salinity_concentration\",\n", + "]" ] }, { @@ -92,7 +92,7 @@ "metadata": {}, "outputs": [], "source": [ - "nrows = 5 # set to None for all records\n", + "nrows = 5 # set to None for all records\n", "save_file_name = \"output/schema-test.json\"\n", "df = dop.make_dataframe(file_name, subset_cols=subset_cols, nrows=nrows)" ] @@ -331,7 +331,7 @@ } ], "source": [ - "df.head() # peek at data" + "df.head() # peek at data" ] }, { @@ -410,7 +410,8 @@ "source": [ "## print out a single record for viewing\n", "for record in dictdf:\n", - " print(json.dumps(record, indent=4)); break" + " print(json.dumps(record, indent=4))\n", + " break" ] }, { @@ -426,7 +427,9 @@ "metadata": {}, "outputs": [], "source": [ - "json_list = dop.make_json_string_list(dictdf, nmdc.Biosample, 'biosample_id', 'biosample_name')" + "json_list = dop.make_json_string_list(\n", + " dictdf, nmdc.Biosample, \"biosample_id\", \"biosample_name\"\n", + ")" ] }, { @@ -592,7 +595,7 @@ } ], "source": [ - "print(json_list[1]) ## peek at data" + "print(json_list[1]) ## peek at data" ] }, { diff --git a/metadata-translation/notebooks/archive/translate-30_FICUS_Proposals_Metadata_4_Emiley_Chris_11082019.ipynb b/metadata-translation/notebooks/archive/translate-30_FICUS_Proposals_Metadata_4_Emiley_Chris_11082019.ipynb index 888b1a2f..6de6c853 100644 --- a/metadata-translation/notebooks/archive/translate-30_FICUS_Proposals_Metadata_4_Emiley_Chris_11082019.ipynb +++ b/metadata-translation/notebooks/archive/translate-30_FICUS_Proposals_Metadata_4_Emiley_Chris_11082019.ipynb @@ -12,6 +12,7 @@ "from hashlib import md5\n", "from pandasql import sqldf\n", "\n", + "\n", "def pysqldf(q):\n", " return sqldf(q, globals())" ] @@ -29,7 +30,9 @@ "metadata": {}, "outputs": [], "source": [ - "ficusdf = pds.read_excel(\"data/30_FICUS_Proposals_Metadata_4_Emiley_Chris_11082019.xlsx\")" + "ficusdf = pds.read_excel(\n", + " \"data/30_FICUS_Proposals_Metadata_4_Emiley_Chris_11082019.xlsx\"\n", + ")" ] }, { @@ -71,8 +74,22 @@ "metadata": {}, "outputs": [], "source": [ - "gold_elevels = ['ecosystem', 'ecosystem_category', 'ecosystem_type', 'ecosystem_subtype', 'specific_ecosystem']\n", - "id_fields = ['gold_id', 'ecosystem_path_id', 'biosample_id', 'organism_id', 'analysis_project_id', 'submission_id', 'img_taxon_id']" + "gold_elevels = [\n", + " \"ecosystem\",\n", + " \"ecosystem_category\",\n", + " \"ecosystem_type\",\n", + " \"ecosystem_subtype\",\n", + " \"specific_ecosystem\",\n", + "]\n", + "id_fields = [\n", + " \"gold_id\",\n", + " \"ecosystem_path_id\",\n", + " \"biosample_id\",\n", + " \"organism_id\",\n", + " \"analysis_project_id\",\n", + " \"submission_id\",\n", + " \"img_taxon_id\",\n", + "]" ] }, { @@ -90,16 +107,18 @@ "source": [ "def make_row_list(row, subset_list=[]):\n", " if len(subset_list) > 0: # only get values in subset list\n", - " row_list = [str(v).lower().strip()\n", - " for k,v in row.to_dict().items() \n", - " if k in subset_list]\n", + " row_list = [\n", + " str(v).lower().strip() for k, v in row.to_dict().items() if k in subset_list\n", + " ]\n", " else:\n", " row_list = [str(e) for e in list(row)]\n", - " \n", + "\n", " return row_list\n", "\n", "\n", - "def make_ficus_row_iri(row, id_field_list, prefix=\"http://purl.obolibrary.org/obo/GOLD_\"):\n", + "def make_ficus_row_iri(\n", + " row, id_field_list, prefix=\"http://purl.obolibrary.org/obo/GOLD_\"\n", + "):\n", " row_list = make_row_list(row, id_field_list)\n", " return make_iri(\"_\".join(row_list), prefix)\n", "\n", @@ -111,7 +130,9 @@ " return \"\"\n", "\n", "\n", - "def make_class_iri(row, gold_elevel_list, prefix=\"http://purl.obolibrary.org/obo/GOLD_\"):\n", + "def make_class_iri(\n", + " row, gold_elevel_list, prefix=\"http://purl.obolibrary.org/obo/GOLD_\"\n", + "):\n", " row_hash = make_row_hash(row, gold_elevel_list)\n", " return make_iri(row_hash, prefix)\n", "\n", @@ -119,15 +140,15 @@ "def make_row_hash(row, subset_list=[]):\n", " row_list = make_row_list(row, subset_list)\n", "\n", - " temp = [\"\" if pds.isnull(e) else e for e in row_list] # replace NaNs with ''\n", + " temp = [\"\" if pds.isnull(e) else e for e in row_list] # replace NaNs with ''\n", " temp = \"\".join(temp)\n", - "# print(\"row: \", temp)\n", + " # print(\"row: \", temp)\n", " return make_hash(temp)\n", "\n", "\n", "def make_hash(val):\n", " if len(val) > 0:\n", - " hash = md5(val.encode('utf-8'))\n", + " hash = md5(val.encode(\"utf-8\"))\n", " return str(hash.hexdigest())\n", " else:\n", " return \"\"\n", @@ -137,8 +158,7 @@ " annotation_dict = {}\n", " for val in value_list:\n", " val = str(val).lower().strip()\n", - " annotation_dict[val] = \\\n", - " {'iri': make_iri(str(val), prefix=prefix), 'label': val}\n", + " annotation_dict[val] = {\"iri\": make_iri(str(val), prefix=prefix), \"label\": val}\n", " return annotation_dict" ] }, @@ -244,8 +264,12 @@ "metadata": {}, "outputs": [], "source": [ - "ficusdf['row_iri'] = ficusdf.apply(lambda row: make_ficus_row_iri(row, id_fields), axis=1)\n", - "ficusdf['class_iri'] = ficusdf.apply(lambda row: make_class_iri(row, gold_elevels), axis=1)" + "ficusdf[\"row_iri\"] = ficusdf.apply(\n", + " lambda row: make_ficus_row_iri(row, id_fields), axis=1\n", + ")\n", + "ficusdf[\"class_iri\"] = ficusdf.apply(\n", + " lambda row: make_class_iri(row, gold_elevels), axis=1\n", + ")" ] }, { @@ -362,9 +386,18 @@ ], "source": [ "## examine output\n", - "pds.set_option('display.max_colwidth', 1000)\n", + "pds.set_option(\"display.max_colwidth\", 1000)\n", "ficusdf.class_iri.head()\n", - "ficusdf[['ecosystem', 'ecosystem_category', 'ecosystem_type', 'ecosystem_subtype', 'specific_ecosystem', 'class_iri']].head()" + "ficusdf[\n", + " [\n", + " \"ecosystem\",\n", + " \"ecosystem_category\",\n", + " \"ecosystem_type\",\n", + " \"ecosystem_subtype\",\n", + " \"specific_ecosystem\",\n", + " \"class_iri\",\n", + " ]\n", + "].head()" ] }, { @@ -381,7 +414,7 @@ "metadata": {}, "outputs": [], "source": [ - "g = Graph() # instantiate graph" + "g = Graph() # instantiate graph" ] }, { @@ -399,8 +432,8 @@ "source": [ "for k, v in annotation_dict.items():\n", " ## note: the value is a dict with keys 'iri' and 'label'\n", - " g.add((URIRef(v['iri']), RDF.type, OWL.AnnotationProperty))\n", - " g.add((URIRef(v['iri']), RDFS.label, Literal(v['label'])))" + " g.add((URIRef(v[\"iri\"]), RDF.type, OWL.AnnotationProperty))\n", + " g.add((URIRef(v[\"iri\"]), RDFS.label, Literal(v[\"label\"])))" ] }, { @@ -416,18 +449,23 @@ "metadata": {}, "outputs": [], "source": [ - "for (ix, row) in ficusdf.iterrows(): # ontdf.head(100).itertuples():\n", - " if len(row.row_iri) > 0: \n", - " row_iri = URIRef(row['row_iri']) \n", - " g.add((row_iri, RDF.type, OWL.NamedIndividual)) # add instance iri to graph\n", - " \n", - " if len(row.class_iri) > 0: \n", - " class_iri = URIRef(row['class_iri'])\n", - " g.add((row_iri, RDF.type, class_iri)) # add type the row instantiates\n", - " \n", - " for k, v in annotation_dict.items(): # field values in spreadsheet as annotation values\n", + "for ix, row in ficusdf.iterrows(): # ontdf.head(100).itertuples():\n", + " if len(row.row_iri) > 0:\n", + " row_iri = URIRef(row[\"row_iri\"])\n", + " g.add((row_iri, RDF.type, OWL.NamedIndividual)) # add instance iri to graph\n", + "\n", + " if len(row.class_iri) > 0:\n", + " class_iri = URIRef(row[\"class_iri\"])\n", + " g.add((row_iri, RDF.type, class_iri)) # add type the row instantiates\n", + "\n", + " for (\n", + " k,\n", + " v,\n", + " ) in (\n", + " annotation_dict.items()\n", + " ): # field values in spreadsheet as annotation values\n", " field_val = row[k]\n", - " annotation_iri = URIRef(v['iri'])\n", + " annotation_iri = URIRef(v[\"iri\"])\n", " g.add((row_iri, annotation_iri, Literal(field_val)))" ] }, @@ -438,7 +476,7 @@ "outputs": [], "source": [ "## save graph (note: different formatats (e.g., turtle) are possible)\n", - "g.serialize(destination='output/FICUS-projects-translation.owl', format='xml')" + "g.serialize(destination=\"output/FICUS-projects-translation.owl\", format=\"xml\")" ] }, { diff --git a/metadata-translation/notebooks/changesheets-example.ipynb b/metadata-translation/notebooks/changesheets-example.ipynb index fa46ca22..08eb717a 100644 --- a/metadata-translation/notebooks/changesheets-example.ipynb +++ b/metadata-translation/notebooks/changesheets-example.ipynb @@ -56,7 +56,12 @@ "from dotenv import dotenv_values\n", "from pymongo import MongoClient\n", "from pymongo.database import Database as MongoDatabase\n", - "from nmdc_runtime.api.core.metadata import load_changesheet, update_mongo_db, mongo_update_command_for, copy_docs_in_update_cmd" + "from nmdc_runtime.api.core.metadata import (\n", + " load_changesheet,\n", + " update_mongo_db,\n", + " mongo_update_command_for,\n", + " copy_docs_in_update_cmd,\n", + ")" ] }, { @@ -73,7 +78,11 @@ "config[\"MONGO_HOST\"]\n", "\n", "# create mongo client\n", - "client = MongoClient(host=config[\"MONGO_HOST\"], username=config[\"MONGO_USERNAME\"], password=config[\"MONGO_PASSWORD\"])\n", + "client = MongoClient(\n", + " host=config[\"MONGO_HOST\"],\n", + " username=config[\"MONGO_USERNAME\"],\n", + " password=config[\"MONGO_PASSWORD\"],\n", + ")\n", "mongodb = client[\"nmdc\"]" ] }, @@ -99,21 +108,25 @@ "source": [ "# helper functions\n", "\n", + "\n", "# wraps the mongo_update_command_for and update_mongo_db into\n", "# a single function to process the change sheet\n", - "def process_changesheet(changeDf, mdb: MongoDatabase, temp_db: MongoDatabase, print_update_cmd=False):\n", + "def process_changesheet(\n", + " changeDf, mdb: MongoDatabase, temp_db: MongoDatabase, print_update_cmd=False\n", + "):\n", " update_cmd = mongo_update_command_for(changeDf)\n", - " \n", - " # used for debugging \n", + "\n", + " # used for debugging\n", " if print_update_cmd:\n", " for id_, cmd in update_cmd.items():\n", - " print('id:', id_)\n", + " print(\"id:\", id_)\n", " print(cmd)\n", - " print('\\n')\n", - " \n", + " print(\"\\n\")\n", + "\n", " copy_docs_in_update_cmd(update_cmd, mdb, temp_db)\n", " return update_mongo_db(temp_db, update_cmd)\n", "\n", + "\n", "# puts the change sheet results in dataframe\n", "def print_results(results, print_before=True, print_after=True, print_errors=True):\n", " for i, result in enumerate(results):\n", @@ -138,7 +151,7 @@ "source": [ "# set dataframe display options\n", "pd.set_option(\"display.max_columns\", None)\n", - "pd.set_option('display.width', 1000)" + "pd.set_option(\"display.width\", 1000)" ] }, { @@ -313,7 +326,7 @@ } ], "source": [ - "pd.read_csv(\"data/changesheet-replace.tsv\", sep=\"\\t\", dtype=\"string\").fillna('')" + "pd.read_csv(\"data/changesheet-replace.tsv\", sep=\"\\t\", dtype=\"string\").fillna(\"\")" ] }, { @@ -1204,7 +1217,7 @@ } ], "source": [ - "pd.read_csv(\"data/changesheet-insert.tsv\", sep=\"\\t\", dtype=\"string\").fillna('')" + "pd.read_csv(\"data/changesheet-insert.tsv\", sep=\"\\t\", dtype=\"string\").fillna(\"\")" ] }, { @@ -1611,7 +1624,9 @@ } ], "source": [ - "print_results(process_changesheet(sheetDf, mongodb, temp_db), print_before=True, print_after=True)" + "print_results(\n", + " process_changesheet(sheetDf, mongodb, temp_db), print_before=True, print_after=True\n", + ")" ] }, { @@ -1970,7 +1985,7 @@ } ], "source": [ - "pd.read_csv(\"data/changesheet-remove-property.tsv\", sep=\"\\t\", dtype=\"string\").fillna('')" + "pd.read_csv(\"data/changesheet-remove-property.tsv\", sep=\"\\t\", dtype=\"string\").fillna(\"\")" ] }, { diff --git a/metadata-translation/notebooks/changesheets-testing.ipynb b/metadata-translation/notebooks/changesheets-testing.ipynb index 1f48921e..2576df7a 100644 --- a/metadata-translation/notebooks/changesheets-testing.ipynb +++ b/metadata-translation/notebooks/changesheets-testing.ipynb @@ -26,7 +26,12 @@ "from dotenv import dotenv_values\n", "from pymongo import MongoClient\n", "from pymongo.database import Database as MongoDatabase\n", - "from nmdc_runtime.api.core.metadata import load_changesheet, update_mongo_db, mongo_update_command_for, copy_docs_in_update_cmd" + "from nmdc_runtime.api.core.metadata import (\n", + " load_changesheet,\n", + " update_mongo_db,\n", + " mongo_update_command_for,\n", + " copy_docs_in_update_cmd,\n", + ")" ] }, { @@ -66,7 +71,11 @@ "metadata": {}, "outputs": [], "source": [ - "client = MongoClient(host=config[\"MONGO_HOST\"], username=config[\"MONGO_USERNAME\"], password=config[\"MONGO_PASSWORD\"])\n", + "client = MongoClient(\n", + " host=config[\"MONGO_HOST\"],\n", + " username=config[\"MONGO_USERNAME\"],\n", + " password=config[\"MONGO_PASSWORD\"],\n", + ")\n", "mongodb = client[\"nmdc\"]" ] }, @@ -113,7 +122,8 @@ " update_cmd = mongo_update_command_for(changeDf)\n", " copy_docs_in_update_cmd(update_cmd, mdb, temp_db)\n", " return update_mongo_db(temp_db, update_cmd)\n", - " \n", + "\n", + "\n", "# for id_, cmd in update_cmd.items():\n", "# print('id:', id_)\n", "# print(cmd)\n", @@ -158,7 +168,7 @@ "outputs": [], "source": [ "pd.set_option(\"display.max_columns\", None)\n", - "pd.set_option('display.width', 1000)" + "pd.set_option(\"display.width\", 1000)" ] }, { @@ -252,7 +262,7 @@ } ], "source": [ - "pd.read_csv(\"data/changesheet-with-separator1.tsv\", sep=\"\\t\", dtype=\"string\").fillna('')" + "pd.read_csv(\"data/changesheet-with-separator1.tsv\", sep=\"\\t\", dtype=\"string\").fillna(\"\")" ] }, { @@ -1169,7 +1179,9 @@ } ], "source": [ - "pd.read_csv(\"data/changesheet-without-separator1.tsv\", sep=\"\\t\", dtype=\"string\").fillna('')" + "pd.read_csv(\"data/changesheet-without-separator1.tsv\", sep=\"\\t\", dtype=\"string\").fillna(\n", + " \"\"\n", + ")" ] }, { @@ -2362,7 +2374,9 @@ } ], "source": [ - "pd.read_csv(\"data/changesheet-without-separator3.tsv\", sep=\"\\t\", dtype=\"string\").fillna('')" + "pd.read_csv(\"data/changesheet-without-separator3.tsv\", sep=\"\\t\", dtype=\"string\").fillna(\n", + " \"\"\n", + ")" ] }, { @@ -2998,7 +3012,7 @@ ], "source": [ "sheetDf = load_changesheet(\"data/changesheet-without-separator3.tsv\", mongodb)\n", - "sheetDf " + "sheetDf" ] }, { @@ -3519,7 +3533,9 @@ } ], "source": [ - "pd.read_csv(\"data/changesheet-array-item-nested-attributes.tsv\", sep=\"\\t\", dtype=\"string\").fillna('')" + "pd.read_csv(\n", + " \"data/changesheet-array-item-nested-attributes.tsv\", sep=\"\\t\", dtype=\"string\"\n", + ").fillna(\"\")" ] }, { @@ -3818,7 +3834,9 @@ } ], "source": [ - "pd.read_csv(\"data/changesheet-update-pi-websites.tsv\", sep=\"\\t\", dtype=\"string\").fillna('')" + "pd.read_csv(\"data/changesheet-update-pi-websites.tsv\", sep=\"\\t\", dtype=\"string\").fillna(\n", + " \"\"\n", + ")" ] }, { @@ -4246,7 +4264,7 @@ } ], "source": [ - "pd.read_csv(\"data/changesheet-remove-item.tsv\", sep=\"\\t\", dtype=\"string\").fillna('')" + "pd.read_csv(\"data/changesheet-remove-item.tsv\", sep=\"\\t\", dtype=\"string\").fillna(\"\")" ] }, { @@ -4635,7 +4653,7 @@ } ], "source": [ - "pd.read_csv(\"data/changesheet-remove-property.tsv\", sep=\"\\t\", dtype=\"string\").fillna('')" + "pd.read_csv(\"data/changesheet-remove-property.tsv\", sep=\"\\t\", dtype=\"string\").fillna(\"\")" ] }, { @@ -5024,7 +5042,7 @@ } ], "source": [ - "pd.read_csv(\"data/changesheet-replace.tsv\", sep=\"\\t\", dtype=\"string\").fillna('')" + "pd.read_csv(\"data/changesheet-replace.tsv\", sep=\"\\t\", dtype=\"string\").fillna(\"\")" ] }, { @@ -5432,7 +5450,7 @@ } ], "source": [ - "pd.read_csv(\"data/changesheet-insert.tsv\", sep=\"\\t\", dtype=\"string\").fillna('')" + "pd.read_csv(\"data/changesheet-insert.tsv\", sep=\"\\t\", dtype=\"string\").fillna(\"\")" ] }, { diff --git a/metadata-translation/notebooks/ensure_biosample_set_study_id.ipynb b/metadata-translation/notebooks/ensure_biosample_set_study_id.ipynb index bd8650ed..f3c01689 100644 --- a/metadata-translation/notebooks/ensure_biosample_set_study_id.ipynb +++ b/metadata-translation/notebooks/ensure_biosample_set_study_id.ipynb @@ -9,6 +9,7 @@ "import os\n", "\n", "from dotenv import load_dotenv\n", + "\n", "load_dotenv(os.path.expanduser(\"~/.nmdc_mongo.env\"))\n", "\n", "from nmdc_mongo import get_db\n", @@ -26,8 +27,8 @@ "biosample_study = {}\n", "\n", "for odoc in db_share.omics_processing_set.find():\n", - " biosample_id = odoc.get(\"has_input\",[None])[0]\n", - " study_id = odoc.get(\"part_of\",[None])[0]\n", + " biosample_id = odoc.get(\"has_input\", [None])[0]\n", + " study_id = odoc.get(\"part_of\", [None])[0]\n", " if biosample_id and study_id:\n", " biosample_study[biosample_id] = study_id" ] @@ -101,7 +102,7 @@ " fetch_json,\n", " get_db,\n", " reset_database,\n", - " snake_case_set_name\n", + " snake_case_set_name,\n", ")\n", "\n", "from nmdc_mongo.admin import admin_client, reset_database_schema\n", @@ -119,21 +120,27 @@ "\n", "existing_set_names = set(dbschema[\"properties\"])\n", "\n", - "for object_without_set in (defined_object_names - set(set_for_object_name.keys())):\n", + "for object_without_set in defined_object_names - set(set_for_object_name.keys()):\n", " proposed_set_name = snake_case_set_name(object_without_set)\n", " if proposed_set_name not in existing_set_names:\n", " dbschema[\"properties\"][proposed_set_name] = {\n", - " \"description\": (f\"This property links a database object to the set of\"\n", - " f\" {object_without_set} objects within it.\"),\n", + " \"description\": (\n", + " f\"This property links a database object to the set of\"\n", + " f\" {object_without_set} objects within it.\"\n", + " ),\n", " \"items\": {\"$ref\": f\"#/definitions/{object_without_set}\"},\n", " \"type\": \"array\",\n", " }\n", - " \n", - "dbschema = assoc_in(dbschema, [\"definitions\", \"ControlledTermValue\", \"properties\", \"term\", \"type\"], \"string\")\n", + "\n", + "dbschema = assoc_in(\n", + " dbschema,\n", + " [\"definitions\", \"ControlledTermValue\", \"properties\", \"term\", \"type\"],\n", + " \"string\",\n", + ")\n", "del dbschema[\"definitions\"][\"ControlledTermValue\"][\"properties\"][\"term\"][\"$ref\"]\n", "\n", "# 'k' not capitalized upstream perhaps. should conform!\n", - "#dbschema = assoc_in(dbschema, [\"definitions\", \"MetagenomeAssembly\", \"properties\", \"scaf_l_gt50k\", \"type\"], \"number\")" + "# dbschema = assoc_in(dbschema, [\"definitions\", \"MetagenomeAssembly\", \"properties\", \"scaf_l_gt50k\", \"type\"], \"number\")" ] }, { @@ -142,7 +149,9 @@ "metadata": {}, "outputs": [], "source": [ - "dbschema = assoc_in(dbschema, [\"definitions\", \"Biosample\", \"properties\", \"_study_id\", \"type\"], \"string\")\n", + "dbschema = assoc_in(\n", + " dbschema, [\"definitions\", \"Biosample\", \"properties\", \"_study_id\", \"type\"], \"string\"\n", + ")\n", "\n", "collschemas = collschemas_for(dbschema)" ] @@ -314,7 +323,9 @@ } ], "source": [ - "db_share.biosample_set.count_documents({\"_study_id\": {\"$in\": [brodie, stegen, wrighton]}})" + "db_share.biosample_set.count_documents(\n", + " {\"_study_id\": {\"$in\": [brodie, stegen, wrighton]}}\n", + ")" ] }, { diff --git a/metadata-translation/notebooks/ghissue_252_253_linked_samples.ipynb b/metadata-translation/notebooks/ghissue_252_253_linked_samples.ipynb index 7e06bf41..1b9b788d 100644 --- a/metadata-translation/notebooks/ghissue_252_253_linked_samples.ipynb +++ b/metadata-translation/notebooks/ghissue_252_253_linked_samples.ipynb @@ -9,6 +9,7 @@ "import os\n", "\n", "from dotenv import load_dotenv\n", + "\n", "load_dotenv(os.path.expanduser(\"~/.nmdc_mongo.env\"))\n", "\n", "from nmdc_mongo import get_db\n", @@ -30,46 +31,48 @@ "from google.auth.transport.requests import Request\n", "\n", "TOKEN_FILE = os.path.expanduser(\"~/token.nmdc-gcloud-api.pickle\")\n", - "CREDENTIALS_FILE = os.path.expanduser('~/nmdc-gcloud-api-credentials.json')\n", + "CREDENTIALS_FILE = os.path.expanduser(\"~/nmdc-gcloud-api-credentials.json\")\n", "\n", "# If modifying these scopes, delete the token*.pickle file.\n", "SCOPES = [\n", - " 'https://www.googleapis.com/auth/spreadsheets.readonly',\n", - " 'https://www.googleapis.com/auth/drive.readonly'\n", + " \"https://www.googleapis.com/auth/spreadsheets.readonly\",\n", + " \"https://www.googleapis.com/auth/drive.readonly\",\n", "]\n", "\n", - "def get_gcloud_api_creds(scopes=SCOPES, token_file=TOKEN_FILE, credentials_file=CREDENTIALS_FILE):\n", + "\n", + "def get_gcloud_api_creds(\n", + " scopes=SCOPES, token_file=TOKEN_FILE, credentials_file=CREDENTIALS_FILE\n", + "):\n", " creds = None\n", " # The file token.pickle stores the user's access and refresh tokens, and is\n", " # created automatically when the authorization flow completes for the first\n", " # time.\n", " if os.path.exists(token_file):\n", - " with open(token_file, 'rb') as token:\n", + " with open(token_file, \"rb\") as token:\n", " creds = pickle.load(token)\n", " # If there are no (valid) credentials available, let the user log in.\n", " if not creds or not creds.valid:\n", " if creds and creds.expired and creds.refresh_token:\n", " creds.refresh(Request())\n", " else:\n", - " flow = InstalledAppFlow.from_client_secrets_file(\n", - " credentials_file, scopes)\n", + " flow = InstalledAppFlow.from_client_secrets_file(credentials_file, scopes)\n", " creds = flow.run_local_server(port=0)\n", " # Save the credentials for the next run\n", - " with open(token_file, 'wb') as token:\n", + " with open(token_file, \"wb\") as token:\n", " pickle.dump(creds, token)\n", " return creds\n", "\n", + "\n", "def get_sheet_values(sheet_id, sheet_range):\n", " creds = get_gcloud_api_creds()\n", - " sheets_service = build('sheets', 'v4', credentials=creds)\n", + " sheets_service = build(\"sheets\", \"v4\", credentials=creds)\n", "\n", " sheet = sheets_service.spreadsheets()\n", - " result = sheet.values().get(spreadsheetId=sheet_id,\n", - " range=sheet_range).execute()\n", - " values = result.get('values', [])\n", + " result = sheet.values().get(spreadsheetId=sheet_id, range=sheet_range).execute()\n", + " values = result.get(\"values\", [])\n", "\n", " if not values:\n", - " print('No data found.')\n", + " print(\"No data found.\")\n", " return []\n", " else:\n", " return [row for row in values]" @@ -81,7 +84,9 @@ "metadata": {}, "outputs": [], "source": [ - "rows = get_sheet_values(sheet_id='1nZOJYiC2QN0hOn5nDj9y9mteWeGyzQQls17zH5mESww', sheet_range='Sheet1!A:E')" + "rows = get_sheet_values(\n", + " sheet_id=\"1nZOJYiC2QN0hOn5nDj9y9mteWeGyzQQls17zH5mESww\", sheet_range=\"Sheet1!A:E\"\n", + ")" ] }, { @@ -97,7 +102,7 @@ "\n", "def get_file(file_id):\n", " creds = get_gcloud_api_creds()\n", - " drive_service = build('drive', 'v3', credentials=creds)\n", + " drive_service = build(\"drive\", \"v3\", credentials=creds)\n", "\n", " request = drive_service.files().get_media(fileId=file_id)\n", " f = io.BytesIO()\n", @@ -106,8 +111,8 @@ " while done is False:\n", " status, done = downloader.next_chunk()\n", " print(\"Download %d%%.\" % int(status.progress() * 100))\n", - " \n", - " return f\n" + "\n", + " return f" ] }, { @@ -127,15 +132,15 @@ "import json\n", "from pprint import pprint\n", "\n", - "f = get_file('1XoSHcImd9LRlZb2nYNWucGtTzgqmdMd0')\n", + "f = get_file(\"1XoSHcImd9LRlZb2nYNWucGtTzgqmdMd0\")\n", "s = f.getvalue().decode(\"utf-8\")\n", "f.close()\n", "try:\n", " stegen_sample_template = json.loads(s)\n", "except json.JSONDecodeError:\n", " stegen_sample_template = json.loads(\n", - " s.replace('\\n', '')\\\n", - " .replace(\"$BIOSAMPLE_ID\", '\"$BIOSAMPLE_ID\"')\\\n", + " s.replace(\"\\n\", \"\")\n", + " .replace(\"$BIOSAMPLE_ID\", '\"$BIOSAMPLE_ID\"')\n", " .replace(\"“\", '\"')\n", " )" ] @@ -150,6 +155,7 @@ "\n", "gold_pattern = re.compile(r\"Gb\\d+\")\n", "\n", + "\n", "def prefix_sample_id(s):\n", " if \":\" in s:\n", " return s\n", @@ -167,15 +173,17 @@ "source": [ "omics = []\n", "for i, row in enumerate(rows):\n", - " if i == 0: # skip header row\n", + " if i == 0: # skip header row\n", " continue\n", - " omics.append({\n", - " \"omics_id\": row[0],\n", - " \"omics_type\": row[1],\n", - " \"sample_name\": row[2],\n", - " \"sample_id\": prefix_sample_id(row[3]),\n", - " \"new\": len(row) > 4 and row[4] == \"TRUE\"\n", - " })" + " omics.append(\n", + " {\n", + " \"omics_id\": row[0],\n", + " \"omics_type\": row[1],\n", + " \"sample_name\": row[2],\n", + " \"sample_id\": prefix_sample_id(row[3]),\n", + " \"new\": len(row) > 4 and row[4] == \"TRUE\",\n", + " }\n", + " )" ] }, { @@ -185,8 +193,10 @@ "outputs": [], "source": [ "existing_ids = [\n", - " d[\"id\"] for d in\n", - " db_share.biosample_set.find({\"id\": {\"$in\": [o[\"sample_id\"] for o in omics]}}, [\"id\"])\n", + " d[\"id\"]\n", + " for d in db_share.biosample_set.find(\n", + " {\"id\": {\"$in\": [o[\"sample_id\"] for o in omics]}}, [\"id\"]\n", + " )\n", "]" ] }, @@ -197,7 +207,7 @@ "outputs": [], "source": [ "# not true on upserts.\n", - "# assert {o[\"sample_id\"] for o in omics if o[\"new\"]} == {o[\"sample_id\"] for o in omics} - set(existing_ids) " + "# assert {o[\"sample_id\"] for o in omics if o[\"new\"]} == {o[\"sample_id\"] for o in omics} - set(existing_ids)" ] }, { @@ -208,6 +218,7 @@ "source": [ "from toolz import assoc_in, get_in\n", "\n", + "\n", "def transform_in(doc, keys, fn):\n", " initial = get_in(keys, doc)\n", " transformed = fn(initial)\n", @@ -223,12 +234,12 @@ "def fill_template(template, sample_id, sample_name):\n", " doc = assoc_in(template, [\"id\"], sample_id)\n", " doc = transform_in(\n", - " doc, [\"identifier\", \"has_raw_value\"],\n", - " lambda s: s.replace(\"$BIOSAMPLE_NAME\", sample_name)\n", + " doc,\n", + " [\"identifier\", \"has_raw_value\"],\n", + " lambda s: s.replace(\"$BIOSAMPLE_NAME\", sample_name),\n", " )\n", " doc = transform_in(\n", - " doc, [\"name\"],\n", - " lambda s: s.replace(\"$BIOSAMPLE_NAME\", sample_name)\n", + " doc, [\"name\"], lambda s: s.replace(\"$BIOSAMPLE_NAME\", sample_name)\n", " )\n", " return doc" ] @@ -241,10 +252,9 @@ "source": [ "def term_subdocs_to_id_strings(doc):\n", " keys_with_term_ids = [\n", - " k for k in doc\n", - " if isinstance(doc[k], dict)\n", - " and \"term\" in doc[k]\n", - " and \"id\" in doc[k][\"term\"]\n", + " k\n", + " for k in doc\n", + " if isinstance(doc[k], dict) and \"term\" in doc[k] and \"id\" in doc[k][\"term\"]\n", " ]\n", " for k in keys_with_term_ids:\n", " doc = assoc_in(doc, [k, \"term\"], doc[k][\"term\"][\"id\"])\n", @@ -278,14 +288,16 @@ "source": [ "def compare_doc_to_mongo_collection_validator(mongo_collection, doc):\n", " doc_items = sorted([(k, v) for k, v in doc.items()])\n", - " validator_items = sorted([\n", - " (k, v) for k, v in\n", - " validator_for(mongo_collection)['properties'].items()\n", - " if k in list(doc)\n", - " ])\n", + " validator_items = sorted(\n", + " [\n", + " (k, v)\n", + " for k, v in validator_for(mongo_collection)[\"properties\"].items()\n", + " if k in list(doc)\n", + " ]\n", + " )\n", " idx_vi = 0\n", " for k, v in doc_items:\n", - " print(\"\\n#####\"+k+\"\\n\")\n", + " print(\"\\n#####\" + k + \"\\n\")\n", " print(\"doc value:\")\n", " pprint(k)\n", " print()\n", @@ -322,7 +334,11 @@ "from nmdc_mongo import dbschema, validator_for, collschemas_for\n", "from nmdc_mongo.admin import admin_client, reset_database_schema\n", "\n", - "dbschema = assoc_in(dbschema, [\"definitions\", \"ControlledTermValue\", \"properties\", \"term\", \"type\"], \"string\")\n", + "dbschema = assoc_in(\n", + " dbschema,\n", + " [\"definitions\", \"ControlledTermValue\", \"properties\", \"term\", \"type\"],\n", + " \"string\",\n", + ")\n", "del dbschema[\"definitions\"][\"ControlledTermValue\"][\"properties\"][\"term\"][\"$ref\"]\n", "collschemas = collschemas_for(dbschema)\n", "\n", @@ -339,6 +355,7 @@ "source": [ "from toolz import get_in, assoc_in\n", "\n", + "\n", "def un_raw_value(doc, key):\n", " value = get_in([key, \"has_raw_value\"], doc)\n", " if value is not None:\n", @@ -346,6 +363,7 @@ " else:\n", " return doc\n", "\n", + "\n", "raws = [\n", " \"ecosystem\",\n", " \"collection_date\",\n", @@ -372,7 +390,7 @@ "outputs": [], "source": [ "import jsonschema\n", - " \n", + "\n", "jsonschema.validate({\"biosample_set\": docs}, schema=dbschema)" ] }, @@ -384,7 +402,9 @@ "source": [ "from pymongo import ReplaceOne\n", "\n", - "rv = db_share.biosample_set.bulk_write([ReplaceOne({\"id\": doc[\"id\"]}, doc, upsert=True) for doc in docs])" + "rv = db_share.biosample_set.bulk_write(\n", + " [ReplaceOne({\"id\": doc[\"id\"]}, doc, upsert=True) for doc in docs]\n", + ")" ] }, { @@ -421,7 +441,7 @@ "outputs": [], "source": [ "omics = [\n", - " transform_in(o, [\"omics_id\"], lambda s: \"emsl:\"+s if \":\" not in s else s)\n", + " transform_in(o, [\"omics_id\"], lambda s: \"emsl:\" + s if \":\" not in s else s)\n", " for o in omics\n", "]" ] @@ -435,8 +455,8 @@ "omics_ids = [o[\"omics_id\"] for o in omics]\n", "\n", "found_omics_ids = [\n", - " d[\"id\"] for d in\n", - " db_share.omics_processing_set.find({\"id\": {\"$in\": omics_ids}},[\"id\"])\n", + " d[\"id\"]\n", + " for d in db_share.omics_processing_set.find({\"id\": {\"$in\": omics_ids}}, [\"id\"])\n", "]" ] }, @@ -476,7 +496,10 @@ " omics_type = get_in([\"omics_type\"], doc)\n", " updates = omics_updates[doc[\"id\"]]\n", " if omics_type != updates[\"omics_type\"]:\n", - " replacing_omics_type[doc[\"id\"]] = {\"from\": omics_type, \"to\": updates[\"omics_type\"]}\n", + " replacing_omics_type[doc[\"id\"]] = {\n", + " \"from\": omics_type,\n", + " \"to\": updates[\"omics_type\"],\n", + " }\n", " doc = assoc_in(doc, [\"omics_type\"], updates[\"omics_type\"])\n", " doc = assoc_in(doc, [\"has_input\"], [updates[\"sample_id\"]])\n", " requests.append(ReplaceOne({\"id\": doc[\"id\"]}, dissoc(doc, \"_id\")))" @@ -571,7 +594,9 @@ "metadata": {}, "outputs": [], "source": [ - "rows = get_sheet_values(sheet_id=\"1Gj6jwU5d8kdYhq8zuvjTB3T4-nt3Ru7NhxYpTeR9eRw\", sheet_range='Sheet1!A:D')" + "rows = get_sheet_values(\n", + " sheet_id=\"1Gj6jwU5d8kdYhq8zuvjTB3T4-nt3Ru7NhxYpTeR9eRw\", sheet_range=\"Sheet1!A:D\"\n", + ")" ] }, { @@ -582,14 +607,16 @@ "source": [ "omics = []\n", "for i, row in enumerate(rows):\n", - " if i == 0: # skip header row\n", + " if i == 0: # skip header row\n", " continue\n", - " omics.append({\n", - " \"omics_id\": \"emsl:\" + row[0].strip(),\n", - " \"omics_type\": row[1].strip(),\n", - " \"sample_name\": row[2].strip(),\n", - " \"sample_id\": \"igsn:\" + row[3].strip(),\n", - " })" + " omics.append(\n", + " {\n", + " \"omics_id\": \"emsl:\" + row[0].strip(),\n", + " \"omics_type\": row[1].strip(),\n", + " \"sample_name\": row[2].strip(),\n", + " \"sample_id\": \"igsn:\" + row[3].strip(),\n", + " }\n", + " )" ] }, { @@ -601,8 +628,8 @@ "omics_ids = [o[\"omics_id\"] for o in omics]\n", "\n", "found_omics_ids = [\n", - " d[\"id\"] for d in\n", - " db_share.omics_processing_set.find({\"id\": {\"$in\": omics_ids}},[\"id\"])\n", + " d[\"id\"]\n", + " for d in db_share.omics_processing_set.find({\"id\": {\"$in\": omics_ids}}, [\"id\"])\n", "]" ] }, @@ -640,7 +667,10 @@ " omics_type = get_in([\"omics_type\"], doc)\n", " updates = omics_updates[doc[\"id\"]]\n", " if omics_type != updates[\"omics_type\"]:\n", - " replacing_omics_type[doc[\"id\"]] = {\"from\": omics_type, \"to\": updates[\"omics_type\"]}\n", + " replacing_omics_type[doc[\"id\"]] = {\n", + " \"from\": omics_type,\n", + " \"to\": updates[\"omics_type\"],\n", + " }\n", " doc = assoc_in(doc, [\"omics_type\"], updates[\"omics_type\"])\n", " doc = assoc_in(doc, [\"has_input\"], [updates[\"sample_id\"]])\n", " requests.append(ReplaceOne({\"id\": doc[\"id\"]}, dissoc(doc, \"_id\")))" diff --git a/metadata-translation/notebooks/ghissue_255.ipynb b/metadata-translation/notebooks/ghissue_255.ipynb index 5e6b04da..af85fdc7 100644 --- a/metadata-translation/notebooks/ghissue_255.ipynb +++ b/metadata-translation/notebooks/ghissue_255.ipynb @@ -9,6 +9,7 @@ "import os\n", "\n", "from dotenv import load_dotenv\n", + "\n", "load_dotenv(os.path.expanduser(\"~/.nmdc_mongo.env\"))\n", "\n", "from nmdc_mongo import get_db\n", @@ -36,7 +37,7 @@ "source": [ "mfilter = {\n", " \"part_of\": [\"gold:Gs0114675\"],\n", - " \"processing_institution\": \"Environmental Molecular Sciences Lab\"\n", + " \"processing_institution\": \"Environmental Molecular Sciences Lab\",\n", "}\n", "\n", "db_share.omics_processing_set.count_documents(filter=mfilter)" @@ -60,7 +61,9 @@ "omics_processing_ids = [d[\"id\"] for d in docs]\n", "data_object_ids = list(concat(d[\"has_output\"] for d in docs))\n", "\n", - "assert len(omics_processing_ids) == db_share.data_object_set.count_documents({\"id\": {\"$in\": data_object_ids}})" + "assert len(omics_processing_ids) == db_share.data_object_set.count_documents(\n", + " {\"id\": {\"$in\": data_object_ids}}\n", + ")" ] }, { diff --git a/metadata-translation/notebooks/ghissue_272.ipynb b/metadata-translation/notebooks/ghissue_272.ipynb index 42e21588..aa8fa1cf 100644 --- a/metadata-translation/notebooks/ghissue_272.ipynb +++ b/metadata-translation/notebooks/ghissue_272.ipynb @@ -9,6 +9,7 @@ "import os\n", "\n", "from dotenv import load_dotenv\n", + "\n", "load_dotenv(os.path.expanduser(\"~/.nmdc_mongo.env\"))\n", "\n", "from nmdc_mongo import get_db, add_to_db\n", @@ -23,196 +24,181 @@ "outputs": [], "source": [ "biosamples = [\n", - " {\n", - " \"name\":\"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States - \",\n", - " \"description\":\"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States\",\n", - " \"lat_lon\":{\n", - " \"has_raw_value\":\"38.9206 -106.9489\",\n", - " \"latitude\":38.9206,\n", - " \"longitude\":-106.9489\n", - " },\n", - " \"geo_loc_name\":\"USA: Colorado\",\n", - " \"collection_date\":\"2017-05-09\",\n", - " \"env_broad_scale\":{\n", - " \"has_raw_value\":\"ENVO_00000446\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"env_local_scale\":{\n", - " \"has_raw_value\":\"ENVO_00000292\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"env_medium\":{\n", - " \"has_raw_value\":\"ENVO_00001998\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"ecosystem\":\"Environmental\",\n", - " \"ecosystem_category\":\"Terrestrial\",\n", - " \"ecosystem_type\":\"Soil\",\n", - " \"ecosystem_subtype\":\"Unclassified\",\n", - " \"specific_ecosystem\":\"Unclassified\",\n", - " \"depth\":15,\n", - " \"ncbi_taxonomy_name\":\"soil metagenome\",\n", - " \"community\":\"microbial communities\",\n", - " \"location\":\"The East River watershed near Crested Butte, Colorado, USA\",\n", - " \"habitat\":\"soil\",\n", - " \"sample_collection_site\":\"soil\",\n", - " \"add_date\":\"22-Jun-18 04.28.47.015000 PM\",\n", - " \"mod_date\":\"01-Oct-19 09.41.01.459000 AM\",\n", - " \"id\":\"igsn:IEWFS000I\",\n", - " \"identifier\":\"igsn:IEWFS000I\"\n", - " },\n", - " {\n", - " \"name\":\"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States - \",\n", - " \"description\":\"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States\",\n", - " \"lat_lon\":{\n", - " \"has_raw_value\":\"38.9206 -106.9489\",\n", - " \"latitude\":38.9206,\n", - " \"longitude\":-106.9489\n", - " },\n", - " \"geo_loc_name\":\"USA: Colorado\",\n", - " \"collection_date\":\"2017-05-09\",\n", - " \"env_broad_scale\":{\n", - " \"has_raw_value\":\"ENVO_00000446\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"env_local_scale\":{\n", - " \"has_raw_value\":\"ENVO_00000292\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"env_medium\":{\n", - " \"has_raw_value\":\"ENVO_00001998\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"ecosystem\":\"Environmental\",\n", - " \"ecosystem_category\":\"Terrestrial\",\n", - " \"ecosystem_type\":\"Soil\",\n", - " \"ecosystem_subtype\":\"Unclassified\",\n", - " \"specific_ecosystem\":\"Unclassified\",\n", - " \"depth\":15,\n", - " \"ncbi_taxonomy_name\":\"soil metagenome\",\n", - " \"community\":\"microbial communities\",\n", - " \"location\":\"The East River watershed near Crested Butte, Colorado, USA\",\n", - " \"habitat\":\"soil\",\n", - " \"sample_collection_site\":\"soil\",\n", - " \"add_date\":\"22-Jun-18 04.28.47.015000 PM\",\n", - " \"mod_date\":\"01-Oct-19 09.41.01.459000 AM\",\n", - " \"id\":\"igsn:IEWFS000K\",\n", - " \"identifier\":\"igsn:IEWFS000K\"\n", - " },\n", - " {\n", - " \"name\":\"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States - \",\n", - " \"description\":\"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States\",\n", - " \"lat_lon\":{\n", - " \"has_raw_value\":\"38.9206 -106.9489\",\n", - " \"latitude\":38.9206,\n", - " \"longitude\":-106.9489\n", - " },\n", - " \"geo_loc_name\":\"USA: Colorado\",\n", - " \"collection_date\":\"2017-05-09\",\n", - " \"env_broad_scale\":{\n", - " \"has_raw_value\":\"ENVO_00000446\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"env_local_scale\":{\n", - " \"has_raw_value\":\"ENVO_00000292\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"env_medium\":{\n", - " \"has_raw_value\":\"ENVO_00001998\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"ecosystem\":\"Environmental\",\n", - " \"ecosystem_category\":\"Terrestrial\",\n", - " \"ecosystem_type\":\"Soil\",\n", - " \"ecosystem_subtype\":\"Unclassified\",\n", - " \"specific_ecosystem\":\"Unclassified\",\n", - " \"depth\":15,\n", - " \"ncbi_taxonomy_name\":\"soil metagenome\",\n", - " \"community\":\"microbial communities\",\n", - " \"location\":\"The East River watershed near Crested Butte, Colorado, USA\",\n", - " \"habitat\":\"soil\",\n", - " \"sample_collection_site\":\"soil\",\n", - " \"add_date\":\"22-Jun-18 04.28.47.015000 PM\",\n", - " \"mod_date\":\"01-Oct-19 09.41.01.459000 AM\",\n", - " \"id\":\"igsn:IEWFS000B\",\n", - " \"identifier\":\"igsn:IEWFS000B\"\n", - " },\n", - " {\n", - " \"name\":\"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States - \",\n", - " \"description\":\"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States\",\n", - " \"lat_lon\":{\n", - " \"has_raw_value\":\"38.9206 -106.9489\",\n", - " \"latitude\":38.9206,\n", - " \"longitude\":-106.9489\n", - " },\n", - " \"geo_loc_name\":\"USA: Colorado\",\n", - " \"collection_date\":\"2017-05-09\",\n", - " \"env_broad_scale\":{\n", - " \"has_raw_value\":\"ENVO_00000446\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"env_local_scale\":{\n", - " \"has_raw_value\":\"ENVO_00000292\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"env_medium\":{\n", - " \"has_raw_value\":\"ENVO_00001998\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"ecosystem\":\"Environmental\",\n", - " \"ecosystem_category\":\"Terrestrial\",\n", - " \"ecosystem_type\":\"Soil\",\n", - " \"ecosystem_subtype\":\"Unclassified\",\n", - " \"specific_ecosystem\":\"Unclassified\",\n", - " \"depth\":15,\n", - " \"ncbi_taxonomy_name\":\"soil metagenome\",\n", - " \"community\":\"microbial communities\",\n", - " \"location\":\"The East River watershed near Crested Butte, Colorado, USA\",\n", - " \"habitat\":\"soil\",\n", - " \"sample_collection_site\":\"soil\",\n", - " \"add_date\":\"22-Jun-18 04.28.47.015000 PM\",\n", - " \"mod_date\":\"01-Oct-19 09.41.01.459000 AM\",\n", - " \"id\":\"igsn:IEWFS000A\",\n", - " \"identifier\":\"igsn:IEWFS000A\"\n", - " },\n", - " {\n", - " \"name\":\"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States - \",\n", - " \"description\":\"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States\",\n", - " \"lat_lon\":{\n", - " \"has_raw_value\":\"38.9206 -106.9489\",\n", - " \"latitude\":38.9206,\n", - " \"longitude\":-106.9489\n", - " },\n", - " \"geo_loc_name\":\"USA: Colorado\",\n", - " \"collection_date\":\"2017-05-09\",\n", - " \"env_broad_scale\":{\n", - " \"has_raw_value\":\"ENVO_00000446\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"env_local_scale\":{\n", - " \"has_raw_value\":\"ENVO_00000292\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"env_medium\":{\n", - " \"has_raw_value\":\"ENVO_00001998\",\n", - " \"type\":\"ControlledTermValue\"\n", - " },\n", - " \"ecosystem\":\"Environmental\",\n", - " \"ecosystem_category\":\"Terrestrial\",\n", - " \"ecosystem_type\":\"Soil\",\n", - " \"ecosystem_subtype\":\"Unclassified\",\n", - " \"specific_ecosystem\":\"Unclassified\",\n", - " \"depth\":15,\n", - " \"ncbi_taxonomy_name\":\"soil metagenome\",\n", - " \"community\":\"microbial communities\",\n", - " \"location\":\"The East River watershed near Crested Butte, Colorado, USA\",\n", - " \"habitat\":\"soil\",\n", - " \"sample_collection_site\":\"soil\",\n", - " \"add_date\":\"22-Jun-18 04.28.47.015000 PM\",\n", - " \"mod_date\":\"01-Oct-19 09.41.01.459000 AM\",\n", - " \"id\":\"igsn:IEWFS000J\",\n", - " \"identifier\":\"igsn:IEWFS000J\"\n", - " }\n", + " {\n", + " \"name\": \"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States - \",\n", + " \"description\": \"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States\",\n", + " \"lat_lon\": {\n", + " \"has_raw_value\": \"38.9206 -106.9489\",\n", + " \"latitude\": 38.9206,\n", + " \"longitude\": -106.9489,\n", + " },\n", + " \"geo_loc_name\": \"USA: Colorado\",\n", + " \"collection_date\": \"2017-05-09\",\n", + " \"env_broad_scale\": {\n", + " \"has_raw_value\": \"ENVO_00000446\",\n", + " \"type\": \"ControlledTermValue\",\n", + " },\n", + " \"env_local_scale\": {\n", + " \"has_raw_value\": \"ENVO_00000292\",\n", + " \"type\": \"ControlledTermValue\",\n", + " },\n", + " \"env_medium\": {\"has_raw_value\": \"ENVO_00001998\", \"type\": \"ControlledTermValue\"},\n", + " \"ecosystem\": \"Environmental\",\n", + " \"ecosystem_category\": \"Terrestrial\",\n", + " \"ecosystem_type\": \"Soil\",\n", + " \"ecosystem_subtype\": \"Unclassified\",\n", + " \"specific_ecosystem\": \"Unclassified\",\n", + " \"depth\": 15,\n", + " \"ncbi_taxonomy_name\": \"soil metagenome\",\n", + " \"community\": \"microbial communities\",\n", + " \"location\": \"The East River watershed near Crested Butte, Colorado, USA\",\n", + " \"habitat\": \"soil\",\n", + " \"sample_collection_site\": \"soil\",\n", + " \"add_date\": \"22-Jun-18 04.28.47.015000 PM\",\n", + " \"mod_date\": \"01-Oct-19 09.41.01.459000 AM\",\n", + " \"id\": \"igsn:IEWFS000I\",\n", + " \"identifier\": \"igsn:IEWFS000I\",\n", + " },\n", + " {\n", + " \"name\": \"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States - \",\n", + " \"description\": \"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States\",\n", + " \"lat_lon\": {\n", + " \"has_raw_value\": \"38.9206 -106.9489\",\n", + " \"latitude\": 38.9206,\n", + " \"longitude\": -106.9489,\n", + " },\n", + " \"geo_loc_name\": \"USA: Colorado\",\n", + " \"collection_date\": \"2017-05-09\",\n", + " \"env_broad_scale\": {\n", + " \"has_raw_value\": \"ENVO_00000446\",\n", + " \"type\": \"ControlledTermValue\",\n", + " },\n", + " \"env_local_scale\": {\n", + " \"has_raw_value\": \"ENVO_00000292\",\n", + " \"type\": \"ControlledTermValue\",\n", + " },\n", + " \"env_medium\": {\"has_raw_value\": \"ENVO_00001998\", \"type\": \"ControlledTermValue\"},\n", + " \"ecosystem\": \"Environmental\",\n", + " \"ecosystem_category\": \"Terrestrial\",\n", + " \"ecosystem_type\": \"Soil\",\n", + " \"ecosystem_subtype\": \"Unclassified\",\n", + " \"specific_ecosystem\": \"Unclassified\",\n", + " \"depth\": 15,\n", + " \"ncbi_taxonomy_name\": \"soil metagenome\",\n", + " \"community\": \"microbial communities\",\n", + " \"location\": \"The East River watershed near Crested Butte, Colorado, USA\",\n", + " \"habitat\": \"soil\",\n", + " \"sample_collection_site\": \"soil\",\n", + " \"add_date\": \"22-Jun-18 04.28.47.015000 PM\",\n", + " \"mod_date\": \"01-Oct-19 09.41.01.459000 AM\",\n", + " \"id\": \"igsn:IEWFS000K\",\n", + " \"identifier\": \"igsn:IEWFS000K\",\n", + " },\n", + " {\n", + " \"name\": \"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States - \",\n", + " \"description\": \"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States\",\n", + " \"lat_lon\": {\n", + " \"has_raw_value\": \"38.9206 -106.9489\",\n", + " \"latitude\": 38.9206,\n", + " \"longitude\": -106.9489,\n", + " },\n", + " \"geo_loc_name\": \"USA: Colorado\",\n", + " \"collection_date\": \"2017-05-09\",\n", + " \"env_broad_scale\": {\n", + " \"has_raw_value\": \"ENVO_00000446\",\n", + " \"type\": \"ControlledTermValue\",\n", + " },\n", + " \"env_local_scale\": {\n", + " \"has_raw_value\": \"ENVO_00000292\",\n", + " \"type\": \"ControlledTermValue\",\n", + " },\n", + " \"env_medium\": {\"has_raw_value\": \"ENVO_00001998\", \"type\": \"ControlledTermValue\"},\n", + " \"ecosystem\": \"Environmental\",\n", + " \"ecosystem_category\": \"Terrestrial\",\n", + " \"ecosystem_type\": \"Soil\",\n", + " \"ecosystem_subtype\": \"Unclassified\",\n", + " \"specific_ecosystem\": \"Unclassified\",\n", + " \"depth\": 15,\n", + " \"ncbi_taxonomy_name\": \"soil metagenome\",\n", + " \"community\": \"microbial communities\",\n", + " \"location\": \"The East River watershed near Crested Butte, Colorado, USA\",\n", + " \"habitat\": \"soil\",\n", + " \"sample_collection_site\": \"soil\",\n", + " \"add_date\": \"22-Jun-18 04.28.47.015000 PM\",\n", + " \"mod_date\": \"01-Oct-19 09.41.01.459000 AM\",\n", + " \"id\": \"igsn:IEWFS000B\",\n", + " \"identifier\": \"igsn:IEWFS000B\",\n", + " },\n", + " {\n", + " \"name\": \"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States - \",\n", + " \"description\": \"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States\",\n", + " \"lat_lon\": {\n", + " \"has_raw_value\": \"38.9206 -106.9489\",\n", + " \"latitude\": 38.9206,\n", + " \"longitude\": -106.9489,\n", + " },\n", + " \"geo_loc_name\": \"USA: Colorado\",\n", + " \"collection_date\": \"2017-05-09\",\n", + " \"env_broad_scale\": {\n", + " \"has_raw_value\": \"ENVO_00000446\",\n", + " \"type\": \"ControlledTermValue\",\n", + " },\n", + " \"env_local_scale\": {\n", + " \"has_raw_value\": \"ENVO_00000292\",\n", + " \"type\": \"ControlledTermValue\",\n", + " },\n", + " \"env_medium\": {\"has_raw_value\": \"ENVO_00001998\", \"type\": \"ControlledTermValue\"},\n", + " \"ecosystem\": \"Environmental\",\n", + " \"ecosystem_category\": \"Terrestrial\",\n", + " \"ecosystem_type\": \"Soil\",\n", + " \"ecosystem_subtype\": \"Unclassified\",\n", + " \"specific_ecosystem\": \"Unclassified\",\n", + " \"depth\": 15,\n", + " \"ncbi_taxonomy_name\": \"soil metagenome\",\n", + " \"community\": \"microbial communities\",\n", + " \"location\": \"The East River watershed near Crested Butte, Colorado, USA\",\n", + " \"habitat\": \"soil\",\n", + " \"sample_collection_site\": \"soil\",\n", + " \"add_date\": \"22-Jun-18 04.28.47.015000 PM\",\n", + " \"mod_date\": \"01-Oct-19 09.41.01.459000 AM\",\n", + " \"id\": \"igsn:IEWFS000A\",\n", + " \"identifier\": \"igsn:IEWFS000A\",\n", + " },\n", + " {\n", + " \"name\": \"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States - \",\n", + " \"description\": \"Soil microbial communities from the East River watershed near Crested Butte, Colorado, United States\",\n", + " \"lat_lon\": {\n", + " \"has_raw_value\": \"38.9206 -106.9489\",\n", + " \"latitude\": 38.9206,\n", + " \"longitude\": -106.9489,\n", + " },\n", + " \"geo_loc_name\": \"USA: Colorado\",\n", + " \"collection_date\": \"2017-05-09\",\n", + " \"env_broad_scale\": {\n", + " \"has_raw_value\": \"ENVO_00000446\",\n", + " \"type\": \"ControlledTermValue\",\n", + " },\n", + " \"env_local_scale\": {\n", + " \"has_raw_value\": \"ENVO_00000292\",\n", + " \"type\": \"ControlledTermValue\",\n", + " },\n", + " \"env_medium\": {\"has_raw_value\": \"ENVO_00001998\", \"type\": \"ControlledTermValue\"},\n", + " \"ecosystem\": \"Environmental\",\n", + " \"ecosystem_category\": \"Terrestrial\",\n", + " \"ecosystem_type\": \"Soil\",\n", + " \"ecosystem_subtype\": \"Unclassified\",\n", + " \"specific_ecosystem\": \"Unclassified\",\n", + " \"depth\": 15,\n", + " \"ncbi_taxonomy_name\": \"soil metagenome\",\n", + " \"community\": \"microbial communities\",\n", + " \"location\": \"The East River watershed near Crested Butte, Colorado, USA\",\n", + " \"habitat\": \"soil\",\n", + " \"sample_collection_site\": \"soil\",\n", + " \"add_date\": \"22-Jun-18 04.28.47.015000 PM\",\n", + " \"mod_date\": \"01-Oct-19 09.41.01.459000 AM\",\n", + " \"id\": \"igsn:IEWFS000J\",\n", + " \"identifier\": \"igsn:IEWFS000J\",\n", + " },\n", "]" ] }, @@ -231,7 +217,9 @@ "metadata": {}, "outputs": [], "source": [ - "assert db_share.biosample_set.count_documents({\"id\": {\"$in\": [d[\"id\"] for d in biosamples]}}) == len(biosamples)" + "assert db_share.biosample_set.count_documents(\n", + " {\"id\": {\"$in\": [d[\"id\"] for d in biosamples]}}\n", + ") == len(biosamples)" ] } ], diff --git a/metadata-translation/notebooks/gold-biosample-null-value-analysis.ipynb b/metadata-translation/notebooks/gold-biosample-null-value-analysis.ipynb index a9d3c04c..2a5e1fe1 100644 --- a/metadata-translation/notebooks/gold-biosample-null-value-analysis.ipynb +++ b/metadata-translation/notebooks/gold-biosample-null-value-analysis.ipynb @@ -284,10 +284,31 @@ "outputs": [], "source": [ "## exclude columns that we don't care about\n", - "unwanted = ['BIOSAMPLE_ID', 'BIOSAMPLE_NAME', 'ADD_DATE', 'MOD_DATE', 'DESCRIPTION', 'ECOSYSTEM', \n", - " 'ECOSYSTEM_CATEGORY', 'ECOSYSTEM_TYPE', 'ECOSYSTEM_SUBTYPE', 'SPECIFIC_ECOSYSTEM', \n", - " 'PUBLIC_SP_COUNT', 'IS_TEST', 'ADMIN_DAP_COUNT', 'PUBLIC_AP_COUNT', 'ADMIN_SP_COUNT', 'MOD_BY',\n", - " 'ACTIVE', 'GOLD_ID', 'IS_PUBLIC', 'SUBMITTER_ID', 'PUBLIC_DAP_COUNT', 'ADMIN_AP_COUNT', 'SPECIMEN']\n", + "unwanted = [\n", + " \"BIOSAMPLE_ID\",\n", + " \"BIOSAMPLE_NAME\",\n", + " \"ADD_DATE\",\n", + " \"MOD_DATE\",\n", + " \"DESCRIPTION\",\n", + " \"ECOSYSTEM\",\n", + " \"ECOSYSTEM_CATEGORY\",\n", + " \"ECOSYSTEM_TYPE\",\n", + " \"ECOSYSTEM_SUBTYPE\",\n", + " \"SPECIFIC_ECOSYSTEM\",\n", + " \"PUBLIC_SP_COUNT\",\n", + " \"IS_TEST\",\n", + " \"ADMIN_DAP_COUNT\",\n", + " \"PUBLIC_AP_COUNT\",\n", + " \"ADMIN_SP_COUNT\",\n", + " \"MOD_BY\",\n", + " \"ACTIVE\",\n", + " \"GOLD_ID\",\n", + " \"IS_PUBLIC\",\n", + " \"SUBMITTER_ID\",\n", + " \"PUBLIC_DAP_COUNT\",\n", + " \"ADMIN_AP_COUNT\",\n", + " \"SPECIMEN\",\n", + "]\n", "cols = [c for c in df.columns if c not in unwanted]" ] }, @@ -309,7 +330,7 @@ "source": [ "## get null counts\n", "series = subdf.isnull().sum().sort_values()\n", - "countdf = pds.DataFrame({'field': series.index, 'null_count': series.values})" + "countdf = pds.DataFrame({\"field\": series.index, \"null_count\": series.values})" ] }, { @@ -319,7 +340,7 @@ "outputs": [], "source": [ "## get counts of non-nulls\n", - "countdf['non_null_count'] = len(df) - countdf.null_count" + "countdf[\"non_null_count\"] = len(df) - countdf.null_count" ] }, { @@ -329,9 +350,12 @@ "outputs": [], "source": [ "## get perecentage of nulls\n", - "countdf['percent_null'] = (countdf['null_count'] / (countdf['null_count'] + countdf['non_null_count'])) * 100\n", - "countdf['percent_null'] = \\\n", - " countdf['percent_null'].map(lambda x: \"0%\" if np.isinf(x) else \"{0:.2f}%\".format(x)) # format as percent" + "countdf[\"percent_null\"] = (\n", + " countdf[\"null_count\"] / (countdf[\"null_count\"] + countdf[\"non_null_count\"])\n", + ") * 100\n", + "countdf[\"percent_null\"] = countdf[\"percent_null\"].map(\n", + " lambda x: \"0%\" if np.isinf(x) else \"{0:.2f}%\".format(x)\n", + ") # format as percent" ] }, { @@ -2173,8 +2197,8 @@ } ], "source": [ - "pds.set_option('display.max_rows', None)\n", - "countdf.sort_values(by=['non_null_count'], ascending=False)" + "pds.set_option(\"display.max_rows\", None)\n", + "countdf.sort_values(by=[\"non_null_count\"], ascending=False)" ] }, { diff --git a/metadata-translation/notebooks/gold_ids_to_igsns.ipynb b/metadata-translation/notebooks/gold_ids_to_igsns.ipynb index 1a40ca73..c31b2b1c 100644 --- a/metadata-translation/notebooks/gold_ids_to_igsns.ipynb +++ b/metadata-translation/notebooks/gold_ids_to_igsns.ipynb @@ -33,6 +33,7 @@ "tic = time()\n", "\n", "from dotenv import load_dotenv\n", + "\n", "load_dotenv(os.path.expanduser(\"~/.nmdc_mongo.env\"))" ] }, @@ -58,7 +59,7 @@ " fetch_json,\n", " get_db,\n", " reset_database,\n", - " snake_case_set_name\n", + " snake_case_set_name,\n", ")" ] }, @@ -81,21 +82,27 @@ "\n", "existing_set_names = set(dbschema[\"properties\"])\n", "\n", - "for object_without_set in (defined_object_names - set(set_for_object_name.keys())):\n", + "for object_without_set in defined_object_names - set(set_for_object_name.keys()):\n", " proposed_set_name = snake_case_set_name(object_without_set)\n", " if proposed_set_name not in existing_set_names:\n", " dbschema[\"properties\"][proposed_set_name] = {\n", - " \"description\": (f\"This property links a database object to the set of\"\n", - " f\" {object_without_set} objects within it.\"),\n", + " \"description\": (\n", + " f\"This property links a database object to the set of\"\n", + " f\" {object_without_set} objects within it.\"\n", + " ),\n", " \"items\": {\"$ref\": f\"#/definitions/{object_without_set}\"},\n", " \"type\": \"array\",\n", " }\n", - " \n", - "dbschema = assoc_in(dbschema, [\"definitions\", \"ControlledTermValue\", \"properties\", \"term\", \"type\"], \"string\")\n", + "\n", + "dbschema = assoc_in(\n", + " dbschema,\n", + " [\"definitions\", \"ControlledTermValue\", \"properties\", \"term\", \"type\"],\n", + " \"string\",\n", + ")\n", "del dbschema[\"definitions\"][\"ControlledTermValue\"][\"properties\"][\"term\"][\"$ref\"]\n", "\n", "# 'k' not capitalized upstream perhaps. should conform!\n", - "#dbschema = assoc_in(dbschema, [\"definitions\", \"MetagenomeAssembly\", \"properties\", \"scaf_l_gt50k\", \"type\"], \"number\")" + "# dbschema = assoc_in(dbschema, [\"definitions\", \"MetagenomeAssembly\", \"properties\", \"scaf_l_gt50k\", \"type\"], \"number\")" ] }, { @@ -126,7 +133,7 @@ "metadata": {}, "outputs": [], "source": [ - "with ZipFile('../src/data/nmdc_database.json.zip') as myzip:\n", + "with ZipFile(\"../src/data/nmdc_database.json.zip\") as myzip:\n", " # may be e.g. 'metadata-translation/src/bin/output/nmdc_database.json' rather than 'nmdc_database.json'\n", " name = next(n for n in myzip.namelist() if n.endswith(\"nmdc_database.json\"))\n", " with myzip.open(name) as f:\n", @@ -149,7 +156,7 @@ "metadata": {}, "outputs": [], "source": [ - "#add_to_db(nmdc_database[\"study_set\"], get_db(\"dwinston_share\"), \"study_set\")" + "# add_to_db(nmdc_database[\"study_set\"], get_db(\"dwinston_share\"), \"study_set\")" ] }, { @@ -160,7 +167,7 @@ "source": [ "from nmdc_mongo import validator_for\n", "\n", - "#validator_for(db.study_set)" + "# validator_for(db.study_set)" ] }, { @@ -196,7 +203,7 @@ " docs = nmdc_database[collection]\n", " object_types = {d.get(\"type\", \"nmdc:\")[5:] for d in docs} - {\"\"}\n", " if any(d for d in docs if \"type\" not in d):\n", - " print(\"some\",collection,\"docs have no type\")\n", + " print(\"some\", collection, \"docs have no type\")\n", " print(collection, object_types)" ] }, @@ -344,22 +351,30 @@ " if \"lat_lon\" in d_new:\n", " d_new[\"lat_lon\"].pop(\"type\", None)\n", " for k_float in (\n", - " \"asm_score\", \"ctg_logsum\", \"ctg_powsum\", \"gap_pct\", \"gc_avg\", \"gc_std\",\n", - " \"scaf_logsum\", \"scaf_powsum\"):\n", + " \"asm_score\",\n", + " \"ctg_logsum\",\n", + " \"ctg_powsum\",\n", + " \"gap_pct\",\n", + " \"gc_avg\",\n", + " \"gc_std\",\n", + " \"scaf_logsum\",\n", + " \"scaf_powsum\",\n", + " ):\n", " if k_float in d_new:\n", - " d_new[k_float] = float(d_new[k_float]) \n", + " d_new[k_float] = float(d_new[k_float])\n", " keys_with_term_ids = [\n", - " k for k in d_new\n", + " k\n", + " for k in d_new\n", " if isinstance(d_new[k], dict)\n", " and \"term\" in d_new[k]\n", " and \"id\" in d_new[k][\"term\"]\n", " ]\n", " for k in keys_with_term_ids:\n", " d_new = assoc_in(d_new, [k, \"term\"], d_new[k][\"term\"][\"id\"])\n", - " \n", + "\n", " key = target_collection[type_] if type_ else source_collection\n", " docs_per_target[key].append(d_new)\n", - " \n", + "\n", " for collection_name, docs in docs_per_target.items():\n", " print(collection_name)\n", " payload = fetch_and_validate_json(docs, collection_name=collection_name)\n", @@ -417,7 +432,9 @@ "\n", "gold_id_pattern = re.compile(r\"Gb\\d+\")\n", "\n", - "with open('../src/data/FICUS_Soil_Gs0135149_Brodie-12-23-2020_PS.xlsx - Brodie_Gs0135149_Soil_Metadata.csv') as f:\n", + "with open(\n", + " \"../src/data/FICUS_Soil_Gs0135149_Brodie-12-23-2020_PS.xlsx - Brodie_Gs0135149_Soil_Metadata.csv\"\n", + ") as f:\n", " reader = csv.reader(f)\n", " for row in reader:\n", " gold_id = row[GOLD_ID_IDX]\n", @@ -441,9 +458,12 @@ "source": [ "from datetime import datetime\n", "\n", - "dt_pattern = re.compile(r\"\\d{2}-(?P\\w+)-\\d{2} \\d{2}\\.\\d{2}\\.\\d{2}\\.(?P\\d+) [A|P]M\")\n", + "dt_pattern = re.compile(\n", + " r\"\\d{2}-(?P\\w+)-\\d{2} \\d{2}\\.\\d{2}\\.\\d{2}\\.(?P\\d+) [A|P]M\"\n", + ")\n", "dt_format = \"%d-%b-%y %I.%M.%S.%f %p\"\n", "\n", + "\n", "def order_timestamps(timestamps):\n", " if not all(isinstance(ts, str) for ts in timestamps):\n", " raise Exception(f\"{timestamps} not strings\")\n", @@ -452,7 +472,9 @@ " match = dt_pattern.search(ts)\n", " first, month, rest = ts.partition(match.group(\"month\"))\n", " ts_new = first + month[0] + month[1:].lower() + rest\n", - " ts_new = ts_new.replace(match.group(\"ns\"), match.group(\"ns\")[:-3]) # truncate to microseconds\n", + " ts_new = ts_new.replace(\n", + " match.group(\"ns\"), match.group(\"ns\")[:-3]\n", + " ) # truncate to microseconds\n", " as_datetimes.append(datetime.strptime(ts_new, dt_format))\n", " sorted_dts = sorted(as_datetimes)\n", " return [dt.strftime(dt_format) for dt in sorted_dts]" @@ -476,19 +498,22 @@ "\n", "er_xna_pattern = re.compile(r\"ER_[D|R]NA_\\d+$\")\n", "\n", + "\n", "def rstrip_name_ER_ID(d):\n", " s = get_in([\"name\"], d)\n", " s_new = er_xna_pattern.split(s)[0] if er_xna_pattern.search(s) else s\n", " return assoc_in(d, [\"name\"], s_new)\n", "\n", + "\n", "def capitalize_location(d):\n", " s = get_in([\"location\"], d)\n", " if s is not None:\n", - " s_new = (s[0].upper() + s[1:])\n", + " s_new = s[0].upper() + s[1:]\n", " return assoc_in(d, [\"location\"], s_new)\n", " else:\n", " return d\n", "\n", + "\n", "pipeline = compose(\n", " capitalize_location,\n", " rstrip_name_ER_ID,\n", @@ -515,13 +540,15 @@ "merged_biosample_docs = []\n", "\n", "for igsn, golds in igsn_golds.items():\n", - " igsn_curie = \"igsn:\"+igsn\n", - " to_change = list(db.biosample_set.find({\"id\": {\"$in\": [f\"gold:{g}\" for g in golds]}}))\n", - " \n", + " igsn_curie = \"igsn:\" + igsn\n", + " to_change = list(\n", + " db.biosample_set.find({\"id\": {\"$in\": [f\"gold:{g}\" for g in golds]}})\n", + " )\n", + "\n", " # No merge needed, just change of id.\n", " if len(to_change) == 1:\n", " merged = assoc_in(to_change[0], [\"id\"], igsn_curie)\n", - " #merged = assoc_in(merged, [\"identifier\"], igsn_curie)\n", + " # merged = assoc_in(merged, [\"identifier\"], igsn_curie)\n", " merged_biosample_docs.append(merged)\n", " continue\n", " elif len(to_change) == 0:\n", @@ -531,7 +558,7 @@ " distilled = list(map(pipeline, to_change))\n", " result = list(diff(distilled[0], distilled[1]))\n", " assert result == []\n", - " \n", + "\n", " # Produce a merged document\n", " earlier_ts, _ = order_timestamps([get_in([\"add_date\"], d) for d in to_change])\n", " merged = assoc_in(distilled[0], [\"add_date\"], earlier_ts)\n", @@ -539,9 +566,9 @@ " merged = assoc_in(merged, [\"mod_date\"], later_ts)\n", " merged = assoc_in(merged, [\"id\"], igsn_curie)\n", " merged = assoc_in(merged, [\"identifier\"], igsn_curie)\n", - " \n", + "\n", " merged_biosample_docs.append(merged)\n", - " merged = None # defense against accidental reuse during next iteration.\n", + " merged = None # defense against accidental reuse during next iteration.\n", "\n", "assert len(merged_biosample_docs) == len(igsn_golds)" ] @@ -573,7 +600,9 @@ "from pymongo import DeleteMany, InsertOne\n", "from toolz import concat\n", "\n", - "requests = [DeleteMany({\"id\": {\"$in\": [\"gold:\"+g for g in concat(igsn_golds.values())]}})]\n", + "requests = [\n", + " DeleteMany({\"id\": {\"$in\": [\"gold:\" + g for g in concat(igsn_golds.values())]}})\n", + "]\n", "requests.extend([InsertOne(d) for d in merged_biosample_docs])\n", "result = db.biosample_set.bulk_write(requests)\n", "result.deleted_count, result.inserted_count" @@ -605,12 +634,14 @@ "outputs": [], "source": [ "requests = []\n", - "to_replace = {\"gold:\"+k: \"igsn:\"+v for k, v in goldid_igsn.items()}\n", + "to_replace = {\"gold:\" + k: \"igsn:\" + v for k, v in goldid_igsn.items()}\n", "\n", "for doc in db.omics_processing_set.find({\"has_input\": {\"$in\": list(to_replace)}}):\n", - " operations = {\"$set\": {\n", - " \"has_input\": [to_replace.get(i, i) for i in doc[\"has_input\"]],\n", - " }}\n", + " operations = {\n", + " \"$set\": {\n", + " \"has_input\": [to_replace.get(i, i) for i in doc[\"has_input\"]],\n", + " }\n", + " }\n", " requests.append({\"filter\": {\"_id\": doc[\"_id\"]}, \"update\": operations})" ] }, @@ -665,7 +696,9 @@ "\n", "emsl_ids_pattern = re.compile(r\"\\d+\")\n", "\n", - "with open('../src/data/FICUS_Soil_Gs0135149_Brodie-12-23-2020_PS.xlsx - Brodie_Gs0135149_Soil_Metadata.csv') as f:\n", + "with open(\n", + " \"../src/data/FICUS_Soil_Gs0135149_Brodie-12-23-2020_PS.xlsx - Brodie_Gs0135149_Soil_Metadata.csv\"\n", + ") as f:\n", " reader = csv.reader(f)\n", " for row in reader:\n", " emsl_ids = row[EMSL_IDS_IDX]\n", @@ -695,7 +728,8 @@ "outputs": [], "source": [ "n_with_emsl_id = db.omics_processing_set.count_documents(\n", - " {\"id\": {\"$in\": [\"emsl:\"+i for i in emslid_igsn]}})" + " {\"id\": {\"$in\": [\"emsl:\" + i for i in emslid_igsn]}}\n", + ")" ] }, { @@ -705,20 +739,24 @@ "outputs": [], "source": [ "requests = []\n", - "to_replace = {\"emsl:\"+k: \"igsn:\"+v for k, v in emslid_igsn.items()}\n", - "to_replace.update({\"emsl:output_\"+k: \"igsn:\"+v for k, v in emslid_igsn.items()})\n", + "to_replace = {\"emsl:\" + k: \"igsn:\" + v for k, v in emslid_igsn.items()}\n", + "to_replace.update({\"emsl:output_\" + k: \"igsn:\" + v for k, v in emslid_igsn.items()})\n", + "\n", "\n", "def omit(blacklist, d):\n", " return keyfilter(lambda k: k not in blacklist, d)\n", "\n", + "\n", "def sans_mongo_id(d):\n", " return omit([\"_id\"], d)\n", "\n", "\n", "for doc in db.omics_processing_set.find({\"has_input\": {\"$in\": list(to_replace)}}):\n", - " operations = {\"$set\": {\n", - " \"has_input\": [to_replace.get(i, i) for i in doc[\"has_input\"]],\n", - " }}\n", + " operations = {\n", + " \"$set\": {\n", + " \"has_input\": [to_replace.get(i, i) for i in doc[\"has_input\"]],\n", + " }\n", + " }\n", " requests.append({\"filter\": {\"_id\": doc[\"_id\"]}, \"update\": operations})" ] }, @@ -740,8 +778,9 @@ "outputs": [], "source": [ "from nmdc_mongo import validator_for\n", + "\n", "db_share = get_db(\"dwinston_share\")\n", - "#validator_for(db_share.biosample_set)" + "# validator_for(db_share.biosample_set)" ] }, { @@ -755,7 +794,7 @@ "admin_client = MongoClient(\n", " host=os.getenv(\"NMDC_MONGO_HOST\"),\n", " username=\"nmdc-admin\",\n", - " password=os.getenv(\"NMDC_MONGO_ADMIN_PWD\")\n", + " password=os.getenv(\"NMDC_MONGO_ADMIN_PWD\"),\n", ")\n", "admin_dwinston_share = admin_client[\"dwinston_share\"]" ] @@ -819,7 +858,9 @@ "source": [ "from nmdc_mongo.admin import reset_database_schema\n", "\n", - "reset_database_schema(admin_client[\"dwinston_share\"], target_collection_names, collschemas)" + "reset_database_schema(\n", + " admin_client[\"dwinston_share\"], target_collection_names, collschemas\n", + ")" ] }, { diff --git a/metadata-translation/notebooks/metaP_stegen.ipynb b/metadata-translation/notebooks/metaP_stegen.ipynb index 99c273f7..5a52c64b 100644 --- a/metadata-translation/notebooks/metaP_stegen.ipynb +++ b/metadata-translation/notebooks/metaP_stegen.ipynb @@ -9,6 +9,7 @@ "import os\n", "\n", "from dotenv import load_dotenv\n", + "\n", "load_dotenv(os.path.expanduser(\"~/.nmdc_mongo.env\"))\n", "\n", "from nmdc_mongo import get_db\n", @@ -35,7 +36,7 @@ " fetch_json,\n", " get_db,\n", " reset_database,\n", - " snake_case_set_name\n", + " snake_case_set_name,\n", ")" ] }, @@ -45,7 +46,11 @@ "metadata": {}, "outputs": [], "source": [ - "dbschema = assoc_in(dbschema, [\"definitions\", \"ControlledTermValue\", \"properties\", \"term\", \"type\"], \"string\")\n", + "dbschema = assoc_in(\n", + " dbschema,\n", + " [\"definitions\", \"ControlledTermValue\", \"properties\", \"term\", \"type\"],\n", + " \"string\",\n", + ")\n", "del dbschema[\"definitions\"][\"ControlledTermValue\"][\"properties\"][\"term\"][\"$ref\"]\n", "collschemas = collschemas_for(dbschema)" ] @@ -56,15 +61,18 @@ "metadata": {}, "outputs": [], "source": [ - "to_fetch = [{\n", - " # >200MB\n", - " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/stegen_MetaProteomicAnalysis_activity.json\",\n", - " \"type\": \"metaproteomics_analysis_activity_set\",\n", - "}, {\n", - " # ~50KB\n", - " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/stegen_emsl_analysis_data_objects.json\",\n", - " \"type\": \"data_object_set\"\n", - "}]" + "to_fetch = [\n", + " {\n", + " # >200MB\n", + " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/stegen_MetaProteomicAnalysis_activity.json\",\n", + " \"type\": \"metaproteomics_analysis_activity_set\",\n", + " },\n", + " {\n", + " # ~50KB\n", + " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/stegen_emsl_analysis_data_objects.json\",\n", + " \"type\": \"data_object_set\",\n", + " },\n", + "]" ] }, { @@ -81,7 +89,6 @@ " \"SpectralCount\": (\"peptide_spectral_count\", int),\n", " \"BestProtein\": (\"best_protein\", identity),\n", " \"min(QValue)\": (\"min_q_value\", float),\n", - " \n", " \"peptide_sequence\": (\"peptide_sequence\", identity),\n", " \"peptide_sum_masic_abundance\": (\"peptide_sum_masic_abundance\", int),\n", " \"peptide_spectral_count\": (\"peptide_spectral_count\", int),\n", @@ -183,13 +190,16 @@ "source": [ "# 33rd analysis\n", "\n", - "to_fetch = [{\n", - " \"url\": \"https://portal.nersc.gov/project/m3408/meta/501128_1781_100340_stegen_MetaProteomicAnalysis_activity.json\",\n", - " \"type\": \"metaproteomics_analysis_activity_set\",\n", - "}, {\n", - " \"url\": \"https://portal.nersc.gov/project/m3408/meta/501128_1781_100340_stegen_emsl_analysis_data_objects.json\",\n", - " \"type\": \"data_object_set\"\n", - "}]" + "to_fetch = [\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/project/m3408/meta/501128_1781_100340_stegen_MetaProteomicAnalysis_activity.json\",\n", + " \"type\": \"metaproteomics_analysis_activity_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/project/m3408/meta/501128_1781_100340_stegen_emsl_analysis_data_objects.json\",\n", + " \"type\": \"data_object_set\",\n", + " },\n", + "]" ] }, { diff --git a/metadata-translation/notebooks/metadata_db_admin.ipynb b/metadata-translation/notebooks/metadata_db_admin.ipynb index 60ed62a3..ac866588 100644 --- a/metadata-translation/notebooks/metadata_db_admin.ipynb +++ b/metadata-translation/notebooks/metadata_db_admin.ipynb @@ -15,7 +15,7 @@ "admin_client = MongoClient(\n", " host=os.getenv(\"NMDC_MONGO_HOST\"),\n", " username=\"nmdc-admin\",\n", - " password=os.getenv(\"NMDC_MONGO_ADMIN_PWD\")\n", + " password=os.getenv(\"NMDC_MONGO_ADMIN_PWD\"),\n", ")\n", "admin_db = admin_client[\"admin\"]" ] @@ -28,26 +28,41 @@ "source": [ "# SOMEDAY MAYBE add `authenticationRestrictions` of IP address / CIDR range per user\n", "\n", + "\n", "def create_ro_user(username, pwd=\"\"):\n", - " admin_db.command(\"createUser\", f\"{username}\", pwd=pwd, roles=[\n", - " {\"role\": \"read\", \"db\": f\"{username}_scratch\"},\n", - " {\"role\": \"read\", \"db\": f\"{username}_dev\"},\n", - " {\"role\": \"read\", \"db\": f\"{username}_share\"},\n", - " ])\n", + " admin_db.command(\n", + " \"createUser\",\n", + " f\"{username}\",\n", + " pwd=pwd,\n", + " roles=[\n", + " {\"role\": \"read\", \"db\": f\"{username}_scratch\"},\n", + " {\"role\": \"read\", \"db\": f\"{username}_dev\"},\n", + " {\"role\": \"read\", \"db\": f\"{username}_share\"},\n", + " ],\n", + " )\n", + "\n", "\n", "def create_rw_user(username, pwd=\"\"):\n", - " admin_db.command(\"createUser\", f\"{username}\", pwd=pwd, roles=[\n", - " {\"role\": \"readWrite\", \"db\": f\"{username}_scratch\"},\n", - " {\"role\": \"readWrite\", \"db\": f\"{username}_dev\"},\n", - " {\"role\": \"readWrite\", \"db\": f\"{username}_share\"},\n", - " ])\n", + " admin_db.command(\n", + " \"createUser\",\n", + " f\"{username}\",\n", + " pwd=pwd,\n", + " roles=[\n", + " {\"role\": \"readWrite\", \"db\": f\"{username}_scratch\"},\n", + " {\"role\": \"readWrite\", \"db\": f\"{username}_dev\"},\n", + " {\"role\": \"readWrite\", \"db\": f\"{username}_share\"},\n", + " ],\n", + " )\n", + "\n", "\n", "def usernames():\n", " return sorted(doc[\"user\"] for doc in admin_db.command(\"usersInfo\")[\"users\"])\n", "\n", + "\n", "def username_stems():\n", " return sorted({u[:-3] for u in usernames() if u.endswith(\"_rw\")})\n", "\n", + "\n", "def grant_read_roles_for_share_dbs(username):\n", " stems = username_stems()\n", " if not stems:\n", @@ -55,13 +70,16 @@ " admin_db.command(\n", " \"grantRolesToUser\",\n", " username,\n", - " roles=[{\"role\": \"read\", \"db\": f\"{stem}_share\"} for stem in username_stems()])\n", + " roles=[{\"role\": \"read\", \"db\": f\"{stem}_share\"} for stem in username_stems()],\n", + " )\n", + "\n", "\n", "def ensure_share_reads():\n", - " for (stem, suffix) in itertools.product(username_stems(), (\"_ro\", \"_rw\")):\n", - " username = stem+suffix\n", + " for stem, suffix in itertools.product(username_stems(), (\"_ro\", \"_rw\")):\n", + " username = stem + suffix\n", " grant_read_roles_for_share_dbs(username)\n", "\n", + "\n", "def nwordspass(n=5, sep=\"-\", words_file=\"/usr/share/dict/words\"):\n", " with open(words_file) as f:\n", " lines = f.readlines()\n", @@ -69,16 +87,17 @@ " result = sep.join(random.sample(words, n))\n", " return result\n", "\n", + "\n", "def ensure_users(email):\n", - " username_stem = email.split('@')[0]\n", + " username_stem = email.split(\"@\")[0]\n", " names = set(usernames())\n", - " result = {\"email\": email} \n", - " user_ro = username_stem+\"_ro\"\n", + " result = {\"email\": email}\n", + " user_ro = username_stem + \"_ro\"\n", " if user_ro not in names:\n", " pwd = nwordspass()\n", " create_ro_user(user_ro, pwd=pwd)\n", " result[user_ro] = pwd\n", - " user_rw = username_stem+\"_rw\"\n", + " user_rw = username_stem + \"_rw\"\n", " if user_rw not in names:\n", " pwd = nwordspass()\n", " create_rw_user(user_rw, pwd=pwd)\n", @@ -102,7 +121,7 @@ "metadata": {}, "outputs": [], "source": [ - "#username_stems()" + "# username_stems()" ] }, { diff --git a/metadata-translation/notebooks/mongo_etl_demo.ipynb b/metadata-translation/notebooks/mongo_etl_demo.ipynb index 6d8aac75..f0b71ff5 100644 --- a/metadata-translation/notebooks/mongo_etl_demo.ipynb +++ b/metadata-translation/notebooks/mongo_etl_demo.ipynb @@ -33,6 +33,7 @@ "tic = time()\n", "\n", "from dotenv import load_dotenv\n", + "\n", "load_dotenv(os.path.expanduser(\"~/.nmdc_mongo.env\"))" ] }, @@ -55,7 +56,7 @@ " fetch_json,\n", " get_db,\n", " reset_database,\n", - " snake_case_set_name\n", + " snake_case_set_name,\n", ")" ] }, @@ -78,17 +79,23 @@ "\n", "existing_set_names = set(dbschema[\"properties\"])\n", "\n", - "for object_without_set in (defined_object_names - set(set_for_object_name.keys())):\n", + "for object_without_set in defined_object_names - set(set_for_object_name.keys()):\n", " proposed_set_name = snake_case_set_name(object_without_set)\n", " if proposed_set_name not in existing_set_names:\n", " dbschema[\"properties\"][proposed_set_name] = {\n", - " \"description\": (f\"This property links a database object to the set of\"\n", - " f\" {object_without_set} objects within it.\"),\n", + " \"description\": (\n", + " f\"This property links a database object to the set of\"\n", + " f\" {object_without_set} objects within it.\"\n", + " ),\n", " \"items\": {\"$ref\": f\"#/definitions/{object_without_set}\"},\n", " \"type\": \"array\",\n", " }\n", "\n", - "dbschema = assoc_in(dbschema, [\"definitions\", \"ControlledTermValue\", \"properties\", \"term\", \"type\"], \"string\")\n", + "dbschema = assoc_in(\n", + " dbschema,\n", + " [\"definitions\", \"ControlledTermValue\", \"properties\", \"term\", \"type\"],\n", + " \"string\",\n", + ")\n", "del dbschema[\"definitions\"][\"ControlledTermValue\"][\"properties\"][\"term\"][\"$ref\"]\n", "\n", "# 'k' not capitalized upstream perhaps. should conform!\n", @@ -136,49 +143,64 @@ "metadata": {}, "outputs": [], "source": [ - "to_fetch = [{\n", - " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/mt_annotation_objects.json\",\n", - " \"type\": \"metagenome_annotation_activity_set\"\n", - "}, {\n", - " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/mt_annotation_data_objects.json\",\n", - " \"type\": \"data_object_set\"\n", - "}, {\n", - " \"url\": \"https://portal.nersc.gov/project/m3408/meta/metagenomeAssembly_activity.json\",\n", - " \"type\": \"metagenome_assembly_set\",\n", - "}, {\n", - " \"url\": \"https://portal.nersc.gov/project/m3408/meta/metagenomeAssembly_data_objects.json\",\n", - " \"type\": \"data_object_set\",\n", - "}, {\n", - " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/ReadbasedAnalysis_activity.json\",\n", - " \"type\": \"read_based_analysis_activity_set\"\n", - "}, {\n", - " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/ReadbasedAnalysis_data_objects.json\",\n", - " \"type\": \"data_object_set\"\n", - "}, {\n", - " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/MAGs_activity.json\",\n", - " \"type\": \"mags_activity_set\",\n", - "}, {\n", - " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/MAGs_data_objects.json\",\n", - " \"type\": \"data_object_set\"\n", - "}, {\n", - " \"url\": \"https://portal.nersc.gov/project/m3408/meta/readQC_activity.json\",\n", - " \"type\": \"read_QC_analysis_activity_set\"\n", - "}, {\n", - " \"url\": \"https://portal.nersc.gov/project/m3408/meta/readQC_activity_data_objects.json\",\n", - " \"type\": \"data_object_set\"\n", - "}, {\n", - " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/img_mg_annotation_objects.json\",\n", - " \"type\": \"metagenome_annotation_activity_set\",\n", - "}, {\n", - " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/img_mg_annotation_data_objects.json\",\n", - " \"type\": \"data_object_set\",\n", - "}, {\n", - " \"url\": \"https://nmdcdemo.emsl.pnnl.gov/metabolomics/registration/gcms_metabolomics_data_products.json\",\n", - " \"type\": \"data_object_set\"\n", - "}, {\n", - " \"url\": \"https://nmdcdemo.emsl.pnnl.gov/nom/registration/ftms_nom_data_products.json\",\n", - " \"type\": \"data_object_set\"\n", - "}]" + "to_fetch = [\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/mt_annotation_objects.json\",\n", + " \"type\": \"metagenome_annotation_activity_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/mt_annotation_data_objects.json\",\n", + " \"type\": \"data_object_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/project/m3408/meta/metagenomeAssembly_activity.json\",\n", + " \"type\": \"metagenome_assembly_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/project/m3408/meta/metagenomeAssembly_data_objects.json\",\n", + " \"type\": \"data_object_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/ReadbasedAnalysis_activity.json\",\n", + " \"type\": \"read_based_analysis_activity_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/ReadbasedAnalysis_data_objects.json\",\n", + " \"type\": \"data_object_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/MAGs_activity.json\",\n", + " \"type\": \"mags_activity_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/MAGs_data_objects.json\",\n", + " \"type\": \"data_object_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/project/m3408/meta/readQC_activity.json\",\n", + " \"type\": \"read_QC_analysis_activity_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/project/m3408/meta/readQC_activity_data_objects.json\",\n", + " \"type\": \"data_object_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/img_mg_annotation_objects.json\",\n", + " \"type\": \"metagenome_annotation_activity_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://portal.nersc.gov/cfs/m3408/meta/img_mg_annotation_data_objects.json\",\n", + " \"type\": \"data_object_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://nmdcdemo.emsl.pnnl.gov/metabolomics/registration/gcms_metabolomics_data_products.json\",\n", + " \"type\": \"data_object_set\",\n", + " },\n", + " {\n", + " \"url\": \"https://nmdcdemo.emsl.pnnl.gov/nom/registration/ftms_nom_data_products.json\",\n", + " \"type\": \"data_object_set\",\n", + " },\n", + "]" ] }, { @@ -526,11 +548,16 @@ "source": [ "from tqdm.notebook import tqdm\n", "\n", - "error_urls = fetch_conform_and_persist_from_manifest({\n", - " \"url_manifest\": (\"https://nmdcdemo.emsl.pnnl.gov/metabolomics/registration/\"\n", - " \"gcms_metabolomics_metadata_products.json\"),\n", - " \"type\": \"metabolomics_analysis_activity_set\"\n", - "}, db)\n", + "error_urls = fetch_conform_and_persist_from_manifest(\n", + " {\n", + " \"url_manifest\": (\n", + " \"https://nmdcdemo.emsl.pnnl.gov/metabolomics/registration/\"\n", + " \"gcms_metabolomics_metadata_products.json\"\n", + " ),\n", + " \"type\": \"metabolomics_analysis_activity_set\",\n", + " },\n", + " db,\n", + ")\n", "len(error_urls)" ] }, @@ -567,11 +594,16 @@ "source": [ "from tqdm.notebook import tqdm\n", "\n", - "error_urls = fetch_conform_and_persist_from_manifest({\n", - " \"url_manifest\": (\"https://nmdcdemo.emsl.pnnl.gov/nom/registration/\"\n", - " \"ftms_nom_metadata_products.json\"),\n", - " \"type\": \"nom_analysis_activity_set\"\n", - "}, db)\n", + "error_urls = fetch_conform_and_persist_from_manifest(\n", + " {\n", + " \"url_manifest\": (\n", + " \"https://nmdcdemo.emsl.pnnl.gov/nom/registration/\"\n", + " \"ftms_nom_metadata_products.json\"\n", + " ),\n", + " \"type\": \"nom_analysis_activity_set\",\n", + " },\n", + " db,\n", + ")\n", "len(error_urls)" ] }, @@ -591,8 +623,7 @@ "outputs": [], "source": [ "target_collection_names = [\n", - " name for name in db.list_collection_names()\n", - " if db[name].count_documents({}) > 0\n", + " name for name in db.list_collection_names() if db[name].count_documents({}) > 0\n", "]" ] }, diff --git a/metadata-translation/notebooks/test-changesheet-with-separator.ipynb b/metadata-translation/notebooks/test-changesheet-with-separator.ipynb index 8fa3b7ee..22c78937 100644 --- a/metadata-translation/notebooks/test-changesheet-with-separator.ipynb +++ b/metadata-translation/notebooks/test-changesheet-with-separator.ipynb @@ -10,6 +10,7 @@ "# append path info to access api\n", "import os, sys\n", "from git_root import git_root\n", + "\n", "sys.path.append(os.path.abspath(git_root(\"nmdc_runtime/api/core\")))" ] }, @@ -44,11 +45,11 @@ "metadata": {}, "outputs": [], "source": [ - "with open('data/study-data1.json') as f:\n", + "with open(\"data/study-data1.json\") as f:\n", " study1 = json.load(f)\n", - "with open('data/study-data2.json') as f:\n", + "with open(\"data/study-data2.json\") as f:\n", " study2 = json.load(f)\n", - "with open('data/study-data3.json') as f:\n", + "with open(\"data/study-data3.json\") as f:\n", " study3 = json.load(f)" ] }, @@ -67,12 +68,12 @@ "metadata": {}, "outputs": [], "source": [ - "def get_study_data(_id:str):\n", + "def get_study_data(_id: str):\n", " global study1, study2, study3\n", " for s in (study1, study2, study3):\n", - " if s['id'] == _id.strip():\n", + " if s[\"id\"] == _id.strip():\n", " return s\n", - " return None # if not found" + " return None # if not found" ] }, { @@ -190,7 +191,7 @@ } ], "source": [ - "sheetDf = load_changesheet('data/changesheet-with-separator1.tsv')\n", + "sheetDf = load_changesheet(\"data/changesheet-with-separator1.tsv\")\n", "sheetDf" ] }, @@ -274,8 +275,8 @@ } ], "source": [ - "_id = grouped[0][0] # id is first element\n", - "data1 = get_study_data(_id) # get data for id\n", + "_id = grouped[0][0] # id is first element\n", + "data1 = get_study_data(_id) # get data for id\n", "data1" ] }, @@ -457,8 +458,8 @@ } ], "source": [ - "_id = grouped[1][0] # id is first element\n", - "data2 = get_study_data(_id) # get data for id\n", + "_id = grouped[1][0] # id is first element\n", + "data2 = get_study_data(_id) # get data for id\n", "data2" ] }, @@ -658,12 +659,11 @@ ], "source": [ "for g in grouped:\n", - " _id = g[0] # id is the first element\n", - " changeDf = g[1] # dataframe is the second element\n", - " data = get_study_data(_id) # get data for id\n", + " _id = g[0] # id is the first element\n", + " changeDf = g[1] # dataframe is the second element\n", + " data = get_study_data(_id) # get data for id\n", " print(changeDf)\n", - " print(json.dumps(update_data(data2, changeDf), indent=2))\n", - " " + " print(json.dumps(update_data(data2, changeDf), indent=2))" ] }, { diff --git a/metadata-translation/notebooks/test-changesheet-without-separator.ipynb b/metadata-translation/notebooks/test-changesheet-without-separator.ipynb index 37398d25..ab84a40c 100644 --- a/metadata-translation/notebooks/test-changesheet-without-separator.ipynb +++ b/metadata-translation/notebooks/test-changesheet-without-separator.ipynb @@ -26,7 +26,12 @@ "from dotenv import dotenv_values\n", "from pymongo import MongoClient\n", "from pymongo.database import Database as MongoDatabase\n", - "from nmdc_runtime.api.core.metadata import load_changesheet, update_mongo_db, mongo_update_command_for, copy_docs_in_update_cmd" + "from nmdc_runtime.api.core.metadata import (\n", + " load_changesheet,\n", + " update_mongo_db,\n", + " mongo_update_command_for,\n", + " copy_docs_in_update_cmd,\n", + ")" ] }, { @@ -66,7 +71,11 @@ "metadata": {}, "outputs": [], "source": [ - "client = MongoClient(host=config[\"MONGO_HOST\"], username=config[\"MONGO_USERNAME\"], password=config[\"MONGO_PASSWORD\"])\n", + "client = MongoClient(\n", + " host=config[\"MONGO_HOST\"],\n", + " username=config[\"MONGO_USERNAME\"],\n", + " password=config[\"MONGO_PASSWORD\"],\n", + ")\n", "mongodb = client[\"nmdc_etl_staging\"]" ] }, @@ -150,7 +159,7 @@ "outputs": [], "source": [ "pd.set_option(\"display.max_columns\", None)\n", - "pd.set_option('display.width', 1000)" + "pd.set_option(\"display.width\", 1000)" ] }, { @@ -499,7 +508,9 @@ } ], "source": [ - "pd.read_csv(\"data/changesheet-without-separator3.tsv\", sep=\"\\t\", dtype=\"string\").fillna('')" + "pd.read_csv(\"data/changesheet-without-separator3.tsv\", sep=\"\\t\", dtype=\"string\").fillna(\n", + " \"\"\n", + ")" ] }, { @@ -511,7 +522,7 @@ }, "outputs": [], "source": [ - "sheetDf = load_changesheet('data/changesheet-without-separator3.tsv', mongodb)\n", + "sheetDf = load_changesheet(\"data/changesheet-without-separator3.tsv\", mongodb)\n", "# sheetDf" ] }, diff --git a/metadata-translation/notebooks/test-output.ipynb b/metadata-translation/notebooks/test-output.ipynb index 3e22fb4b..9dc9351a 100644 --- a/metadata-translation/notebooks/test-output.ipynb +++ b/metadata-translation/notebooks/test-output.ipynb @@ -42,6 +42,7 @@ "import lib.data_operations as dop\n", "from pandasql import sqldf\n", "\n", + "\n", "def pysqldf(q):\n", " return sqldf(q, globals())" ] @@ -80,11 +81,22 @@ }, "outputs": [], "source": [ - "study = dop.make_dataframe(\"export.sql/STUDY_DATA_TABLE.dsv\", file_archive_name=\"data/nmdc-version2.zip\")\n", - "contact = dop.make_dataframe(\"export.sql/CONTACT_DATA_TABLE.dsv\", file_archive_name=\"data/nmdc-version2.zip\")\n", - "project = dop.make_dataframe(\"export.sql/PROJECT_DATA_TABLE.dsv\", file_archive_name=\"data/nmdc-version2.zip\")\n", - "project_biosample = dop.make_dataframe(\"export.sql/PROJECT_BIOSAMPLE_DATA_TABLE.dsv\", file_archive_name=\"data/nmdc-version2.zip\")\n", - "biosample = dop.make_dataframe(\"export.sql/BIOSAMPLE_DATA_TABLE.dsv\", file_archive_name=\"data/nmdc-version2.zip\")\n", + "study = dop.make_dataframe(\n", + " \"export.sql/STUDY_DATA_TABLE.dsv\", file_archive_name=\"data/nmdc-version2.zip\"\n", + ")\n", + "contact = dop.make_dataframe(\n", + " \"export.sql/CONTACT_DATA_TABLE.dsv\", file_archive_name=\"data/nmdc-version2.zip\"\n", + ")\n", + "project = dop.make_dataframe(\n", + " \"export.sql/PROJECT_DATA_TABLE.dsv\", file_archive_name=\"data/nmdc-version2.zip\"\n", + ")\n", + "project_biosample = dop.make_dataframe(\n", + " \"export.sql/PROJECT_BIOSAMPLE_DATA_TABLE.dsv\",\n", + " file_archive_name=\"data/nmdc-version2.zip\",\n", + ")\n", + "biosample = dop.make_dataframe(\n", + " \"export.sql/BIOSAMPLE_DATA_TABLE.dsv\", file_archive_name=\"data/nmdc-version2.zip\"\n", + ")\n", "proposals = dop.make_dataframe(\"data/JGI-EMSL-FICUS-proposals.fnl.tsv\")" ] }, @@ -311,7 +323,7 @@ }, "outputs": [], "source": [ - "study_dictdf = study.to_dict(orient=\"records\") # transorm dataframe to dictionary" + "study_dictdf = study.to_dict(orient=\"records\") # transorm dataframe to dictionary" ] }, { @@ -350,13 +362,29 @@ "outputs": [], "source": [ "## specify characteristics\n", - "characteristics = \\\n", - " ['gold_study_name', 'principal_investigator_name', 'add_date', 'mod_date', 'doi',\n", - " 'ecosystem', 'ecosystem_category', 'ecosystem_type', 'ecosystem_subtype', 'specific_ecosystem', 'ecosystem_path_id']\n", + "characteristics = [\n", + " \"gold_study_name\",\n", + " \"principal_investigator_name\",\n", + " \"add_date\",\n", + " \"mod_date\",\n", + " \"doi\",\n", + " \"ecosystem\",\n", + " \"ecosystem_category\",\n", + " \"ecosystem_type\",\n", + " \"ecosystem_subtype\",\n", + " \"specific_ecosystem\",\n", + " \"ecosystem_path_id\",\n", + "]\n", "\n", "## create list of json string objects\n", - "study_json_list = dop.make_json_string_list\\\n", - " (study_dictdf, nmdc.Study, id_key='gold_id', name_key='study_name', description_key=\"description\", characteristic_fields=characteristics)" + "study_json_list = dop.make_json_string_list(\n", + " study_dictdf,\n", + " nmdc.Study,\n", + " id_key=\"gold_id\",\n", + " name_key=\"study_name\",\n", + " description_key=\"description\",\n", + " characteristic_fields=characteristics,\n", + ")" ] }, { @@ -392,7 +420,9 @@ }, "outputs": [], "source": [ - "dop.save_json_string_list(\"output/nmdc-json/study.json\", study_json_list) # save json string list to file" + "dop.save_json_string_list(\n", + " \"output/nmdc-json/study.json\", study_json_list\n", + ") # save json string list to file" ] }, { @@ -497,7 +527,7 @@ }, "outputs": [], "source": [ - "project_dictdf = project.to_dict(orient=\"records\") # transorm dataframe to dictionary" + "project_dictdf = project.to_dict(orient=\"records\") # transorm dataframe to dictionary" ] }, { @@ -516,13 +546,26 @@ "outputs": [], "source": [ "## specify characteristics\n", - "characteristics = \\\n", - " ['add_date', 'mod_date', 'completion_date', 'ncbi_project_name', 'omics_type', 'principal_investigator_name', 'processing_institution']\n", + "characteristics = [\n", + " \"add_date\",\n", + " \"mod_date\",\n", + " \"completion_date\",\n", + " \"ncbi_project_name\",\n", + " \"omics_type\",\n", + " \"principal_investigator_name\",\n", + " \"processing_institution\",\n", + "]\n", "\n", "## create list of json string objects\n", - "project_json_list = dop.make_json_string_list\\\n", - " (project_dictdf, nmdc.OmicsProcessing, id_key='gold_id', name_key='project_name', \n", - " part_of_key=\"study_gold_id\", description_key=\"description\", characteristic_fields=characteristics)" + "project_json_list = dop.make_json_string_list(\n", + " project_dictdf,\n", + " nmdc.OmicsProcessing,\n", + " id_key=\"gold_id\",\n", + " name_key=\"project_name\",\n", + " part_of_key=\"study_gold_id\",\n", + " description_key=\"description\",\n", + " characteristic_fields=characteristics,\n", + ")" ] }, { @@ -558,7 +601,9 @@ }, "outputs": [], "source": [ - "dop.save_json_string_list(\"output/nmdc-json/omics_processing.json\", project_json_list) # save json string list to file" + "dop.save_json_string_list(\n", + " \"output/nmdc-json/omics_processing.json\", project_json_list\n", + ") # save json string list to file" ] }, { @@ -711,7 +756,9 @@ }, "outputs": [], "source": [ - "biosample_dictdf = biosampledf.to_dict(orient=\"records\") # transorm dataframe to dictionary" + "biosample_dictdf = biosampledf.to_dict(\n", + " orient=\"records\"\n", + ") # transorm dataframe to dictionary" ] }, { @@ -730,48 +777,48 @@ "outputs": [], "source": [ "## specify characteristics\n", - "characteristics = \\\n", - " ['add_date',\n", - " 'mod_date',\n", - " 'ecosystem_path_id',\n", - " 'ecosystem',\n", - " 'ecosystem_category',\n", - " 'ecosystem_type',\n", - " 'ecosystem_subtype',\n", - " 'specific_ecosystem',\n", - " 'habitat',\n", - " 'location',\n", - " 'community',\n", - " 'ncbi_taxonomy_name',\n", - " 'geographic_location',\n", - " 'latitude',\n", - " 'longitude',\n", - " 'sample_collection_site',\n", - " 'identifier',\n", - " 'sample_collection_year',\n", - " 'sample_collection_month',\n", - " 'sample_collection_day',\n", - " 'sample_collection_hour',\n", - " 'sample_collection_minute',\n", - " 'host_name',\n", - " 'depth',\n", - " 'subsurface_depth',\n", - " 'altitude',\n", - " 'temperature_range',\n", - " 'proport_woa_temperature',\n", - " 'biogas_temperature',\n", - " 'growth_temperature',\n", - " 'soil_annual_season_temp',\n", - " 'water_samp_store_temp',\n", - " 'biogas_retention_time',\n", - " 'salinity',\n", - " 'pressure',\n", - " 'ph',\n", - " 'chlorophyll_concentration',\n", - " 'nitrate_concentration',\n", - " 'oxygen_concentration',\n", - " 'salinity_concentration'\n", - " ]" + "characteristics = [\n", + " \"add_date\",\n", + " \"mod_date\",\n", + " \"ecosystem_path_id\",\n", + " \"ecosystem\",\n", + " \"ecosystem_category\",\n", + " \"ecosystem_type\",\n", + " \"ecosystem_subtype\",\n", + " \"specific_ecosystem\",\n", + " \"habitat\",\n", + " \"location\",\n", + " \"community\",\n", + " \"ncbi_taxonomy_name\",\n", + " \"geographic_location\",\n", + " \"latitude\",\n", + " \"longitude\",\n", + " \"sample_collection_site\",\n", + " \"identifier\",\n", + " \"sample_collection_year\",\n", + " \"sample_collection_month\",\n", + " \"sample_collection_day\",\n", + " \"sample_collection_hour\",\n", + " \"sample_collection_minute\",\n", + " \"host_name\",\n", + " \"depth\",\n", + " \"subsurface_depth\",\n", + " \"altitude\",\n", + " \"temperature_range\",\n", + " \"proport_woa_temperature\",\n", + " \"biogas_temperature\",\n", + " \"growth_temperature\",\n", + " \"soil_annual_season_temp\",\n", + " \"water_samp_store_temp\",\n", + " \"biogas_retention_time\",\n", + " \"salinity\",\n", + " \"pressure\",\n", + " \"ph\",\n", + " \"chlorophyll_concentration\",\n", + " \"nitrate_concentration\",\n", + " \"oxygen_concentration\",\n", + " \"salinity_concentration\",\n", + "]" ] }, { @@ -790,9 +837,15 @@ "outputs": [], "source": [ "## create list of json string objects\n", - "biosample_json_list = dop.make_json_string_list\\\n", - " (biosample_dictdf, nmdc.Biosample, id_key='gold_id', name_key='biosample_name', \n", - " part_of_key=\"project_gold_ids\", description_key=\"description\", characteristic_fields=characteristics)" + "biosample_json_list = dop.make_json_string_list(\n", + " biosample_dictdf,\n", + " nmdc.Biosample,\n", + " id_key=\"gold_id\",\n", + " name_key=\"biosample_name\",\n", + " part_of_key=\"project_gold_ids\",\n", + " description_key=\"description\",\n", + " characteristic_fields=characteristics,\n", + ")" ] }, { @@ -828,7 +881,9 @@ }, "outputs": [], "source": [ - "dop.save_json_string_list(\"output/nmdc-json/biosample.json\", biosample_json_list) # save json string list to file" + "dop.save_json_string_list(\n", + " \"output/nmdc-json/biosample.json\", biosample_json_list\n", + ") # save json string list to file" ] }, { diff --git a/metadata-translation/notebooks/test-pipeline-scratch.ipynb b/metadata-translation/notebooks/test-pipeline-scratch.ipynb index b9728999..2bfa14b5 100644 --- a/metadata-translation/notebooks/test-pipeline-scratch.ipynb +++ b/metadata-translation/notebooks/test-pipeline-scratch.ipynb @@ -7,8 +7,9 @@ "outputs": [], "source": [ "import os, sys\n", - "sys.path.append(os.path.abspath('../src/bin/lib/')) # add path to lib\n", - "sys.path.append(os.path.abspath('../../schema/')) # add path nmdc.py" + "\n", + "sys.path.append(os.path.abspath(\"../src/bin/lib/\")) # add path to lib\n", + "sys.path.append(os.path.abspath(\"../../schema/\")) # add path nmdc.py" ] }, { @@ -24,11 +25,17 @@ "from dotted_dict import DottedDict\n", "from collections import namedtuple\n", "import data_operations as dop\n", - "from data_operations import make_dataframe, make_dataframe_from_spec_file, unpivot_dataframe\n", + "from data_operations import (\n", + " make_dataframe,\n", + " make_dataframe_from_spec_file,\n", + " unpivot_dataframe,\n", + ")\n", "import pandas as pds\n", "import numpy as np\n", "\n", "from pandasql import sqldf\n", + "\n", + "\n", "def pysqldf(q):\n", " return sqldf(q, globals())" ] @@ -40,7 +47,7 @@ "outputs": [], "source": [ "spec_file = \"../src/bin/lib/nmdc_data_source.yaml\"\n", - "with open(spec_file, 'r') as input_file:\n", + "with open(spec_file, \"r\") as input_file:\n", " spec = DottedDict(yaml.load(input_file, Loader=Loader))" ] }, @@ -74,7 +81,7 @@ "outputs": [], "source": [ "# read data from saved file\n", - "mdf = pds.read_csv('../src/data/nmdc_merged_data.tsv.zip', sep='\\t', dtype=str)\n", + "mdf = pds.read_csv(\"../src/data/nmdc_merged_data.tsv.zip\", sep=\"\\t\", dtype=str)\n", "# mdf = pds.read_csv('../src/data/nmdc_merged_data.tsv.zip', sep='\\t', dtype=str, nrows=100)" ] }, @@ -107,10 +114,13 @@ "metadata": {}, "outputs": [], "source": [ - "tagsdf = pds.DataFrame({\n", - " 'user_id':[1,2,1,3,3,3,4,5],\n", - " 'content_id':[1,1,2,2,2,2,3,4],\n", - " 'tag':['cool','nice','clever','clever','not-bad', '', 'foo', 'bar']})\n" + "tagsdf = pds.DataFrame(\n", + " {\n", + " \"user_id\": [1, 2, 1, 3, 3, 3, 4, 5],\n", + " \"content_id\": [1, 1, 2, 2, 2, 2, 3, 4],\n", + " \"tag\": [\"cool\", \"nice\", \"clever\", \"clever\", \"not-bad\", \"\", \"foo\", \"bar\"],\n", + " }\n", + ")" ] }, { @@ -236,8 +246,10 @@ "# tagsdf['tags'] = tagsdf.groupby('content_id')['tag'].apply(lambda tags: ','.join( filter(None, tags) ))\n", "# tagsdf['tags'] = tagsdf.groupby('content_id')['tag'].apply(lambda tags: ','.join(tags))\n", "# tagsdf['tags'] = tagsdf.groupby(\"content_id\").agg({'tag': lambda tag: ','.join(tag)})\n", - "groups = pds.DataFrame(tagsdf.groupby('content_id')['tag'].apply(lambda tags: ','.join( filter(None, tags) ))).reset_index()\n", - "groups.rename(columns={'tag':'tags'}, inplace=True)" + "groups = pds.DataFrame(\n", + " tagsdf.groupby(\"content_id\")[\"tag\"].apply(lambda tags: \",\".join(filter(None, tags)))\n", + ").reset_index()\n", + "groups.rename(columns={\"tag\": \"tags\"}, inplace=True)" ] }, { @@ -318,7 +330,7 @@ "metadata": {}, "outputs": [], "source": [ - "tagsdf = pds.merge(tagsdf, groups, how='inner', on='content_id')" + "tagsdf = pds.merge(tagsdf, groups, how=\"inner\", on=\"content_id\")" ] }, { @@ -459,7 +471,7 @@ "metadata": {}, "outputs": [], "source": [ - "tagsdf.drop(columns=['tag'], inplace=True)" + "tagsdf.drop(columns=[\"tag\"], inplace=True)" ] }, { @@ -664,9 +676,9 @@ "metadata": {}, "outputs": [], "source": [ - "study_table = dop.extract_table(mdf, 'study_table')\n", - "jgi_emsl_table = dop.extract_table(mdf, 'ficus_jgi_emsl')\n", - "emsl_table = dop.extract_table(mdf, 'ficus_emsl')" + "study_table = dop.extract_table(mdf, \"study_table\")\n", + "jgi_emsl_table = dop.extract_table(mdf, \"ficus_jgi_emsl\")\n", + "emsl_table = dop.extract_table(mdf, \"ficus_emsl\")" ] }, { @@ -964,8 +976,8 @@ "metadata": {}, "outputs": [], "source": [ - "study_table_splice = study_table[['study_id', 'gold_id']].copy()\n", - "jgi_emsl_table_splice = jgi_emsl_table[['gold_study_id', 'emsl_proposal_id']]" + "study_table_splice = study_table[[\"study_id\", \"gold_id\"]].copy()\n", + "jgi_emsl_table_splice = jgi_emsl_table[[\"gold_study_id\", \"emsl_proposal_id\"]]" ] }, { @@ -974,8 +986,13 @@ "metadata": {}, "outputs": [], "source": [ - "temp1_df = \\\n", - " pds.merge(jgi_emsl_table_splice, study_table_splice, how='inner', left_on='gold_study_id', right_on='gold_id')" + "temp1_df = pds.merge(\n", + " jgi_emsl_table_splice,\n", + " study_table_splice,\n", + " how=\"inner\",\n", + " left_on=\"gold_study_id\",\n", + " right_on=\"gold_id\",\n", + ")" ] }, { @@ -1145,17 +1162,17 @@ "metadata": {}, "outputs": [], "source": [ - "study_table = dop.extract_table(mdf, 'study_table')\n", - "contact_table = dop.extract_table(mdf, 'contact_table')\n", - "proposals_table = dop.extract_table(mdf, 'proposals_table')\n", - "project_table = dop.extract_table(mdf, 'project_table')\n", - "jgi_emsl_table = dop.extract_table(mdf, 'ficus_jgi_emsl')\n", - "emsl_table = dop.extract_table(mdf, 'ficus_emsl')\n", - "faa_table = dop.extract_table(mdf, 'ficus_faa_table')\n", - "fna_table = dop.extract_table(mdf, 'ficus_fna_table')\n", - "fastq_table = dop.extract_table(mdf, 'ficus_fasq_table')\n", - "project_biosample_table = dop.extract_table(mdf, 'project_biosample_table')\n", - "biosample_table = dop.extract_table(mdf, 'biosample_table')" + "study_table = dop.extract_table(mdf, \"study_table\")\n", + "contact_table = dop.extract_table(mdf, \"contact_table\")\n", + "proposals_table = dop.extract_table(mdf, \"proposals_table\")\n", + "project_table = dop.extract_table(mdf, \"project_table\")\n", + "jgi_emsl_table = dop.extract_table(mdf, \"ficus_jgi_emsl\")\n", + "emsl_table = dop.extract_table(mdf, \"ficus_emsl\")\n", + "faa_table = dop.extract_table(mdf, \"ficus_faa_table\")\n", + "fna_table = dop.extract_table(mdf, \"ficus_fna_table\")\n", + "fastq_table = dop.extract_table(mdf, \"ficus_fasq_table\")\n", + "project_biosample_table = dop.extract_table(mdf, \"project_biosample_table\")\n", + "biosample_table = dop.extract_table(mdf, \"biosample_table\")" ] }, { @@ -1178,8 +1195,12 @@ } ], "source": [ - "data_objects = dop.make_data_objects_dataframe(faa_table, fna_table, fastq_table, project_table)\n", - "data_objects_dictdf = data_objects.to_dict(orient=\"records\") # transorm dataframe to dictionary" + "data_objects = dop.make_data_objects_dataframe(\n", + " faa_table, fna_table, fastq_table, project_table\n", + ")\n", + "data_objects_dictdf = data_objects.to_dict(\n", + " orient=\"records\"\n", + ") # transorm dataframe to dictionary" ] }, { @@ -1219,9 +1240,9 @@ "metadata": {}, "outputs": [], "source": [ - "pds.set_option('display.width', 10000)\n", - "pds.set_option('display.max_colwidth', 2000)\n", - "files_df.sort_values(by='gold_project_id')" + "pds.set_option(\"display.width\", 10000)\n", + "pds.set_option(\"display.max_colwidth\", 2000)\n", + "files_df.sort_values(by=\"gold_project_id\")" ] }, { @@ -1231,7 +1252,7 @@ "outputs": [], "source": [ "# groups = data_objects.groupby('gold_project_id')['file_id']\n", - "groups = data_objects.groupby('gold_project_id')\n", + "groups = data_objects.groupby(\"gold_project_id\")\n", "# pds.DataFrame(groups).sort_values(by=0)\n", "# pds.DataFrame(groups)" ] @@ -1243,10 +1264,11 @@ "outputs": [], "source": [ "# for g in groups: print(type(g[1]))\n", - "for idx, g in enumerate(groups): \n", + "for idx, g in enumerate(groups):\n", " ser = g[1].file_id\n", " print(list(ser))\n", - " if idx > 0: break\n", + " if idx > 0:\n", + " break\n", "# for g in groups: print(g[1])\n", "# groups = files_df.groupby('gold_project_id')['file_ids']\n", "# pds.DataFrame(groups).sort_values(by=0)\n", @@ -1259,7 +1281,7 @@ "metadata": {}, "outputs": [], "source": [ - "fgroups = data_objects.groupby('gold_project_id')['file_id']" + "fgroups = data_objects.groupby(\"gold_project_id\")[\"file_id\"]" ] }, { @@ -1268,11 +1290,12 @@ "metadata": {}, "outputs": [], "source": [ - "for idx, g in enumerate(fgroups): \n", + "for idx, g in enumerate(fgroups):\n", " ser = g[1]\n", " print(ser)\n", " print(list(ser))\n", - " if idx > 0: break" + " if idx > 0:\n", + " break" ] }, { @@ -1281,7 +1304,9 @@ "metadata": {}, "outputs": [], "source": [ - "pds.DataFrame(fgroups.apply(lambda x:','.join(filter(None, x)))).drop_duplicates().reset_index()" + "pds.DataFrame(\n", + " fgroups.apply(lambda x: \",\".join(filter(None, x)))\n", + ").drop_duplicates().reset_index()" ] }, { @@ -1311,7 +1336,9 @@ "metadata": {}, "outputs": [], "source": [ - "project = dop.make_project_dataframe (project_table, study_table, contact_table, data_objects)" + "project = dop.make_project_dataframe(\n", + " project_table, study_table, contact_table, data_objects\n", + ")" ] }, { @@ -1321,7 +1348,7 @@ "outputs": [], "source": [ "# project.output_files.unique()\n", - "project[['output_files']].head()" + "project[[\"output_files\"]].head()" ] }, { @@ -1330,11 +1357,14 @@ "metadata": {}, "outputs": [], "source": [ - "groups = data_objects.groupby('gold_project_id')['file_id']\n", + "groups = data_objects.groupby(\"gold_project_id\")[\"file_id\"]\n", "\n", - "output_files = \\\n", - " pds.DataFrame(groups.apply(lambda x:','.join(filter(None, x)))).drop_duplicates().reset_index()\n", - "output_files.rename(columns={'file_id': 'output_files'}, inplace=True)" + "output_files = (\n", + " pds.DataFrame(groups.apply(lambda x: \",\".join(filter(None, x))))\n", + " .drop_duplicates()\n", + " .reset_index()\n", + ")\n", + "output_files.rename(columns={\"file_id\": \"output_files\"}, inplace=True)" ] }, { diff --git a/metadata-translation/notebooks/test-pipeline-temp.ipynb b/metadata-translation/notebooks/test-pipeline-temp.ipynb index 92c22edb..fcce7bb6 100644 --- a/metadata-translation/notebooks/test-pipeline-temp.ipynb +++ b/metadata-translation/notebooks/test-pipeline-temp.ipynb @@ -7,7 +7,8 @@ "outputs": [], "source": [ "import os, sys\n", - "sys.path.append(os.path.abspath('../src/bin/lib/')) # add path to lib" + "\n", + "sys.path.append(os.path.abspath(\"../src/bin/lib/\")) # add path to lib" ] }, { @@ -25,6 +26,8 @@ "import data_operations as dop\n", "from pandasql import sqldf\n", "import pandas as pds\n", + "\n", + "\n", "def pysqldf(q):\n", " return sqldf(q, globals())" ] @@ -36,7 +39,7 @@ "outputs": [], "source": [ "spec_file = \"../src/bin/lib/nmdc_data_source.yaml\"\n", - "with open(spec_file, 'r') as input_file:\n", + "with open(spec_file, \"r\") as input_file:\n", " spec = DottedDict(yaml.load(input_file, Loader=Loader))" ] }, @@ -48,7 +51,7 @@ "source": [ "## put all data in one dataframe; for testing grap 20 rows\n", "# mdf = dop.make_dataframe_from_spec_file (spec_file, nrows=20)\n", - "mdf = dop.make_dataframe_from_spec_file (spec_file)\n", + "mdf = dop.make_dataframe_from_spec_file(spec_file)\n", "# mdf.head()" ] }, @@ -83,15 +86,15 @@ "outputs": [], "source": [ "## get tables from merged dataframe\n", - "study_table = dop.extract_table(mdf, 'study_table')\n", - "contact_table = dop.extract_table(mdf, 'contact_table')\n", - "proposals_table = dop.extract_table(mdf, 'proposals_table')\n", - "project_table = dop.extract_table(mdf, 'project_table')\n", - "project_biosample_table = dop.extract_table(mdf, 'project_biosample_table')\n", - "biosample_table = dop.extract_table(mdf, 'biosample_table')\n", - "faa_table = dop.extract_table(mdf, 'ficus_faa_table')\n", - "fna_table = dop.extract_table(mdf, 'ficus_fna_table')\n", - "fasq_table = dop.extract_table(mdf, 'ficus_fasq_table')" + "study_table = dop.extract_table(mdf, \"study_table\")\n", + "contact_table = dop.extract_table(mdf, \"contact_table\")\n", + "proposals_table = dop.extract_table(mdf, \"proposals_table\")\n", + "project_table = dop.extract_table(mdf, \"project_table\")\n", + "project_biosample_table = dop.extract_table(mdf, \"project_biosample_table\")\n", + "biosample_table = dop.extract_table(mdf, \"biosample_table\")\n", + "faa_table = dop.extract_table(mdf, \"ficus_faa_table\")\n", + "fna_table = dop.extract_table(mdf, \"ficus_fna_table\")\n", + "fasq_table = dop.extract_table(mdf, \"ficus_fasq_table\")" ] }, { @@ -137,8 +140,8 @@ "metadata": {}, "outputs": [], "source": [ - "project_biosample_table = dop.extract_table(mdf, 'project_biosample_table')\n", - "biosample_table = dop.extract_table(mdf, 'biosample_table')" + "project_biosample_table = dop.extract_table(mdf, \"project_biosample_table\")\n", + "biosample_table = dop.extract_table(mdf, \"biosample_table\")" ] }, { @@ -147,7 +150,9 @@ "metadata": {}, "outputs": [], "source": [ - "biosample = dop.make_biosample_dataframe(biosample_table, project_biosample_table, project_table)" + "biosample = dop.make_biosample_dataframe(\n", + " biosample_table, project_biosample_table, project_table\n", + ")" ] }, { @@ -178,9 +183,9 @@ "metadata": {}, "outputs": [], "source": [ - "faa_table = dop.extract_table(mdf, 'ficus_faa_table')\n", - "fna_table = dop.extract_table(mdf, 'ficus_fna_table')\n", - "fasq_table = dop.extract_table(mdf, 'ficus_fasq_table')\n", + "faa_table = dop.extract_table(mdf, \"ficus_faa_table\")\n", + "fna_table = dop.extract_table(mdf, \"ficus_fna_table\")\n", + "fasq_table = dop.extract_table(mdf, \"ficus_fasq_table\")\n", "data_objects_table = pds.concat([faa_table, fna_table, fasq_table], axis=0)\n", "# data_objects.head()" ] @@ -191,7 +196,9 @@ "metadata": {}, "outputs": [], "source": [ - "data_objects = dop.make_data_objects_datafame(data_objects_table, project) # NB: using project not project_table\n", + "data_objects = dop.make_data_objects_datafame(\n", + " data_objects_table, project\n", + ") # NB: using project not project_table\n", "# data_objects.head()" ] }, diff --git a/metadata-translation/notebooks/test-pipeline.ipynb b/metadata-translation/notebooks/test-pipeline.ipynb index e71ba177..ddff9f50 100644 --- a/metadata-translation/notebooks/test-pipeline.ipynb +++ b/metadata-translation/notebooks/test-pipeline.ipynb @@ -7,8 +7,9 @@ "outputs": [], "source": [ "import os, sys\n", - "sys.path.append(os.path.abspath('../src/bin/lib/')) # add path to data_opertations.py\n", - "sys.path.append(os.path.abspath('../../schema/')) # add path nmdc.py" + "\n", + "sys.path.append(os.path.abspath(\"../src/bin/lib/\")) # add path to data_opertations.py\n", + "sys.path.append(os.path.abspath(\"../../schema/\")) # add path nmdc.py" ] }, { @@ -31,6 +32,8 @@ "import jsonasobj\n", "\n", "from pandasql import sqldf\n", + "\n", + "\n", "def pysqldf(q):\n", " return sqldf(q, globals())" ] @@ -49,7 +52,7 @@ "outputs": [], "source": [ "spec_file = \"../src/bin/lib/nmdc_data_source.yaml\"\n", - "with open(spec_file, 'r') as input_file:\n", + "with open(spec_file, \"r\") as input_file:\n", " spec = DottedDict(yaml.load(input_file, Loader=Loader))" ] }, @@ -85,7 +88,7 @@ "outputs": [], "source": [ "# read data from saved file\n", - "mdf = pds.read_csv('../src/data/nmdc_merged_data.tsv.zip', sep='\\t', dtype=str)\n", + "mdf = pds.read_csv(\"../src/data/nmdc_merged_data.tsv.zip\", sep=\"\\t\", dtype=str)\n", "# mdf = pds.read_csv('../src/data/nmdc_merged_data.tsv.zip', sep='\\t', dtype=str, nrows=100)" ] }, @@ -109,7 +112,7 @@ } ], "source": [ - "mdf.nmdc_data_source.unique() ## list of the data sources in merged" + "mdf.nmdc_data_source.unique() ## list of the data sources in merged" ] }, { @@ -125,17 +128,17 @@ "metadata": {}, "outputs": [], "source": [ - "study_table = dop.extract_table(mdf, 'study_table')\n", - "contact_table = dop.extract_table(mdf, 'contact_table')\n", - "proposals_table = dop.extract_table(mdf, 'proposals_table')\n", - "project_table = dop.extract_table(mdf, 'project_table')\n", - "jgi_emsl_table = dop.extract_table(mdf, 'ficus_jgi_emsl')\n", - "emsl_table = dop.extract_table(mdf, 'ficus_emsl')\n", - "faa_table = dop.extract_table(mdf, 'ficus_faa_table')\n", - "fna_table = dop.extract_table(mdf, 'ficus_fna_table')\n", - "fastq_table = dop.extract_table(mdf, 'ficus_fastq_table')\n", - "project_biosample_table = dop.extract_table(mdf, 'project_biosample_table')\n", - "biosample_table = dop.extract_table(mdf, 'biosample_table')\n", + "study_table = dop.extract_table(mdf, \"study_table\")\n", + "contact_table = dop.extract_table(mdf, \"contact_table\")\n", + "proposals_table = dop.extract_table(mdf, \"proposals_table\")\n", + "project_table = dop.extract_table(mdf, \"project_table\")\n", + "jgi_emsl_table = dop.extract_table(mdf, \"ficus_jgi_emsl\")\n", + "emsl_table = dop.extract_table(mdf, \"ficus_emsl\")\n", + "faa_table = dop.extract_table(mdf, \"ficus_faa_table\")\n", + "fna_table = dop.extract_table(mdf, \"ficus_fna_table\")\n", + "fastq_table = dop.extract_table(mdf, \"ficus_fastq_table\")\n", + "project_biosample_table = dop.extract_table(mdf, \"project_biosample_table\")\n", + "biosample_table = dop.extract_table(mdf, \"biosample_table\")\n", "# biosample_table.columns" ] }, @@ -153,7 +156,7 @@ "outputs": [], "source": [ "study = dop.make_study_dataframe(study_table, contact_table, proposals_table)\n", - "study_dictdf = study.to_dict(orient=\"records\") # transorm dataframe to dictionary\n", + "study_dictdf = study.to_dict(orient=\"records\") # transorm dataframe to dictionary\n", "# study.gold_id" ] }, @@ -164,19 +167,24 @@ "outputs": [], "source": [ "## specify attributes\n", - "attributes = \\\n", - " ['gold_study_name', 'principal_investigator_name', 'add_date', 'mod_date', 'doi',\n", - " 'ecosystem', 'ecosystem_category', 'ecosystem_type', 'ecosystem_subtype', 'specific_ecosystem']\n", + "attributes = [\n", + " \"gold_study_name\",\n", + " \"principal_investigator_name\",\n", + " \"add_date\",\n", + " \"mod_date\",\n", + " \"doi\",\n", + " \"ecosystem\",\n", + " \"ecosystem_category\",\n", + " \"ecosystem_type\",\n", + " \"ecosystem_subtype\",\n", + " \"specific_ecosystem\",\n", + "]\n", "\n", - "constructor = \\\n", - " {\n", - " 'id': 'gold_id',\n", - " 'name': 'study_name',\n", - " 'description': 'description'\n", - " }\n", + "constructor = {\"id\": \"gold_id\", \"name\": \"study_name\", \"description\": \"description\"}\n", "\n", - "study_json_list = dop.make_json_string_list\\\n", - " (study_dictdf, nmdc.Study, constructor_map=constructor, attribute_fields=attributes)" + "study_json_list = dop.make_json_string_list(\n", + " study_dictdf, nmdc.Study, constructor_map=constructor, attribute_fields=attributes\n", + ")" ] }, { @@ -203,7 +211,9 @@ "metadata": {}, "outputs": [], "source": [ - "dop.save_json_string_list(\"output/test-pipeline/gold_study.json\", study_json_list) # save json string list to file" + "dop.save_json_string_list(\n", + " \"output/test-pipeline/gold_study.json\", study_json_list\n", + ") # save json string list to file" ] }, { @@ -220,7 +230,7 @@ "outputs": [], "source": [ "emsl = dop.make_emsl_dataframe(emsl_table, jgi_emsl_table, study_table)\n", - "emsl_dictdf = emsl.to_dict(orient=\"records\") # transorm dataframe to dictionary" + "emsl_dictdf = emsl.to_dict(orient=\"records\") # transorm dataframe to dictionary" ] }, { @@ -250,24 +260,26 @@ "metadata": {}, "outputs": [], "source": [ - "attributes = \\\n", - " [\n", - " #'file_size_bytes',\n", - " # {'part_of': ({'id': 'gold_study_id'}, nmdc.Study)},\n", - " # {'has_output': ({'id': 'data_object_id'}, nmdc.DataObject)}\n", - " {'part_of': 'gold_study_id'},\n", - " {'has_output': 'data_object_id'}\n", - " ]\n", + "attributes = [\n", + " #'file_size_bytes',\n", + " # {'part_of': ({'id': 'gold_study_id'}, nmdc.Study)},\n", + " # {'has_output': ({'id': 'data_object_id'}, nmdc.DataObject)}\n", + " {\"part_of\": \"gold_study_id\"},\n", + " {\"has_output\": \"data_object_id\"},\n", + "]\n", "\n", - "constructor = \\\n", - " {\n", - " 'id': 'dataset_id',\n", - " 'name': 'dataset_name',\n", - " 'description': 'dataset_type_description'\n", - " }\n", + "constructor = {\n", + " \"id\": \"dataset_id\",\n", + " \"name\": \"dataset_name\",\n", + " \"description\": \"dataset_type_description\",\n", + "}\n", "\n", - "emsl_project_json_list = dop.make_json_string_list\\\n", - " (emsl_dictdf, nmdc.OmicsProcessing, constructor_map=constructor, attribute_fields=attributes)" + "emsl_project_json_list = dop.make_json_string_list(\n", + " emsl_dictdf,\n", + " nmdc.OmicsProcessing,\n", + " constructor_map=constructor,\n", + " attribute_fields=attributes,\n", + ")" ] }, { @@ -292,7 +304,9 @@ "metadata": {}, "outputs": [], "source": [ - "dop.save_json_string_list(\"output/test-pipeline/emsl_omics_processing.json\", emsl_project_json_list) # save json string list to file" + "dop.save_json_string_list(\n", + " \"output/test-pipeline/emsl_omics_processing.json\", emsl_project_json_list\n", + ") # save json string list to file" ] }, { @@ -308,20 +322,20 @@ "metadata": {}, "outputs": [], "source": [ - "attributes = \\\n", - " [\n", - " 'file_size_bytes'\n", - " ]\n", + "attributes = [\"file_size_bytes\"]\n", "\n", - "constructor = \\\n", - " {\n", - " 'id': 'data_object_id',\n", - " 'name': 'data_object_name',\n", - " 'description': 'dataset_type_description'\n", - " }\n", + "constructor = {\n", + " \"id\": \"data_object_id\",\n", + " \"name\": \"data_object_name\",\n", + " \"description\": \"dataset_type_description\",\n", + "}\n", "\n", - "emsl_data_object_json_list = dop.make_json_string_list\\\n", - " (emsl_dictdf, nmdc.DataObject, constructor_map=constructor, attribute_fields=attributes)" + "emsl_data_object_json_list = dop.make_json_string_list(\n", + " emsl_dictdf,\n", + " nmdc.DataObject,\n", + " constructor_map=constructor,\n", + " attribute_fields=attributes,\n", + ")" ] }, { @@ -346,7 +360,9 @@ "metadata": {}, "outputs": [], "source": [ - "dop.save_json_string_list(\"output/test-pipeline/emsl_data_objects.json\", emsl_data_object_json_list) # save json string list to file" + "dop.save_json_string_list(\n", + " \"output/test-pipeline/emsl_data_objects.json\", emsl_data_object_json_list\n", + ") # save json string list to file" ] }, { @@ -371,8 +387,12 @@ "metadata": {}, "outputs": [], "source": [ - "data_objects = dop.make_data_objects_dataframe(faa_table, fna_table, fastq_table, project_table)\n", - "data_objects_dictdf = data_objects.to_dict(orient=\"records\") # transorm dataframe to dictionary" + "data_objects = dop.make_data_objects_dataframe(\n", + " faa_table, fna_table, fastq_table, project_table\n", + ")\n", + "data_objects_dictdf = data_objects.to_dict(\n", + " orient=\"records\"\n", + ") # transorm dataframe to dictionary" ] }, { @@ -401,20 +421,20 @@ "metadata": {}, "outputs": [], "source": [ - "attributes = \\\n", - " [\n", - " 'file_size_bytes'\n", - " ]\n", + "attributes = [\"file_size_bytes\"]\n", "\n", - "constructor = \\\n", - " {\n", - " 'id': 'file_id',\n", - " 'name': 'file_name',\n", - " 'description': 'file_type_description'\n", - " }\n", + "constructor = {\n", + " \"id\": \"file_id\",\n", + " \"name\": \"file_name\",\n", + " \"description\": \"file_type_description\",\n", + "}\n", "\n", - "data_objects_json_list = dop.make_json_string_list\\\n", - " (data_objects_dictdf, nmdc.DataObject, constructor_map=constructor, attribute_fields=attributes)" + "data_objects_json_list = dop.make_json_string_list(\n", + " data_objects_dictdf,\n", + " nmdc.DataObject,\n", + " constructor_map=constructor,\n", + " attribute_fields=attributes,\n", + ")" ] }, { @@ -439,7 +459,9 @@ "metadata": {}, "outputs": [], "source": [ - "dop.save_json_string_list(\"output/test-pipeline/faa_fna_fastq_data_objects.json\", data_objects_json_list) # save json string list to file" + "dop.save_json_string_list(\n", + " \"output/test-pipeline/faa_fna_fastq_data_objects.json\", data_objects_json_list\n", + ") # save json string list to file" ] }, { @@ -464,7 +486,9 @@ "metadata": {}, "outputs": [], "source": [ - "project = dop.make_project_dataframe(project_table, study_table, contact_table, data_objects)\n", + "project = dop.make_project_dataframe(\n", + " project_table, study_table, contact_table, data_objects\n", + ")\n", "# project[pds.isnull(project.output_file_ids)]\n", "# project = project[project.nmdc_record_id == \"115128\"] # test if output_file_ids is null\n", "# project.output_file_ids.unique()\n", @@ -477,7 +501,7 @@ "metadata": {}, "outputs": [], "source": [ - "project_dictdf = project.to_dict(orient=\"records\") # transorm dataframe to dictionary\n", + "project_dictdf = project.to_dict(orient=\"records\") # transorm dataframe to dictionary\n", "# project.columns" ] }, @@ -488,31 +512,29 @@ "outputs": [], "source": [ "## specify characteristics\n", - "attributes = \\\n", - " [\n", - " # {'part_of': ({'id': 'study_gold_id'}, nmdc.Study)},\n", - " # {'has_output': ({'id': 'output_file_ids'}, nmdc.DataObject)},\n", - " {'part_of': 'study_gold_id'},\n", - " {'has_output': 'output_file_ids'},\n", - " 'add_date', \n", - " 'mod_date', \n", - " 'completion_date', \n", - " 'ncbi_project_name', \n", - " 'omics_type', \n", - " 'principal_investigator_name',\n", - " 'processing_institution'\n", - " ]\n", + "attributes = [\n", + " # {'part_of': ({'id': 'study_gold_id'}, nmdc.Study)},\n", + " # {'has_output': ({'id': 'output_file_ids'}, nmdc.DataObject)},\n", + " {\"part_of\": \"study_gold_id\"},\n", + " {\"has_output\": \"output_file_ids\"},\n", + " \"add_date\",\n", + " \"mod_date\",\n", + " \"completion_date\",\n", + " \"ncbi_project_name\",\n", + " \"omics_type\",\n", + " \"principal_investigator_name\",\n", + " \"processing_institution\",\n", + "]\n", "\n", "\n", - "constructor = \\\n", - " {\n", - " 'id': 'gold_id',\n", - " 'name': 'project_name',\n", - " 'description': 'description'\n", - " }\n", + "constructor = {\"id\": \"gold_id\", \"name\": \"project_name\", \"description\": \"description\"}\n", "\n", - "project_json_list = dop.make_json_string_list\\\n", - " (project_dictdf, nmdc.OmicsProcessing, constructor_map=constructor, attribute_fields=attributes)" + "project_json_list = dop.make_json_string_list(\n", + " project_dictdf,\n", + " nmdc.OmicsProcessing,\n", + " constructor_map=constructor,\n", + " attribute_fields=attributes,\n", + ")" ] }, { @@ -537,7 +559,9 @@ "metadata": {}, "outputs": [], "source": [ - "dop.save_json_string_list(\"output/test-pipeline/gold_omics_processing.json\", project_json_list) # save json string list to file" + "dop.save_json_string_list(\n", + " \"output/test-pipeline/gold_omics_processing.json\", project_json_list\n", + ") # save json string list to file" ] }, { @@ -553,7 +577,9 @@ "metadata": {}, "outputs": [], "source": [ - "biosample = dop.make_biosample_dataframe(biosample_table, project_biosample_table, project_table)" + "biosample = dop.make_biosample_dataframe(\n", + " biosample_table, project_biosample_table, project_table\n", + ")" ] }, { @@ -562,7 +588,9 @@ "metadata": {}, "outputs": [], "source": [ - "biosample_dictdf = biosample.to_dict(orient=\"records\") # transorm dataframe to dictionary\n", + "biosample_dictdf = biosample.to_dict(\n", + " orient=\"records\"\n", + ") # transorm dataframe to dictionary\n", "# biosample_dictdf[0] ## peek at dict data" ] }, @@ -573,103 +601,102 @@ "outputs": [], "source": [ "## specify attributes\n", - "attributes = \\\n", - " [\n", - " 'add_date',\n", - " 'mod_date',\n", - " 'collection_date',\n", - " 'ecosystem',\n", - " 'ecosystem_dcategory',\n", - " 'ecosystem_type',\n", - " 'ecosystem_subtype',\n", - " 'specific_ecosystem',\n", - " 'habitat',\n", - " 'location',\n", - " 'community',\n", - " 'ncbi_taxonomy_name',\n", - " 'geographic_location',\n", - " 'sample_collection_site',\n", - " 'identifier',\n", - " 'host_name',\n", - " 'depth',\n", - " 'subsurface_depth',\n", - " 'altitude',\n", - " 'proport_woa_temperature',\n", - " 'biogas_temperature',\n", - " 'growth_temperature',\n", - " 'water_samp_store_temp',\n", - " 'biogas_retention_time',\n", - " 'salinity',\n", - " 'pressure',\n", - " 'ph',\n", - " 'chlorophyll_concentration',\n", - " 'nitrate_concentration',\n", - " 'oxygen_concentration',\n", - " 'salinity_concentration',\n", - " 'sample_volume',\n", - " 'sample_weight_dna_ext',\n", - " 'sampling_strategy',\n", - " 'soil_link_climate_info',\n", - " 'soil_misc_param',\n", - " 'soil_misc_param ',\n", - " 'soil_water_content',\n", - " 'soluble_iron_micromol',\n", - " 'subsurface_depth2',\n", - " 'tot_nitrogen',\n", - " 'tot_org_carbon',\n", - " 'water_alkalinity',\n", - " 'water_alkalinity_method',\n", - " 'water_alkyl_diethers',\n", - " 'water_aminopept_act',\n", - " 'water_ammonium',\n", - " 'water_bacterial_carbon_prod',\n", - " 'water_bishomohopanol',\n", - " 'water_bromide',\n", - " 'water_calcium',\n", - " 'water_carbon_nitrog_ratio',\n", - " 'water_chem_administration',\n", - " 'water_chloride',\n", - " 'water_density',\n", - " 'water_diether_lipids',\n", - " 'water_diss_carbon_dioxide',\n", - " 'water_diss_hydrogen',\n", - " 'water_diss_inorg_carbon',\n", - " 'water_diss_inorg_phosphorus',\n", - " 'water_diss_org_carbon',\n", - " 'water_diss_org_nitrogen',\n", - " 'water_glucosidase_activity',\n", - " 'water_magnesium',\n", - " 'water_mean_frict_vel',\n", - " 'water_mean_peak_frict_vel',\n", - " 'water_misc_parameter',\n", - " 'water_n_alkanes',\n", - " 'water_nitrite',\n", - " 'water_org_matter',\n", - " 'water_org_nitrogen',\n", - " 'water_organism_count',\n", - " 'water_oxy_stat_sample',\n", - " 'water_part_org_carbon',\n", - " 'water_perturbation',\n", - " 'water_petroleum_hydrocarbon',\n", - " 'water_phaeopigments',\n", - " 'water_phosplipid_fatt_acid',\n", - " 'water_potassium',\n", - " 'water_redox_potential',\n", - " 'water_samp_store_dur',\n", - " 'water_samp_store_loc',\n", - " 'water_size_frac_low',\n", - " 'water_size_frac_up',\n", - " 'water_sodium',\n", - " 'water_sulfate',\n", - " 'water_sulfide',\n", - " 'water_tidal_stage',\n", - " 'water_tot_depth_water_col',\n", - " 'water_tot_diss_nitro',\n", - " 'water_tot_phosphorus',\n", - " 'water_turbidity',\n", - " {'part_of': 'project_gold_ids'}\n", + "attributes = [\n", + " \"add_date\",\n", + " \"mod_date\",\n", + " \"collection_date\",\n", + " \"ecosystem\",\n", + " \"ecosystem_dcategory\",\n", + " \"ecosystem_type\",\n", + " \"ecosystem_subtype\",\n", + " \"specific_ecosystem\",\n", + " \"habitat\",\n", + " \"location\",\n", + " \"community\",\n", + " \"ncbi_taxonomy_name\",\n", + " \"geographic_location\",\n", + " \"sample_collection_site\",\n", + " \"identifier\",\n", + " \"host_name\",\n", + " \"depth\",\n", + " \"subsurface_depth\",\n", + " \"altitude\",\n", + " \"proport_woa_temperature\",\n", + " \"biogas_temperature\",\n", + " \"growth_temperature\",\n", + " \"water_samp_store_temp\",\n", + " \"biogas_retention_time\",\n", + " \"salinity\",\n", + " \"pressure\",\n", + " \"ph\",\n", + " \"chlorophyll_concentration\",\n", + " \"nitrate_concentration\",\n", + " \"oxygen_concentration\",\n", + " \"salinity_concentration\",\n", + " \"sample_volume\",\n", + " \"sample_weight_dna_ext\",\n", + " \"sampling_strategy\",\n", + " \"soil_link_climate_info\",\n", + " \"soil_misc_param\",\n", + " \"soil_misc_param \",\n", + " \"soil_water_content\",\n", + " \"soluble_iron_micromol\",\n", + " \"subsurface_depth2\",\n", + " \"tot_nitrogen\",\n", + " \"tot_org_carbon\",\n", + " \"water_alkalinity\",\n", + " \"water_alkalinity_method\",\n", + " \"water_alkyl_diethers\",\n", + " \"water_aminopept_act\",\n", + " \"water_ammonium\",\n", + " \"water_bacterial_carbon_prod\",\n", + " \"water_bishomohopanol\",\n", + " \"water_bromide\",\n", + " \"water_calcium\",\n", + " \"water_carbon_nitrog_ratio\",\n", + " \"water_chem_administration\",\n", + " \"water_chloride\",\n", + " \"water_density\",\n", + " \"water_diether_lipids\",\n", + " \"water_diss_carbon_dioxide\",\n", + " \"water_diss_hydrogen\",\n", + " \"water_diss_inorg_carbon\",\n", + " \"water_diss_inorg_phosphorus\",\n", + " \"water_diss_org_carbon\",\n", + " \"water_diss_org_nitrogen\",\n", + " \"water_glucosidase_activity\",\n", + " \"water_magnesium\",\n", + " \"water_mean_frict_vel\",\n", + " \"water_mean_peak_frict_vel\",\n", + " \"water_misc_parameter\",\n", + " \"water_n_alkanes\",\n", + " \"water_nitrite\",\n", + " \"water_org_matter\",\n", + " \"water_org_nitrogen\",\n", + " \"water_organism_count\",\n", + " \"water_oxy_stat_sample\",\n", + " \"water_part_org_carbon\",\n", + " \"water_perturbation\",\n", + " \"water_petroleum_hydrocarbon\",\n", + " \"water_phaeopigments\",\n", + " \"water_phosplipid_fatt_acid\",\n", + " \"water_potassium\",\n", + " \"water_redox_potential\",\n", + " \"water_samp_store_dur\",\n", + " \"water_samp_store_loc\",\n", + " \"water_size_frac_low\",\n", + " \"water_size_frac_up\",\n", + " \"water_sodium\",\n", + " \"water_sulfate\",\n", + " \"water_sulfide\",\n", + " \"water_tidal_stage\",\n", + " \"water_tot_depth_water_col\",\n", + " \"water_tot_diss_nitro\",\n", + " \"water_tot_phosphorus\",\n", + " \"water_turbidity\",\n", + " {\"part_of\": \"project_gold_ids\"},\n", " # {'part_of': ({'id': 'project_gold_ids'}, nmdc.OmicsProcessing)}\n", - " ]\n", + "]\n", "\n", "# removed in version 5: 'temperature_range', 'soil_annual_season_temp'" ] @@ -692,7 +719,7 @@ "source": [ "## create map betweeen gold fields and mixs terms\n", "mapping_df = dop.make_dataframe(\"../src/data/GOLD-to-mixs-map.tsv\")\n", - "attr_map = dop.make_gold_to_mixs_map(attributes, mapping_df, 'biosample')" + "attr_map = dop.make_gold_to_mixs_map(attributes, mapping_df, \"biosample\")" ] }, { @@ -702,16 +729,18 @@ "outputs": [], "source": [ "## create dict of constructor args\n", - "constructor = \\\n", - " {\n", - " 'id': 'gold_id',\n", - " 'name': 'biosample_name',\n", - " 'description': 'description',\n", - " 'env_broad_scale': [{'has_raw_value':'env_broad_scale'}, nmdc.ControlledTermValue],\n", - " 'env_local_scale': [{'has_raw_value':'env_local_scale'}, nmdc.ControlledTermValue],\n", - " 'env_medium': [{'has_raw_value': 'env_medium'}, nmdc.ControlledTermValue],\n", - " 'lat_lon': [{'latitude': 'latitude', 'longitude': 'longitude', 'has_raw_value': 'lat_lon'}, nmdc.GeolocationValue],\n", - " }" + "constructor = {\n", + " \"id\": \"gold_id\",\n", + " \"name\": \"biosample_name\",\n", + " \"description\": \"description\",\n", + " \"env_broad_scale\": [{\"has_raw_value\": \"env_broad_scale\"}, nmdc.ControlledTermValue],\n", + " \"env_local_scale\": [{\"has_raw_value\": \"env_local_scale\"}, nmdc.ControlledTermValue],\n", + " \"env_medium\": [{\"has_raw_value\": \"env_medium\"}, nmdc.ControlledTermValue],\n", + " \"lat_lon\": [\n", + " {\"latitude\": \"latitude\", \"longitude\": \"longitude\", \"has_raw_value\": \"lat_lon\"},\n", + " nmdc.GeolocationValue,\n", + " ],\n", + "}" ] }, { @@ -721,8 +750,13 @@ "outputs": [], "source": [ "## create list of json string objects\n", - "biosample_json_list = dop.make_json_string_list \\\n", - " (biosample_dictdf, nmdc.Biosample, constructor_map=constructor, attribute_fields=attributes, attribute_map=attr_map)" + "biosample_json_list = dop.make_json_string_list(\n", + " biosample_dictdf,\n", + " nmdc.Biosample,\n", + " constructor_map=constructor,\n", + " attribute_fields=attributes,\n", + " attribute_map=attr_map,\n", + ")" ] }, { @@ -747,7 +781,9 @@ "metadata": {}, "outputs": [], "source": [ - "dop.save_json_string_list(\"output/test-pipeline/biosample.json\", biosample_json_list) # save json string list to file" + "dop.save_json_string_list(\n", + " \"output/test-pipeline/biosample.json\", biosample_json_list\n", + ") # save json string list to file" ] }, { @@ -771,7 +807,7 @@ "outputs": [], "source": [ "## navigate to test output directory\n", - "os.chdir('output/test-pipeline/')" + "os.chdir(\"output/test-pipeline/\")" ] }, { @@ -811,7 +847,7 @@ "outputs": [], "source": [ "biosample_set = None\n", - "with open('../test-five-biosamples/biosample.json', 'r') as f:\n", + "with open(\"../test-five-biosamples/biosample.json\", \"r\") as f:\n", " biosample_set = json.load(f)" ] }, @@ -858,7 +894,7 @@ "outputs": [], "source": [ "project_set = None\n", - "with open('../test-five-biosamples/project.json', 'r') as f:\n", + "with open(\"../test-five-biosamples/project.json\", \"r\") as f:\n", " project_set = json.load(f)" ] }, @@ -904,7 +940,7 @@ "outputs": [], "source": [ "study_set = None\n", - "with open('../test-five-biosamples/study.json', 'r') as f:\n", + "with open(\"../test-five-biosamples/study.json\", \"r\") as f:\n", " study_set = json.load(f)" ] }, @@ -960,7 +996,7 @@ "outputs": [], "source": [ "data_object_set = None\n", - "with open('../test-five-biosamples/data_object.json', 'r') as f:\n", + "with open(\"../test-five-biosamples/data_object.json\", \"r\") as f:\n", " data_object_set = json.load(f)" ] }, @@ -991,7 +1027,7 @@ "outputs": [], "source": [ "## emsl projects\n", - "# !jq '.[0]' emsl_omics_processing.json " + "# !jq '.[0]' emsl_omics_processing.json" ] }, { @@ -1009,13 +1045,12 @@ "metadata": {}, "outputs": [], "source": [ - "database = \\\n", - " {\n", - " \"study_set\": [*study_set], \n", - " \"omics_processing_set\": [*project_set], \n", - " \"biosample_set\": [*biosample_set], \n", - " \"data_object_set\": [*data_object_set]\n", - " }" + "database = {\n", + " \"study_set\": [*study_set],\n", + " \"omics_processing_set\": [*project_set],\n", + " \"biosample_set\": [*biosample_set],\n", + " \"data_object_set\": [*data_object_set],\n", + "}" ] }, { @@ -1024,7 +1059,7 @@ "metadata": {}, "outputs": [], "source": [ - "with open('nmdc-02.json', 'w') as fp:\n", + "with open(\"nmdc-02.json\", \"w\") as fp:\n", " json.dump(database, fp)" ] }, diff --git a/metadata-translation/notebooks/translate-EMSL-data.ipynb b/metadata-translation/notebooks/translate-EMSL-data.ipynb index f90c3a14..3420e1a2 100644 --- a/metadata-translation/notebooks/translate-EMSL-data.ipynb +++ b/metadata-translation/notebooks/translate-EMSL-data.ipynb @@ -18,7 +18,8 @@ "outputs": [], "source": [ "import os, sys\n", - "sys.path.append(os.path.abspath('../src/bin/lib/')) # add path to lib" + "\n", + "sys.path.append(os.path.abspath(\"../src/bin/lib/\")) # add path to lib" ] }, { @@ -34,6 +35,7 @@ "import data_operations as dop\n", "from pandasql import sqldf\n", "\n", + "\n", "def pysqldf(q):\n", " return sqldf(q, globals())" ] @@ -54,7 +56,9 @@ "metadata": {}, "outputs": [], "source": [ - "study = dop.make_dataframe(\"export.sql/STUDY_DATA_TABLE.dsv\", file_archive_name=\"../src/data/nmdc-version2.zip\")" + "study = dop.make_dataframe(\n", + " \"export.sql/STUDY_DATA_TABLE.dsv\", file_archive_name=\"../src/data/nmdc-version2.zip\"\n", + ")" ] }, { @@ -96,12 +100,19 @@ "source": [ "## load emsl instrument run data\n", "## the spreadsheet contains multiple tab, so I have to load using pandas and the clean the columnn names\n", - "emsl = pds.concat(pds.read_excel(\"../src/data/EMSL_FICUS_project_process_data_export.xlsx\", \n", - " sheet_name=None), ignore_index=True)\n", + "emsl = pds.concat(\n", + " pds.read_excel(\n", + " \"../src/data/EMSL_FICUS_project_process_data_export.xlsx\", sheet_name=None\n", + " ),\n", + " ignore_index=True,\n", + ")\n", "emsl = dop.clean_dataframe_column_names(emsl)\n", "\n", "## load mapping spreadsheet\n", - "jgi_emsl = dop.make_dataframe(\"../src/data/FICUS - JGI-EMSL Proposal - Gold Study - ID mapping and PI.xlsx\", file_type=\"excel\")\n" + "jgi_emsl = dop.make_dataframe(\n", + " \"../src/data/FICUS - JGI-EMSL Proposal - Gold Study - ID mapping and PI.xlsx\",\n", + " file_type=\"excel\",\n", + ")" ] }, { @@ -178,7 +189,9 @@ "metadata": {}, "outputs": [], "source": [ - "emsl.rename(columns={\"experimental_data_type\":\"omics_type\"}, inplace=True) # rename column" + "emsl.rename(\n", + " columns={\"experimental_data_type\": \"omics_type\"}, inplace=True\n", + ") # rename column" ] }, { @@ -187,7 +200,9 @@ "metadata": {}, "outputs": [], "source": [ - "emsl.rename(columns={\"dataset_file_size_bytes\":\"file_size\"}, inplace=True) # rename column" + "emsl.rename(\n", + " columns={\"dataset_file_size_bytes\": \"file_size\"}, inplace=True\n", + ") # rename column" ] }, { @@ -196,7 +211,9 @@ "metadata": {}, "outputs": [], "source": [ - "emsl[\"processing_institution\"] = \"Environmental Molecular Sciences Lab\" # add processing institution" + "emsl[\"processing_institution\"] = (\n", + " \"Environmental Molecular Sciences Lab\" # add processing institution\n", + ")" ] }, { @@ -206,7 +223,9 @@ "outputs": [], "source": [ "emsl[\"data_object_id\"] = \"output_\"\n", - "emsl[\"data_object_id\"] = emsl[\"data_object_id\"] + emsl[\"dataset_id\"].map(str) # build data object id" + "emsl[\"data_object_id\"] = emsl[\"data_object_id\"] + emsl[\"dataset_id\"].map(\n", + " str\n", + ") # build data object id" ] }, { @@ -216,7 +235,9 @@ "outputs": [], "source": [ "emsl[\"data_object_name\"] = \"output: \"\n", - "emsl[\"data_object_name\"] = emsl[\"data_object_name\"] + emsl[\"dataset_name\"].map(str) # build data object name" + "emsl[\"data_object_name\"] = emsl[\"data_object_name\"] + emsl[\"dataset_name\"].map(\n", + " str\n", + ") # build data object name" ] }, { @@ -263,13 +284,19 @@ ], "source": [ "## specify characteristics\n", - "characteristics = \\\n", - " ['omics_type', 'instrument_name', 'processing_institution']\n", + "characteristics = [\"omics_type\", \"instrument_name\", \"processing_institution\"]\n", "\n", "## create list of json string objects\n", - "omics_processing_dict_list = dop.make_nmdc_dict_list\\\n", - " (emsl_dictdf, nmdc.OmicsProcessing, id_key='dataset_id', name_key='dataset_name', description_key=\"dataset_type_description\",\n", - " part_of_key=\"gold_study_id\", has_output_key=\"data_object_id\", characteristic_fields=characteristics)" + "omics_processing_dict_list = dop.make_nmdc_dict_list(\n", + " emsl_dictdf,\n", + " nmdc.OmicsProcessing,\n", + " id_key=\"dataset_id\",\n", + " name_key=\"dataset_name\",\n", + " description_key=\"dataset_type_description\",\n", + " part_of_key=\"gold_study_id\",\n", + " has_output_key=\"data_object_id\",\n", + " characteristic_fields=characteristics,\n", + ")" ] }, { @@ -285,7 +312,7 @@ "metadata": {}, "outputs": [], "source": [ - "omics_processing_dict_list[0] # peek at data" + "omics_processing_dict_list[0] # peek at data" ] }, { @@ -302,12 +329,16 @@ "outputs": [], "source": [ "## specify characteristics\n", - "characteristics = ['file_size']\n", + "characteristics = [\"file_size\"]\n", "\n", "## create list of dictionaries\n", - "data_objects_dict_list = dop.make_nmdc_dict_list\\\n", - " (emsl_dictdf, nmdc.DataObject, id_key='data_object_id', \n", - " name_key='data_object_name', characteristic_fields=characteristics)" + "data_objects_dict_list = dop.make_nmdc_dict_list(\n", + " emsl_dictdf,\n", + " nmdc.DataObject,\n", + " id_key=\"data_object_id\",\n", + " name_key=\"data_object_name\",\n", + " characteristic_fields=characteristics,\n", + ")" ] }, { @@ -333,7 +364,9 @@ "outputs": [], "source": [ "## load omics processing json into dict list\n", - "omics_processing_file_data = dop.load_dict_from_json_file(\"output/nmdc-json/omics_processing.json\")" + "omics_processing_file_data = dop.load_dict_from_json_file(\n", + " \"output/nmdc-json/omics_processing.json\"\n", + ")" ] }, { @@ -367,7 +400,7 @@ } ], "source": [ - "omics_processing_file_data[0] # peek at data" + "omics_processing_file_data[0] # peek at data" ] }, { @@ -401,7 +434,9 @@ "metadata": {}, "outputs": [], "source": [ - "updated_omics_processing_json_list = dop.convert_dict_list_to_json_list(updated_omics_processing)" + "updated_omics_processing_json_list = dop.convert_dict_list_to_json_list(\n", + " updated_omics_processing\n", + ")" ] }, { @@ -410,7 +445,9 @@ "metadata": {}, "outputs": [], "source": [ - "dop.save_json_string_list(\"output/nmdc-json/omics_processing.json\", updated_omics_processing_json_list) # save json string list to file" + "dop.save_json_string_list(\n", + " \"output/nmdc-json/omics_processing.json\", updated_omics_processing_json_list\n", + ") # save json string list to file" ] }, { @@ -427,7 +464,9 @@ "outputs": [], "source": [ "## load data objects json into dict list\n", - "data_objects_file_data = dop.load_dict_from_json_file(\"output/nmdc-json/data_objects.json\")" + "data_objects_file_data = dop.load_dict_from_json_file(\n", + " \"output/nmdc-json/data_objects.json\"\n", + ")" ] }, { @@ -470,7 +509,9 @@ "metadata": {}, "outputs": [], "source": [ - "updated_data_objects_json_list = dop.convert_dict_list_to_json_list(updated_data_objects)" + "updated_data_objects_json_list = dop.convert_dict_list_to_json_list(\n", + " updated_data_objects\n", + ")" ] }, { @@ -479,7 +520,9 @@ "metadata": {}, "outputs": [], "source": [ - "dop.save_json_string_list(\"output/nmdc-json/data_objects.json\", updated_data_objects_json_list) # save json string list to file\n" + "dop.save_json_string_list(\n", + " \"output/nmdc-json/data_objects.json\", updated_data_objects_json_list\n", + ") # save json string list to file" ] } ], diff --git a/metadata-translation/notebooks/translate-GOLD-data-objects.ipynb b/metadata-translation/notebooks/translate-GOLD-data-objects.ipynb index fd197017..0479f3ff 100644 --- a/metadata-translation/notebooks/translate-GOLD-data-objects.ipynb +++ b/metadata-translation/notebooks/translate-GOLD-data-objects.ipynb @@ -18,7 +18,8 @@ "outputs": [], "source": [ "import os, sys\n", - "sys.path.append(os.path.abspath('../src/bin/lib/')) # add path to lib" + "\n", + "sys.path.append(os.path.abspath(\"../src/bin/lib/\")) # add path to lib" ] }, { @@ -34,6 +35,7 @@ "import data_operations as dop\n", "from pandasql import sqldf\n", "\n", + "\n", "def pysqldf(q):\n", " return sqldf(q, globals())" ] @@ -54,8 +56,13 @@ "metadata": {}, "outputs": [], "source": [ - "study = dop.make_dataframe(\"export.sql/STUDY_DATA_TABLE.dsv\", file_archive_name=\"../src/data/nmdc-version2.zip\")\n", - "project = dop.make_dataframe(\"export.sql/PROJECT_DATA_TABLE.dsv\", file_archive_name=\"../src/data/nmdc-version2.zip\")" + "study = dop.make_dataframe(\n", + " \"export.sql/STUDY_DATA_TABLE.dsv\", file_archive_name=\"../src/data/nmdc-version2.zip\"\n", + ")\n", + "project = dop.make_dataframe(\n", + " \"export.sql/PROJECT_DATA_TABLE.dsv\",\n", + " file_archive_name=\"../src/data/nmdc-version2.zip\",\n", + ")" ] }, { @@ -205,7 +212,9 @@ "metadata": {}, "outputs": [], "source": [ - "data_objects_dictdf = data_objects.to_dict(orient=\"records\") # transorm dataframe to dictionary" + "data_objects_dictdf = data_objects.to_dict(\n", + " orient=\"records\"\n", + ") # transorm dataframe to dictionary" ] }, { @@ -238,11 +247,17 @@ ], "source": [ "## specify characteristics\n", - "characteristics = ['file_size']\n", + "characteristics = [\"file_size\"]\n", "\n", "## create list of json string objects\n", - "data_objects_json_list = dop.make_json_string_list\\\n", - " (data_objects_dictdf, nmdc.DataObject, id_key='file_id', name_key='file_name', description_key=\"file_type_description\", characteristic_fields=characteristics)" + "data_objects_json_list = dop.make_json_string_list(\n", + " data_objects_dictdf,\n", + " nmdc.DataObject,\n", + " id_key=\"file_id\",\n", + " name_key=\"file_name\",\n", + " description_key=\"file_type_description\",\n", + " characteristic_fields=characteristics,\n", + ")" ] }, { @@ -260,7 +275,9 @@ "metadata": {}, "outputs": [], "source": [ - "dop.save_json_string_list(\"output/nmdc-json/data_objects.json\", data_objects_json_list) # save json string list to file" + "dop.save_json_string_list(\n", + " \"output/nmdc-json/data_objects.json\", data_objects_json_list\n", + ") # save json string list to file" ] }, { @@ -291,7 +308,7 @@ "metadata": {}, "outputs": [], "source": [ - "omics_dict_list[0] ## peek at data" + "omics_dict_list[0] ## peek at data" ] }, { @@ -323,10 +340,14 @@ "outputs": [], "source": [ "## iterate over dataframe and create a has_output key for dictionary items with matching project ids\n", - "for (ix, gold_project_id, file_ids) in files_df.itertuples():\n", + "for ix, gold_project_id, file_ids in files_df.itertuples():\n", " for omics_dict in omics_dict_list:\n", - " if gold_project_id == omics_dict[\"id\"]: # compare project id to id of current dict object\n", - " omics_dict[\"has_output\"] = file_ids.split() # create list of file ids associated with project id" + " if (\n", + " gold_project_id == omics_dict[\"id\"]\n", + " ): # compare project id to id of current dict object\n", + " omics_dict[\"has_output\"] = (\n", + " file_ids.split()\n", + " ) # create list of file ids associated with project id" ] }, { @@ -351,7 +372,7 @@ "metadata": {}, "outputs": [], "source": [ - "project_json_list = [] # list to hold individual json objects\n", + "project_json_list = [] # list to hold individual json objects\n", "for omics_dict in omics_dict_list:\n", " project_json_list.append(json.dumps(omics_dict))" ] @@ -362,7 +383,9 @@ "metadata": {}, "outputs": [], "source": [ - "dop.save_json_string_list(\"output/nmdc-json/omics_processing.json\", project_json_list) # save json string list to file" + "dop.save_json_string_list(\n", + " \"output/nmdc-json/omics_processing.json\", project_json_list\n", + ") # save json string list to file" ] } ], diff --git a/metadata-translation/notebooks/translate-GOLD-study-project-biosample.ipynb b/metadata-translation/notebooks/translate-GOLD-study-project-biosample.ipynb index 35b3ff3b..665bf1f1 100644 --- a/metadata-translation/notebooks/translate-GOLD-study-project-biosample.ipynb +++ b/metadata-translation/notebooks/translate-GOLD-study-project-biosample.ipynb @@ -18,7 +18,8 @@ "outputs": [], "source": [ "import os, sys\n", - "sys.path.append(os.path.abspath('../src/bin/lib/')) # add path to lib" + "\n", + "sys.path.append(os.path.abspath(\"../src/bin/lib/\")) # add path to lib" ] }, { @@ -34,6 +35,7 @@ "import data_operations as dop\n", "from pandasql import sqldf\n", "\n", + "\n", "def pysqldf(q):\n", " return sqldf(q, globals())" ] @@ -54,11 +56,25 @@ "metadata": {}, "outputs": [], "source": [ - "study = dop.make_dataframe(\"export.sql/STUDY_DATA_TABLE.dsv\", file_archive_name=\"../src/data/nmdc-version2.zip\")\n", - "contact = dop.make_dataframe(\"export.sql/CONTACT_DATA_TABLE.dsv\", file_archive_name=\"../src/data/nmdc-version2.zip\")\n", - "project = dop.make_dataframe(\"export.sql/PROJECT_DATA_TABLE.dsv\", file_archive_name=\"../src/data/nmdc-version2.zip\")\n", - "project_biosample = dop.make_dataframe(\"export.sql/PROJECT_BIOSAMPLE_DATA_TABLE.dsv\", file_archive_name=\"../src/data/nmdc-version2.zip\")\n", - "biosample = dop.make_dataframe(\"export.sql/BIOSAMPLE_DATA_TABLE.dsv\", file_archive_name=\"../src/data/nmdc-version2.zip\")\n", + "study = dop.make_dataframe(\n", + " \"export.sql/STUDY_DATA_TABLE.dsv\", file_archive_name=\"../src/data/nmdc-version2.zip\"\n", + ")\n", + "contact = dop.make_dataframe(\n", + " \"export.sql/CONTACT_DATA_TABLE.dsv\",\n", + " file_archive_name=\"../src/data/nmdc-version2.zip\",\n", + ")\n", + "project = dop.make_dataframe(\n", + " \"export.sql/PROJECT_DATA_TABLE.dsv\",\n", + " file_archive_name=\"../src/data/nmdc-version2.zip\",\n", + ")\n", + "project_biosample = dop.make_dataframe(\n", + " \"export.sql/PROJECT_BIOSAMPLE_DATA_TABLE.dsv\",\n", + " file_archive_name=\"../src/data/nmdc-version2.zip\",\n", + ")\n", + "biosample = dop.make_dataframe(\n", + " \"export.sql/BIOSAMPLE_DATA_TABLE.dsv\",\n", + " file_archive_name=\"../src/data/nmdc-version2.zip\",\n", + ")\n", "proposals = dop.make_dataframe(\"../src/data/JGI-EMSL-FICUS-proposals.fnl.tsv\")" ] }, @@ -195,7 +211,7 @@ "metadata": {}, "outputs": [], "source": [ - "study_dictdf = study.to_dict(orient=\"records\") # transorm dataframe to dictionary" + "study_dictdf = study.to_dict(orient=\"records\") # transorm dataframe to dictionary" ] }, { @@ -229,13 +245,29 @@ ], "source": [ "## specify characteristics\n", - "characteristics = \\\n", - " ['gold_study_name', 'principal_investigator_name', 'add_date', 'mod_date', 'doi',\n", - " 'ecosystem', 'ecosystem_category', 'ecosystem_type', 'ecosystem_subtype', 'specific_ecosystem', 'ecosystem_path_id']\n", + "characteristics = [\n", + " \"gold_study_name\",\n", + " \"principal_investigator_name\",\n", + " \"add_date\",\n", + " \"mod_date\",\n", + " \"doi\",\n", + " \"ecosystem\",\n", + " \"ecosystem_category\",\n", + " \"ecosystem_type\",\n", + " \"ecosystem_subtype\",\n", + " \"specific_ecosystem\",\n", + " \"ecosystem_path_id\",\n", + "]\n", "\n", "## create list of json string objects\n", - "study_json_list = dop.make_json_string_list\\\n", - " (study_dictdf, nmdc.Study, id_key='gold_id', name_key='study_name', description_key=\"description\", characteristic_fields=characteristics)" + "study_json_list = dop.make_json_string_list(\n", + " study_dictdf,\n", + " nmdc.Study,\n", + " id_key=\"gold_id\",\n", + " name_key=\"study_name\",\n", + " description_key=\"description\",\n", + " characteristic_fields=characteristics,\n", + ")" ] }, { @@ -253,7 +285,9 @@ "metadata": {}, "outputs": [], "source": [ - "dop.save_json_string_list(\"output/nmdc-json/study.json\", study_json_list) # save json string list to file" + "dop.save_json_string_list(\n", + " \"output/nmdc-json/study.json\", study_json_list\n", + ") # save json string list to file" ] }, { @@ -313,7 +347,7 @@ "metadata": {}, "outputs": [], "source": [ - "project_dictdf = project.to_dict(orient=\"records\") # transorm dataframe to dictionary" + "project_dictdf = project.to_dict(orient=\"records\") # transorm dataframe to dictionary" ] }, { @@ -323,13 +357,26 @@ "outputs": [], "source": [ "## specify characteristics\n", - "characteristics = \\\n", - " ['add_date', 'mod_date', 'completion_date', 'ncbi_project_name', 'omics_type', 'principal_investigator_name', 'processing_institution']\n", + "characteristics = [\n", + " \"add_date\",\n", + " \"mod_date\",\n", + " \"completion_date\",\n", + " \"ncbi_project_name\",\n", + " \"omics_type\",\n", + " \"principal_investigator_name\",\n", + " \"processing_institution\",\n", + "]\n", "\n", "## create list of json string objects\n", - "project_json_list = dop.make_json_string_list\\\n", - " (project_dictdf, nmdc.OmicsProcessing, id_key='gold_id', name_key='project_name', \n", - " part_of_key=\"study_gold_id\", description_key=\"description\", characteristic_fields=characteristics)" + "project_json_list = dop.make_json_string_list(\n", + " project_dictdf,\n", + " nmdc.OmicsProcessing,\n", + " id_key=\"gold_id\",\n", + " name_key=\"project_name\",\n", + " part_of_key=\"study_gold_id\",\n", + " description_key=\"description\",\n", + " characteristic_fields=characteristics,\n", + ")" ] }, { @@ -347,7 +394,9 @@ "metadata": {}, "outputs": [], "source": [ - "dop.save_json_string_list(\"output/nmdc-json/omics_processing.json\", project_json_list) # save json string list to file" + "dop.save_json_string_list(\n", + " \"output/nmdc-json/omics_processing.json\", project_json_list\n", + ") # save json string list to file" ] }, { @@ -473,7 +522,9 @@ "metadata": {}, "outputs": [], "source": [ - "biosample_dictdf = biosampledf.to_dict(orient=\"records\") # transorm dataframe to dictionary" + "biosample_dictdf = biosampledf.to_dict(\n", + " orient=\"records\"\n", + ") # transorm dataframe to dictionary" ] }, { @@ -483,48 +534,48 @@ "outputs": [], "source": [ "## specify characteristics\n", - "characteristics = \\\n", - " ['add_date',\n", - " 'mod_date',\n", - " 'ecosystem_path_id',\n", - " 'ecosystem',\n", - " 'ecosystem_category',\n", - " 'ecosystem_type',\n", - " 'ecosystem_subtype',\n", - " 'specific_ecosystem',\n", - " 'habitat',\n", - " 'location',\n", - " 'community',\n", - " 'ncbi_taxonomy_name',\n", - " 'geographic_location',\n", - " 'latitude',\n", - " 'longitude',\n", - " 'sample_collection_site',\n", - " 'identifier',\n", - " 'sample_collection_year',\n", - " 'sample_collection_month',\n", - " 'sample_collection_day',\n", - " 'sample_collection_hour',\n", - " 'sample_collection_minute',\n", - " 'host_name',\n", - " 'depth',\n", - " 'subsurface_depth',\n", - " 'altitude',\n", - " 'temperature_range',\n", - " 'proport_woa_temperature',\n", - " 'biogas_temperature',\n", - " 'growth_temperature',\n", - " 'soil_annual_season_temp',\n", - " 'water_samp_store_temp',\n", - " 'biogas_retention_time',\n", - " 'salinity',\n", - " 'pressure',\n", - " 'ph',\n", - " 'chlorophyll_concentration',\n", - " 'nitrate_concentration',\n", - " 'oxygen_concentration',\n", - " 'salinity_concentration'\n", - " ]" + "characteristics = [\n", + " \"add_date\",\n", + " \"mod_date\",\n", + " \"ecosystem_path_id\",\n", + " \"ecosystem\",\n", + " \"ecosystem_category\",\n", + " \"ecosystem_type\",\n", + " \"ecosystem_subtype\",\n", + " \"specific_ecosystem\",\n", + " \"habitat\",\n", + " \"location\",\n", + " \"community\",\n", + " \"ncbi_taxonomy_name\",\n", + " \"geographic_location\",\n", + " \"latitude\",\n", + " \"longitude\",\n", + " \"sample_collection_site\",\n", + " \"identifier\",\n", + " \"sample_collection_year\",\n", + " \"sample_collection_month\",\n", + " \"sample_collection_day\",\n", + " \"sample_collection_hour\",\n", + " \"sample_collection_minute\",\n", + " \"host_name\",\n", + " \"depth\",\n", + " \"subsurface_depth\",\n", + " \"altitude\",\n", + " \"temperature_range\",\n", + " \"proport_woa_temperature\",\n", + " \"biogas_temperature\",\n", + " \"growth_temperature\",\n", + " \"soil_annual_season_temp\",\n", + " \"water_samp_store_temp\",\n", + " \"biogas_retention_time\",\n", + " \"salinity\",\n", + " \"pressure\",\n", + " \"ph\",\n", + " \"chlorophyll_concentration\",\n", + " \"nitrate_concentration\",\n", + " \"oxygen_concentration\",\n", + " \"salinity_concentration\",\n", + "]" ] }, { @@ -534,9 +585,15 @@ "outputs": [], "source": [ "## create list of json string objects\n", - "biosample_json_list = dop.make_json_string_list\\\n", - " (biosample_dictdf, nmdc.Biosample, id_key='gold_id', name_key='biosample_name', \n", - " part_of_key=\"project_gold_ids\", description_key=\"description\", characteristic_fields=characteristics)" + "biosample_json_list = dop.make_json_string_list(\n", + " biosample_dictdf,\n", + " nmdc.Biosample,\n", + " id_key=\"gold_id\",\n", + " name_key=\"biosample_name\",\n", + " part_of_key=\"project_gold_ids\",\n", + " description_key=\"description\",\n", + " characteristic_fields=characteristics,\n", + ")" ] }, { @@ -554,7 +611,9 @@ "metadata": {}, "outputs": [], "source": [ - "dop.save_json_string_list(\"output/nmdc-json/biosample.json\", biosample_json_list) # save json string list to file" + "dop.save_json_string_list(\n", + " \"output/nmdc-json/biosample.json\", biosample_json_list\n", + ") # save json string list to file" ] }, { diff --git a/metadata-translation/src/bin/lib/__init__.py b/metadata-translation/src/bin/lib/__init__.py index 57c9eca4..10520879 100644 --- a/metadata-translation/src/bin/lib/__init__.py +++ b/metadata-translation/src/bin/lib/__init__.py @@ -1,13 +1,14 @@ import os -print('**** init: ', os.path.abspath(".")) -#print('**** module file: ', os.path.abspath(".")) +print("**** init: ", os.path.abspath(".")) +# print('**** module file: ', os.path.abspath(".")) -#import inspect -#inspect.getfile(nmdc) + +# import inspect +# inspect.getfile(nmdc) from importlib.machinery import SourceFileLoader import importlib -#spec = importlib.util.spec_from_file_location("module.name", "/path/to/file.py") -#print('*** spec: ', spec) +# spec = importlib.util.spec_from_file_location("module.name", "/path/to/file.py") +# print('*** spec: ', spec) diff --git a/metadata-translation/src/bin/lib/data_operations.py b/metadata-translation/src/bin/lib/data_operations.py index 8b5f611b..72fbd261 100644 --- a/metadata-translation/src/bin/lib/data_operations.py +++ b/metadata-translation/src/bin/lib/data_operations.py @@ -20,11 +20,21 @@ import nmdc -def make_dataframe (file_name, subset_cols=[], exclude_cols=[], nrows=None, lowercase_col_names=True, - replace_spaces=True, file_type="tsv", delimiter="\t", sheet_name=0, file_archive_name=""): +def make_dataframe( + file_name, + subset_cols=[], + exclude_cols=[], + nrows=None, + lowercase_col_names=True, + replace_spaces=True, + file_type="tsv", + delimiter="\t", + sheet_name=0, + file_archive_name="", +): """ Builds a pandas dataframe from the designated file. - + Args: file_name: The name of the file containing the data for the dataframe. If the file is not in the same directory, then specify the path as part of the file name. subset_cols: Specifies a specific of subset of columns to be included in the dataframe. @@ -40,45 +50,61 @@ def make_dataframe (file_name, subset_cols=[], exclude_cols=[], nrows=None, lowe Pandas dataframe """ ## normalize paramaters for use with pandas - if len(subset_cols) < 1: subset_cols = None - if len(exclude_cols) < 1: exclude_cols = None - + if len(subset_cols) < 1: + subset_cols = None + if len(exclude_cols) < 1: + exclude_cols = None + ## check if file is contained in an archive file_archive = None if len(file_archive_name) > 1: file_archive = zipfile.ZipFile(file_archive_name, "r") - + ## load data from file if "tsv" == file_type.lower() or "csv" == file_type.lower(): if None != file_archive: - df = pds.read_csv(file_archive.open(file_name), sep=delimiter, nrows=nrows, comment='#') + df = pds.read_csv( + file_archive.open(file_name), sep=delimiter, nrows=nrows, comment="#" + ) else: - df = pds.read_csv(file_name, sep=delimiter, nrows=nrows, comment='#') + df = pds.read_csv(file_name, sep=delimiter, nrows=nrows, comment="#") elif "excel" == file_type.lower(): if None != file_archive: - df = pds.read_excel(file_archive.open(file_name), sheet_name=sheet_name, nrows=nrows) + df = pds.read_excel( + file_archive.open(file_name), sheet_name=sheet_name, nrows=nrows + ) else: df = pds.read_excel(file_name, sheet_name=sheet_name, nrows=nrows) elif "multi-sheet-excel" == file_type.lower(): if None != file_archive: df = pds.concat( - pds.read_excel(file_archive.open(file_name), sheet_name=None, ignore_index=True, nrows=nrows)) + pds.read_excel( + file_archive.open(file_name), + sheet_name=None, + ignore_index=True, + nrows=nrows, + ) + ) else: - df = pds.concat(pds.read_excel(file_name, sheet_name=None, ignore_index=True, nrows=nrows)) - + df = pds.concat( + pds.read_excel( + file_name, sheet_name=None, ignore_index=True, nrows=nrows + ) + ) + ## clean column names df = clean_dataframe_column_names(df, lowercase_col_names, replace_spaces) - + ## create subset of columns ## note: since column names are case sensitive, this needs to happen after cleaning column names if subset_cols: df = df[subset_cols] - + ## return dataframe return df -def clean_dataframe_column_names (df, lowercase_col_names=True, replace_spaces=True): +def clean_dataframe_column_names(df, lowercase_col_names=True, replace_spaces=True): """ Changes the column names of a dataframe into a standard format. The default settings change the column names to: - lower case @@ -90,41 +116,50 @@ def clean_dataframe_column_names (df, lowercase_col_names=True, replace_spaces=T Returns: Pandas dataframe """ - + ## clean column names if lowercase_col_names: df.columns = [c.strip().lower() for c in df.columns] - + if replace_spaces: df.columns = [c.replace(" ", "_") for c in df.columns] - + return df -def make_dataframe_dictionary (file_name, subset_cols=[], exclude_cols=[], nrows=None, lowercase_col_names=True, - replace_spaces=True, file_type="tsv", delimiter="\t", sheet_name=0, - file_archive_name=""): +def make_dataframe_dictionary( + file_name, + subset_cols=[], + exclude_cols=[], + nrows=None, + lowercase_col_names=True, + replace_spaces=True, + file_type="tsv", + delimiter="\t", + sheet_name=0, + file_archive_name="", +): """ Builds a dictionary based on the structure of the pandas dataframe generated from the designated file. The dictionary is oriented for records. E.g.: [ { - 'col1': 1, + 'col1': 1, 'col2': 0.5 - }, + }, { - 'col1': 2, + 'col1': 2, 'col2': 0.75 } ] - Essentially, this function is a shortcut for calling make_dataframe() and then transforming the result into a dictionary. + Essentially, this function is a shortcut for calling make_dataframe() and then transforming the result into a dictionary. E.g.: df = make_dataframe(file_name) dictdf = dictdf = df.to_dict(orient="records") - - + + Args: file_name: The name of the file containing the data for the dataframe. If the file is not in the same directory, then specify the path as part of the file name. subset_cols: Specifies a specific of subset of columns to be included in the dataframe. @@ -139,19 +174,30 @@ def make_dataframe_dictionary (file_name, subset_cols=[], exclude_cols=[], nrows Returns: Dictionary built from a Pandas dataframe. """ - df = make_dataframe(file_name, subset_cols=[], exclude_cols=[], nrows=None, lowercase_col_names=True, \ - replace_spaces=True, file_type="tsv", delimiter=delimiter, sheet_name=sheet_name, - file_archive_name=file_archive_name) + df = make_dataframe( + file_name, + subset_cols=[], + exclude_cols=[], + nrows=None, + lowercase_col_names=True, + replace_spaces=True, + file_type="tsv", + delimiter=delimiter, + sheet_name=sheet_name, + file_archive_name=file_archive_name, + ) return df.to_dict(orient="records") -def make_json_string_list (dictionary, - nmdc_class, - constructor_map={}, - attribute_fields=[], - attribute_map={}, - remove_key_attributes=True, - add_attribute=True): +def make_json_string_list( + dictionary, + nmdc_class, + constructor_map={}, + attribute_fields=[], + attribute_map={}, + remove_key_attributes=True, + add_attribute=True, +): """ Takes a dictionary in which each item is a record and returns a list of json strings build from each record. Args: @@ -167,21 +213,22 @@ def make_json_string_list (dictionary, remove_key_attributes: Specifies whether to remove the named keys (e.g, id_key, part_of_key) from the attributes list. Returns: A list in which each item is a json string. - + """ - dict_list = \ - make_nmdc_dict_list(dictionary, - nmdc_class, - constructor_map=constructor_map, - attribute_fields=attribute_fields, - attribute_map=attribute_map, - remove_key_attributes=remove_key_attributes, - add_attribute=add_attribute) - + dict_list = make_nmdc_dict_list( + dictionary, + nmdc_class, + constructor_map=constructor_map, + attribute_fields=attribute_fields, + attribute_map=attribute_map, + remove_key_attributes=remove_key_attributes, + add_attribute=add_attribute, + ) + return convert_dict_list_to_json_list(dict_list) -def convert_dict_list_to_json_list (dict_list): +def convert_dict_list_to_json_list(dict_list): """ Takes a list of dictionaries, converts each dictionary into json, and returns a list the json strings. Args: @@ -190,16 +237,16 @@ def convert_dict_list_to_json_list (dict_list): A list in which each item is a json string. """ json_list = [] # list to hold json - + ## iterate over dict list for d in dict_list: json_list.append(json.dumps(d)) - + ## return final list return json_list -def make_lat_lon (latitude, longitude): +def make_lat_lon(latitude, longitude): # latitude = "" if pds.isnull(latitude) else str(latitude).strip().replace('\n', '') # longitude = "" if pds.isnull(longitude) else str(longitude).strip().replace('\n', '') latitude = None if pds.isnull(latitude) else float(latitude) @@ -211,13 +258,15 @@ def make_lat_lon (latitude, longitude): return None -def make_nmdc_dict_list (dictionary, - nmdc_class, - constructor_map={}, - attribute_fields=[], - attribute_map={}, - remove_key_attributes=True, - add_attribute=True): +def make_nmdc_dict_list( + dictionary, + nmdc_class, + constructor_map={}, + attribute_fields=[], + attribute_map={}, + remove_key_attributes=True, + add_attribute=True, +): """ Takes a dictionary in which each item is a record and returns a list of dictionaries that conform to the nmdc schema. Args: @@ -234,13 +283,13 @@ def make_nmdc_dict_list (dictionary, add_attribute: Specifies whether an attributes in the attribute_fields list should be added to the nmdc class if not already present. Returns: A list in which each item is a dictionary that conforms to the nmdc schema - + """ - def map_slot_to_entity (slot_map, record): + def map_slot_to_entity(slot_map, record): """ - Connects a slot to an entity whose type is specified in a map/dict. - Example 2, the map: + Connects a slot to an entity whose type is specified in a map/dict. + Example 2, the map: {'part_of': project_gold_ids'} specifies that the part_of slot connects to the record's project_ids values. @@ -252,39 +301,42 @@ def map_slot_to_entity (slot_map, record): slot_name = list(slot_map.keys())[0] slot_value = list(slot_map.values())[0] referenced_entity = None - + ## if the slot values is not a tuple, return record's values if type(()) != type(slot_value): ## if no value found in the record, simply return none - if pds.isnull(record[slot_value]): return None + if pds.isnull(record[slot_value]): + return None - if slot_name in ['part_of', 'has_input', 'has_output']: - referenced_entity = record[slot_value].split(',') + if slot_name in ["part_of", "has_input", "has_output"]: + referenced_entity = record[slot_value].split(",") else: referenced_entity = record[slot_value] - + ## if the slot value is a tuple, then construct an object if type(()) == type(slot_value): param_dict = slot_value[0] nmdc_class = slot_value[1] - id_field = param_dict['id'] + id_field = param_dict["id"] ## if no value found in the record, simply return none - if pds.isnull(record[id_field]): return None - - if slot_name in ['part_of', 'has_input', 'has_output']: - id_values = record[id_field].split(',') - referenced_entity = [nmdc_class(**{'id': id_val}) for id_val in id_values] + if pds.isnull(record[id_field]): + return None + + if slot_name in ["part_of", "has_input", "has_output"]: + id_values = record[id_field].split(",") + referenced_entity = [ + nmdc_class(**{"id": id_val}) for id_val in id_values + ] for r in referenced_entity: - setattr(r, 'type', nmdc_class.class_class_curie) # add type info + setattr(r, "type", nmdc_class.class_class_curie) # add type info else: - referenced_entity = nmdc_class(**{'id':record[id_field]}) - setattr(referenced_entity, 'type', nmdc_class.class_class_curie) + referenced_entity = nmdc_class(**{"id": record[id_field]}) + setattr(referenced_entity, "type", nmdc_class.class_class_curie) return referenced_entity - - def make_constructor_args (constructor_map, record): + def make_constructor_args(constructor_map, record): ## for every mapping between a key and data field create a dict ## of the parameters needed to instantiate the class constructor_dict = {} @@ -292,41 +344,41 @@ def make_constructor_args (constructor_map, record): ## if the fields is a list, index 0 is param dict, index 1 is the class ## e.g., lat_lon': ['lat_lon', 'GeolocationValue'], or lat_lon': ['lat_lon', nmdc.GeolocationValue] if type([]) == type(field): - params = \ - { - key: "" if pds.isnull(record[value]) else record[value] # we don't want null/NaN to be a value in the constructor - for key, value in field[0].items() - } + params = { + key: ( + "" if pds.isnull(record[value]) else record[value] + ) # we don't want null/NaN to be a value in the constructor + for key, value in field[0].items() + } ## check if nmdc class is being passed as a string e.g., lat_lon': ['lat_lon', 'GeolocationValue'] - if type('') == type(field[1]): + if type("") == type(field[1]): constructor_type = getattr(nmdc, field[1]) else: constructor_type = field[1] - constructor_obj = constructor_type(**params) # create object from type + constructor_obj = constructor_type(**params) # create object from type constructor_obj.type = constructor_type.class_class_curie constructor_dict[key] = constructor_obj else: # print(field) constructor_dict[key] = record[field] - + return constructor_dict - - - def make_attribute_value (data_value): + + def make_attribute_value(data_value): """ Local function used to create attribute_value object linked the the raw value. """ - #print(obj, key, value) + # print(obj, key, value) av = nmdc.AttributeValue() av.has_raw_value = data_value - + return av - + ## add a 'type' slot to the nmdc class ## this allows us to easily inspect the type of entity in the json - setattr(nmdc_class, 'type', None) - + setattr(nmdc_class, "type", None) + ## by default, we don't want the constructors for the class ## to also be attributes of the object, these keys link objects other objects for key in constructor_map.keys(): @@ -336,14 +388,17 @@ def make_attribute_value (data_value): ## add attribute to the nmdc class if not present if add_attribute: for af in attribute_fields: - if type({}) == type(af): af = list(af.keys())[0] # needed for attributes given as a dict - if not hasattr(nmdc_class, str(af)): setattr(nmdc_class, str(af), None) + if type({}) == type(af): + af = list(af.keys())[0] # needed for attributes given as a dict + if not hasattr(nmdc_class, str(af)): + setattr(nmdc_class, str(af), None) ### TODO ! throw a a warning ### if len(attribute_map) > 0: for af in attribute_map.values(): - if not hasattr(nmdc_class, af): setattr(nmdc_class, af, None) - + if not hasattr(nmdc_class, af): + setattr(nmdc_class, af, None) + ## for each record in the dictionary, create an object of type nmdc_class and put the object into the list dict_list = [] # list to hold individual dictionary objects for record in dictionary: @@ -354,10 +409,17 @@ def make_attribute_value (data_value): else: obj = nmdc_class() - obj.type = nmdc_class.class_class_curie ## add info about the type of entity it is - + obj.type = ( + nmdc_class.class_class_curie + ) ## add info about the type of entity it is + for key, item in record.items(): - if (not pds.isnull(item)) and ('' != item) and (not (item is None)) and (key in attribute_fields): + if ( + (not pds.isnull(item)) + and ("" != item) + and (not (item is None)) + and (key in attribute_fields) + ): av = make_attribute_value(item) ## check if attribute has been mapped to a mixs term @@ -369,24 +431,29 @@ def make_attribute_value (data_value): setattr(obj, key, av) else: setattr(obj, key, av) - + ## go though the attribute list and link slots to entities specified in slot map/dict ## for example: {'part_of': ({'id': 'project_gold_ids'}, nmdc.OmicsProcessing)} for af in attribute_fields: if type({}) == type(af): slot_name = list(af.keys())[0] - if 'part_of' == slot_name: obj.part_of = map_slot_to_entity (af, record) - if 'has_input' == slot_name: obj.has_input = map_slot_to_entity(af, record) - if 'has_output' == slot_name: obj.has_output = map_slot_to_entity(af, record) - - dict_obj = json.loads(jsonasobj.as_json(obj)) # in order to not save empty values you need to convert to json + if "part_of" == slot_name: + obj.part_of = map_slot_to_entity(af, record) + if "has_input" == slot_name: + obj.has_input = map_slot_to_entity(af, record) + if "has_output" == slot_name: + obj.has_output = map_slot_to_entity(af, record) + + dict_obj = json.loads( + jsonasobj.as_json(obj) + ) # in order to not save empty values you need to convert to json dict_list.append(dict_obj) # and then loads json to get dict; this may be a bug - + ## return final list return dict_list -def save_json_string_list (file_name, json_list): +def save_json_string_list(file_name, json_list): """ Saves a list of json strings to a file. Args: @@ -402,7 +469,7 @@ def save_json_string_list (file_name, json_list): json.dump(json.loads(json_str), f, indent=2) -def load_dict_from_json_file (file_name): +def load_dict_from_json_file(file_name): """ Creates and returns from a json file. Args: @@ -414,112 +481,130 @@ def load_dict_from_json_file (file_name): return json.load(json_file) -def make_dataframe_from_spec_file (data_spec_file, nrows=None): - def make_df_from_file (data_source, nrows): +def make_dataframe_from_spec_file(data_spec_file, nrows=None): + def make_df_from_file(data_source, nrows): file_type = data_source.file_type fname = data_source.file_name - - if 'file_archive_name' in data_source.keys(): + + if "file_archive_name" in data_source.keys(): farchive = data_source.file_archive_name - df = make_dataframe(fname, file_archive_name=farchive, file_type=file_type, nrows=nrows) + df = make_dataframe( + fname, file_archive_name=farchive, file_type=file_type, nrows=nrows + ) else: df = make_dataframe(fname, file_type=file_type, nrows=nrows) - + return df - - def make_df (source, source_type='file_name'): + + def make_df(source, source_type="file_name"): name = source[0] data = source[1] data_source = source[1].data_source - + if source_type not in data_source.keys(): return None - + ## get data from file - if 'file_name' in data_source.keys(): + if "file_name" in data_source.keys(): df = make_df_from_file(data_source, nrows=nrows) - + ## add extra columns - if 'append_columns' in data.keys(): - for col in data.append_columns: df[col.name] = col.value - + if "append_columns" in data.keys(): + for col in data.append_columns: + df[col.name] = col.value + ## rename columns - if 'rename_slots' in data.keys(): - for slot in data.rename_slots: df.rename(columns={slot.old_name:slot.new_name}, inplace=True) - + if "rename_slots" in data.keys(): + for slot in data.rename_slots: + df.rename(columns={slot.old_name: slot.new_name}, inplace=True) + ## filter rows by specific values - if 'filters' in data.keys(): - for fltr in data.filters: df = df[df[fltr.field].isin(fltr.values)] + if "filters" in data.keys(): + for fltr in data.filters: + df = df[df[fltr.field].isin(fltr.values)] ## select a subset of the columns - if 'subset_cols' in data.keys(): + if "subset_cols" in data.keys(): df = df[data.subset_cols] - + ## add 'nmdc_record_id' as a primary key - if 'id_key' in data.keys(): - df['nmdc_record_id'] = df[data.id_key] - df['nmdc_record_id'] = df['nmdc_record_id'].astype(str) # ensure all keys are strings + if "id_key" in data.keys(): + df["nmdc_record_id"] = df[data.id_key] + df["nmdc_record_id"] = df["nmdc_record_id"].astype( + str + ) # ensure all keys are strings else: - df.index.name = 'nmdc_record_id' # rename the current index + df.index.name = "nmdc_record_id" # rename the current index df.reset_index(inplace=True) # turn index into a column - df['nmdc_record_id'] = df['nmdc_record_id'].astype(str) # ensure all keys are strings - + df["nmdc_record_id"] = df["nmdc_record_id"].astype( + str + ) # ensure all keys are strings + return df - - with open(data_spec_file, 'r') as input_file: + + with open(data_spec_file, "r") as input_file: spec = DottedDict(yaml.load(input_file, Loader=Loader)) - - Data_source = namedtuple('Data_source', 'data name') - + + Data_source = namedtuple("Data_source", "data name") + dataframes = [] for source in spec.data_sources.items(): df = make_df(source) ds = Data_source(df, source[0]) dataframes.append(ds) # print(source[0], len(df)) - + merged_df = merge_dataframes(dataframes) return merged_df -def merge_dataframes (dataframes, data_source_names=[]): - merged_df = pds.DataFrame(columns=['nmdc_data_source', 'nmdc_record_id', 'attribute', 'value']) - +def merge_dataframes(dataframes, data_source_names=[]): + merged_df = pds.DataFrame( + columns=["nmdc_data_source", "nmdc_record_id", "attribute", "value"] + ) + for idx, df in enumerate(dataframes): - if 'pandas.core.frame.DataFrame' == type(df): + if "pandas.core.frame.DataFrame" == type(df): data_source_name = data_source_names[idx] data = df else: data_source_name = df.name data = df.data - + ## convert data into an EAV structure - eavdf = data.melt(id_vars=['nmdc_record_id'], var_name='attribute') - eavdf['nmdc_data_source'] = data_source_name + eavdf = data.melt(id_vars=["nmdc_record_id"], var_name="attribute") + eavdf["nmdc_data_source"] = data_source_name # print(data_source_name, len(eavdf)) merged_df = merged_df.append(eavdf, ignore_index=True) - + return merged_df -def unpivot_dataframe (df, index='nmdc_record_id', columns='attribute', value='value', - splice=['nmdc_record_id', 'attribute', 'value']): +def unpivot_dataframe( + df, + index="nmdc_record_id", + columns="attribute", + value="value", + splice=["nmdc_record_id", "attribute", "value"], +): ## reshape eav structure to row-column structure ## see: https://www.journaldev.com/33398/pandas-melt-unmelt-pivot-function if len(splice) > 0: df = df[splice].pivot(index=index, columns=columns) else: df = df.pivot(index=index, columns=columns) - - if len(df) > 0: df = df[value].reset_index() # drop value hierarchical index - if len(df) > 0: df = df.where(pds.notnull(df), None) # replace an NaN values with None + + if len(df) > 0: + df = df[value].reset_index() # drop value hierarchical index + if len(df) > 0: + df = df.where(pds.notnull(df), None) # replace an NaN values with None df.columns.name = None # remove column name attribute - + return df -def extract_table (merged_df, table_name): +def extract_table(merged_df, table_name): df = unpivot_dataframe(merged_df[merged_df.nmdc_data_source == table_name]) return df @@ -530,14 +615,19 @@ def make_gold_to_mixs_list(source_list, dataframe, table_name): ## check for special condition fo using dicts for linking objects if type({}) != type(item): ## check for gold to mixs mapping - target_item = \ - get_mapped_term(dataframe, database='gold', table_name=table_name, source_field=item, target_field='mixs_term') - + target_item = get_mapped_term( + dataframe, + database="gold", + table_name=table_name, + source_field=item, + target_field="mixs_term", + ) + if len(target_item) > 0: target_list.append(target_item) else: target_list.append(item) - + return target_list @@ -547,36 +637,44 @@ def make_gold_to_mixs_map(source_list, dataframe, table_name): ## check for special condition fo using dicts for linking objects if type({}) != type(item): ## check for gold to mixs mapping - mapped_item = \ - get_mapped_term(dataframe, database='gold', table_name=table_name, source_field=item, target_field='mixs_term') - + mapped_item = get_mapped_term( + dataframe, + database="gold", + table_name=table_name, + source_field=item, + target_field="mixs_term", + ) + if len(mapped_item) > 0: gold_to_mixs[item] = mapped_item - + return gold_to_mixs def get_gold_biosample_mixs_term(dataframe, source_field): - return get_mapped_term(dataframe, 'gold', 'biosample',source_field, 'mixs_term') + return get_mapped_term(dataframe, "gold", "biosample", source_field, "mixs_term") def get_mapped_term(dataframe, database, table_name, source_field, target_field): - return_val = \ - dataframe[(dataframe['database'].str.lower() == database.lower()) - & (dataframe['table'].str.lower() == table_name.lower()) - & (dataframe['field'].str.lower() == source_field.lower())] - + return_val = dataframe[ + (dataframe["database"].str.lower() == database.lower()) + & (dataframe["table"].str.lower() == table_name.lower()) + & (dataframe["field"].str.lower() == source_field.lower()) + ] + if len(return_val) > 0: - return return_val[target_field].values[0] # if more than one match is found, only the first is returned + return return_val[target_field].values[ + 0 + ] # if more than one match is found, only the first is returned else: return "" - - + + def make_collection_date(year_val, month_val, day_val, hour_val="", minute_val=""): def pad_value(val, pad_len=2): s = str(val) return s.zfill(pad_len) - + return_val = "" year_val = year_val.strip() month_val = month_val.strip() @@ -584,16 +682,16 @@ def pad_value(val, pad_len=2): hour_val = hour_val.strip() minute_val = minute_val.strip() return_val = "" - + ## if a year isn't provided simply return the empty string if len(year_val) < 1: return "" else: return_val = pad_value(year_val, 4) - + if len(month_val) > 0: return_val = return_val + "-" + pad_value(month_val) - + ## we only days that have months assocated with them if (len(month_val) > 0) and (len(day_val) > 0): return_val = return_val + "-" + pad_value(day_val) @@ -603,171 +701,238 @@ def pad_value(val, pad_len=2): if (len(hour_val) > 0) and (len(minute_val) > 0): return_val = return_val + "T" + pad_value(hour_val) + ":" + minute_val elif len(hour_val) > 0: - return_val = return_val + "T" + pad_value(hour_val) + "00" # case for when no minute val is given - + return_val = ( + return_val + "T" + pad_value(hour_val) + "00" + ) # case for when no minute val is given + return return_val -def make_study_dataframe (study_table, contact_table, proposals_table, result_cols=[]): +def make_study_dataframe(study_table, contact_table, proposals_table, result_cols=[]): ## subset dataframes - contact_table_splice = contact_table[['contact_id', 'principal_investigator_name']].copy() - proposals_table_splice = proposals_table[['gold_study', 'doi']].copy() - + contact_table_splice = contact_table[ + ["contact_id", "principal_investigator_name"] + ].copy() + proposals_table_splice = proposals_table[["gold_study", "doi"]].copy() + ## left join data from contact - temp1_df = pds.merge(study_table.copy(), contact_table_splice, how='left', on='contact_id') - + temp1_df = pds.merge( + study_table.copy(), contact_table_splice, how="left", on="contact_id" + ) + ## left join data from proposals - temp2_df = pds.merge(temp1_df, proposals_table_splice, how='left', left_on='gold_id', right_on='gold_study') - + temp2_df = pds.merge( + temp1_df, + proposals_table_splice, + how="left", + left_on="gold_id", + right_on="gold_study", + ) + ## add prefix temp2_df.gold_id = "gold:" + temp2_df.gold_id temp2_df.gold_study = "gold:" + temp2_df.gold_study - + if len(result_cols) > 0: return temp2_df[result_cols] else: return temp2_df -def make_project_dataframe (project_table, - study_table, - contact_table, - data_object_table=None, - project_biosample_table=None, - biosample_table=None, - result_cols=[]): +def make_project_dataframe( + project_table, + study_table, + contact_table, + data_object_table=None, + project_biosample_table=None, + biosample_table=None, + result_cols=[], +): ## subset data - study_table_splice = study_table[['study_id', 'gold_id']].copy() - contact_table_splice = contact_table[['contact_id', 'principal_investigator_name']].copy() - + study_table_splice = study_table[["study_id", "gold_id"]].copy() + contact_table_splice = contact_table[ + ["contact_id", "principal_investigator_name"] + ].copy() + ####### HACK ########### ## remove "Whole Genome Sequencing" records ## - project_table = project_table[project_table['omics_type'] != "Whole Genome Sequencing"] + project_table = project_table[ + project_table["omics_type"] != "Whole Genome Sequencing" + ] ######################### ## rename study.gold_id to study_gold_id - study_table_splice.rename(columns={'gold_id': 'study_gold_id'}, inplace=True) - + study_table_splice.rename(columns={"gold_id": "study_gold_id"}, inplace=True) + ## inner join on study (project must be part of study) - temp1_df = pds.merge(project_table, study_table_splice, how='inner', left_on='master_study_id', right_on='study_id') - + temp1_df = pds.merge( + project_table, + study_table_splice, + how="inner", + left_on="master_study_id", + right_on="study_id", + ) + ## left join contact data - temp2_df = pds.merge(temp1_df, contact_table_splice, how='left', left_on='pi_id', right_on='contact_id') - + temp2_df = pds.merge( + temp1_df, + contact_table_splice, + how="left", + left_on="pi_id", + right_on="contact_id", + ) + ## add prefix temp2_df.gold_id = "gold:" + temp2_df.gold_id temp2_df.study_gold_id = "gold:" + temp2_df.study_gold_id ## if present join data objects as output of project - if (not (data_object_table is None)): + if not (data_object_table is None): ## make copy and add prefix data_object_table = data_object_table.copy() - data_object_table.gold_project_id = \ - data_object_table.gold_project_id.map(lambda x: x if 'gold:' == x[0:5] else 'gold:' + x) - + data_object_table.gold_project_id = data_object_table.gold_project_id.map( + lambda x: x if "gold:" == x[0:5] else "gold:" + x + ) + ## create a group concat for all file ids in the data objects - groups = data_object_table.groupby('gold_project_id')['file_id'] - output_files = \ - pds.DataFrame(groups.apply(lambda x: ','.join(filter(None, x)))).drop_duplicates().reset_index() - output_files.rename(columns={'file_id': 'output_file_ids'}, inplace=True) - output_files['output_file_ids'] = output_files['output_file_ids'].astype(str) - + groups = data_object_table.groupby("gold_project_id")["file_id"] + output_files = ( + pds.DataFrame(groups.apply(lambda x: ",".join(filter(None, x)))) + .drop_duplicates() + .reset_index() + ) + output_files.rename(columns={"file_id": "output_file_ids"}, inplace=True) + output_files["output_file_ids"] = output_files["output_file_ids"].astype(str) + ## join project and output files - temp2_df = pds.merge(temp2_df, output_files, how='left', left_on='gold_id', right_on='gold_project_id') - + temp2_df = pds.merge( + temp2_df, + output_files, + how="left", + left_on="gold_id", + right_on="gold_project_id", + ) + ## if present join biosamples as inputs to project if (not (project_biosample_table is None)) and (not (biosample_table is None)): ## make local copies & rename column project_biosample_table = project_biosample_table.copy() - biosample_table = biosample_table[['biosample_id', 'gold_id']].copy() - biosample_table.rename(columns={'gold_id': 'biosample_gold_id'}, inplace=True) + biosample_table = biosample_table[["biosample_id", "gold_id"]].copy() + biosample_table.rename(columns={"gold_id": "biosample_gold_id"}, inplace=True) ## add prefix - biosample_table['biosample_gold_id'] = \ - biosample_table['biosample_gold_id'].map(lambda x: x if 'gold:' == x[0:5] else 'gold:' + x) - + biosample_table["biosample_gold_id"] = biosample_table["biosample_gold_id"].map( + lambda x: x if "gold:" == x[0:5] else "gold:" + x + ) + ## join project biosamples to biosamples - input_samples = pds.merge(project_biosample_table, biosample_table, how='inner', on='biosample_id') - temp2_df = pds.merge(temp2_df, input_samples, how='left', on='project_id') - + input_samples = pds.merge( + project_biosample_table, biosample_table, how="inner", on="biosample_id" + ) + temp2_df = pds.merge(temp2_df, input_samples, how="left", on="project_id") + if len(result_cols) > 0: return temp2_df[result_cols] else: return temp2_df -def make_biosample_dataframe (biosample_table, project_biosample_table, project_table, result_cols=[]): +def make_biosample_dataframe( + biosample_table, project_biosample_table, project_table, result_cols=[] +): def make_collection_date_from_row(row): def _format_date_part_value(val): - if pds.isnull(val): return "" - + if pds.isnull(val): + return "" + if type("") == type(val): - if '.' in val: - return val[0: val.find('.')].strip() + if "." in val: + return val[0 : val.find(".")].strip() else: return val.strip() else: return str(int(val)).strip() - - year_val = _format_date_part_value(row['sample_collection_year']) - month_val = _format_date_part_value(row['sample_collection_month']) - day_val = _format_date_part_value(row['sample_collection_day']) - hour_val = _format_date_part_value(row['sample_collection_hour']) - minute_val = _format_date_part_value(row['sample_collection_minute']) - + + year_val = _format_date_part_value(row["sample_collection_year"]) + month_val = _format_date_part_value(row["sample_collection_month"]) + day_val = _format_date_part_value(row["sample_collection_day"]) + hour_val = _format_date_part_value(row["sample_collection_hour"]) + minute_val = _format_date_part_value(row["sample_collection_minute"]) + return make_collection_date(year_val, month_val, day_val, hour_val, minute_val) - + ## subset data - project_biosample_table_splice = project_biosample_table[['biosample_id', 'project_id']].copy() - project_table_splice = project_table[['project_id', 'gold_id']].copy() - + project_biosample_table_splice = project_biosample_table[ + ["biosample_id", "project_id"] + ].copy() + project_table_splice = project_table[["project_id", "gold_id"]].copy() + ## add prefix project_table_splice.gold_id = "gold:" + project_table_splice.gold_id - + ## rename columns - project_table_splice.rename(columns={'gold_id': 'project_gold_id'}, inplace=True) - + project_table_splice.rename(columns={"gold_id": "project_gold_id"}, inplace=True) + ## inner join on project_biosample and project; i.e., biosamples must be linked to project - temp1_df = pds.merge(biosample_table, project_biosample_table_splice, how='inner', on='biosample_id') - temp2_df = pds.merge(temp1_df, project_table_splice, how='inner', on='project_id') - + temp1_df = pds.merge( + biosample_table, project_biosample_table_splice, how="inner", on="biosample_id" + ) + temp2_df = pds.merge(temp1_df, project_table_splice, how="inner", on="project_id") + ## add collection date and lat_lon columns - temp2_df['collection_date'] = temp2_df.apply(lambda row: make_collection_date_from_row(row), axis=1) - temp2_df['lat_lon'] = temp2_df.apply(lambda row: make_lat_lon(row.latitude, row.longitude), axis=1) - + temp2_df["collection_date"] = temp2_df.apply( + lambda row: make_collection_date_from_row(row), axis=1 + ) + temp2_df["lat_lon"] = temp2_df.apply( + lambda row: make_lat_lon(row.latitude, row.longitude), axis=1 + ) + ## convert latitude and longitute columns to floats - temp2_df['latitude'] = temp2_df['latitude'].map(lambda x: None if pds.isnull(x) else float(x)) - temp2_df['longitude'] = temp2_df['longitude'].map(lambda x: None if pds.isnull(x) else float(x)) - + temp2_df["latitude"] = temp2_df["latitude"].map( + lambda x: None if pds.isnull(x) else float(x) + ) + temp2_df["longitude"] = temp2_df["longitude"].map( + lambda x: None if pds.isnull(x) else float(x) + ) + ## add gold prefix - temp2_df['gold_id'] = 'gold:' + temp2_df['gold_id'] - + temp2_df["gold_id"] = "gold:" + temp2_df["gold_id"] + ## biosample might belong to more than one project; so do the equivalent of a group_cat ## see: https://queirozf.com/entries/pandas-dataframe-groupby-examples ## see: https://stackoverflow.com/questions/18138693/replicating-group-concat-for-pandas-dataframe - groups = \ - temp2_df.groupby('biosample_id')['project_gold_id'].apply(lambda pid:','.join(filter(None, pid))).reset_index() - groups.rename(columns={'project_gold_id':'project_gold_ids'}, inplace=True) - + groups = ( + temp2_df.groupby("biosample_id")["project_gold_id"] + .apply(lambda pid: ",".join(filter(None, pid))) + .reset_index() + ) + groups.rename(columns={"project_gold_id": "project_gold_ids"}, inplace=True) + # join concat groups to dataframe - temp3_df = pds.merge(temp2_df, groups, how='left', on='biosample_id') - + temp3_df = pds.merge(temp2_df, groups, how="left", on="biosample_id") + ## remove uneeded columns & drop dups - temp3_df.drop(columns=['project_gold_id'], inplace=True) + temp3_df.drop(columns=["project_gold_id"], inplace=True) temp3_df.drop_duplicates(inplace=True) - + if len(result_cols) > 0: return temp3_df[result_cols] else: return temp3_df -def make_jgi_emsl_dataframe(jgi_emsl_table, study_table, result_cols=[]): +def make_jgi_emsl_dataframe(jgi_emsl_table, study_table, result_cols=[]): ## subset data - study_table_splice = study_table[['study_id', 'gold_id']].copy() - + study_table_splice = study_table[["study_id", "gold_id"]].copy() + ## inner join jgi-emsl data on study (must be part of study) - temp1_df = pds.merge(jgi_emsl_table, study_table_splice, how='inner', left_on='gold_study_id', right_on='gold_id') + temp1_df = pds.merge( + jgi_emsl_table, + study_table_splice, + how="inner", + left_on="gold_study_id", + right_on="gold_id", + ) ## add prefix temp1_df.gold_id = "gold:" + temp1_df.gold_id @@ -779,36 +944,60 @@ def make_jgi_emsl_dataframe(jgi_emsl_table, study_table, result_cols=[]): return temp1_df -def make_emsl_dataframe (emsl_table, jgi_emsl_table, study_table, emsl_biosample_table, result_cols=[]): +def make_emsl_dataframe( + emsl_table, jgi_emsl_table, study_table, emsl_biosample_table, result_cols=[] +): ## subset data - study_table_splice = study_table[['study_id', 'gold_id']].copy() - jgi_emsl_table_splice = jgi_emsl_table[['gold_study_id', 'emsl_proposal_id']].copy() - biosample_slice = emsl_biosample_table[['dataset_id', 'biosample_gold_id']].copy() - biosample_slice['biosample_gold_id'] = 'gold:' + biosample_slice['biosample_gold_id'] # add prefix - + study_table_splice = study_table[["study_id", "gold_id"]].copy() + jgi_emsl_table_splice = jgi_emsl_table[["gold_study_id", "emsl_proposal_id"]].copy() + biosample_slice = emsl_biosample_table[["dataset_id", "biosample_gold_id"]].copy() + biosample_slice["biosample_gold_id"] = ( + "gold:" + biosample_slice["biosample_gold_id"] + ) # add prefix + ## inner join jgi-emsl data on study (must be part of study) - temp1_df = \ - pds.merge(jgi_emsl_table_splice, study_table_splice, how='inner', left_on='gold_study_id', right_on='gold_id') - + temp1_df = pds.merge( + jgi_emsl_table_splice, + study_table_splice, + how="inner", + left_on="gold_study_id", + right_on="gold_id", + ) + ## inner join emsl data on jgi-emsl proposal ids - temp2_df = pds.merge(emsl_table, temp1_df, how='inner', on='emsl_proposal_id') - + temp2_df = pds.merge(emsl_table, temp1_df, how="inner", on="emsl_proposal_id") + ## add data obect id column temp2_df["data_object_id"] = "output_" - temp2_df["data_object_id"] = temp2_df["data_object_id"] + temp2_df["dataset_id"].map(str) # build data object id - + temp2_df["data_object_id"] = temp2_df["data_object_id"] + temp2_df[ + "dataset_id" + ].map( + str + ) # build data object id + ## add data object name column temp2_df["data_object_name"] = "output: " - temp2_df["data_object_name"] = temp2_df["data_object_name"] + temp2_df["dataset_name"].map(str) # build data object id + temp2_df["data_object_name"] = temp2_df["data_object_name"] + temp2_df[ + "dataset_name" + ].map( + str + ) # build data object id ## group concat & join the biosample ids that are inputs to the omics process - groups = biosample_slice.groupby('dataset_id')['biosample_gold_id'] - input_biosamples = \ - pds.DataFrame(groups.apply(lambda x: ','.join(filter(None, x)))).drop_duplicates().reset_index() - input_biosamples.rename(columns={'biosample_gold_id': 'biosample_gold_ids'}, inplace=True) - input_biosamples['biosample_gold_ids'] = input_biosamples['biosample_gold_ids'].astype(str) - temp2_df = pds.merge(temp2_df, input_biosamples, how='left', on='dataset_id') - + groups = biosample_slice.groupby("dataset_id")["biosample_gold_id"] + input_biosamples = ( + pds.DataFrame(groups.apply(lambda x: ",".join(filter(None, x)))) + .drop_duplicates() + .reset_index() + ) + input_biosamples.rename( + columns={"biosample_gold_id": "biosample_gold_ids"}, inplace=True + ) + input_biosamples["biosample_gold_ids"] = input_biosamples[ + "biosample_gold_ids" + ].astype(str) + temp2_df = pds.merge(temp2_df, input_biosamples, how="left", on="dataset_id") + ## add prefix temp2_df.gold_id = "gold:" + temp2_df.gold_id temp2_df.gold_study_id = "gold:" + temp2_df.gold_study_id @@ -817,62 +1006,74 @@ def make_emsl_dataframe (emsl_table, jgi_emsl_table, study_table, emsl_biosample ## drop duplicates temp2_df.drop_duplicates(inplace=True) - + if len(result_cols) > 0: return temp2_df[result_cols] else: return temp2_df -def make_data_objects_dataframe (faa_table, fna_table, fastq_table, project_table, result_cols=[]): +def make_data_objects_dataframe( + faa_table, fna_table, fastq_table, project_table, result_cols=[] +): ## subset data - project_table_splice = project_table[['gold_id']].copy() - + project_table_splice = project_table[["gold_id"]].copy() + ## copy tables faa_df = faa_table.copy() fna_df = fna_table.copy() fastq_df = fastq_table.copy() ## add prefixes for faa, fna, and fastq files - faa_df.file_id = "nmdc:" + faa_df.file_id - fna_df.file_id = "nmdc:" + fna_df.file_id + faa_df.file_id = "nmdc:" + faa_df.file_id + fna_df.file_id = "nmdc:" + fna_df.file_id fastq_df.file_id = "jgi:" + fastq_df.file_id ## merge tables data_objects = pds.concat([faa_df, fna_df, fastq_df], axis=0) - + ## inner joing data objects (e.g., faa, fna, fasq) to projects - temp1_df = \ - pds.merge(data_objects, project_table_splice, how='inner', left_on='gold_project_id', right_on='gold_id') - + temp1_df = pds.merge( + data_objects, + project_table_splice, + how="inner", + left_on="gold_project_id", + right_on="gold_id", + ) + ## add prefix for gold temp1_df.gold_project_id = "gold:" + temp1_df.gold_project_id temp1_df.gold_id = "gold:" + temp1_df.gold_id - + if len(result_cols) > 0: return temp1_df[result_cols] else: return temp1_df[data_objects.columns] -def make_jgi_fastq_dataframe (fastq_table, project_table, result_cols=[]): +def make_jgi_fastq_dataframe(fastq_table, project_table, result_cols=[]): ## subset data - project_table_splice = project_table[['gold_id']].copy() - + project_table_splice = project_table[["gold_id"]].copy() + ## copy tables fastq_df = fastq_table.copy() ## add prefixes for fastq file id fastq_df.file_id = "jgi:" + fastq_df.file_id - + ## inner join to projects - temp1_df = \ - pds.merge(fastq_df, project_table_splice, how='inner', left_on='gold_project_id', right_on='gold_id') - + temp1_df = pds.merge( + fastq_df, + project_table_splice, + how="inner", + left_on="gold_project_id", + right_on="gold_id", + ) + ## add prefix for gold temp1_df.gold_project_id = "gold:" + temp1_df.gold_project_id temp1_df.gold_id = "gold:" + temp1_df.gold_id - + if len(result_cols) > 0: return temp1_df[result_cols] else: diff --git a/metadata-translation/src/bin/mandatory_mixs_terms.py b/metadata-translation/src/bin/mandatory_mixs_terms.py index 24688721..d8b6149c 100755 --- a/metadata-translation/src/bin/mandatory_mixs_terms.py +++ b/metadata-translation/src/bin/mandatory_mixs_terms.py @@ -16,6 +16,7 @@ import data_operations as dop from pandasql import sqldf + ## helper for pandasql def pysqldf(q): return sqldf(q, globals()) diff --git a/nmdc_runtime/site/graphs.py b/nmdc_runtime/site/graphs.py index 97f5710f..09c5d512 100644 --- a/nmdc_runtime/site/graphs.py +++ b/nmdc_runtime/site/graphs.py @@ -87,7 +87,6 @@ def hello_mongo(): mongo_stats() - @graph def housekeeping(): delete_operations(list_operations(filter_ops_undone_expired())) diff --git a/nmdc_runtime/site/resources.py b/nmdc_runtime/site/resources.py index ed5e8102..0ba01f42 100644 --- a/nmdc_runtime/site/resources.py +++ b/nmdc_runtime/site/resources.py @@ -510,4 +510,4 @@ def get_mongo(run_config: frozendict): ) ) ) - return mongo_resource(resource_context) \ No newline at end of file + return mongo_resource(resource_context) diff --git a/setup.py b/setup.py index b00d7b62..5da4e771 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,5 @@ """blah.""" + from setuptools import find_packages, setup # type: ignore with open("README.md") as f: diff --git a/tests/e2e/test_minter_api.py b/tests/e2e/test_minter_api.py index d4bf5f04..c3dc5e4e 100644 --- a/tests/e2e/test_minter_api.py +++ b/tests/e2e/test_minter_api.py @@ -15,7 +15,9 @@ def _get_client(): return RuntimeApiSiteClient(base_url=os.getenv("API_HOST"), **rs["site_client"]) -@pytest.mark.xfail(reason="Expect 422 Client Error: Unprocessable Entity for url: http://fastapi:8000/pids/mint") +@pytest.mark.xfail( + reason="Expect 422 Client Error: Unprocessable Entity for url: http://fastapi:8000/pids/mint" +) def test_minter_api_mint(): client = _get_client() rv = client.request( @@ -24,7 +26,9 @@ def test_minter_api_mint(): assert len(rv) == 1 and rv[0].startswith("nmdc:") -@pytest.mark.xfail(reason="Expect 422 Client Error: Unprocessable Entity for url: http://fastapi:8000/pids/mint") +@pytest.mark.xfail( + reason="Expect 422 Client Error: Unprocessable Entity for url: http://fastapi:8000/pids/mint" +) def test_minter_api_resolve(): client = _get_client() [id_name] = client.request( @@ -34,7 +38,9 @@ def test_minter_api_resolve(): assert rv["id"] == id_name and rv["status"] == "draft" -@pytest.mark.xfail(reason="Expect 422 Client Error: Unprocessable Entity for url: http://fastapi:8000/pids/mint") +@pytest.mark.xfail( + reason="Expect 422 Client Error: Unprocessable Entity for url: http://fastapi:8000/pids/mint" +) def test_minter_api_bind(): client = _get_client() [id_name] = client.request( @@ -52,8 +58,9 @@ def test_minter_api_bind(): ) -@pytest.mark.xfail(reason="Expect 422 Client Error: Unprocessable Entity for url: http://fastapi:8000/pids/mint") - +@pytest.mark.xfail( + reason="Expect 422 Client Error: Unprocessable Entity for url: http://fastapi:8000/pids/mint" +) def test_minter_api_delete(): client = _get_client() [id_name] = client.request( diff --git a/tests/test_data/test_neon_benthic_data_translator.py b/tests/test_data/test_neon_benthic_data_translator.py index fb286da1..bb9840b2 100644 --- a/tests/test_data/test_neon_benthic_data_translator.py +++ b/tests/test_data/test_neon_benthic_data_translator.py @@ -128,6 +128,7 @@ ), } + def neon_envo_mappings_file(): tsv_data = """neon_nlcd_value\tmrlc_edomvd_before_hyphen\tmrlc_edomv\tenvo_alt_id\tenvo_id\tenvo_label\tenv_local_scale\tsubCLassOf and part of path to biome\tother justification\tbiome_label\tbiome_id\tenv_broad_scale deciduousForest\tDeciduous Forest\t41\tNLCD:41\tENVO:01000816\tarea of deciduous forest\tarea of deciduous forest [ENVO:01000816]\t --subCLassOf-->terretrial environmental zone--part of-->\t\tterrestrial biome\tENVO:00000448\tterrestrial biome [ENVO:00000448]""" @@ -150,14 +151,17 @@ def site_code_mapping(): class TestNeonBenthicDataTranslator: @pytest.fixture def translator(self, test_minter): - return NeonBenthicDataTranslator(benthic_data=benthic_data, - site_code_mapping=site_code_mapping(), - neon_envo_mappings_file=neon_envo_mappings_file(), - neon_raw_data_file_mappings_file=neon_raw_data_file_mappings_file(), - id_minter=test_minter - ) - - @pytest.mark.xfail(reason="AttributeError: module 'nmdc_schema.nmdc' has no attribute 'QualityControlReport'") + return NeonBenthicDataTranslator( + benthic_data=benthic_data, + site_code_mapping=site_code_mapping(), + neon_envo_mappings_file=neon_envo_mappings_file(), + neon_raw_data_file_mappings_file=neon_raw_data_file_mappings_file(), + id_minter=test_minter, + ) + + @pytest.mark.xfail( + reason="AttributeError: module 'nmdc_schema.nmdc' has no attribute 'QualityControlReport'" + ) def test_get_database(self, translator): database = translator.get_database() diff --git a/tests/test_data/test_neon_soil_data_translator.py b/tests/test_data/test_neon_soil_data_translator.py index 251ab3af..0746b27d 100644 --- a/tests/test_data/test_neon_soil_data_translator.py +++ b/tests/test_data/test_neon_soil_data_translator.py @@ -778,6 +778,7 @@ ), } + def neon_envo_mappings_file(): tsv_data = """neon_nlcd_value\tmrlc_edomvd_before_hyphen\tmrlc_edomv\tenvo_alt_id\tenvo_id\tenvo_label\tenv_local_scale\tsubCLassOf and part of path to biome\tother justification\tbiome_label\tbiome_id\tenv_broad_scale deciduousForest\tDeciduous Forest\t41\tNLCD:41\tENVO:01000816\tarea of deciduous forest\tarea of deciduous forest [ENVO:01000816]\t --subCLassOf-->terretrial environmental zone--part of-->\t\tterrestrial biome\tENVO:00000448\tterrestrial biome [ENVO:00000448]""" @@ -796,24 +797,37 @@ def neon_raw_data_file_mappings_file(): class TestNeonDataTranslator: @pytest.fixture def translator(self, test_minter): - return NeonSoilDataTranslator(mms_data=mms_data, - sls_data=sls_data, - neon_envo_mappings_file=neon_envo_mappings_file(), - neon_raw_data_file_mappings_file=neon_raw_data_file_mappings_file(), - id_minter=test_minter - ) + return NeonSoilDataTranslator( + mms_data=mms_data, + sls_data=sls_data, + neon_envo_mappings_file=neon_envo_mappings_file(), + neon_raw_data_file_mappings_file=neon_raw_data_file_mappings_file(), + id_minter=test_minter, + ) def test_missing_mms_table(self, test_minter): # Test behavior when mms data is missing a table with pytest.raises( ValueError, match="missing one of the metagenomic microbe soil tables" ): - NeonSoilDataTranslator({}, sls_data, neon_envo_mappings_file(), neon_raw_data_file_mappings_file(), id_minter=test_minter) + NeonSoilDataTranslator( + {}, + sls_data, + neon_envo_mappings_file(), + neon_raw_data_file_mappings_file(), + id_minter=test_minter, + ) def test_missing_sls_table(self, test_minter): # Test behavior when sls data is missing a table with pytest.raises(ValueError, match="missing one of the soil periodic tables"): - NeonSoilDataTranslator(mms_data, {}, neon_envo_mappings_file(), neon_raw_data_file_mappings_file(), id_minter=test_minter) + NeonSoilDataTranslator( + mms_data, + {}, + neon_envo_mappings_file(), + neon_raw_data_file_mappings_file(), + id_minter=test_minter, + ) def test_get_value_or_none(self): # use one biosample record to test this method @@ -860,7 +874,9 @@ def test_create_timestamp_value_with_valid_args(self): collect_date = _create_timestamp_value("2020-07-13T14:34Z") assert collect_date.has_raw_value == "2020-07-13T14:34Z" - @pytest.mark.xfail(reason="AttributeError: module 'nmdc_schema.nmdc' has no attribute 'QualityControlReport'") + @pytest.mark.xfail( + reason="AttributeError: module 'nmdc_schema.nmdc' has no attribute 'QualityControlReport'" + ) def test_get_database(self, translator): database = translator.get_database() diff --git a/tests/test_graphs/test_submission_portal_graphs.py b/tests/test_graphs/test_submission_portal_graphs.py index a2d257ee..e1b4d595 100644 --- a/tests/test_graphs/test_submission_portal_graphs.py +++ b/tests/test_graphs/test_submission_portal_graphs.py @@ -70,7 +70,9 @@ } -@pytest.mark.xfail(reason="DagsterInvalidConfigError: Error in config for job translate_metadata_submission_to_nmdc_schema_database") +@pytest.mark.xfail( + reason="DagsterInvalidConfigError: Error in config for job translate_metadata_submission_to_nmdc_schema_database" +) def test_translate_metadata_submission_to_nmdc_schema_database(): """Smoke test for translate_metadata_submission_to_nmdc_schema_database job"""