From 4c77eeae0cf1a1f540097b647f503fd1a65c8342 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Tue, 7 May 2024 17:02:23 -0700 Subject: [PATCH 01/27] dagster harness for NMDC-to-NCBI export code --- nmdc_runtime/site/export/ncbi_xml.py | 126 ++++++++++++++++++++ nmdc_runtime/site/export/nmdc_api_client.py | 34 ++++++ nmdc_runtime/site/graphs.py | 10 ++ nmdc_runtime/site/ops.py | 35 ++++++ nmdc_runtime/site/repository.py | 13 ++ nmdc_runtime/site/workspace.yaml | 3 + 6 files changed, 221 insertions(+) create mode 100644 nmdc_runtime/site/export/ncbi_xml.py create mode 100644 nmdc_runtime/site/export/nmdc_api_client.py diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py new file mode 100644 index 00000000..7b88aa62 --- /dev/null +++ b/nmdc_runtime/site/export/ncbi_xml.py @@ -0,0 +1,126 @@ +import datetime +import xml.etree.ElementTree as ET +import xml.dom.minidom + + +class NCBISubmissionXML: + def __init__( + self, study_id: str, org="National Microbiome Data Collaborative (NMDC)" + ): + self.root = ET.Element("Submission") + self.study_id = study_id + self.org = org + + def set_element(self, tag, text="", attrib=None, children=None): + attrib = attrib or {} + children = children or [] + element = ET.Element(tag, attrib=attrib) + element.text = text + for child in children: + element.append(child) + return element + + def set_description( + self, email="aclum@lbl.gov", user="NMDC", first="Alicia", last="Clum", date=None + ): + date = date or datetime.datetime.now().strftime("%Y-%m-%d") + description = self.set_element( + "Description", + children=[ + self.set_element("Comment", f"NMDC Submission for {self.study_id}"), + self.set_element("Submitter", attrib={"user_name": user}), + self.set_element( + "Organization", + attrib={"role": "owner", "type": "center"}, + children=[ + self.set_element("Name", self.org), + self.set_element( + "Contact", + attrib={"email": email}, + children=[ + self.set_element( + "Name", + children=[ + self.set_element("First", first), + self.set_element("Last", last), + ], + ) + ], + ), + ], + ), + self.set_element("Hold", attrib={"release_date": date}), + ], + ) + self.root.append(description) + + def set_biosample(self, title, spuid, sid, name, pkg, attributes=None): + attributes = attributes or {} + biosample = self.set_element( + "BioSample", + attrib={"schema_version": "2.0"}, + children=[ + self.set_element( + "SampleId", + children=[ + self.set_element("SPUID", sid, {"spuid_namespace": self.org}) + ], + ), + self.set_element( + "Descriptor", + children=[ + self.set_element("Title", title), + self.set_element( + "Description", children=[self.set_element("p", spuid)] + ), + ], + ), + self.set_element( + "Organism", children=[self.set_element("OrganismName", name)] + ), + self.set_element("Package", pkg), + self.set_element( + "Attributes", + children=[ + self.set_element( + "Attribute", attributes[key], {"attribute_name": key} + ) + for key in sorted(attributes) + ], + ), + ], + ) + action = self.set_element( + "Action", + children=[ + self.set_element( + "AddData", + attrib={"target_db": "BioSample"}, + children=[ + self.set_element( + "Data", + attrib={"content_type": "XML"}, + children=[ + self.set_element("XmlContent", children=[biosample]) + ], + ), + self.set_element( + "Identifier", + children=[ + self.set_element( + "SPUID", sid, {"spuid_namespace": self.org} + ) + ], + ), + ], + ) + ], + ) + self.root.append(action) + + def get_submission_xml(self): + self.set_description() + + rough_string = ET.tostring(self.root, "unicode") + reparsed = xml.dom.minidom.parseString(rough_string) + return reparsed.toprettyxml(indent=" ", newl="\n") diff --git a/nmdc_runtime/site/export/nmdc_api_client.py b/nmdc_runtime/site/export/nmdc_api_client.py new file mode 100644 index 00000000..6d7938e9 --- /dev/null +++ b/nmdc_runtime/site/export/nmdc_api_client.py @@ -0,0 +1,34 @@ +import requests + + +class NMDCApiClient: + def __init__(self, api_base_url): + if not api_base_url.endswith("/"): + api_base_url += "/" + self.base_url = api_base_url + self.headers = { + "accept": "application/json", + "Content-Type": "application/json", + } + + def get_biosamples_part_of_study(self, study_id: str) -> list[dict]: + """ + Get the biosamples that are part of a study. + """ + biosample_records = [] + params = { + "filter": '{"part_of": "' + study_id + '"}', + "max_page_size": "1000", + } + url = self.base_url + "nmdcschema/biosample_set" + response = requests.get(url, params=params, headers=self.headers) + response.raise_for_status() + biosample_records.extend(response.json()["resources"]) + # Get the next page of results, if any + while response.json().get("next_page_token") is not None: + params["page_token"] = response.json()["next_page_token"] + response = requests.get(url, params=params, headers=self.headers) + response.raise_for_status() + biosample_records.extend(response.json()["resources"]) + + return biosample_records diff --git a/nmdc_runtime/site/graphs.py b/nmdc_runtime/site/graphs.py index 076eb498..fccefe9a 100644 --- a/nmdc_runtime/site/graphs.py +++ b/nmdc_runtime/site/graphs.py @@ -49,6 +49,9 @@ get_neon_pipeline_inputs, get_df_from_url, site_code_mapping, + get_ncbi_export_pipeline_inputs, + ncbi_submission_xml_from_nmdc_study, + ncbi_submission_xml_asset, ) @@ -381,3 +384,10 @@ def ingest_neon_surface_water_metadata(): ) run_id = submit_metadata_to_db(database) poll_for_run_completion(run_id) + + +@graph +def nmdc_study_to_ncbi_submission_export(): + study_id = get_ncbi_export_pipeline_inputs() + xml_data = ncbi_submission_xml_from_nmdc_study(study_id) + ncbi_submission_xml_asset(xml_data) diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index 59c45fd6..749cb4a9 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -9,6 +9,7 @@ from io import BytesIO, StringIO from typing import Tuple from zipfile import ZipFile +# import xml.etree.ElementTree as ET import pandas as pd import requests @@ -55,6 +56,7 @@ _add_run_complete_event, ) from nmdc_runtime.api.models.util import ResultT +from nmdc_runtime.site.export.ncbi_xml import NCBISubmissionXML from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict from nmdc_runtime.site.resources import ( NmdcPortalApiClient, @@ -768,6 +770,24 @@ def export_json_to_drs( return ["/objects/" + drs_object["id"]] +@op( + description="NCBI Submission XML file rendered in a Dagster Asset", + out=Out(description="XML content rendered through Dagit UI") +) +def ncbi_submission_xml_asset(context: OpExecutionContext, data: str): + context.log_event( + AssetMaterialization( + asset_key="ncbi_submission_xml", + description="NCBI Submission XML Data", + metadata={ + "xml": MetadataValue.text(data) + } + ) + ) + + return Output(data) + + def unique_field_values(docs: List[Dict[str, Any]], field: str): return {doc[field] for doc in docs if field in doc} @@ -977,3 +997,18 @@ def site_code_mapping() -> dict: raise Exception( f"Failed to fetch site data from {endpoint}. Status code: {response.status_code}, Content: {response.content}" ) + + +@op(config_schema={"study_id": str}) +def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str: + return context.op_config["study_id"] + + +@op +def ncbi_submission_xml_from_nmdc_study( + context: OpExecutionContext, + study_id: str, + ) -> str: + ncbi_exporter = NCBISubmissionXML(study_id) + ncbi_xml = ncbi_exporter.get_submission_xml() + return ncbi_xml diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py index fada8da1..c716a0a9 100644 --- a/nmdc_runtime/site/repository.py +++ b/nmdc_runtime/site/repository.py @@ -42,6 +42,7 @@ ingest_neon_soil_metadata, ingest_neon_benthic_metadata, ingest_neon_surface_water_metadata, + nmdc_study_to_ncbi_submission_export, ) from nmdc_runtime.site.resources import ( get_mongo, @@ -852,6 +853,18 @@ def biosample_submission_ingest(): ] +@repository +def biosample_export(): + return [ + nmdc_study_to_ncbi_submission_export.to_job( + config={ + "ops": { + "get_ncbi_export_pipeline_inputs": {"config": {"study_id": ""}}, + }, + }, + ), + ] + # @repository # def validation(): # graph_jobs = [validate_jgi_job, validate_gold_job, validate_emsl_job] diff --git a/nmdc_runtime/site/workspace.yaml b/nmdc_runtime/site/workspace.yaml index e594197e..5da09ab9 100644 --- a/nmdc_runtime/site/workspace.yaml +++ b/nmdc_runtime/site/workspace.yaml @@ -11,6 +11,9 @@ load_from: - python_package: package_name: nmdc_runtime.site.repository attribute: biosample_submission_ingest + - python_package: + package_name: nmdc_runtime.site.repository + attribute: biosample_export # - python_package: # package_name: nmdc_runtime.site.repository # attribute: validation From 43f6baf5dc285e8a572e3816a0afc411bed5c4ea Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 8 May 2024 21:39:44 +0000 Subject: [PATCH 02/27] style: reformat --- nmdc_runtime/site/ops.py | 11 +++++------ nmdc_runtime/site/repository.py | 1 + 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index 749cb4a9..0d19ffaa 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -9,6 +9,7 @@ from io import BytesIO, StringIO from typing import Tuple from zipfile import ZipFile + # import xml.etree.ElementTree as ET import pandas as pd import requests @@ -772,16 +773,14 @@ def export_json_to_drs( @op( description="NCBI Submission XML file rendered in a Dagster Asset", - out=Out(description="XML content rendered through Dagit UI") + out=Out(description="XML content rendered through Dagit UI"), ) def ncbi_submission_xml_asset(context: OpExecutionContext, data: str): context.log_event( AssetMaterialization( asset_key="ncbi_submission_xml", description="NCBI Submission XML Data", - metadata={ - "xml": MetadataValue.text(data) - } + metadata={"xml": MetadataValue.text(data)}, ) ) @@ -997,7 +996,7 @@ def site_code_mapping() -> dict: raise Exception( f"Failed to fetch site data from {endpoint}. Status code: {response.status_code}, Content: {response.content}" ) - + @op(config_schema={"study_id": str}) def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str: @@ -1008,7 +1007,7 @@ def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str: def ncbi_submission_xml_from_nmdc_study( context: OpExecutionContext, study_id: str, - ) -> str: +) -> str: ncbi_exporter = NCBISubmissionXML(study_id) ncbi_xml = ncbi_exporter.get_submission_xml() return ncbi_xml diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py index c716a0a9..9503d9b6 100644 --- a/nmdc_runtime/site/repository.py +++ b/nmdc_runtime/site/repository.py @@ -865,6 +865,7 @@ def biosample_export(): ), ] + # @repository # def validation(): # graph_jobs = [validate_jgi_job, validate_gold_job, validate_emsl_job] From 2653223684291c4677422f8c9cda617147356829 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Wed, 8 May 2024 15:54:04 -0700 Subject: [PATCH 03/27] type handlers and capability to parse information out from nested NMDCslot structure --- nmdc_runtime/site/export/ncbi_xml.py | 133 +++++++++++++++------ nmdc_runtime/site/export/ncbi_xml_utils.py | 95 +++++++++++++++ 2 files changed, 192 insertions(+), 36 deletions(-) create mode 100644 nmdc_runtime/site/export/ncbi_xml_utils.py diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py index 7b88aa62..2ff527a7 100644 --- a/nmdc_runtime/site/export/ncbi_xml.py +++ b/nmdc_runtime/site/export/ncbi_xml.py @@ -1,3 +1,14 @@ +from nmdc_runtime.site.export.ncbi_xml_utils import ( + handle_controlled_identified_term_value, + handle_controlled_term_value, + handle_geolocation_value, + handle_quantity_value, + handle_text_value, + handle_timestamp_value, + handle_float_value, + handle_string_value, + load_mappings, +) import datetime import xml.etree.ElementTree as ET import xml.dom.minidom @@ -11,6 +22,19 @@ def __init__( self.study_id = study_id self.org = org + # dispatcher dictionary capturing handlers for NMDC object to NCBI flat Attribute + # type handlers + self.type_handlers = { + "QuantityValue": handle_quantity_value, + "TextValue": handle_text_value, + "TimestampValue": handle_timestamp_value, + "ControlledTermValue": handle_controlled_term_value, + "ControlledIdentifiedTermValue": handle_controlled_identified_term_value, + "GeolocationValue": handle_geolocation_value, + "float": handle_float_value, + "string": handle_string_value, + } + def set_element(self, tag, text="", attrib=None, children=None): attrib = attrib or {} children = children or [] @@ -54,42 +78,67 @@ def set_description( ) self.root.append(description) - def set_biosample(self, title, spuid, sid, name, pkg, attributes=None): - attributes = attributes or {} - biosample = self.set_element( - "BioSample", - attrib={"schema_version": "2.0"}, - children=[ - self.set_element( - "SampleId", - children=[ - self.set_element("SPUID", sid, {"spuid_namespace": self.org}) - ], - ), - self.set_element( - "Descriptor", - children=[ - self.set_element("Title", title), - self.set_element( - "Description", children=[self.set_element("p", spuid)] - ), - ], - ), - self.set_element( - "Organism", children=[self.set_element("OrganismName", name)] - ), - self.set_element("Package", pkg), - self.set_element( - "Attributes", - children=[ - self.set_element( - "Attribute", attributes[key], {"attribute_name": key} - ) - for key in sorted(attributes) - ], - ), - ], + def set_biosample( + self, + title, + spuid, + sid, + name, + pkg, + nmdc_biosample, + ): + attribute_mappings, slot_range_mappings = load_mappings( + "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/issue-1940/assets/ncbi_mappings/ncbi_attribute_mappings_filled.tsv" ) + + attributes = {} + for json_key, value in nmdc_biosample.items(): + if isinstance(value, list): + continue + + xml_key = attribute_mappings.get(json_key, json_key) + value_type = slot_range_mappings.get( + json_key, "string" + ) + handler = self.type_handlers.get( + value_type, handle_string_value + ) + + formatted_value = handler(value) + attributes[xml_key] = formatted_value + + # Create the BioSample XML block with these attributes + biosample_elements = [ + self.set_element( + "SampleId", + children=[ + self.set_element("SPUID", sid, {"spuid_namespace": self.org}) + ], + ), + self.set_element( + "Descriptor", + children=[ + self.set_element("Title", title), + self.set_element( + "Description", children=[self.set_element("p", spuid)] + ), + ], + ), + self.set_element( + "Organism", children=[self.set_element("OrganismName", name)] + ), + self.set_element("Package", pkg), + self.set_element( + "Attributes", + children=[ + self.set_element( + "Attribute", attributes[key], {"attribute_name": key} + ) + for key in sorted(attributes) + ], + ), + ] + action = self.set_element( "Action", children=[ @@ -101,7 +150,16 @@ def set_biosample(self, title, spuid, sid, name, pkg, attributes=None): "Data", attrib={"content_type": "XML"}, children=[ - self.set_element("XmlContent", children=[biosample]) + self.set_element( + "XmlContent", + children=[ + self.set_element( + "BioSample", + attrib={"schema_version": "2.0"}, + children=biosample_elements, + ) + ], + ) ], ), self.set_element( @@ -121,6 +179,9 @@ def set_biosample(self, title, spuid, sid, name, pkg, attributes=None): def get_submission_xml(self): self.set_description() + # TODO: iterate over all biosamples in the study + # make call to self.set_biosample() here + rough_string = ET.tostring(self.root, "unicode") reparsed = xml.dom.minidom.parseString(rough_string) return reparsed.toprettyxml(indent=" ", newl="\n") diff --git a/nmdc_runtime/site/export/ncbi_xml_utils.py b/nmdc_runtime/site/export/ncbi_xml_utils.py new file mode 100644 index 00000000..e34cae6d --- /dev/null +++ b/nmdc_runtime/site/export/ncbi_xml_utils.py @@ -0,0 +1,95 @@ +from io import StringIO +import csv +import requests + + +def handle_quantity_value(slot_value): + if "has_numeric_value" in slot_value and "has_unit" in slot_value: + return f"{slot_value['has_numeric_value']} {slot_value['has_unit']}" + elif ( + "has_maximum_numeric_value" in slot_value + and "has_minimum_numeric_value" in slot_value + and "has_unit" in slot_value + ): + range_value = ( + slot_value["has_maximum_numeric_value"] + - slot_value["has_minimum_numeric_value"] + ) + return f"({range_value}) {slot_value['has_unit']}" + elif "has_raw_value" in slot_value: + return slot_value["has_raw_value"] + return "Unknown format" + + +def handle_text_value(slot_value): + return slot_value.get("has_raw_value", "Unknown format") + + +def handle_timestamp_value(slot_value): + return slot_value.get("has_raw_value", "Unknown format") + + +def handle_controlled_term_value(slot_value): + if "term" in slot_value: + term = slot_value["term"] + if "name" in term and "id" in term: + return f"{term['name']} [{term['id']}]" + elif "id" in term: + return term["id"] + elif "name" in term: + return term["name"] + elif "has_raw_value" in slot_value: + return slot_value["has_raw_value"] + return "Unknown format" + + +def handle_controlled_identified_term_value(slot_value): + if "term" in slot_value: + term = slot_value["term"] + if "name" in term and "id" in term: + return f"{term['name']} [{term['id']}]" + elif "has_raw_value" in slot_value: + return slot_value["has_raw_value"] + return "Unknown format" + + +def handle_geolocation_value(slot_value): + if "latitude" in slot_value and "longitude" in slot_value: + return f"{slot_value['latitude']} {slot_value['longitude']}" + elif "has_raw_value" in slot_value: + return slot_value["has_raw_value"] + return "Unknown format" + + +def handle_float_value(slot_value): + return f"{slot_value:.2f}" + + +def handle_string_value(slot_value): + return f"{slot_value}" + + +def load_mappings(url): + response = requests.get(url) + response.raise_for_status() + file_content = response.text + + attribute_mappings = {} + slot_range_mappings = {} + reader = csv.DictReader(StringIO(file_content), delimiter="\t") + for row in reader: + if row["ignore"].strip(): + continue + + json_key = row["nmdc_schema_slot"] + # attribute mappings + xml_attribute_name = row["ncbi_biosample_attribute_name"] + attribute_mappings[json_key] = ( + xml_attribute_name if xml_attribute_name else json_key + ) + + # slot range mappings + data_type = row["nmdc_schema_slot_range"] + slot_range_mappings[json_key] = data_type if data_type else "default" + + return attribute_mappings, slot_range_mappings From 7b714a5c0e86c639fa27b7496867265c4821f555 Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 8 May 2024 22:56:06 +0000 Subject: [PATCH 04/27] style: reformat --- nmdc_runtime/site/export/ncbi_xml.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py index 2ff527a7..da3577d4 100644 --- a/nmdc_runtime/site/export/ncbi_xml.py +++ b/nmdc_runtime/site/export/ncbi_xml.py @@ -97,12 +97,8 @@ def set_biosample( continue xml_key = attribute_mappings.get(json_key, json_key) - value_type = slot_range_mappings.get( - json_key, "string" - ) - handler = self.type_handlers.get( - value_type, handle_string_value - ) + value_type = slot_range_mappings.get(json_key, "string") + handler = self.type_handlers.get(value_type, handle_string_value) formatted_value = handler(value) attributes[xml_key] = formatted_value From d992f1f6b4089fb1ce0a1f12711736b3094c8edf Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Thu, 9 May 2024 11:20:15 -0700 Subject: [PATCH 05/27] implement set_bioproject() method to create block for NCBI BioProject --- nmdc_runtime/site/export/ncbi_xml.py | 68 ++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py index da3577d4..c32fa0ed 100644 --- a/nmdc_runtime/site/export/ncbi_xml.py +++ b/nmdc_runtime/site/export/ncbi_xml.py @@ -1,3 +1,8 @@ +import json +import datetime +import xml.etree.ElementTree as ET +import xml.dom.minidom + from nmdc_runtime.site.export.ncbi_xml_utils import ( handle_controlled_identified_term_value, handle_controlled_term_value, @@ -9,9 +14,6 @@ handle_string_value, load_mappings, ) -import datetime -import xml.etree.ElementTree as ET -import xml.dom.minidom class NCBISubmissionXML: @@ -78,6 +80,64 @@ def set_description( ) self.root.append(description) + def set_descriptor(self, title, description, url): + descriptor_elements = [] + descriptor_elements.append(self.set_element("Title", title)) + descriptor_elements.append( + self.set_element( + "Description", children=[self.set_element("p", description)] + ) + ) + + external_resources = json.loads(url) + for label, link in external_resources.items(): + external_link = self.set_element("ExternalLink", attrib={"label": label}) + url_element = self.set_element("URL", link) + external_link.append(url_element) + descriptor_elements.append(external_link) + + return descriptor_elements + + def set_bioproject(self, title, project_id, description, data_type, url): + action = self.set_element("Action") + add_data = self.set_element("AddData", attrib={"target_db": "BioProject"}) + + data_element = self.set_element("Data", attrib={"content_type": "XML"}) + xml_content = self.set_element("XmlContent") + project = self.set_element("Project", attrib={"schema_version": "2.0"}) + + project_id_element = self.set_element("ProjectID") + spuid = self.set_element("SPUID", project_id, {"spuid_namespace": self.org}) + project_id_element.append(spuid) + + descriptor = self.set_descriptor(title, description, url) + project_type = self.set_element("ProjectType") + project_type_submission = self.set_element( + "ProjectTypeSubmission", attrib={"sample_scope": "eEnvironment"} + ) + intended_data_type_set = self.set_element("IntendedDataTypeSet") + data_type_element = self.set_element("DataType", data_type) + + intended_data_type_set.append(data_type_element) + project_type_submission.append(intended_data_type_set) + project_type.append(project_type_submission) + + project.extend([project_id_element] + descriptor + [project_type]) + + xml_content.append(project) + data_element.append(xml_content) + add_data.append(data_element) + + identifier = self.set_element("Identifier") + spuid_identifier = self.set_element( + "SPUID", project_id, {"spuid_namespace": self.org} + ) + identifier.append(spuid_identifier) + add_data.append(identifier) + + action.append(add_data) + self.root.append(action) + def set_biosample( self, title, @@ -175,6 +235,8 @@ def set_biosample( def get_submission_xml(self): self.set_description() + # initialize/make call to self.set_bioproject() here + # TODO: iterate over all biosamples in the study # make call to self.set_biosample() here From 71be54f242f46a9099a39ed520d41b503bc42660 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Thu, 9 May 2024 17:06:01 -0700 Subject: [PATCH 06/27] capture submission non Attribute metadata through Dagit repo interface --- nmdc_runtime/site/export/ncbi_xml.py | 63 +++++++++++++++++----------- nmdc_runtime/site/graphs.py | 4 +- nmdc_runtime/site/ops.py | 62 +++++++++++++++++++++++++-- nmdc_runtime/site/repository.py | 26 +++++++++++- 4 files changed, 124 insertions(+), 31 deletions(-) diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py index c32fa0ed..f868174c 100644 --- a/nmdc_runtime/site/export/ncbi_xml.py +++ b/nmdc_runtime/site/export/ncbi_xml.py @@ -18,11 +18,13 @@ class NCBISubmissionXML: def __init__( - self, study_id: str, org="National Microbiome Data Collaborative (NMDC)" + self, ncbi_submission_fields: dict ): self.root = ET.Element("Submission") - self.study_id = study_id - self.org = org + self.nmdc_study_id = ncbi_submission_fields.get("nmdc_study_id") + self.ncbi_submission_metadata = ncbi_submission_fields.get("ncbi_submission_metadata", {}) + self.ncbi_bioproject_metadata = ncbi_submission_fields.get("ncbi_bioproject_metadata", {}) + self.ncbi_biosample_metadata = ncbi_submission_fields.get("ncbi_biosample_metadata", {}) # dispatcher dictionary capturing handlers for NMDC object to NCBI flat Attribute # type handlers @@ -47,19 +49,19 @@ def set_element(self, tag, text="", attrib=None, children=None): return element def set_description( - self, email="aclum@lbl.gov", user="NMDC", first="Alicia", last="Clum", date=None + self, email, user, first, last, org, date=None ): date = date or datetime.datetime.now().strftime("%Y-%m-%d") description = self.set_element( "Description", children=[ - self.set_element("Comment", f"NMDC Submission for {self.study_id}"), + self.set_element("Comment", f"NMDC Submission for {self.nmdc_study_id}"), self.set_element("Submitter", attrib={"user_name": user}), self.set_element( "Organization", attrib={"role": "owner", "type": "center"}, children=[ - self.set_element("Name", self.org), + self.set_element("Name", org), self.set_element( "Contact", attrib={"email": email}, @@ -80,7 +82,7 @@ def set_description( ) self.root.append(description) - def set_descriptor(self, title, description, url): + def set_descriptor(self, title, description): descriptor_elements = [] descriptor_elements.append(self.set_element("Title", title)) descriptor_elements.append( @@ -89,16 +91,9 @@ def set_descriptor(self, title, description, url): ) ) - external_resources = json.loads(url) - for label, link in external_resources.items(): - external_link = self.set_element("ExternalLink", attrib={"label": label}) - url_element = self.set_element("URL", link) - external_link.append(url_element) - descriptor_elements.append(external_link) - return descriptor_elements - def set_bioproject(self, title, project_id, description, data_type, url): + def set_bioproject(self, title, project_id, description, data_type, org): action = self.set_element("Action") add_data = self.set_element("AddData", attrib={"target_db": "BioProject"}) @@ -107,10 +102,10 @@ def set_bioproject(self, title, project_id, description, data_type, url): project = self.set_element("Project", attrib={"schema_version": "2.0"}) project_id_element = self.set_element("ProjectID") - spuid = self.set_element("SPUID", project_id, {"spuid_namespace": self.org}) + spuid = self.set_element("SPUID", project_id, {"spuid_namespace": org}) project_id_element.append(spuid) - descriptor = self.set_descriptor(title, description, url) + descriptor = self.set_descriptor(title, description) project_type = self.set_element("ProjectType") project_type_submission = self.set_element( "ProjectTypeSubmission", attrib={"sample_scope": "eEnvironment"} @@ -130,7 +125,7 @@ def set_bioproject(self, title, project_id, description, data_type, url): identifier = self.set_element("Identifier") spuid_identifier = self.set_element( - "SPUID", project_id, {"spuid_namespace": self.org} + "SPUID", project_id, {"spuid_namespace": org} ) identifier.append(spuid_identifier) add_data.append(identifier) @@ -145,6 +140,7 @@ def set_biosample( sid, name, pkg, + org, nmdc_biosample, ): attribute_mappings, slot_range_mappings = load_mappings( @@ -168,7 +164,7 @@ def set_biosample( self.set_element( "SampleId", children=[ - self.set_element("SPUID", sid, {"spuid_namespace": self.org}) + self.set_element("SPUID", sid, {"spuid_namespace": org}) ], ), self.set_element( @@ -222,7 +218,7 @@ def set_biosample( "Identifier", children=[ self.set_element( - "SPUID", sid, {"spuid_namespace": self.org} + "SPUID", sid, {"spuid_namespace": org} ) ], ), @@ -233,12 +229,31 @@ def set_biosample( self.root.append(action) def get_submission_xml(self): - self.set_description() + self.set_description( + email=self.ncbi_submission_metadata.get("email", ""), + user=self.ncbi_submission_metadata.get("user", ""), + first=self.ncbi_submission_metadata.get("first", ""), + last=self.ncbi_submission_metadata.get("last", ""), + org=self.ncbi_submission_metadata.get("organization", ""), + ) - # initialize/make call to self.set_bioproject() here + self.set_bioproject( + title=self.ncbi_bioproject_metadata.get("title", ""), + project_id=self.ncbi_bioproject_metadata.get("project_id", ""), + description=self.ncbi_bioproject_metadata.get("description", ""), + data_type=self.ncbi_bioproject_metadata.get("data_type", ""), + org=self.ncbi_submission_metadata.get("organization", ""), + ) - # TODO: iterate over all biosamples in the study - # make call to self.set_biosample() here + self.set_biosample( + title=self.ncbi_biosample_metadata.get("title", ""), + spuid=self.ncbi_biosample_metadata.get("spuid", ""), + sid=self.ncbi_biosample_metadata.get("sid", ""), + name=self.ncbi_biosample_metadata.get("name", ""), + pkg=self.ncbi_biosample_metadata.get("pkg", ""), + org=self.ncbi_submission_metadata.get("organization", ""), + nmdc_biosample={} + ) rough_string = ET.tostring(self.root, "unicode") reparsed = xml.dom.minidom.parseString(rough_string) diff --git a/nmdc_runtime/site/graphs.py b/nmdc_runtime/site/graphs.py index fccefe9a..f1b755d6 100644 --- a/nmdc_runtime/site/graphs.py +++ b/nmdc_runtime/site/graphs.py @@ -388,6 +388,6 @@ def ingest_neon_surface_water_metadata(): @graph def nmdc_study_to_ncbi_submission_export(): - study_id = get_ncbi_export_pipeline_inputs() - xml_data = ncbi_submission_xml_from_nmdc_study(study_id) + ncbi_submission_fields = get_ncbi_export_pipeline_inputs() + xml_data = ncbi_submission_xml_from_nmdc_study(ncbi_submission_fields) ncbi_submission_xml_asset(xml_data) diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index 0d19ffaa..58352640 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -31,6 +31,8 @@ String, op, Optional, + Field, + Permissive, ) from gridfs import GridFS from linkml_runtime.dumpers import json_dumper @@ -998,16 +1000,68 @@ def site_code_mapping() -> dict: ) -@op(config_schema={"study_id": str}) +@op( + config_schema={ + "nmdc_study_id": str, + "ncbi_submission_metadata": Field( + Permissive( + { + "email": String, + "first": String, + "last": String, + "user": String, + } + ), + is_required=True, + description="General metadata about the NCBI submission.", + ), + "ncbi_bioproject_metadata": Field( + Permissive( + { + "title": String, + "project_id": String, + "description": String, + "data_type": String, + } + ), + is_required=True, + description="Metadata for NCBI BioProject in the Submission.", + ), + "ncbi_biosample_metadata": Field( + Permissive( + { + "title": String, + "spuid": String, + "sid": String, + "name": String, + "pkg": String, + } + ), + is_required=True, + description="Metadata for one or many NCBI BioSample in the Submission.", + ), + }, + out=Out(Dict), +) def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str: - return context.op_config["study_id"] + nmdc_study_id = context.op_config["nmdc_study_id"] + ncbi_submission_metadata = context.op_config.get("ncbi_submission_metadata", {}) + ncbi_bioproject_metadata = context.op_config.get("ncbi_bioproject_metadata", {}) + ncbi_biosample_metadata = context.op_config.get("ncbi_biosample_metadata", {}) + + return { + "nmdc_study_id": nmdc_study_id, + "ncbi_submission_metadata": ncbi_submission_metadata, + "ncbi_bioproject_metadata": ncbi_bioproject_metadata, + "ncbi_biosample_metadata": ncbi_biosample_metadata, + } @op def ncbi_submission_xml_from_nmdc_study( context: OpExecutionContext, - study_id: str, + ncbi_exporter_metadata: dict, ) -> str: - ncbi_exporter = NCBISubmissionXML(study_id) + ncbi_exporter = NCBISubmissionXML(ncbi_exporter_metadata) ncbi_xml = ncbi_exporter.get_submission_xml() return ncbi_xml diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py index 9503d9b6..90651210 100644 --- a/nmdc_runtime/site/repository.py +++ b/nmdc_runtime/site/repository.py @@ -859,7 +859,31 @@ def biosample_export(): nmdc_study_to_ncbi_submission_export.to_job( config={ "ops": { - "get_ncbi_export_pipeline_inputs": {"config": {"study_id": ""}}, + "get_ncbi_export_pipeline_inputs": { + "config": { + "nmdc_study_id": "", + "ncbi_submission_metadata": { + "email": "", + "first": "", + "last": "", + "user": "", + "organization": "", + }, + "ncbi_bioproject_metadata": { + "title": "", + "project_id": "", + "description": "", + "data_type": "", + }, + "ncbi_biosample_metadata": { + "title": "", + "spuid": "", + "sid": "", + "name": "", + "pkg": "", + }, + } + }, }, }, ), From 7fb364f997e7d52b6a693fe2b4031cc077afe3ef Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 10 May 2024 00:06:43 +0000 Subject: [PATCH 07/27] style: reformat --- nmdc_runtime/site/export/ncbi_xml.py | 34 ++++++++++++++-------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py index f868174c..314baa75 100644 --- a/nmdc_runtime/site/export/ncbi_xml.py +++ b/nmdc_runtime/site/export/ncbi_xml.py @@ -17,14 +17,18 @@ class NCBISubmissionXML: - def __init__( - self, ncbi_submission_fields: dict - ): + def __init__(self, ncbi_submission_fields: dict): self.root = ET.Element("Submission") self.nmdc_study_id = ncbi_submission_fields.get("nmdc_study_id") - self.ncbi_submission_metadata = ncbi_submission_fields.get("ncbi_submission_metadata", {}) - self.ncbi_bioproject_metadata = ncbi_submission_fields.get("ncbi_bioproject_metadata", {}) - self.ncbi_biosample_metadata = ncbi_submission_fields.get("ncbi_biosample_metadata", {}) + self.ncbi_submission_metadata = ncbi_submission_fields.get( + "ncbi_submission_metadata", {} + ) + self.ncbi_bioproject_metadata = ncbi_submission_fields.get( + "ncbi_bioproject_metadata", {} + ) + self.ncbi_biosample_metadata = ncbi_submission_fields.get( + "ncbi_biosample_metadata", {} + ) # dispatcher dictionary capturing handlers for NMDC object to NCBI flat Attribute # type handlers @@ -48,14 +52,14 @@ def set_element(self, tag, text="", attrib=None, children=None): element.append(child) return element - def set_description( - self, email, user, first, last, org, date=None - ): + def set_description(self, email, user, first, last, org, date=None): date = date or datetime.datetime.now().strftime("%Y-%m-%d") description = self.set_element( "Description", children=[ - self.set_element("Comment", f"NMDC Submission for {self.nmdc_study_id}"), + self.set_element( + "Comment", f"NMDC Submission for {self.nmdc_study_id}" + ), self.set_element("Submitter", attrib={"user_name": user}), self.set_element( "Organization", @@ -163,9 +167,7 @@ def set_biosample( biosample_elements = [ self.set_element( "SampleId", - children=[ - self.set_element("SPUID", sid, {"spuid_namespace": org}) - ], + children=[self.set_element("SPUID", sid, {"spuid_namespace": org})], ), self.set_element( "Descriptor", @@ -217,9 +219,7 @@ def set_biosample( self.set_element( "Identifier", children=[ - self.set_element( - "SPUID", sid, {"spuid_namespace": org} - ) + self.set_element("SPUID", sid, {"spuid_namespace": org}) ], ), ], @@ -252,7 +252,7 @@ def get_submission_xml(self): name=self.ncbi_biosample_metadata.get("name", ""), pkg=self.ncbi_biosample_metadata.get("pkg", ""), org=self.ncbi_submission_metadata.get("organization", ""), - nmdc_biosample={} + nmdc_biosample={}, ) rough_string = ET.tostring(self.root, "unicode") From ae062d16e44e858bdd072932392d6d6398780fea Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Fri, 10 May 2024 12:51:22 -0700 Subject: [PATCH 08/27] process all biosamples from a given NMDC study for NCBI XML translation --- nmdc_runtime/site/export/ncbi_xml.py | 157 +++++++++++--------- nmdc_runtime/site/export/nmdc_api_client.py | 37 +++-- nmdc_runtime/site/ops.py | 5 +- nmdc_runtime/site/repository.py | 4 +- 4 files changed, 110 insertions(+), 93 deletions(-) diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py index 314baa75..03092930 100644 --- a/nmdc_runtime/site/export/ncbi_xml.py +++ b/nmdc_runtime/site/export/ncbi_xml.py @@ -1,4 +1,3 @@ -import json import datetime import xml.etree.ElementTree as ET import xml.dom.minidom @@ -14,6 +13,7 @@ handle_string_value, load_mappings, ) +from nmdc_runtime.site.export.nmdc_api_client import NMDCApiClient class NCBISubmissionXML: @@ -29,6 +29,7 @@ def __init__(self, ncbi_submission_fields: dict): self.ncbi_biosample_metadata = ncbi_submission_fields.get( "ncbi_biosample_metadata", {} ) + self.nmdc_api_client = NMDCApiClient() # dispatcher dictionary capturing handlers for NMDC object to NCBI flat Attribute # type handlers @@ -142,91 +143,95 @@ def set_biosample( title, spuid, sid, - name, - pkg, + organism_name, + package, org, - nmdc_biosample, + nmdc_biosamples, ): attribute_mappings, slot_range_mappings = load_mappings( "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/issue-1940/assets/ncbi_mappings/ncbi_attribute_mappings_filled.tsv" ) - attributes = {} - for json_key, value in nmdc_biosample.items(): - if isinstance(value, list): - continue + for biosample in nmdc_biosamples: + attributes = {} + for json_key, value in biosample.items(): + if isinstance(value, list): + continue # Skip processing for list values - xml_key = attribute_mappings.get(json_key, json_key) - value_type = slot_range_mappings.get(json_key, "string") - handler = self.type_handlers.get(value_type, handle_string_value) + xml_key = attribute_mappings.get(json_key, json_key) + value_type = slot_range_mappings.get(json_key, "string") + handler = self.type_handlers.get(value_type, handle_string_value) - formatted_value = handler(value) - attributes[xml_key] = formatted_value + formatted_value = handler(value) + attributes[xml_key] = formatted_value - # Create the BioSample XML block with these attributes - biosample_elements = [ - self.set_element( - "SampleId", - children=[self.set_element("SPUID", sid, {"spuid_namespace": org})], - ), - self.set_element( - "Descriptor", - children=[ - self.set_element("Title", title), - self.set_element( - "Description", children=[self.set_element("p", spuid)] - ), - ], - ), - self.set_element( - "Organism", children=[self.set_element("OrganismName", name)] - ), - self.set_element("Package", pkg), - self.set_element( - "Attributes", - children=[ - self.set_element( - "Attribute", attributes[key], {"attribute_name": key} - ) - for key in sorted(attributes) - ], - ), - ] - - action = self.set_element( - "Action", - children=[ + # Create the BioSample XML block with these attributes for each biosample + biosample_elements = [ self.set_element( - "AddData", - attrib={"target_db": "BioSample"}, + "SampleId", + children=[self.set_element("SPUID", sid, {"spuid_namespace": org})], + ), + self.set_element( + "Descriptor", children=[ + self.set_element("Title", title), self.set_element( - "Data", - attrib={"content_type": "XML"}, - children=[ - self.set_element( - "XmlContent", - children=[ - self.set_element( - "BioSample", - attrib={"schema_version": "2.0"}, - children=biosample_elements, - ) - ], - ) - ], + "Description", children=[self.set_element("p", spuid)] ), + ], + ), + self.set_element( + "Organism", + children=[self.set_element("OrganismName", organism_name)], + ), + self.set_element("Package", package), + self.set_element( + "Attributes", + children=[ self.set_element( - "Identifier", - children=[ - self.set_element("SPUID", sid, {"spuid_namespace": org}) - ], - ), + "Attribute", attributes[key], {"attribute_name": key} + ) + for key in sorted(attributes) ], - ) - ], - ) - self.root.append(action) + ), + ] + + action = self.set_element( + "Action", + children=[ + self.set_element( + "AddData", + attrib={"target_db": "BioSample"}, + children=[ + self.set_element( + "Data", + attrib={"content_type": "XML"}, + children=[ + self.set_element( + "XmlContent", + children=[ + self.set_element( + "BioSample", + attrib={"schema_version": "2.0"}, + children=biosample_elements, + ), + ], + ), + ], + ), + self.set_element( + "Identifier", + children=[ + self.set_element( + "SPUID", sid, {"spuid_namespace": org} + ), + ], + ), + ], + ), + ], + ) + self.root.append(action) def get_submission_xml(self): self.set_description( @@ -245,14 +250,18 @@ def get_submission_xml(self): org=self.ncbi_submission_metadata.get("organization", ""), ) + biosamples_list = self.nmdc_api_client.get_biosamples_part_of_study( + self.nmdc_study_id + ) + self.set_biosample( title=self.ncbi_biosample_metadata.get("title", ""), spuid=self.ncbi_biosample_metadata.get("spuid", ""), sid=self.ncbi_biosample_metadata.get("sid", ""), - name=self.ncbi_biosample_metadata.get("name", ""), - pkg=self.ncbi_biosample_metadata.get("pkg", ""), + organism_name=self.ncbi_biosample_metadata.get("organism_name", ""), + package=self.ncbi_biosample_metadata.get("package", ""), org=self.ncbi_submission_metadata.get("organization", ""), - nmdc_biosample={}, + nmdc_biosamples=biosamples_list, ) rough_string = ET.tostring(self.root, "unicode") diff --git a/nmdc_runtime/site/export/nmdc_api_client.py b/nmdc_runtime/site/export/nmdc_api_client.py index 6d7938e9..b4dd38d7 100644 --- a/nmdc_runtime/site/export/nmdc_api_client.py +++ b/nmdc_runtime/site/export/nmdc_api_client.py @@ -1,11 +1,18 @@ +import os +import json import requests +from dotenv import load_dotenv + class NMDCApiClient: - def __init__(self, api_base_url): - if not api_base_url.endswith("/"): - api_base_url += "/" - self.base_url = api_base_url + def __init__(self, api_base_url=None): + load_dotenv() + self.base_url = api_base_url or os.getenv("API_HOST") + if not self.base_url: + raise ValueError("API base URL for runtime environment is required.") + if not self.base_url.endswith("/"): + self.base_url += "/" self.headers = { "accept": "application/json", "Content-Type": "application/json", @@ -16,19 +23,19 @@ def get_biosamples_part_of_study(self, study_id: str) -> list[dict]: Get the biosamples that are part of a study. """ biosample_records = [] - params = { - "filter": '{"part_of": "' + study_id + '"}', - "max_page_size": "1000", - } + params = {"filter": json.dumps({"part_of": study_id}), "max_page_size": "1000"} url = self.base_url + "nmdcschema/biosample_set" - response = requests.get(url, params=params, headers=self.headers) - response.raise_for_status() - biosample_records.extend(response.json()["resources"]) - # Get the next page of results, if any - while response.json().get("next_page_token") is not None: - params["page_token"] = response.json()["next_page_token"] + + while True: response = requests.get(url, params=params, headers=self.headers) response.raise_for_status() - biosample_records.extend(response.json()["resources"]) + data = response.json() + biosample_records.extend(data["resources"]) + + # Check if there's a next page + next_page_token = data.get("next_page_token") + if not next_page_token: + break + params["page_token"] = next_page_token return biosample_records diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index 58352640..c327ee9d 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -1010,6 +1010,7 @@ def site_code_mapping() -> dict: "first": String, "last": String, "user": String, + "organization": String, } ), is_required=True, @@ -1033,8 +1034,8 @@ def site_code_mapping() -> dict: "title": String, "spuid": String, "sid": String, - "name": String, - "pkg": String, + "organism_name": String, + "package": String, } ), is_required=True, diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py index 90651210..1d7d4691 100644 --- a/nmdc_runtime/site/repository.py +++ b/nmdc_runtime/site/repository.py @@ -879,8 +879,8 @@ def biosample_export(): "title": "", "spuid": "", "sid": "", - "name": "", - "pkg": "", + "organism_name": "", + "package": "", }, } }, From 24e8ad97b024d18f446f14f257b16e23c905b586 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Fri, 10 May 2024 14:15:01 -0700 Subject: [PATCH 09/27] allow users to pass in mapping file through Dagit interface --- nmdc_runtime/site/export/ncbi_xml.py | 5 ++++- nmdc_runtime/site/ops.py | 5 +++++ nmdc_runtime/site/repository.py | 1 + 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py index 03092930..ab40f3d9 100644 --- a/nmdc_runtime/site/export/ncbi_xml.py +++ b/nmdc_runtime/site/export/ncbi_xml.py @@ -20,6 +20,9 @@ class NCBISubmissionXML: def __init__(self, ncbi_submission_fields: dict): self.root = ET.Element("Submission") self.nmdc_study_id = ncbi_submission_fields.get("nmdc_study_id") + self.nmdc_ncbi_attribute_mapping_file_url = ncbi_submission_fields.get( + "nmdc_ncbi_attribute_mapping_file_url" + ) self.ncbi_submission_metadata = ncbi_submission_fields.get( "ncbi_submission_metadata", {} ) @@ -149,7 +152,7 @@ def set_biosample( nmdc_biosamples, ): attribute_mappings, slot_range_mappings = load_mappings( - "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/issue-1940/assets/ncbi_mappings/ncbi_attribute_mappings_filled.tsv" + self.nmdc_ncbi_attribute_mapping_file_url ) for biosample in nmdc_biosamples: diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index c327ee9d..29545e78 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -1003,6 +1003,7 @@ def site_code_mapping() -> dict: @op( config_schema={ "nmdc_study_id": str, + "nmdc_ncbi_attribute_mapping_file_url": str, "ncbi_submission_metadata": Field( Permissive( { @@ -1046,12 +1047,16 @@ def site_code_mapping() -> dict: ) def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str: nmdc_study_id = context.op_config["nmdc_study_id"] + nmdc_ncbi_attribute_mapping_file_url = context.op_config[ + "nmdc_ncbi_attribute_mapping_file_url" + ] ncbi_submission_metadata = context.op_config.get("ncbi_submission_metadata", {}) ncbi_bioproject_metadata = context.op_config.get("ncbi_bioproject_metadata", {}) ncbi_biosample_metadata = context.op_config.get("ncbi_biosample_metadata", {}) return { "nmdc_study_id": nmdc_study_id, + "nmdc_ncbi_attribute_mapping_file_url": nmdc_ncbi_attribute_mapping_file_url, "ncbi_submission_metadata": ncbi_submission_metadata, "ncbi_bioproject_metadata": ncbi_bioproject_metadata, "ncbi_biosample_metadata": ncbi_biosample_metadata, diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py index 1d7d4691..8075551d 100644 --- a/nmdc_runtime/site/repository.py +++ b/nmdc_runtime/site/repository.py @@ -862,6 +862,7 @@ def biosample_export(): "get_ncbi_export_pipeline_inputs": { "config": { "nmdc_study_id": "", + "nmdc_ncbi_attribute_mapping_file_url": "", "ncbi_submission_metadata": { "email": "", "first": "", From 48e7be347c41a4101d01c447ec48305a0609e093 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Mon, 13 May 2024 12:13:11 -0700 Subject: [PATCH 10/27] remove spuid, sid and title NCBI BioSample configuration parameters --- nmdc_runtime/site/export/ncbi_xml.py | 27 +++++++++++++++++---------- nmdc_runtime/site/ops.py | 3 --- nmdc_runtime/site/repository.py | 3 --- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py index ab40f3d9..311b057a 100644 --- a/nmdc_runtime/site/export/ncbi_xml.py +++ b/nmdc_runtime/site/export/ncbi_xml.py @@ -143,9 +143,6 @@ def set_bioproject(self, title, project_id, description, data_type, org): def set_biosample( self, - title, - spuid, - sid, organism_name, package, org, @@ -157,10 +154,17 @@ def set_biosample( for biosample in nmdc_biosamples: attributes = {} + sample_id_value = None + for json_key, value in biosample.items(): if isinstance(value, list): continue # Skip processing for list values + # Special handling for NMDC Biosample "id" + if json_key == "id": + sample_id_value = value + continue + xml_key = attribute_mappings.get(json_key, json_key) value_type = slot_range_mappings.get(json_key, "string") handler = self.type_handlers.get(value_type, handle_string_value) @@ -172,14 +176,18 @@ def set_biosample( biosample_elements = [ self.set_element( "SampleId", - children=[self.set_element("SPUID", sid, {"spuid_namespace": org})], + children=[ + self.set_element( + "SPUID", sample_id_value, {"spuid_namespace": org} + ) + ], ), self.set_element( "Descriptor", children=[ - self.set_element("Title", title), self.set_element( - "Description", children=[self.set_element("p", spuid)] + "Title", + f"NMDC Biosample {sample_id_value} from {organism_name} part of {self.nmdc_study_id} study", ), ], ), @@ -226,7 +234,9 @@ def set_biosample( "Identifier", children=[ self.set_element( - "SPUID", sid, {"spuid_namespace": org} + "SPUID", + sample_id_value, + {"spuid_namespace": org}, ), ], ), @@ -258,9 +268,6 @@ def get_submission_xml(self): ) self.set_biosample( - title=self.ncbi_biosample_metadata.get("title", ""), - spuid=self.ncbi_biosample_metadata.get("spuid", ""), - sid=self.ncbi_biosample_metadata.get("sid", ""), organism_name=self.ncbi_biosample_metadata.get("organism_name", ""), package=self.ncbi_biosample_metadata.get("package", ""), org=self.ncbi_submission_metadata.get("organization", ""), diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index 29545e78..df6b07a1 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -1032,9 +1032,6 @@ def site_code_mapping() -> dict: "ncbi_biosample_metadata": Field( Permissive( { - "title": String, - "spuid": String, - "sid": String, "organism_name": String, "package": String, } diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py index 8075551d..3e68892e 100644 --- a/nmdc_runtime/site/repository.py +++ b/nmdc_runtime/site/repository.py @@ -877,9 +877,6 @@ def biosample_export(): "data_type": "", }, "ncbi_biosample_metadata": { - "title": "", - "spuid": "", - "sid": "", "organism_name": "", "package": "", }, From b7a8000a66449cf7127b46366b34ba818cdbec88 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Mon, 13 May 2024 12:24:03 -0700 Subject: [PATCH 11/27] update handle_quantity_value() in ncbi_xml_utils.py --- nmdc_runtime/site/export/ncbi_xml_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nmdc_runtime/site/export/ncbi_xml_utils.py b/nmdc_runtime/site/export/ncbi_xml_utils.py index e34cae6d..e8e66047 100644 --- a/nmdc_runtime/site/export/ncbi_xml_utils.py +++ b/nmdc_runtime/site/export/ncbi_xml_utils.py @@ -15,7 +15,7 @@ def handle_quantity_value(slot_value): slot_value["has_maximum_numeric_value"] - slot_value["has_minimum_numeric_value"] ) - return f"({range_value}) {slot_value['has_unit']}" + return f"{range_value} {slot_value['has_unit']}" elif "has_raw_value" in slot_value: return slot_value["has_raw_value"] return "Unknown format" From dae0d13045fdb34c06b01f9c35aeeae9331ad88c Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Mon, 13 May 2024 18:09:07 -0700 Subject: [PATCH 12/27] if an NMDC biosample key is not in mapping file, ignore it --- nmdc_runtime/site/export/ncbi_xml.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py index 311b057a..9fd3ba80 100644 --- a/nmdc_runtime/site/export/ncbi_xml.py +++ b/nmdc_runtime/site/export/ncbi_xml.py @@ -165,7 +165,10 @@ def set_biosample( sample_id_value = value continue - xml_key = attribute_mappings.get(json_key, json_key) + if json_key not in attribute_mappings: + continue + + xml_key = attribute_mappings[json_key] value_type = slot_range_mappings.get(json_key, "string") handler = self.type_handlers.get(value_type, handle_string_value) From dfe5f4161585156a7f0f043ea948105c64758394 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Wed, 15 May 2024 17:31:25 -0700 Subject: [PATCH 13/27] comprehensive test suite for NMDC-to-NCBI export pipeline --- nmdc_runtime/site/export/ncbi_xml_utils.py | 2 + requirements/dev.in | 3 +- requirements/main.in | 1 + tests/test_data/test_ncbi_xml.py | 434 +++++++++++++++++++++ 4 files changed, 439 insertions(+), 1 deletion(-) create mode 100644 tests/test_data/test_ncbi_xml.py diff --git a/nmdc_runtime/site/export/ncbi_xml_utils.py b/nmdc_runtime/site/export/ncbi_xml_utils.py index e8e66047..1ad34aca 100644 --- a/nmdc_runtime/site/export/ncbi_xml_utils.py +++ b/nmdc_runtime/site/export/ncbi_xml_utils.py @@ -48,6 +48,8 @@ def handle_controlled_identified_term_value(slot_value): term = slot_value["term"] if "name" in term and "id" in term: return f"{term['name']} [{term['id']}]" + elif "id" in term: + return term["id"] elif "has_raw_value" in slot_value: return slot_value["has_raw_value"] return "Unknown format" diff --git a/requirements/dev.in b/requirements/dev.in index dbe7b8e9..601370de 100644 --- a/requirements/dev.in +++ b/requirements/dev.in @@ -11,4 +11,5 @@ pytest-cov requests-mock setuptools twine -requests-cache \ No newline at end of file +requests-cache +pytest-mock \ No newline at end of file diff --git a/requirements/main.in b/requirements/main.in index 45998c84..4f58b6e3 100644 --- a/requirements/main.in +++ b/requirements/main.in @@ -30,6 +30,7 @@ pandas passlib[bcrypt] pymongo pydantic[email]>=1.10.0 +pytest-mock python-dotenv python-jose[cryptography] python-multipart diff --git a/tests/test_data/test_ncbi_xml.py b/tests/test_data/test_ncbi_xml.py new file mode 100644 index 00000000..3cd2913c --- /dev/null +++ b/tests/test_data/test_ncbi_xml.py @@ -0,0 +1,434 @@ +from unittest.mock import MagicMock +import pytest +from requests.exceptions import HTTPError +import xml.etree.ElementTree as ET + +from nmdc_runtime.site.export.ncbi_xml import NCBISubmissionXML +from nmdc_runtime.site.export.ncbi_xml_utils import ( + load_mappings, + handle_quantity_value, + handle_text_value, + handle_timestamp_value, + handle_controlled_term_value, + handle_controlled_identified_term_value, + handle_geolocation_value, + handle_float_value, + handle_string_value, +) +from nmdc_runtime.site.export.nmdc_api_client import NMDCApiClient + +MOCK_SUBMISSION_FIELDS = { + "nmdc_study_id": "nmdc:sty-11-12345", + "nmdc_ncbi_attribute_mapping_file_url": "http://example.com/mappings.tsv", + "ncbi_submission_metadata": { + "email": "user@example.com", + "user": "testuser", + "first": "Test", + "last": "User", + "organization": "Test Org", + }, + "ncbi_bioproject_metadata": { + "title": "Test Project", + "project_id": "PRJNA12345", + "description": "A test project", + "data_type": "metagenome", + }, + "ncbi_biosample_metadata": { + "title": "Test Sample", + "organism_name": "E. coli", + "package": "Test Package", + }, +} + + +@pytest.fixture +def ncbi_submission_client(): + return NCBISubmissionXML(ncbi_submission_fields=MOCK_SUBMISSION_FIELDS) + + +@pytest.fixture +def nmdc_api_client(): + return NMDCApiClient(api_base_url="http://fakeapi.com/") + + +@pytest.fixture +def nmdc_biosample(): + return [ + { + "analysis_type": ["metagenomics"], + "biosample_categories": ["NEON"], + "collection_date": {"has_raw_value": "2014-08-05T18:40Z"}, + "conduc": {"has_numeric_value": 567, "has_unit": "uS/cm"}, + "elev": 1178.7, + "env_broad_scale": { + "term": {"id": "ENVO:03605008", "name": "freshwater stream biome"} + }, + "env_local_scale": { + "term": {"id": "ENVO:03605007", "name": "freshwater stream"} + }, + "env_medium": {"term": {"id": "ENVO:03605006", "name": "stream water"}}, + "env_package": {"has_raw_value": "water"}, + "geo_loc_name": {"has_raw_value": "USA: Colorado, Arikaree River"}, + "id": "nmdc:bsm-12-gnfpt483", + "lat_lon": {"latitude": 39.758359, "longitude": -102.448595}, + "name": "ARIK.SS.20140805", + "part_of": ["nmdc:sty-11-hht5sb92"], + "samp_collec_device": "Grab", + "temp": {"has_numeric_value": 20.1, "has_unit": "Cel"}, + "type": "nmdc:Biosample", + } + ] + + +class TestNCBISubmissionXML: + def test_set_element(self, ncbi_submission_client): + element = ncbi_submission_client.set_element("Test", "Hello", {"attr": "value"}) + assert element.tag == "Test" + assert element.text == "Hello" + assert element.attrib == {"attr": "value"} + + def test_set_description(self, ncbi_submission_client): + ncbi_submission_client.set_description( + MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["email"], + MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["user"], + MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["first"], + MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["last"], + MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["organization"], + ) + description = ET.tostring( + ncbi_submission_client.root.find("Description"), "unicode" + ) + + root = ET.fromstring(description) + comment = root.find("Comment").text + submitter = root.find("Submitter").attrib["user_name"] + org_name = root.find("Organization/Name").text + contact_email = root.find("Organization/Contact").attrib["email"] + contact_first = root.find("Organization/Contact/Name/First").text + contact_last = root.find("Organization/Contact/Name/Last").text + + assert comment == "NMDC Submission for nmdc:sty-11-12345" + assert submitter == "testuser" + assert org_name == "Test Org" + assert contact_email == "user@example.com" + assert contact_first == "Test" + assert contact_last == "User" + + def test_set_bioproject(self, ncbi_submission_client): + ncbi_submission_client.set_bioproject( + title=MOCK_SUBMISSION_FIELDS["ncbi_bioproject_metadata"]["title"], + project_id=MOCK_SUBMISSION_FIELDS["ncbi_bioproject_metadata"]["project_id"], + description=MOCK_SUBMISSION_FIELDS["ncbi_bioproject_metadata"][ + "description" + ], + data_type=MOCK_SUBMISSION_FIELDS["ncbi_bioproject_metadata"]["data_type"], + org=MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["organization"], + ) + bioproject_xml = ET.tostring( + ncbi_submission_client.root.find(".//Project"), "unicode" + ) + assert "Test Project" in bioproject_xml + assert "PRJNA12345" in bioproject_xml + assert "A test project" in bioproject_xml + assert "metagenome" in bioproject_xml + assert "Test Org" in bioproject_xml + + def test_set_biosample(self, ncbi_submission_client, nmdc_biosample, mocker): + mocker.patch( + "nmdc_runtime.site.export.ncbi_xml.load_mappings", + return_value=( + { + "analysis_type": "", + "biosample_categories": "", + "collection_date": "collection_date", + "conduc": "conduc", + "elev": "elev", + "env_broad_scale": "env_broad_scale", + "env_local_scale": "env_local_scale", + "env_medium": "env_medium", + "env_package": "env_package", + "geo_loc_name": "geo_loc_name", + "id": "", + "lat_lon": "lat_lon", + "name": "sample_name", + "part_of": "", + "samp_collec_device": "samp_collect_device", + "temp": "temp", + "type": "", + }, + { + "analysis_type": "AnalysisTypeEnum", + "biosample_categories": "BiosampleCategoryEnum", + "collection_date": "TimestampValue", + "conduc": "QuantityValue", + "elev": "float", + "env_broad_scale": "ControlledIdentifiedTermValue", + "env_local_scale": "ControlledIdentifiedTermValue", + "env_medium": "ControlledIdentifiedTermValue", + "env_package": "TextValue", + "geo_loc_name": "TextValue", + "id": "uriorcurie", + "lat_lon": "GeolocationValue", + "name": "string", + "part_of": "Study", + "samp_collec_device": "string", + "temp": "QuantityValue", + "type": "string", + }, + ), + ) + ncbi_submission_client.set_biosample( + organism_name=MOCK_SUBMISSION_FIELDS["ncbi_biosample_metadata"][ + "organism_name" + ], + package=MOCK_SUBMISSION_FIELDS["ncbi_biosample_metadata"]["package"], + org=MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["organization"], + nmdc_biosamples=nmdc_biosample, + ) + biosample_xml = ET.tostring( + ncbi_submission_client.root.find(".//BioSample"), "unicode" + ) + assert "E. coli" in biosample_xml + assert "Test Package" in biosample_xml + assert "Test Org" in biosample_xml + + def test_get_submission_xml(self, mocker, ncbi_submission_client, nmdc_biosample): + mocker.patch( + "nmdc_runtime.site.export.ncbi_xml.load_mappings", + return_value=( + { + "analysis_type": "", + "biosample_categories": "", + "collection_date": "collection_date", + "conduc": "conduc", + "elev": "elev", + "env_broad_scale": "env_broad_scale", + "env_local_scale": "env_local_scale", + "env_medium": "env_medium", + "env_package": "env_package", + "geo_loc_name": "geo_loc_name", + "id": "", + "lat_lon": "lat_lon", + "name": "sample_name", + "part_of": "", + "samp_collec_device": "samp_collect_device", + "temp": "temp", + "type": "", + }, + { + "analysis_type": "AnalysisTypeEnum", + "biosample_categories": "BiosampleCategoryEnum", + "collection_date": "TimestampValue", + "conduc": "QuantityValue", + "elev": "float", + "env_broad_scale": "ControlledIdentifiedTermValue", + "env_local_scale": "ControlledIdentifiedTermValue", + "env_medium": "ControlledIdentifiedTermValue", + "env_package": "TextValue", + "geo_loc_name": "TextValue", + "id": "uriorcurie", + "lat_lon": "GeolocationValue", + "name": "string", + "part_of": "Study", + "samp_collec_device": "string", + "temp": "QuantityValue", + "type": "string", + }, + ), + ) + + mocker.patch.object( + NMDCApiClient, "get_biosamples_part_of_study", return_value=nmdc_biosample + ) + + submission_xml = ncbi_submission_client.get_submission_xml() + + assert "nmdc:bsm-12-gnfpt483" in submission_xml + assert "E. coli" in submission_xml + assert "stream water" in submission_xml + assert "USA: Colorado, Arikaree River" in submission_xml + assert "2014-08-05T18:40Z" in submission_xml + assert "testuser" in submission_xml + assert "Test Project" in submission_xml + + +class TestNMDCApiClient: + def test_get_biosamples_part_of_study_success(self, mocker, nmdc_api_client): + mock_response = mocker.MagicMock() + mock_response.json.return_value = { + "resources": [ + {"id": "nmdc:bsm-12-gnfpt483", "part_of": ["nmdc:sty-11-hht5sb92"]} + ], + "next_page_token": None, + } + mocker.patch("requests.get", return_value=mock_response) + result = nmdc_api_client.get_biosamples_part_of_study("nmdc:sty-11-hht5sb92") + assert result == [ + {"id": "nmdc:bsm-12-gnfpt483", "part_of": ["nmdc:sty-11-hht5sb92"]} + ] + + def test_get_biosamples_part_of_study_failure(self, mocker, nmdc_api_client): + mocker.patch("requests.get", side_effect=HTTPError("API Error")) + with pytest.raises(HTTPError): + nmdc_api_client.get_biosamples_part_of_study("nmdc:sty-11-hht5sb92") + + +class TestNCBIXMLUtils: + def test_handle_quantity_value(self): + assert ( + handle_quantity_value({"has_numeric_value": 10, "has_unit": "mg"}) + == "10 mg" + ) + assert ( + handle_quantity_value( + { + "has_maximum_numeric_value": 15, + "has_minimum_numeric_value": 5, + "has_unit": "kg", + } + ) + == "10 kg" + ) + assert handle_quantity_value({"has_raw_value": "20 units"}) == "20 units" + assert handle_quantity_value({}) == "Unknown format" + + def test_handle_text_value(self): + assert handle_text_value({"has_raw_value": "Sample Text"}) == "Sample Text" + assert handle_text_value({}) == "Unknown format" + + def test_handle_timestamp_value(self): + assert handle_timestamp_value({"has_raw_value": "2021-01-01"}) == "2021-01-01" + assert handle_timestamp_value({}) == "Unknown format" + + def test_handle_controlled_term_value(self): + term_data = {"term": {"name": "Homo sapiens", "id": "NCBITaxon:9606"}} + assert ( + handle_controlled_term_value(term_data) == "Homo sapiens [NCBITaxon:9606]" + ) + assert ( + handle_controlled_term_value({"term": {"id": "NCBITaxon:9606"}}) + == "NCBITaxon:9606" + ) + assert ( + handle_controlled_term_value({"term": {"name": "Homo sapiens"}}) + == "Homo sapiens" + ) + assert ( + handle_controlled_term_value( + {"has_raw_value": "Homo sapiens [NCBITaxon:9606]"} + ) + == "Homo sapiens [NCBITaxon:9606]" + ) + assert handle_controlled_term_value({}) == "Unknown format" + + def test_handle_controlled_identified_term_value(self): + term_data = {"term": {"name": "Homo sapiens", "id": "NCBITaxon:9606"}} + assert ( + handle_controlled_identified_term_value(term_data) + == "Homo sapiens [NCBITaxon:9606]" + ) + assert ( + handle_controlled_identified_term_value({"term": {"id": "NCBITaxon:9606"}}) + == "NCBITaxon:9606" + ) + assert ( + handle_controlled_identified_term_value({"term": {"name": "Homo sapiens"}}) + == "Unknown format" + ) + assert ( + handle_controlled_identified_term_value( + {"has_raw_value": "Homo sapiens [NCBITaxon:9606]"} + ) + == "Homo sapiens [NCBITaxon:9606]" + ) + assert handle_controlled_identified_term_value({}) == "Unknown format" + + def test_handle_geolocation_value(self): + assert ( + handle_geolocation_value({"latitude": 34.05, "longitude": -118.25}) + == "34.05 -118.25" + ) + assert ( + handle_geolocation_value({"has_raw_value": "34.05, -118.25"}) + == "34.05, -118.25" + ) + assert handle_geolocation_value({}) == "Unknown format" + + def test_handle_float_value(self): + assert handle_float_value(10.1234) == "10.12" + + def test_handle_string_value(self): + assert handle_string_value("Foo") == "Foo" + + def test_load_mappings(self, mocker): + mock_tsv_content = ( + "nmdc_schema_class\tnmdc_schema_slot\tnmdc_schema_slot_range\tncbi_biosample_attribute_name\tstatic_value\tignore\n" + "Biosample\tanalysis_type\tAnalysisTypeEnum\t\t\t\n" + "Biosample\tbiosample_categories\tBiosampleCategoryEnum\t\t\t\n" + "Biosample\tcollection_date\tTimestampValue\tcollection_date\t\t\n" + "Biosample\tconduc\tQuantityValue\tconduc\t\t\n" + "Biosample\telev\tfloat\telev\t\t\n" + "Biosample\tenv_broad_scale\tControlledIdentifiedTermValue\tenv_broad_scale\t\t\n" + "Biosample\tenv_local_scale\tControlledIdentifiedTermValue\tenv_local_scale\t\t\n" + "Biosample\tenv_medium\tControlledIdentifiedTermValue\tenv_medium\t\t\n" + "Biosample\tenv_package\tTextValue\tenv_package\t\t\n" + "Biosample\tgeo_loc_name\tQuantityValue\tgeo_loc_name\t\t\n" + "Biosample\tid\turiorcurie\t\t\t\n" + "Biosample\tlat_lon\tGeolocationValue\tlat_lon\t\t\n" + "Biosample\tname\tstring\tsample_name\t\t\n" + "Biosample\tpart_of\tStudy\t\t\t\n" + "Biosample\tsamp_collec_device\tstring\tsamp_collect_device\t\t\n" + "Biosample\ttemp\tQuantityValue\ttemp\t\t\n" + "Biosample\ttype\tstring\t\t\t\n" + ) + + mock_response = MagicMock() + mock_response.text = mock_tsv_content + mocker.patch("requests.get", return_value=mock_response) + + attribute_mappings, slot_range_mappings = load_mappings( + "http://example.com/mappings.tsv" + ) + + expected_attribute_mappings = { + "analysis_type": "analysis_type", + "biosample_categories": "biosample_categories", + "collection_date": "collection_date", + "conduc": "conduc", + "elev": "elev", + "env_broad_scale": "env_broad_scale", + "env_local_scale": "env_local_scale", + "env_medium": "env_medium", + "env_package": "env_package", + "geo_loc_name": "geo_loc_name", + "id": "id", + "lat_lon": "lat_lon", + "name": "sample_name", + "part_of": "part_of", + "samp_collec_device": "samp_collect_device", + "temp": "temp", + "type": "type", + } + + expected_slot_range_mappings = { + "analysis_type": "AnalysisTypeEnum", + "biosample_categories": "BiosampleCategoryEnum", + "collection_date": "TimestampValue", + "conduc": "QuantityValue", + "elev": "float", + "env_broad_scale": "ControlledIdentifiedTermValue", + "env_local_scale": "ControlledIdentifiedTermValue", + "env_medium": "ControlledIdentifiedTermValue", + "env_package": "TextValue", + "geo_loc_name": "QuantityValue", + "id": "uriorcurie", + "lat_lon": "GeolocationValue", + "name": "string", + "part_of": "Study", + "samp_collec_device": "string", + "temp": "QuantityValue", + "type": "string", + } + + assert attribute_mappings == expected_attribute_mappings + assert slot_range_mappings == expected_slot_range_mappings From 4ceb10a8221adc84c21ab7a6fb43911a6e3612c8 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Wed, 15 May 2024 17:42:04 -0700 Subject: [PATCH 14/27] update dev.txt and main.txt in requirements folder --- requirements/dev.txt | 92 +++++++------- requirements/main.txt | 280 +++++++++++++++++++++++------------------- 2 files changed, 195 insertions(+), 177 deletions(-) diff --git a/requirements/dev.txt b/requirements/dev.txt index f0238446..8eb8d791 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -9,11 +9,13 @@ attrs==23.2.0 # -c requirements/main.txt # cattrs # requests-cache -black==24.2.0 +backports-tarfile==1.1.1 + # via jaraco-context +black==24.4.2 # via # -c requirements/main.txt # -r requirements/dev.in -build==1.1.1 +build==1.2.1 # via pip-tools cattrs==23.2.3 # via @@ -23,10 +25,6 @@ certifi==2024.2.2 # via # -c requirements/main.txt # requests -cffi==1.16.0 - # via - # -c requirements/main.txt - # cryptography charset-normalizer==3.3.2 # via # -c requirements/main.txt @@ -36,31 +34,28 @@ click==8.1.7 # -c requirements/main.txt # black # pip-tools -coverage==7.4.3 +coverage==7.5.1 # via # -r requirements/dev.in # pytest-cov -cryptography==42.0.5 - # via - # -c requirements/main.txt - # secretstorage -docutils==0.20.1 +docutils==0.21.2 # via # -c requirements/main.txt # readme-renderer -exceptiongroup==1.2.0 +exceptiongroup==1.2.1 # via # -c requirements/main.txt # cattrs # pytest flake8==7.0.0 # via -r requirements/dev.in -idna==3.6 +idna==3.7 # via # -c requirements/main.txt # requests -importlib-metadata==7.0.1 +importlib-metadata==7.1.0 # via + # build # keyring # twine iniconfig==2.0.0 @@ -69,13 +64,13 @@ iniconfig==2.0.0 # pytest invoke==2.2.0 # via -r requirements/dev.in -jaraco-classes==3.3.1 +jaraco-classes==3.4.0 # via keyring -jeepney==0.8.0 - # via - # keyring - # secretstorage -keyring==24.3.1 +jaraco-context==5.3.0 + # via keyring +jaraco-functools==4.0.1 + # via keyring +keyring==25.2.1 # via twine markdown-it-py==3.0.0 # via @@ -88,14 +83,16 @@ mdurl==0.1.2 # -c requirements/main.txt # markdown-it-py more-itertools==10.2.0 - # via jaraco-classes + # via + # jaraco-classes + # jaraco-functools mypy-extensions==1.0.0 # via # -c requirements/main.txt # black -nh3==0.2.15 +nh3==0.2.17 # via readme-renderer -packaging==23.2 +packaging==24.0 # via # -c requirements/main.txt # black @@ -105,48 +102,47 @@ pathspec==0.12.1 # via # -c requirements/main.txt # black -pip-tools==7.4.0 +pip-tools==7.4.1 # via -r requirements/dev.in -pkginfo==1.9.6 +pkginfo==1.10.0 # via twine -platformdirs==4.2.0 +platformdirs==4.2.2 # via # -c requirements/main.txt # black # requests-cache -pluggy==1.4.0 +pluggy==1.5.0 # via # -c requirements/main.txt # pytest pycodestyle==2.11.1 # via flake8 -pycparser==2.21 - # via - # -c requirements/main.txt - # cffi pyflakes==3.2.0 - # via - # -c requirements/main.txt - # flake8 -pygments==2.17.2 + # via flake8 +pygments==2.18.0 # via # -c requirements/main.txt # readme-renderer # rich -pyproject-hooks==1.0.0 +pyproject-hooks==1.1.0 # via # build # pip-tools -pytest==8.0.2 +pytest==8.2.0 # via # -c requirements/main.txt # -r requirements/dev.in # pytest-asyncio # pytest-cov -pytest-asyncio==0.23.5 + # pytest-mock +pytest-asyncio==0.23.6 # via -r requirements/dev.in -pytest-cov==4.1.0 +pytest-cov==5.0.0 # via -r requirements/dev.in +pytest-mock==3.14.0 + # via + # -c requirements/main.txt + # -r requirements/dev.in readme-renderer==43.0 # via twine requests==2.31.0 @@ -160,7 +156,7 @@ requests-cache==1.2.0 # via # -c requirements/main.txt # -r requirements/dev.in -requests-mock==1.11.0 +requests-mock==1.12.1 # via -r requirements/dev.in requests-toolbelt==1.0.0 # via @@ -172,12 +168,9 @@ rich==13.7.1 # via # -c requirements/main.txt # twine -secretstorage==3.3.3 - # via keyring six==1.16.0 # via # -c requirements/main.txt - # requests-mock # url-normalize tomli==2.0.1 # via @@ -186,11 +179,10 @@ tomli==2.0.1 # build # coverage # pip-tools - # pyproject-hooks # pytest twine==5.0.0 # via -r requirements/dev.in -typing-extensions==4.10.0 +typing-extensions==4.11.0 # via # -c requirements/main.txt # black @@ -199,15 +191,15 @@ url-normalize==1.4.3 # via # -c requirements/main.txt # requests-cache -urllib3==2.0.7 +urllib3==2.2.1 # via # -c requirements/main.txt # requests # requests-cache # twine -wheel==0.42.0 +wheel==0.43.0 # via pip-tools -zipp==3.17.0 +zipp==3.18.1 # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: @@ -215,7 +207,7 @@ pip==24.0 # via # -r requirements/dev.in # pip-tools -setuptools==69.1.1 +setuptools==69.5.1 # via # -c requirements/main.txt # -r requirements/dev.in diff --git a/requirements/main.txt b/requirements/main.txt index fd18a174..3548c168 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -42,9 +42,7 @@ attrs==23.2.0 # jsonschema # referencing # requests-cache -autoflake==2.3.0 - # via shed -babel==2.14.0 +babel==2.15.0 # via # jupyterlab-server # mkdocs-material @@ -53,9 +51,9 @@ backoff==2.2.1 # via gql base32-lib==1.0.2 # via -r requirements/main.in -bcrypt==4.1.2 +bcrypt==4.1.3 # via passlib -beanie==1.25.0 +beanie==1.26.0 # via -r requirements/main.in beautifulsoup4==4.12.3 # via @@ -64,13 +62,13 @@ beautifulsoup4==4.12.3 # nbconvert bioregistry==0.10.158 # via nmdc-schema -black==24.2.0 +black==24.4.2 # via shed bleach==6.1.0 # via nbconvert -boto3==1.34.54 +boto3==1.34.105 # via -r requirements/main.in -botocore==1.34.54 +botocore==1.34.105 # via # boto3 # s3transfer @@ -109,6 +107,7 @@ click==8.1.7 # prefixcommons # pystow # terminusdb-client + # typer # uvicorn colorama==0.4.6 # via mkdocs-material @@ -116,36 +115,36 @@ coloredlogs==14.0 # via dagster com2ann==0.3.0 # via shed -comm==0.2.1 +comm==0.2.2 # via # ipykernel # ipywidgets -croniter==2.0.2 +croniter==2.0.5 # via dagster -cryptography==42.0.5 +cryptography==42.0.7 # via python-jose -curies==0.7.7 +curies==0.7.9 # via # bioregistry # linkml-runtime # prefixmaps -dagit==1.6.8 +dagit==1.7.5 # via -r requirements/main.in -dagster==1.6.8 +dagster==1.7.5 # via # -r requirements/main.in # dagster-graphql # dagster-postgres # dagster-webserver -dagster-graphql==1.6.8 +dagster-graphql==1.7.5 # via # -r requirements/main.in # dagster-webserver -dagster-pipes==1.6.8 +dagster-pipes==1.7.5 # via dagster -dagster-postgres==0.22.8 +dagster-postgres==0.23.5 # via -r requirements/main.in -dagster-webserver==1.6.8 +dagster-webserver==1.7.5 # via dagit debugpy==1.8.1 # via ipykernel @@ -161,21 +160,23 @@ dnspython==2.6.1 # via # email-validator # pymongo -docstring-parser==0.15 +docstring-parser==0.16 # via dagster -docutils==0.20.1 +docutils==0.21.2 # via sphinx dotted-dict==1.1.3 # via -r requirements/main.in -ecdsa==0.18.0 +ecdsa==0.19.0 # via python-jose editorconfig==0.12.4 # via jsbeautifier email-validator==2.1.1 - # via pydantic + # via + # fastapi + # pydantic et-xmlfile==1.1.0 # via openpyxl -exceptiongroup==1.2.0 +exceptiongroup==1.2.1 # via # anyio # cattrs @@ -183,19 +184,25 @@ exceptiongroup==1.2.0 # pytest executing==2.0.1 # via stack-data -fastapi==0.110.0 - # via -r requirements/main.in +fastapi==0.111.0 + # via + # -r requirements/main.in + # fastapi-cli +fastapi-cli==0.0.3 + # via fastapi fastjsonschema==2.19.1 # via # -r requirements/main.in # nbformat +filelock==3.14.0 + # via dagster fnc==0.5.3 # via -r requirements/main.in fqdn==1.5.1 # via jsonschema -frozendict==2.4.0 +frozendict==2.4.4 # via -r requirements/main.in -fsspec==2024.2.0 +fsspec==2024.3.1 # via universal-pathlib ghp-import==2.1.0 # via mkdocs @@ -212,13 +219,15 @@ graphql-core==3.2.3 # graphql-relay graphql-relay==3.2.0 # via graphene -graphviz==0.20.1 +graphviz==0.20.3 # via linkml -grpcio==1.62.0 +greenlet==3.0.3 + # via sqlalchemy +grpcio==1.63.0 # via # dagster # grpcio-health-checking -grpcio-health-checking==1.62.0 +grpcio-health-checking==1.62.2 # via dagster h11==0.14.0 # via @@ -229,15 +238,17 @@ hbreader==0.9.1 # jsonasobj2 # linkml # linkml-runtime -httpcore==1.0.4 +httpcore==1.0.5 # via httpx httptools==0.6.1 # via uvicorn httpx==0.27.0 - # via jupyterlab + # via + # fastapi + # jupyterlab humanfriendly==10.0 # via coloredlogs -idna==3.6 +idna==3.7 # via # anyio # email-validator @@ -249,14 +260,14 @@ imagesize==1.4.1 # via sphinx iniconfig==2.0.0 # via pytest -ipykernel==6.29.3 +ipykernel==6.29.4 # via # jupyter # jupyter-console # jupyterlab # mkdocs-jupyter # qtconsole -ipython==8.22.1 +ipython==8.24.0 # via # ipykernel # ipywidgets @@ -269,13 +280,12 @@ isodate==0.6.1 # rdflib isoduration==20.11.0 # via jsonschema -isort==5.13.2 - # via shed jedi==0.19.1 # via ipython -jinja2==3.1.3 +jinja2==3.1.4 # via # dagster + # fastapi # jupyter-server # jupyterlab # jupyterlab-server @@ -284,19 +294,18 @@ jinja2==3.1.3 # mkdocs # mkdocs-material # nbconvert - # numpydoc # sphinx jmespath==1.0.1 # via # boto3 # botocore -jq==1.6.0 +jq==1.7.0 # via -r requirements/main.in jsbeautifier==1.15.1 # via mkdocs-mermaid2-plugin json-flattener==0.1.9 # via linkml-runtime -json5==0.9.18 +json5==0.9.25 # via jupyterlab-server jsonasobj==1.3.1 # via @@ -315,7 +324,7 @@ jsonpointer==2.4 # via # jsonpatch # jsonschema -jsonschema==4.21.1 +jsonschema==4.22.0 # via # jupyter-events # jupyterlab-server @@ -326,7 +335,7 @@ jsonschema-specifications==2023.12.1 # via jsonschema jupyter==1.0.0 # via -r requirements/main.in -jupyter-client==8.6.0 +jupyter-client==8.6.1 # via # ipykernel # jupyter-console @@ -335,7 +344,7 @@ jupyter-client==8.6.0 # qtconsole jupyter-console==6.6.3 # via jupyter -jupyter-core==5.7.1 +jupyter-core==5.7.2 # via # ipykernel # jupyter-client @@ -346,52 +355,52 @@ jupyter-core==5.7.1 # nbconvert # nbformat # qtconsole -jupyter-events==0.9.0 +jupyter-events==0.10.0 # via jupyter-server -jupyter-lsp==2.2.3 +jupyter-lsp==2.2.5 # via jupyterlab -jupyter-server==2.12.5 +jupyter-server==2.14.0 # via # jupyter-lsp # jupyterlab # jupyterlab-server # notebook # notebook-shim -jupyter-server-terminals==0.5.2 +jupyter-server-terminals==0.5.3 # via jupyter-server -jupyterlab==4.1.2 +jupyterlab==4.1.8 # via # -r requirements/main.in # notebook jupyterlab-pygments==0.3.0 # via nbconvert -jupyterlab-server==2.25.3 +jupyterlab-server==2.27.1 # via # jupyterlab # notebook jupyterlab-widgets==3.0.10 # via ipywidgets -jupytext==1.16.1 +jupytext==1.16.2 # via mkdocs-jupyter lazy-model==0.2.0 # via beanie -libcst==1.2.0 +libcst==1.3.1 # via shed -linkml==1.7.5 +linkml==1.7.10 # via # -r requirements/main.in # nmdc-schema linkml-dataops==0.1.0 # via linkml -linkml-runtime==1.7.2 +linkml-runtime==1.7.5 # via # -r requirements/main.in # linkml # linkml-dataops # nmdc-schema -mako==1.3.2 +mako==1.3.5 # via alembic -markdown==3.5.2 +markdown==3.6 # via # mkdocs # mkdocs-material @@ -407,28 +416,32 @@ markupsafe==2.1.5 # mako # mkdocs # nbconvert -matplotlib-inline==0.1.6 +matplotlib-inline==0.1.7 # via # ipykernel # ipython -mdit-py-plugins==0.4.0 +mdit-py-plugins==0.4.1 # via jupytext mdurl==0.1.2 # via markdown-it-py mergedeep==1.3.4 - # via mkdocs + # via + # mkdocs + # mkdocs-get-deps mistune==3.0.2 # via nbconvert -mkdocs==1.5.3 +mkdocs==1.6.0 # via # mkdocs-jupyter # mkdocs-material # mkdocs-mermaid2-plugin # mkdocs-redirects # nmdc-schema -mkdocs-jupyter==0.24.6 +mkdocs-get-deps==0.2.0 + # via mkdocs +mkdocs-jupyter==0.24.7 # via -r requirements/main.in -mkdocs-material==9.5.12 +mkdocs-material==9.5.22 # via # -r requirements/main.in # mkdocs-jupyter @@ -443,24 +456,22 @@ mkdocs-redirects==1.2.1 # via nmdc-schema more-click==0.1.2 # via bioregistry -motor==3.3.2 +motor==3.4.0 # via # -r requirements/main.in # beanie multidict==6.0.5 # via yarl mypy-extensions==1.0.0 - # via - # black - # typing-inspect -nbclient==0.9.0 + # via black +nbclient==0.10.0 # via nbconvert -nbconvert==7.16.1 +nbconvert==7.16.4 # via # jupyter # jupyter-server # mkdocs-jupyter -nbformat==5.9.2 +nbformat==5.10.4 # via # jupyter-server # jupytext @@ -470,7 +481,7 @@ nest-asyncio==1.6.0 # via ipykernel nmdc-schema==10.2.0 # via -r requirements/main.in -notebook==7.1.1 +notebook==7.1.3 # via jupyter notebook-shim==0.2.4 # via @@ -480,15 +491,17 @@ numpy==1.26.4 # via # pandas # terminusdb-client -numpydoc==1.6.0 +numpydoc==1.7.0 # via terminusdb-client openpyxl==3.1.2 # via # -r requirements/main.in # linkml +orjson==3.10.3 + # via fastapi overrides==7.7.0 # via jupyter-server -packaging==23.2 +packaging==24.0 # via # black # dagster @@ -506,7 +519,7 @@ packaging==23.2 # sphinx paginate==0.5.6 # via mkdocs-material -pandas==2.2.1 +pandas==2.2.2 # via # -r requirements/main.in # terminusdb-client @@ -514,7 +527,7 @@ pandocfilters==1.5.1 # via nbconvert parse==1.20.1 # via linkml -parso==0.8.3 +parso==0.8.4 # via jedi passlib==1.7.4 # via -r requirements/main.in @@ -526,13 +539,13 @@ pendulum==3.0.0 # via dagster pexpect==4.9.0 # via ipython -platformdirs==4.2.0 +platformdirs==4.2.2 # via # black # jupyter-core - # mkdocs + # mkdocs-get-deps # requests-cache -pluggy==1.4.0 +pluggy==1.5.0 # via pytest ply==3.11 # via jsonpath-ng @@ -540,7 +553,7 @@ prefixcommons==0.1.12 # via # linkml # linkml-runtime -prefixmaps==0.2.2 +prefixmaps==0.2.4 # via # linkml # linkml-runtime @@ -564,13 +577,13 @@ ptyprocess==0.7.0 # terminado pure-eval==0.2.2 # via stack-data -pyasn1==0.5.1 +pyasn1==0.6.0 # via # python-jose # rsa -pycparser==2.21 +pycparser==2.22 # via cffi -pydantic==2.6.3 +pydantic==2.7.1 # via # -r requirements/main.in # beanie @@ -581,11 +594,9 @@ pydantic==2.6.3 # lazy-model # linkml # linkml-runtime -pydantic-core==2.16.3 +pydantic-core==2.18.2 # via pydantic -pyflakes==3.2.0 - # via autoflake -pygments==2.17.2 +pygments==2.18.0 # via # ipython # jupyter-console @@ -600,16 +611,16 @@ pyjsg==0.11.10 # linkml # pyshexc # shexjsg -pymdown-extensions==10.7 +pymdown-extensions==10.8.1 # via # mkdocs-material # mkdocs-mermaid2-plugin -pymongo==4.6.2 +pymongo==4.7.2 # via # -r requirements/main.in # motor # nmdc-schema -pyparsing==3.1.1 +pyparsing==3.1.2 # via rdflib pyshex==0.8.1 # via linkml @@ -619,10 +630,14 @@ pyshexc==0.9.1 # pyshex pystow==0.5.4 # via bioregistry -pytest==8.0.2 - # via pytest-logging +pytest==8.2.0 + # via + # pytest-logging + # pytest-mock pytest-logging==2015.11.4 # via prefixcommons +pytest-mock==3.14.0 + # via -r requirements/main.in python-dateutil==2.9.0.post0 # via # arrow @@ -645,7 +660,9 @@ python-jose==3.3.0 python-json-logger==2.0.7 # via jupyter-events python-multipart==0.0.9 - # via -r requirements/main.in + # via + # -r requirements/main.in + # fastapi pytrie==0.4.0 # via curies pytz==2024.1 @@ -653,7 +670,7 @@ pytz==2024.1 # croniter # dagster # pandas -pyupgrade==3.15.1 +pyupgrade==3.15.2 # via shed pyyaml==6.0.1 # via @@ -666,6 +683,7 @@ pyyaml==6.0.1 # linkml # linkml-runtime # mkdocs + # mkdocs-get-deps # mkdocs-mermaid2-plugin # prefixcommons # prefixmaps @@ -674,14 +692,14 @@ pyyaml==6.0.1 # uvicorn pyyaml-env-tag==0.1 # via mkdocs -pyzmq==25.1.2 +pyzmq==26.0.3 # via # ipykernel # jupyter-client # jupyter-console # jupyter-server # qtconsole -qtconsole==5.5.1 +qtconsole==5.5.2 # via jupyter qtpy==2.4.1 # via qtconsole @@ -701,12 +719,12 @@ rdflib-shim==1.0.3 # pyshex # pyshexc # sparqlslurper -referencing==0.33.0 +referencing==0.35.1 # via # jsonschema # jsonschema-specifications # jupyter-events -regex==2023.12.25 +regex==2024.5.10 # via mkdocs-material requests==2.31.0 # via @@ -743,8 +761,10 @@ rfc3986-validator==0.1.1 rfc3987==1.3.8 # via jsonschema rich==13.7.1 - # via dagster -rpds-py==0.18.0 + # via + # dagster + # typer +rpds-py==0.18.1 # via # jsonschema # referencing @@ -754,16 +774,20 @@ ruamel-yaml==0.18.6 # via linkml-dataops ruamel-yaml-clib==0.2.8 # via ruamel-yaml -s3transfer==0.10.0 +ruff==0.4.4 + # via shed +s3transfer==0.10.1 # via boto3 semver==3.0.2 # via -r requirements/main.in -send2trash==1.8.2 +send2trash==1.8.3 # via jupyter-server -setuptools-scm==8.0.4 +setuptools-scm==8.1.0 # via -r requirements/main.in -shed==2024.1.1 +shed==2024.3.1 # via terminusdb-client +shellingham==1.5.4 + # via typer shexjsg==0.8.2 # via # pyshex @@ -796,7 +820,7 @@ sparqlwrapper==2.0.0 # via # pyshex # sparqlslurper -sphinx==7.2.6 +sphinx==7.3.7 # via numpydoc sphinxcontrib-applehelp==1.0.8 # via sphinx @@ -810,14 +834,14 @@ sphinxcontrib-qthelp==1.0.7 # via sphinx sphinxcontrib-serializinghtml==1.1.10 # via sphinx -sqlalchemy==2.0.27 +sqlalchemy==2.0.30 # via # alembic # dagster # linkml stack-data==0.6.3 # via ipython -starlette==0.36.3 +starlette==0.37.2 # via # dagster-graphql # dagster-webserver @@ -828,33 +852,32 @@ tabulate==0.9.0 # via # dagster # numpydoc -tenacity==8.2.3 +tenacity==8.3.0 # via -r requirements/main.in -terminado==0.18.0 +terminado==0.18.1 # via # jupyter-server # jupyter-server-terminals terminusdb-client==10.2.6 # via -r requirements/main.in -time-machine==2.13.0 +time-machine==2.14.1 # via pendulum -tinycss2==1.2.1 +tinycss2==1.3.0 # via nbconvert tokenize-rt==5.2.0 # via pyupgrade toml==0.10.2 - # via - # beanie - # jupytext + # via beanie tomli==2.0.1 # via - # autoflake # black # dagster # jupyterlab + # jupytext # numpydoc # pytest # setuptools-scm + # sphinx toolz==0.12.1 # via -r requirements/main.in toposort==1.10 @@ -867,14 +890,14 @@ tornado==6.4 # jupyterlab # notebook # terminado -tqdm==4.66.2 +tqdm==4.66.4 # via # -r requirements/main.in # bioregistry # dagster # pystow # terminusdb-client -traitlets==5.14.1 +traitlets==5.14.3 # via # comm # ipykernel @@ -893,9 +916,11 @@ traitlets==5.14.1 # qtconsole typeguard==2.13.3 # via terminusdb-client -types-python-dateutil==2.8.19.20240106 +typer==0.12.3 + # via fastapi-cli +types-python-dateutil==2.9.0.20240316 # via arrow -typing-extensions==4.10.0 +typing-extensions==4.11.0 # via # alembic # anyio @@ -905,35 +930,36 @@ typing-extensions==4.10.0 # cattrs # dagster # fastapi - # libcst + # ipython # pydantic # pydantic-core - # setuptools-scm # sqlalchemy - # typing-inspect + # typer # uvicorn -typing-inspect==0.9.0 - # via libcst tzdata==2024.1 # via # pandas # pendulum -universal-pathlib==0.2.1 +ujson==5.10.0 + # via fastapi +universal-pathlib==0.2.2 # via dagster uri-template==1.3.0 # via jsonschema url-normalize==1.4.3 # via requests-cache -urllib3==2.0.7 +urllib3==2.2.1 # via # botocore # pyshex # requests # requests-cache -uvicorn==0.27.1 +uvicorn==0.29.0 # via # -r requirements/main.in # dagster-webserver + # fastapi + # fastapi-cli uvloop==0.19.0 # via uvicorn watchdog==4.0.0 @@ -951,7 +977,7 @@ webencodings==0.5.1 # via # bleach # tinycss2 -websocket-client==1.7.0 +websocket-client==1.8.0 # via jupyter-server websockets==12.0 # via uvicorn @@ -967,7 +993,7 @@ yarl==1.9.4 # via gql # The following packages are considered to be unsafe in a requirements file: -setuptools==69.1.1 +setuptools==69.5.1 # via # dagster # mkdocs-mermaid2-plugin From c7d3da9be48951bacbefb858158789eb18a0fa62 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Thu, 16 May 2024 13:52:24 -0700 Subject: [PATCH 15/27] logic for validating generated XML against XSD schemas --- nmdc_runtime/site/export/ncbi_xml.py | 16 ++++++++++++++- nmdc_runtime/site/export/ncbi_xml_utils.py | 23 +++++++++++++++++++++- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py index 9fd3ba80..78b3bc51 100644 --- a/nmdc_runtime/site/export/ncbi_xml.py +++ b/nmdc_runtime/site/export/ncbi_xml.py @@ -12,6 +12,7 @@ handle_float_value, handle_string_value, load_mappings, + validate_xml, ) from nmdc_runtime.site.export.nmdc_api_client import NMDCApiClient @@ -277,6 +278,19 @@ def get_submission_xml(self): nmdc_biosamples=biosamples_list, ) + rough_string = ET.tostring(self.root, "unicode") reparsed = xml.dom.minidom.parseString(rough_string) - return reparsed.toprettyxml(indent=" ", newl="\n") + submission_xml = reparsed.toprettyxml(indent=" ", newl="\n") + + # ============= Uncomment the following code to validate the XML against NCBI XSDs ============ # + # submission_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/submission.xsd?view=co" + # submission_xsd_validation = validate_xml(submission_xml, submission_xsd_url) + + # bioproject_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/bioproject.xsd?view=co" + # bioproject_xsd_validation = validate_xml(submission_xml, bioproject_xsd_url) + + # biosample_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/biosample.xsd?view=co" + # biosample_xsd_validation = validate_xml(submission_xml, biosample_xsd_url) + + return submission_xml \ No newline at end of file diff --git a/nmdc_runtime/site/export/ncbi_xml_utils.py b/nmdc_runtime/site/export/ncbi_xml_utils.py index 1ad34aca..59b99b56 100644 --- a/nmdc_runtime/site/export/ncbi_xml_utils.py +++ b/nmdc_runtime/site/export/ncbi_xml_utils.py @@ -1,4 +1,5 @@ -from io import StringIO +from lxml import etree +from io import BytesIO, StringIO import csv import requests @@ -95,3 +96,23 @@ def load_mappings(url): slot_range_mappings[json_key] = data_type if data_type else "default" return attribute_mappings, slot_range_mappings + + +def validate_xml(xml, xsd_url): + response = requests.get(xsd_url) + response.raise_for_status() + xsd_content = response.text + + xml_schema_doc = etree.parse(BytesIO(xsd_content.encode('utf-8'))) + xml_schema = etree.XMLSchema(xml_schema_doc) + + if ' Date: Thu, 16 May 2024 13:54:52 -0700 Subject: [PATCH 16/27] black formatting NCBI XML related files --- nmdc_runtime/site/export/ncbi_xml.py | 7 +++---- nmdc_runtime/site/export/ncbi_xml_utils.py | 6 +++--- requirements/dev.in | 3 ++- requirements/dev.txt | 4 ++++ requirements/main.in | 1 + requirements/main.txt | 12 +++++++----- 6 files changed, 20 insertions(+), 13 deletions(-) diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py index 78b3bc51..a9b433a4 100644 --- a/nmdc_runtime/site/export/ncbi_xml.py +++ b/nmdc_runtime/site/export/ncbi_xml.py @@ -278,7 +278,6 @@ def get_submission_xml(self): nmdc_biosamples=biosamples_list, ) - rough_string = ET.tostring(self.root, "unicode") reparsed = xml.dom.minidom.parseString(rough_string) submission_xml = reparsed.toprettyxml(indent=" ", newl="\n") @@ -286,11 +285,11 @@ def get_submission_xml(self): # ============= Uncomment the following code to validate the XML against NCBI XSDs ============ # # submission_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/submission.xsd?view=co" # submission_xsd_validation = validate_xml(submission_xml, submission_xsd_url) - + # bioproject_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/bioproject.xsd?view=co" # bioproject_xsd_validation = validate_xml(submission_xml, bioproject_xsd_url) - + # biosample_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/biosample.xsd?view=co" # biosample_xsd_validation = validate_xml(submission_xml, biosample_xsd_url) - return submission_xml \ No newline at end of file + return submission_xml diff --git a/nmdc_runtime/site/export/ncbi_xml_utils.py b/nmdc_runtime/site/export/ncbi_xml_utils.py index 59b99b56..64183202 100644 --- a/nmdc_runtime/site/export/ncbi_xml_utils.py +++ b/nmdc_runtime/site/export/ncbi_xml_utils.py @@ -103,11 +103,11 @@ def validate_xml(xml, xsd_url): response.raise_for_status() xsd_content = response.text - xml_schema_doc = etree.parse(BytesIO(xsd_content.encode('utf-8'))) + xml_schema_doc = etree.parse(BytesIO(xsd_content.encode("utf-8"))) xml_schema = etree.XMLSchema(xml_schema_doc) - if ' Date: Wed, 29 May 2024 12:35:39 -0700 Subject: [PATCH 17/27] use RuntimeApiSiteClient instead of defining new NmdcApiClient class --- nmdc_runtime/site/export/ncbi_xml.py | 22 +++--- nmdc_runtime/site/export/nmdc_api_client.py | 41 ----------- nmdc_runtime/site/export/study_metadata.py | 29 ++++++-- nmdc_runtime/site/graphs.py | 10 ++- nmdc_runtime/site/ops.py | 14 ++-- nmdc_runtime/site/repository.py | 21 +++++- tests/test_data/test_ncbi_xml.py | 77 ++++++++------------- 7 files changed, 100 insertions(+), 114 deletions(-) delete mode 100644 nmdc_runtime/site/export/nmdc_api_client.py diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py index a9b433a4..3eba0c44 100644 --- a/nmdc_runtime/site/export/ncbi_xml.py +++ b/nmdc_runtime/site/export/ncbi_xml.py @@ -14,26 +14,26 @@ load_mappings, validate_xml, ) -from nmdc_runtime.site.export.nmdc_api_client import NMDCApiClient class NCBISubmissionXML: - def __init__(self, ncbi_submission_fields: dict): + def __init__(self, nmdc_study_id: str, ncbi_submission_metadata: dict): self.root = ET.Element("Submission") - self.nmdc_study_id = ncbi_submission_fields.get("nmdc_study_id") - self.nmdc_ncbi_attribute_mapping_file_url = ncbi_submission_fields.get( + + self.nmdc_study_id = nmdc_study_id + + self.nmdc_ncbi_attribute_mapping_file_url = ncbi_submission_metadata.get( "nmdc_ncbi_attribute_mapping_file_url" ) - self.ncbi_submission_metadata = ncbi_submission_fields.get( + self.ncbi_submission_metadata = ncbi_submission_metadata.get( "ncbi_submission_metadata", {} ) - self.ncbi_bioproject_metadata = ncbi_submission_fields.get( + self.ncbi_bioproject_metadata = ncbi_submission_metadata.get( "ncbi_bioproject_metadata", {} ) - self.ncbi_biosample_metadata = ncbi_submission_fields.get( + self.ncbi_biosample_metadata = ncbi_submission_metadata.get( "ncbi_biosample_metadata", {} ) - self.nmdc_api_client = NMDCApiClient() # dispatcher dictionary capturing handlers for NMDC object to NCBI flat Attribute # type handlers @@ -250,7 +250,7 @@ def set_biosample( ) self.root.append(action) - def get_submission_xml(self): + def get_submission_xml(self, biosamples_list: list): self.set_description( email=self.ncbi_submission_metadata.get("email", ""), user=self.ncbi_submission_metadata.get("user", ""), @@ -267,10 +267,6 @@ def get_submission_xml(self): org=self.ncbi_submission_metadata.get("organization", ""), ) - biosamples_list = self.nmdc_api_client.get_biosamples_part_of_study( - self.nmdc_study_id - ) - self.set_biosample( organism_name=self.ncbi_biosample_metadata.get("organism_name", ""), package=self.ncbi_biosample_metadata.get("package", ""), diff --git a/nmdc_runtime/site/export/nmdc_api_client.py b/nmdc_runtime/site/export/nmdc_api_client.py deleted file mode 100644 index b4dd38d7..00000000 --- a/nmdc_runtime/site/export/nmdc_api_client.py +++ /dev/null @@ -1,41 +0,0 @@ -import os -import json -import requests - -from dotenv import load_dotenv - - -class NMDCApiClient: - def __init__(self, api_base_url=None): - load_dotenv() - self.base_url = api_base_url or os.getenv("API_HOST") - if not self.base_url: - raise ValueError("API base URL for runtime environment is required.") - if not self.base_url.endswith("/"): - self.base_url += "/" - self.headers = { - "accept": "application/json", - "Content-Type": "application/json", - } - - def get_biosamples_part_of_study(self, study_id: str) -> list[dict]: - """ - Get the biosamples that are part of a study. - """ - biosample_records = [] - params = {"filter": json.dumps({"part_of": study_id}), "max_page_size": "1000"} - url = self.base_url + "nmdcschema/biosample_set" - - while True: - response = requests.get(url, params=params, headers=self.headers) - response.raise_for_status() - data = response.json() - biosample_records.extend(data["resources"]) - - # Check if there's a next page - next_page_token = data.get("next_page_token") - if not next_page_token: - break - params["page_token"] = next_page_token - - return biosample_records diff --git a/nmdc_runtime/site/export/study_metadata.py b/nmdc_runtime/site/export/study_metadata.py index cdcfef8e..626ce01b 100644 --- a/nmdc_runtime/site/export/study_metadata.py +++ b/nmdc_runtime/site/export/study_metadata.py @@ -5,7 +5,6 @@ import csv from io import StringIO -import requests from dagster import ( op, get_dagster_logger, @@ -26,13 +25,27 @@ def get_all_docs(client, collection, filter_): per_page = 200 url_base = f"/{collection}?filter={filter_}&per_page={per_page}" results = [] - rv = client.request("GET", url_base).json() + response = client.request("GET", url_base) + if response.status_code != 200: + raise Exception( + f"Runtime API request failed with status {response.status_code}." + f" Check URL: {url_base}" + ) + rv = response.json() results.extend(rv.get("results", [])) page, count = rv["meta"]["page"], rv["meta"]["count"] assert count <= 10_000 while page * per_page < count: - rv = requests.get(url_base + f"&page={page + 1}").json() - results.extend(rv["results"]) + page += 1 + url = f"{url_base}&page={page}" + response = client.request("GET", url) + if response.status_code != 200: + raise Exception( + f"Runtime API request failed with status {response.status_code}." + f" Check URL: {url}" + ) + rv = response.json() + results.extend(rv.get("results", [])) return results @@ -115,3 +128,11 @@ def export_study_biosamples_as_csv(context: OpExecutionContext, study_export_inf def export_study_biosamples_metadata(): outputs = export_study_biosamples_as_csv(get_study_biosamples_metadata()) add_output_run_event(outputs) + + +@op(required_resource_keys={"runtime_api_site_client"}) +def get_biosamples_by_study_id(context: OpExecutionContext, nmdc_study_id: str): + # nmdc_study_id = context.op_config["nmdc_study_id"] + client: RuntimeApiSiteClient = context.resources.runtime_api_site_client + biosamples = get_all_docs(client, "biosamples", f"part_of:{nmdc_study_id}") + return biosamples diff --git a/nmdc_runtime/site/graphs.py b/nmdc_runtime/site/graphs.py index f1b755d6..c5b485a6 100644 --- a/nmdc_runtime/site/graphs.py +++ b/nmdc_runtime/site/graphs.py @@ -49,10 +49,12 @@ get_neon_pipeline_inputs, get_df_from_url, site_code_mapping, + get_ncbi_export_pipeline_study_id, get_ncbi_export_pipeline_inputs, ncbi_submission_xml_from_nmdc_study, ncbi_submission_xml_asset, ) +from nmdc_runtime.site.export.study_metadata import get_biosamples_by_study_id @graph @@ -388,6 +390,10 @@ def ingest_neon_surface_water_metadata(): @graph def nmdc_study_to_ncbi_submission_export(): - ncbi_submission_fields = get_ncbi_export_pipeline_inputs() - xml_data = ncbi_submission_xml_from_nmdc_study(ncbi_submission_fields) + nmdc_study_id = get_ncbi_export_pipeline_study_id() + biosamples = get_biosamples_by_study_id(nmdc_study_id) + ncbi_submission_metadata = get_ncbi_export_pipeline_inputs() + xml_data = ncbi_submission_xml_from_nmdc_study( + nmdc_study_id, ncbi_submission_metadata, biosamples + ) ncbi_submission_xml_asset(xml_data) diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index df6b07a1..ecd4f864 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -1000,9 +1000,13 @@ def site_code_mapping() -> dict: ) +@op(config_schema={"nmdc_study_id": str}) +def get_ncbi_export_pipeline_study_id(context: OpExecutionContext) -> str: + return context.op_config["nmdc_study_id"] + + @op( config_schema={ - "nmdc_study_id": str, "nmdc_ncbi_attribute_mapping_file_url": str, "ncbi_submission_metadata": Field( Permissive( @@ -1043,7 +1047,6 @@ def site_code_mapping() -> dict: out=Out(Dict), ) def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str: - nmdc_study_id = context.op_config["nmdc_study_id"] nmdc_ncbi_attribute_mapping_file_url = context.op_config[ "nmdc_ncbi_attribute_mapping_file_url" ] @@ -1052,7 +1055,6 @@ def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str: ncbi_biosample_metadata = context.op_config.get("ncbi_biosample_metadata", {}) return { - "nmdc_study_id": nmdc_study_id, "nmdc_ncbi_attribute_mapping_file_url": nmdc_ncbi_attribute_mapping_file_url, "ncbi_submission_metadata": ncbi_submission_metadata, "ncbi_bioproject_metadata": ncbi_bioproject_metadata, @@ -1063,8 +1065,10 @@ def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str: @op def ncbi_submission_xml_from_nmdc_study( context: OpExecutionContext, + nmdc_study_id: str, ncbi_exporter_metadata: dict, + biosamples: list, ) -> str: - ncbi_exporter = NCBISubmissionXML(ncbi_exporter_metadata) - ncbi_xml = ncbi_exporter.get_submission_xml() + ncbi_exporter = NCBISubmissionXML(nmdc_study_id, ncbi_exporter_metadata) + ncbi_xml = ncbi_exporter.get_submission_xml(biosamples) return ncbi_xml diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py index 3e68892e..9719cede 100644 --- a/nmdc_runtime/site/repository.py +++ b/nmdc_runtime/site/repository.py @@ -855,13 +855,32 @@ def biosample_submission_ingest(): @repository def biosample_export(): + normal_resources = run_config_frozen__normal_env["resources"] return [ nmdc_study_to_ncbi_submission_export.to_job( + resource_defs=resource_defs, config={ + "resources": merge( + unfreeze(normal_resources), + { + "runtime_api_site_client": { + "config": { + "base_url": {"env": "API_HOST"}, + "client_id": {"env": "API_SITE_CLIENT_ID"}, + "client_secret": {"env": "API_SITE_CLIENT_SECRET"}, + "site_id": {"env": "API_SITE_ID"}, + }, + }, + }, + ), "ops": { - "get_ncbi_export_pipeline_inputs": { + "get_ncbi_export_pipeline_study_id": { "config": { "nmdc_study_id": "", + } + }, + "get_ncbi_export_pipeline_inputs": { + "config": { "nmdc_ncbi_attribute_mapping_file_url": "", "ncbi_submission_metadata": { "email": "", diff --git a/tests/test_data/test_ncbi_xml.py b/tests/test_data/test_ncbi_xml.py index 3cd2913c..f8974f68 100644 --- a/tests/test_data/test_ncbi_xml.py +++ b/tests/test_data/test_ncbi_xml.py @@ -15,10 +15,10 @@ handle_float_value, handle_string_value, ) -from nmdc_runtime.site.export.nmdc_api_client import NMDCApiClient -MOCK_SUBMISSION_FIELDS = { - "nmdc_study_id": "nmdc:sty-11-12345", +MOCK_NCBI_NMDC_STUDY_ID = "nmdc:sty-11-12345" + +MOCK_NCBI_SUBMISSION_METADATA = { "nmdc_ncbi_attribute_mapping_file_url": "http://example.com/mappings.tsv", "ncbi_submission_metadata": { "email": "user@example.com", @@ -43,12 +43,10 @@ @pytest.fixture def ncbi_submission_client(): - return NCBISubmissionXML(ncbi_submission_fields=MOCK_SUBMISSION_FIELDS) - - -@pytest.fixture -def nmdc_api_client(): - return NMDCApiClient(api_base_url="http://fakeapi.com/") + return NCBISubmissionXML( + nmdc_study_id=MOCK_NCBI_NMDC_STUDY_ID, + ncbi_submission_metadata=MOCK_NCBI_SUBMISSION_METADATA, + ) @pytest.fixture @@ -89,11 +87,11 @@ def test_set_element(self, ncbi_submission_client): def test_set_description(self, ncbi_submission_client): ncbi_submission_client.set_description( - MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["email"], - MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["user"], - MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["first"], - MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["last"], - MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["organization"], + MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"]["email"], + MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"]["user"], + MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"]["first"], + MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"]["last"], + MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"]["organization"], ) description = ET.tostring( ncbi_submission_client.root.find("Description"), "unicode" @@ -116,13 +114,19 @@ def test_set_description(self, ncbi_submission_client): def test_set_bioproject(self, ncbi_submission_client): ncbi_submission_client.set_bioproject( - title=MOCK_SUBMISSION_FIELDS["ncbi_bioproject_metadata"]["title"], - project_id=MOCK_SUBMISSION_FIELDS["ncbi_bioproject_metadata"]["project_id"], - description=MOCK_SUBMISSION_FIELDS["ncbi_bioproject_metadata"][ + title=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"]["title"], + project_id=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"][ + "project_id" + ], + description=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"][ "description" ], - data_type=MOCK_SUBMISSION_FIELDS["ncbi_bioproject_metadata"]["data_type"], - org=MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["organization"], + data_type=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"][ + "data_type" + ], + org=MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"][ + "organization" + ], ) bioproject_xml = ET.tostring( ncbi_submission_client.root.find(".//Project"), "unicode" @@ -178,11 +182,13 @@ def test_set_biosample(self, ncbi_submission_client, nmdc_biosample, mocker): ), ) ncbi_submission_client.set_biosample( - organism_name=MOCK_SUBMISSION_FIELDS["ncbi_biosample_metadata"][ + organism_name=MOCK_NCBI_SUBMISSION_METADATA["ncbi_biosample_metadata"][ "organism_name" ], - package=MOCK_SUBMISSION_FIELDS["ncbi_biosample_metadata"]["package"], - org=MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["organization"], + package=MOCK_NCBI_SUBMISSION_METADATA["ncbi_biosample_metadata"]["package"], + org=MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"][ + "organization" + ], nmdc_biosamples=nmdc_biosample, ) biosample_xml = ET.tostring( @@ -237,11 +243,7 @@ def test_get_submission_xml(self, mocker, ncbi_submission_client, nmdc_biosample ), ) - mocker.patch.object( - NMDCApiClient, "get_biosamples_part_of_study", return_value=nmdc_biosample - ) - - submission_xml = ncbi_submission_client.get_submission_xml() + submission_xml = ncbi_submission_client.get_submission_xml(nmdc_biosample) assert "nmdc:bsm-12-gnfpt483" in submission_xml assert "E. coli" in submission_xml @@ -252,27 +254,6 @@ def test_get_submission_xml(self, mocker, ncbi_submission_client, nmdc_biosample assert "Test Project" in submission_xml -class TestNMDCApiClient: - def test_get_biosamples_part_of_study_success(self, mocker, nmdc_api_client): - mock_response = mocker.MagicMock() - mock_response.json.return_value = { - "resources": [ - {"id": "nmdc:bsm-12-gnfpt483", "part_of": ["nmdc:sty-11-hht5sb92"]} - ], - "next_page_token": None, - } - mocker.patch("requests.get", return_value=mock_response) - result = nmdc_api_client.get_biosamples_part_of_study("nmdc:sty-11-hht5sb92") - assert result == [ - {"id": "nmdc:bsm-12-gnfpt483", "part_of": ["nmdc:sty-11-hht5sb92"]} - ] - - def test_get_biosamples_part_of_study_failure(self, mocker, nmdc_api_client): - mocker.patch("requests.get", side_effect=HTTPError("API Error")) - with pytest.raises(HTTPError): - nmdc_api_client.get_biosamples_part_of_study("nmdc:sty-11-hht5sb92") - - class TestNCBIXMLUtils: def test_handle_quantity_value(self): assert ( From 0ab92b3913dea8505f92c8bbf8f08de8ef57652c Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Mon, 3 Jun 2024 19:34:12 -0700 Subject: [PATCH 18/27] add logic for autogenerating block for SRA db --- nmdc_runtime/site/export/ncbi_xml.py | 97 +++++++++++++++++++++- nmdc_runtime/site/export/ncbi_xml_utils.py | 53 ++++++++++++ nmdc_runtime/site/export/study_metadata.py | 1 - nmdc_runtime/site/graphs.py | 6 +- nmdc_runtime/site/ops.py | 15 +++- nmdc_runtime/site/repository.py | 8 ++ 6 files changed, 174 insertions(+), 6 deletions(-) diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py index 3eba0c44..23346264 100644 --- a/nmdc_runtime/site/export/ncbi_xml.py +++ b/nmdc_runtime/site/export/ncbi_xml.py @@ -176,7 +176,6 @@ def set_biosample( formatted_value = handler(value) attributes[xml_key] = formatted_value - # Create the BioSample XML block with these attributes for each biosample biosample_elements = [ self.set_element( "SampleId", @@ -250,7 +249,95 @@ def set_biosample( ) self.root.append(action) - def get_submission_xml(self, biosamples_list: list): + def set_fastq( + self, + biosample_data_objects: list, + bioproject_id: str, + org: str, + ): + fastq_files = [] + biosample_ids = [] + + for entry in biosample_data_objects: + for biosample_id, data_objects in entry.items(): + biosample_ids.append(biosample_id) + for data_object in data_objects: + if "url" in data_object: + fastq_files.append(data_object["url"]) + + if fastq_files: + files_elements = [ + self.set_element( + "File", + "", + {"file_path": f}, + [self.set_element("DataType", "generic-data")], + ) + for f in fastq_files + ] + + attribute_elements = [ + self.set_element( + "AttributeRefId", + attrib={"name": "BioProject"}, + children=[ + self.set_element( + "RefId", + children=[ + self.set_element( + "SPUID", + bioproject_id, + {"spuid_namespace": org}, + ) + ], + ) + ], + ) + ] + + for biosample_id in biosample_ids: + attribute_elements.append( + self.set_element( + "AttributeRefId", + attrib={"name": "BioSample"}, + children=[ + self.set_element( + "RefId", + children=[ + self.set_element( + "SPUID", + biosample_id, + {"spuid_namespace": org}, + ) + ], + ) + ], + ) + ) + + identifier_element = self.set_element( + "Identifier", + children=[ + self.set_element("SPUID", bioproject_id, {"spuid_namespace": org}) + ], + ) + + action = self.set_element( + "Action", + children=[ + self.set_element( + "AddFiles", + attrib={"target_db": "SRA"}, + children=files_elements + + attribute_elements + + [identifier_element], + ), + ], + ) + + self.root.append(action) + + def get_submission_xml(self, biosamples_list: list, data_objects_list: list): self.set_description( email=self.ncbi_submission_metadata.get("email", ""), user=self.ncbi_submission_metadata.get("user", ""), @@ -274,6 +361,12 @@ def get_submission_xml(self, biosamples_list: list): nmdc_biosamples=biosamples_list, ) + self.set_fastq( + biosample_data_objects=data_objects_list, + bioproject_id=self.ncbi_bioproject_metadata.get("project_id", ""), + org=self.ncbi_submission_metadata.get("organization", ""), + ) + rough_string = ET.tostring(self.root, "unicode") reparsed = xml.dom.minidom.parseString(rough_string) submission_xml = reparsed.toprettyxml(indent=" ", newl="\n") diff --git a/nmdc_runtime/site/export/ncbi_xml_utils.py b/nmdc_runtime/site/export/ncbi_xml_utils.py index 64183202..bf3d285e 100644 --- a/nmdc_runtime/site/export/ncbi_xml_utils.py +++ b/nmdc_runtime/site/export/ncbi_xml_utils.py @@ -4,6 +4,59 @@ import requests +# TODO: do not hardcode this mapping +def get_classname_from_typecode(doc_id): + typecode = doc_id.split(":")[1].split("-")[0] + class_map = { + "bsm": "Biosample", + "extr": "Extraction", + "pool": "Pooling", + "libprep": "LibraryPreparation", + "procsm": "ProcessedSample", + "omprc": "OmicsProcessing", + "dobj": "DataObject", + } + return class_map.get(typecode) + + +def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list): + biosample_data_objects = [] + + for biosample in biosamples_list: + current_ids = [biosample["id"]] + collected_data_objects = [] + + while current_ids: + new_current_ids = [] + for current_id in current_ids: + query = {"has_input": current_id} + document = all_docs_collection.find_one(query) + + if not document: + continue + + has_output = document.get("has_output") + if not has_output: + continue + + for output_id in has_output: + if get_classname_from_typecode(output_id) == "DataObject": + data_object_doc = all_docs_collection.find_one( + {"id": output_id} + ) + if data_object_doc: + collected_data_objects.append(data_object_doc) + else: + new_current_ids.append(output_id) + + current_ids = new_current_ids + + if collected_data_objects: + biosample_data_objects.append({biosample["id"]: collected_data_objects}) + + return biosample_data_objects + + def handle_quantity_value(slot_value): if "has_numeric_value" in slot_value and "has_unit" in slot_value: return f"{slot_value['has_numeric_value']} {slot_value['has_unit']}" diff --git a/nmdc_runtime/site/export/study_metadata.py b/nmdc_runtime/site/export/study_metadata.py index 626ce01b..3cf9bc6d 100644 --- a/nmdc_runtime/site/export/study_metadata.py +++ b/nmdc_runtime/site/export/study_metadata.py @@ -132,7 +132,6 @@ def export_study_biosamples_metadata(): @op(required_resource_keys={"runtime_api_site_client"}) def get_biosamples_by_study_id(context: OpExecutionContext, nmdc_study_id: str): - # nmdc_study_id = context.op_config["nmdc_study_id"] client: RuntimeApiSiteClient = context.resources.runtime_api_site_client biosamples = get_all_docs(client, "biosamples", f"part_of:{nmdc_study_id}") return biosamples diff --git a/nmdc_runtime/site/graphs.py b/nmdc_runtime/site/graphs.py index c5b485a6..700ff6d7 100644 --- a/nmdc_runtime/site/graphs.py +++ b/nmdc_runtime/site/graphs.py @@ -50,6 +50,7 @@ get_df_from_url, site_code_mapping, get_ncbi_export_pipeline_study_id, + get_data_objects_from_biosamples, get_ncbi_export_pipeline_inputs, ncbi_submission_xml_from_nmdc_study, ncbi_submission_xml_asset, @@ -391,9 +392,10 @@ def ingest_neon_surface_water_metadata(): @graph def nmdc_study_to_ncbi_submission_export(): nmdc_study_id = get_ncbi_export_pipeline_study_id() - biosamples = get_biosamples_by_study_id(nmdc_study_id) ncbi_submission_metadata = get_ncbi_export_pipeline_inputs() + biosamples = get_biosamples_by_study_id(nmdc_study_id) + data_objects = get_data_objects_from_biosamples(biosamples) xml_data = ncbi_submission_xml_from_nmdc_study( - nmdc_study_id, ncbi_submission_metadata, biosamples + nmdc_study_id, ncbi_submission_metadata, biosamples, data_objects ) ncbi_submission_xml_asset(xml_data) diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index ecd4f864..330b056d 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -37,6 +37,7 @@ from gridfs import GridFS from linkml_runtime.dumpers import json_dumper from linkml_runtime.utils.yamlutils import YAMLRoot +from nmdc_runtime.api.db.mongo import get_mongo_db from nmdc_runtime.api.core.idgen import generate_one_id from nmdc_runtime.api.core.metadata import ( _validate_changesheet, @@ -60,6 +61,7 @@ ) from nmdc_runtime.api.models.util import ResultT from nmdc_runtime.site.export.ncbi_xml import NCBISubmissionXML +from nmdc_runtime.site.export.ncbi_xml_utils import fetch_data_objects_from_biosamples from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict from nmdc_runtime.site.resources import ( NmdcPortalApiClient, @@ -1062,13 +1064,24 @@ def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str: } +@op(required_resource_keys={"mongo"}) +def get_data_objects_from_biosamples(context: OpExecutionContext, biosamples: list): + mdb = context.resources.mongo.db + alldocs_collection = mdb["alldocs"] + biosample_data_objects = fetch_data_objects_from_biosamples( + alldocs_collection, biosamples + ) + return biosample_data_objects + + @op def ncbi_submission_xml_from_nmdc_study( context: OpExecutionContext, nmdc_study_id: str, ncbi_exporter_metadata: dict, biosamples: list, + data_objects: list, ) -> str: ncbi_exporter = NCBISubmissionXML(nmdc_study_id, ncbi_exporter_metadata) - ncbi_xml = ncbi_exporter.get_submission_xml(biosamples) + ncbi_xml = ncbi_exporter.get_submission_xml(biosamples, data_objects) return ncbi_xml diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py index 9719cede..6d62b1cf 100644 --- a/nmdc_runtime/site/repository.py +++ b/nmdc_runtime/site/repository.py @@ -863,6 +863,14 @@ def biosample_export(): "resources": merge( unfreeze(normal_resources), { + "mongo": { + "config": { + "host": {"env": "MONGO_HOST"}, + "username": {"env": "MONGO_USERNAME"}, + "password": {"env": "MONGO_PASSWORD"}, + "dbname": {"env": "MONGO_DBNAME"}, + }, + }, "runtime_api_site_client": { "config": { "base_url": {"env": "API_HOST"}, From 151de042f0fd1ccf6e89db1af30cf4d215aa6267 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Mon, 3 Jun 2024 19:57:30 -0700 Subject: [PATCH 19/27] update tests for new method set_fastq() in ncbi_xml.py --- tests/test_data/test_ncbi_xml.py | 104 +++++++++++++++++++++++++------ 1 file changed, 84 insertions(+), 20 deletions(-) diff --git a/tests/test_data/test_ncbi_xml.py b/tests/test_data/test_ncbi_xml.py index f8974f68..0af9ddc5 100644 --- a/tests/test_data/test_ncbi_xml.py +++ b/tests/test_data/test_ncbi_xml.py @@ -55,29 +55,52 @@ def nmdc_biosample(): { "analysis_type": ["metagenomics"], "biosample_categories": ["NEON"], - "collection_date": {"has_raw_value": "2014-08-05T18:40Z"}, - "conduc": {"has_numeric_value": 567, "has_unit": "uS/cm"}, - "elev": 1178.7, - "env_broad_scale": { - "term": {"id": "ENVO:03605008", "name": "freshwater stream biome"} + "collection_date": {"has_raw_value": "2015-07-21T18:00Z"}, + "depth": { + "has_maximum_numeric_value": 1, + "has_minimum_numeric_value": 0, + "has_unit": "meters", }, - "env_local_scale": { - "term": {"id": "ENVO:03605007", "name": "freshwater stream"} + "elev": 1179.5, + "env_broad_scale": { + "term": {"id": "ENVO:01000253", "name": "freshwater river biome"} }, - "env_medium": {"term": {"id": "ENVO:03605006", "name": "stream water"}}, - "env_package": {"has_raw_value": "water"}, + "env_local_scale": {"term": {"id": "ENVO:03600094", "name": "stream pool"}}, + "env_medium": {"term": {"id": "ENVO:00002007", "name": "sediment"}}, "geo_loc_name": {"has_raw_value": "USA: Colorado, Arikaree River"}, - "id": "nmdc:bsm-12-gnfpt483", - "lat_lon": {"latitude": 39.758359, "longitude": -102.448595}, - "name": "ARIK.SS.20140805", - "part_of": ["nmdc:sty-11-hht5sb92"], - "samp_collec_device": "Grab", - "temp": {"has_numeric_value": 20.1, "has_unit": "Cel"}, + "id": "nmdc:bsm-12-p9q5v236", + "lat_lon": {"latitude": 39.758206, "longitude": -102.447148}, + "name": "ARIK.20150721.AMC.EPIPSAMMON.3", + "part_of": ["nmdc:sty-11-34xj1150"], "type": "nmdc:Biosample", } ] +@pytest.fixture +def data_objects_list(): + return [ + { + "data_object_type": "Metagenome Raw Read 1", + "description": "sequencing results for BMI_HVKNKBGX5_Tube347_R1", + "id": "nmdc:dobj-12-b3ft7a80", + "md5_checksum": "cae0a9342d786e731ae71f6f37b76120", + "name": "BMI_HVKNKBGX5_Tube347_R1.fastq.gz", + "type": "nmdc:DataObject", + "url": "https://storage.neonscience.org/neon-microbial-raw-seq-files/2023/BMI_HVKNKBGX5_mms_R1/BMI_HVKNKBGX5_Tube347_R1.fastq.gz", + }, + { + "data_object_type": "Metagenome Raw Read 2", + "description": "sequencing results for BMI_HVKNKBGX5_Tube347_R2", + "id": "nmdc:dobj-12-1zv4q961", + "md5_checksum": "7340fe25644183a4f56d36ce52389d83", + "name": "BMI_HVKNKBGX5_Tube347_R2.fastq.gz", + "type": "nmdc:DataObject", + "url": "https://storage.neonscience.org/neon-microbial-raw-seq-files/2023/BMI_HVKNKBGX5_mms_R2/BMI_HVKNKBGX5_Tube347_R2.fastq.gz", + }, + ] + + class TestNCBISubmissionXML: def test_set_element(self, ncbi_submission_client): element = ncbi_submission_client.set_element("Test", "Hello", {"attr": "value"}) @@ -198,7 +221,32 @@ def test_set_biosample(self, ncbi_submission_client, nmdc_biosample, mocker): assert "Test Package" in biosample_xml assert "Test Org" in biosample_xml - def test_get_submission_xml(self, mocker, ncbi_submission_client, nmdc_biosample): + def test_set_fastq(self, ncbi_submission_client, data_objects_list, nmdc_biosample): + biosample_data_objects = [ + {biosample["id"]: data_objects_list} for biosample in nmdc_biosample + ] + + ncbi_submission_client.set_fastq( + biosample_data_objects=biosample_data_objects, + bioproject_id=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"][ + "project_id" + ], + org=MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"][ + "organization" + ], + ) + + action_xml = ET.tostring( + ncbi_submission_client.root.find(".//Action"), "unicode" + ) + assert "BMI_HVKNKBGX5_Tube347_R2.fastq.gz" in action_xml + assert "PRJNA12345" in action_xml + assert "nmdc:bsm-12-p9q5v236" in action_xml + assert "Test Org" in action_xml + + def test_get_submission_xml( + self, mocker, ncbi_submission_client, nmdc_biosample, data_objects_list + ): mocker.patch( "nmdc_runtime.site.export.ncbi_xml.load_mappings", return_value=( @@ -243,13 +291,29 @@ def test_get_submission_xml(self, mocker, ncbi_submission_client, nmdc_biosample ), ) - submission_xml = ncbi_submission_client.get_submission_xml(nmdc_biosample) + biosample_data_objects = [ + {biosample["id"]: data_objects_list} for biosample in nmdc_biosample + ] + + ncbi_submission_client.set_fastq( + biosample_data_objects=biosample_data_objects, + bioproject_id=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"][ + "project_id" + ], + org=MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"][ + "organization" + ], + ) + + submission_xml = ncbi_submission_client.get_submission_xml( + nmdc_biosample, biosample_data_objects + ) - assert "nmdc:bsm-12-gnfpt483" in submission_xml + assert "nmdc:bsm-12-p9q5v236" in submission_xml assert "E. coli" in submission_xml - assert "stream water" in submission_xml + assert "sediment" in submission_xml assert "USA: Colorado, Arikaree River" in submission_xml - assert "2014-08-05T18:40Z" in submission_xml + assert "2015-07-21T18:00Z" in submission_xml assert "testuser" in submission_xml assert "Test Project" in submission_xml From c9832374f2bd7e18aed5d48798702897d2541577 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Tue, 4 Jun 2024 16:30:36 -0700 Subject: [PATCH 20/27] visual pass code review: fix structure of generated XML --- nmdc_runtime/site/export/ncbi_xml.py | 148 +++++++++++++++------------ nmdc_runtime/site/ops.py | 2 + nmdc_runtime/site/repository.py | 1 + tests/test_data/test_ncbi_xml.py | 31 ++++-- 4 files changed, 107 insertions(+), 75 deletions(-) diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py index 23346264..0280a007 100644 --- a/nmdc_runtime/site/export/ncbi_xml.py +++ b/nmdc_runtime/site/export/ncbi_xml.py @@ -1,7 +1,9 @@ +import os import datetime import xml.etree.ElementTree as ET import xml.dom.minidom +from urllib.parse import urlparse from nmdc_runtime.site.export.ncbi_xml_utils import ( handle_controlled_identified_term_value, handle_controlled_term_value, @@ -147,6 +149,7 @@ def set_biosample( organism_name, package, org, + bioproject_id, nmdc_biosamples, ): attribute_mappings, slot_range_mappings = load_mappings( @@ -198,6 +201,14 @@ def set_biosample( "Organism", children=[self.set_element("OrganismName", organism_name)], ), + self.set_element( + "BioProject", + children=[ + self.set_element( + "PrimaryId", bioproject_id, {"db": "BioProject"} + ) + ], + ), self.set_element("Package", package), self.set_element( "Attributes", @@ -255,87 +266,94 @@ def set_fastq( bioproject_id: str, org: str, ): - fastq_files = [] - biosample_ids = [] - for entry in biosample_data_objects: + fastq_files = [] + biosample_ids = [] + for biosample_id, data_objects in entry.items(): biosample_ids.append(biosample_id) for data_object in data_objects: if "url" in data_object: - fastq_files.append(data_object["url"]) - - if fastq_files: - files_elements = [ - self.set_element( - "File", - "", - {"file_path": f}, - [self.set_element("DataType", "generic-data")], - ) - for f in fastq_files - ] - - attribute_elements = [ - self.set_element( - "AttributeRefId", - attrib={"name": "BioProject"}, - children=[ - self.set_element( - "RefId", - children=[ - self.set_element( - "SPUID", - bioproject_id, - {"spuid_namespace": org}, - ) - ], + url = urlparse(data_object["url"]) + file_path = os.path.join( + os.path.basename(os.path.dirname(url.path)), + os.path.basename(url.path), ) - ], - ) - ] + fastq_files.append(file_path) - for biosample_id in biosample_ids: - attribute_elements.append( + if fastq_files: + files_elements = [ + self.set_element( + "File", + "", + {"file_path": f}, + [self.set_element("DataType", "generic-data")], + ) + for f in fastq_files + ] + + attribute_elements = [ self.set_element( "AttributeRefId", - attrib={"name": "BioSample"}, + attrib={"name": "BioProject"}, children=[ self.set_element( "RefId", children=[ self.set_element( "SPUID", - biosample_id, + bioproject_id, {"spuid_namespace": org}, ) ], ) ], ) - ) + ] - identifier_element = self.set_element( - "Identifier", - children=[ - self.set_element("SPUID", bioproject_id, {"spuid_namespace": org}) - ], - ) + for biosample_id in biosample_ids: + attribute_elements.append( + self.set_element( + "AttributeRefId", + attrib={"name": "BioSample"}, + children=[ + self.set_element( + "RefId", + children=[ + self.set_element( + "SPUID", + biosample_id, + {"spuid_namespace": org}, + ) + ], + ) + ], + ) + ) - action = self.set_element( - "Action", - children=[ - self.set_element( - "AddFiles", - attrib={"target_db": "SRA"}, - children=files_elements - + attribute_elements - + [identifier_element], - ), - ], - ) + identifier_element = self.set_element( + "Identifier", + children=[ + self.set_element( + "SPUID", bioproject_id, {"spuid_namespace": org} + ) + ], + ) - self.root.append(action) + action = self.set_element( + "Action", + children=[ + self.set_element( + "AddFiles", + attrib={"target_db": "SRA"}, + children=files_elements + + attribute_elements + + [identifier_element], + ), + ], + ) + + self.root.append(action) def get_submission_xml(self, biosamples_list: list, data_objects_list: list): self.set_description( @@ -346,18 +364,20 @@ def get_submission_xml(self, biosamples_list: list, data_objects_list: list): org=self.ncbi_submission_metadata.get("organization", ""), ) - self.set_bioproject( - title=self.ncbi_bioproject_metadata.get("title", ""), - project_id=self.ncbi_bioproject_metadata.get("project_id", ""), - description=self.ncbi_bioproject_metadata.get("description", ""), - data_type=self.ncbi_bioproject_metadata.get("data_type", ""), - org=self.ncbi_submission_metadata.get("organization", ""), - ) + if not self.ncbi_bioproject_metadata.get("exists"): + self.set_bioproject( + title=self.ncbi_bioproject_metadata.get("title", ""), + project_id=self.ncbi_bioproject_metadata.get("project_id", ""), + description=self.ncbi_bioproject_metadata.get("description", ""), + data_type=self.ncbi_bioproject_metadata.get("data_type", ""), + org=self.ncbi_submission_metadata.get("organization", ""), + ) self.set_biosample( organism_name=self.ncbi_biosample_metadata.get("organism_name", ""), package=self.ncbi_biosample_metadata.get("package", ""), org=self.ncbi_submission_metadata.get("organization", ""), + bioproject_id=self.ncbi_bioproject_metadata.get("project_id", ""), nmdc_biosamples=biosamples_list, ) diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index 330b056d..43a251ce 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -33,6 +33,7 @@ Optional, Field, Permissive, + Bool, ) from gridfs import GridFS from linkml_runtime.dumpers import json_dumper @@ -1030,6 +1031,7 @@ def get_ncbi_export_pipeline_study_id(context: OpExecutionContext) -> str: "project_id": String, "description": String, "data_type": String, + "exists": Bool, } ), is_required=True, diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py index 6d62b1cf..79919b2a 100644 --- a/nmdc_runtime/site/repository.py +++ b/nmdc_runtime/site/repository.py @@ -902,6 +902,7 @@ def biosample_export(): "project_id": "", "description": "", "data_type": "", + "exists": False, }, "ncbi_biosample_metadata": { "organism_name": "", diff --git a/tests/test_data/test_ncbi_xml.py b/tests/test_data/test_ncbi_xml.py index 0af9ddc5..0816adbd 100644 --- a/tests/test_data/test_ncbi_xml.py +++ b/tests/test_data/test_ncbi_xml.py @@ -1,6 +1,5 @@ from unittest.mock import MagicMock import pytest -from requests.exceptions import HTTPError import xml.etree.ElementTree as ET from nmdc_runtime.site.export.ncbi_xml import NCBISubmissionXML @@ -32,6 +31,7 @@ "project_id": "PRJNA12345", "description": "A test project", "data_type": "metagenome", + "exists": False, }, "ncbi_biosample_metadata": { "title": "Test Sample", @@ -213,6 +213,9 @@ def test_set_biosample(self, ncbi_submission_client, nmdc_biosample, mocker): "organization" ], nmdc_biosamples=nmdc_biosample, + bioproject_id=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"][ + "project_id" + ], ) biosample_xml = ET.tostring( ncbi_submission_client.root.find(".//BioSample"), "unicode" @@ -220,6 +223,7 @@ def test_set_biosample(self, ncbi_submission_client, nmdc_biosample, mocker): assert "E. coli" in biosample_xml assert "Test Package" in biosample_xml assert "Test Org" in biosample_xml + assert "PRJNA12345" in biosample_xml def test_set_fastq(self, ncbi_submission_client, data_objects_list, nmdc_biosample): biosample_data_objects = [ @@ -236,13 +240,18 @@ def test_set_fastq(self, ncbi_submission_client, data_objects_list, nmdc_biosamp ], ) - action_xml = ET.tostring( - ncbi_submission_client.root.find(".//Action"), "unicode" - ) - assert "BMI_HVKNKBGX5_Tube347_R2.fastq.gz" in action_xml - assert "PRJNA12345" in action_xml - assert "nmdc:bsm-12-p9q5v236" in action_xml - assert "Test Org" in action_xml + action_elements = ncbi_submission_client.root.findall(".//Action") + assert len(action_elements) == len(biosample_data_objects) + + for action_element in action_elements: + action_xml = ET.tostring(action_element, "unicode") + assert ( + "BMI_HVKNKBGX5_Tube347_R1.fastq.gz" in action_xml + or "BMI_HVKNKBGX5_Tube347_R2.fastq.gz" in action_xml + ) + assert "PRJNA12345" in action_xml + assert "nmdc:bsm-12-p9q5v236" in action_xml + assert "Test Org" in action_xml def test_get_submission_xml( self, mocker, ncbi_submission_client, nmdc_biosample, data_objects_list @@ -306,7 +315,7 @@ def test_get_submission_xml( ) submission_xml = ncbi_submission_client.get_submission_xml( - nmdc_biosample, biosample_data_objects + nmdc_biosample, data_objects_list ) assert "nmdc:bsm-12-p9q5v236" in submission_xml @@ -417,7 +426,7 @@ def test_load_mappings(self, mocker): "Biosample\tenv_local_scale\tControlledIdentifiedTermValue\tenv_local_scale\t\t\n" "Biosample\tenv_medium\tControlledIdentifiedTermValue\tenv_medium\t\t\n" "Biosample\tenv_package\tTextValue\tenv_package\t\t\n" - "Biosample\tgeo_loc_name\tQuantityValue\tgeo_loc_name\t\t\n" + "Biosample\tgeo_loc_name\tTextValue\tgeo_loc_name\t\t\n" "Biosample\tid\turiorcurie\t\t\t\n" "Biosample\tlat_lon\tGeolocationValue\tlat_lon\t\t\n" "Biosample\tname\tstring\tsample_name\t\t\n" @@ -465,7 +474,7 @@ def test_load_mappings(self, mocker): "env_local_scale": "ControlledIdentifiedTermValue", "env_medium": "ControlledIdentifiedTermValue", "env_package": "TextValue", - "geo_loc_name": "QuantityValue", + "geo_loc_name": "TextValue", "id": "uriorcurie", "lat_lon": "GeolocationValue", "name": "string", From e5421a357be39b35006db249eef194caa8bcb7fa Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Wed, 5 Jun 2024 15:18:55 -0700 Subject: [PATCH 21/27] typecode class map inference from schema --- nmdc_runtime/site/export/ncbi_xml_utils.py | 23 +++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/nmdc_runtime/site/export/ncbi_xml_utils.py b/nmdc_runtime/site/export/ncbi_xml_utils.py index bf3d285e..f7293795 100644 --- a/nmdc_runtime/site/export/ncbi_xml_utils.py +++ b/nmdc_runtime/site/export/ncbi_xml_utils.py @@ -1,21 +1,22 @@ -from lxml import etree from io import BytesIO, StringIO +from nmdc_runtime.minter.config import typecodes +from lxml import etree + import csv import requests -# TODO: do not hardcode this mapping +def _build_class_map(class_map_data): + return { + entry["name"]: entry["schema_class"].split(":")[1] for entry in class_map_data + } + + def get_classname_from_typecode(doc_id): + class_map_data = typecodes() + class_map = _build_class_map(class_map_data) + typecode = doc_id.split(":")[1].split("-")[0] - class_map = { - "bsm": "Biosample", - "extr": "Extraction", - "pool": "Pooling", - "libprep": "LibraryPreparation", - "procsm": "ProcessedSample", - "omprc": "OmicsProcessing", - "dobj": "DataObject", - } return class_map.get(typecode) From 80c4339245c2b4855747c908804f477bbd77101e Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Thu, 6 Jun 2024 15:08:22 -0700 Subject: [PATCH 22/27] allow copying of XML file contents from dagit UI --- nmdc_runtime/site/ops.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index 43a251ce..d3bbbeb7 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -781,11 +781,22 @@ def export_json_to_drs( out=Out(description="XML content rendered through Dagit UI"), ) def ncbi_submission_xml_asset(context: OpExecutionContext, data: str): + filename = "ncbi_submission.xml" + file_path = os.path.join(context.instance.storage_directory(), filename) + + os.makedirs(os.path.dirname(file_path), exist_ok=True) + + with open(file_path, "w") as f: + f.write(data) + context.log_event( AssetMaterialization( asset_key="ncbi_submission_xml", description="NCBI Submission XML Data", - metadata={"xml": MetadataValue.text(data)}, + metadata={ + "file_path": MetadataValue.path(file_path), + "xml": MetadataValue.text(data), + }, ) ) From adcd8e621b3db79bb2677693b9c9ec2295c9669d Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Fri, 14 Jun 2024 11:06:04 -0700 Subject: [PATCH 23/27] inference of some Submission and BioProject fields from existing metadata --- nmdc_runtime/site/export/ncbi_xml.py | 26 +++++++++++++++------- nmdc_runtime/site/export/study_metadata.py | 4 ++-- nmdc_runtime/site/graphs.py | 8 +++---- nmdc_runtime/site/ops.py | 20 ++++++++--------- nmdc_runtime/site/repository.py | 8 +------ 5 files changed, 34 insertions(+), 32 deletions(-) diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py index 0280a007..2e27472e 100644 --- a/nmdc_runtime/site/export/ncbi_xml.py +++ b/nmdc_runtime/site/export/ncbi_xml.py @@ -3,6 +3,7 @@ import xml.etree.ElementTree as ET import xml.dom.minidom +from typing import Any from urllib.parse import urlparse from nmdc_runtime.site.export.ncbi_xml_utils import ( handle_controlled_identified_term_value, @@ -19,10 +20,19 @@ class NCBISubmissionXML: - def __init__(self, nmdc_study_id: str, ncbi_submission_metadata: dict): + def __init__(self, nmdc_study: Any, ncbi_submission_metadata: dict): self.root = ET.Element("Submission") - self.nmdc_study_id = nmdc_study_id + self.nmdc_study_id = nmdc_study.get("id") + self.nmdc_study_title = nmdc_study.get("title") + self.nmdc_study_description = nmdc_study.get("description") + self.ncbi_bioproject_id = nmdc_study.get("insdc_bioproject_identifiers") + self.nmdc_pi_email = nmdc_study.get("principal_investigator", {}).get("email") + nmdc_study_pi_name = ( + nmdc_study.get("principal_investigator", {}).get("name").split() + ) + self.first_name = nmdc_study_pi_name[0] + self.last_name = nmdc_study_pi_name[1] if len(nmdc_study_pi_name) > 1 else None self.nmdc_ncbi_attribute_mapping_file_url = ncbi_submission_metadata.get( "nmdc_ncbi_attribute_mapping_file_url" @@ -357,18 +367,18 @@ def set_fastq( def get_submission_xml(self, biosamples_list: list, data_objects_list: list): self.set_description( - email=self.ncbi_submission_metadata.get("email", ""), - user=self.ncbi_submission_metadata.get("user", ""), - first=self.ncbi_submission_metadata.get("first", ""), - last=self.ncbi_submission_metadata.get("last", ""), + email=self.nmdc_pi_email, + user="National Microbiome Data Collaborative (NMDC)", + first=self.first_name, + last=self.last_name, org=self.ncbi_submission_metadata.get("organization", ""), ) if not self.ncbi_bioproject_metadata.get("exists"): self.set_bioproject( - title=self.ncbi_bioproject_metadata.get("title", ""), + title=self.nmdc_study_title, project_id=self.ncbi_bioproject_metadata.get("project_id", ""), - description=self.ncbi_bioproject_metadata.get("description", ""), + description=self.nmdc_study_description, data_type=self.ncbi_bioproject_metadata.get("data_type", ""), org=self.ncbi_submission_metadata.get("organization", ""), ) diff --git a/nmdc_runtime/site/export/study_metadata.py b/nmdc_runtime/site/export/study_metadata.py index 3cf9bc6d..d9bb2a97 100644 --- a/nmdc_runtime/site/export/study_metadata.py +++ b/nmdc_runtime/site/export/study_metadata.py @@ -131,7 +131,7 @@ def export_study_biosamples_metadata(): @op(required_resource_keys={"runtime_api_site_client"}) -def get_biosamples_by_study_id(context: OpExecutionContext, nmdc_study_id: str): +def get_biosamples_by_study_id(context: OpExecutionContext, nmdc_study: dict): client: RuntimeApiSiteClient = context.resources.runtime_api_site_client - biosamples = get_all_docs(client, "biosamples", f"part_of:{nmdc_study_id}") + biosamples = get_all_docs(client, "biosamples", f"part_of:{nmdc_study['id']}") return biosamples diff --git a/nmdc_runtime/site/graphs.py b/nmdc_runtime/site/graphs.py index 700ff6d7..2798bccb 100644 --- a/nmdc_runtime/site/graphs.py +++ b/nmdc_runtime/site/graphs.py @@ -49,7 +49,7 @@ get_neon_pipeline_inputs, get_df_from_url, site_code_mapping, - get_ncbi_export_pipeline_study_id, + get_ncbi_export_pipeline_study, get_data_objects_from_biosamples, get_ncbi_export_pipeline_inputs, ncbi_submission_xml_from_nmdc_study, @@ -391,11 +391,11 @@ def ingest_neon_surface_water_metadata(): @graph def nmdc_study_to_ncbi_submission_export(): - nmdc_study_id = get_ncbi_export_pipeline_study_id() + nmdc_study = get_ncbi_export_pipeline_study() ncbi_submission_metadata = get_ncbi_export_pipeline_inputs() - biosamples = get_biosamples_by_study_id(nmdc_study_id) + biosamples = get_biosamples_by_study_id(nmdc_study) data_objects = get_data_objects_from_biosamples(biosamples) xml_data = ncbi_submission_xml_from_nmdc_study( - nmdc_study_id, ncbi_submission_metadata, biosamples, data_objects + nmdc_study, ncbi_submission_metadata, biosamples, data_objects ) ncbi_submission_xml_asset(xml_data) diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index d3bbbeb7..51d8c22d 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -48,6 +48,7 @@ ) from nmdc_runtime.api.core.util import dotted_path_for, hash_from_str, json_clean, now from nmdc_runtime.api.endpoints.util import persist_content_and_get_drs_object +from nmdc_runtime.api.endpoints.find import find_study_by_id from nmdc_runtime.api.models.job import Job, JobOperationMetadata from nmdc_runtime.api.models.metadata import ChangesheetIn from nmdc_runtime.api.models.operation import ( @@ -1014,9 +1015,12 @@ def site_code_mapping() -> dict: ) -@op(config_schema={"nmdc_study_id": str}) -def get_ncbi_export_pipeline_study_id(context: OpExecutionContext) -> str: - return context.op_config["nmdc_study_id"] +@op(config_schema={"nmdc_study_id": str}, required_resource_keys={"mongo"}) +def get_ncbi_export_pipeline_study(context: OpExecutionContext) -> Any: + nmdc_study = find_study_by_id( + context.op_config["nmdc_study_id"], context.resources.mongo.db + ) + return nmdc_study @op( @@ -1025,10 +1029,6 @@ def get_ncbi_export_pipeline_study_id(context: OpExecutionContext) -> str: "ncbi_submission_metadata": Field( Permissive( { - "email": String, - "first": String, - "last": String, - "user": String, "organization": String, } ), @@ -1038,9 +1038,7 @@ def get_ncbi_export_pipeline_study_id(context: OpExecutionContext) -> str: "ncbi_bioproject_metadata": Field( Permissive( { - "title": String, "project_id": String, - "description": String, "data_type": String, "exists": Bool, } @@ -1090,11 +1088,11 @@ def get_data_objects_from_biosamples(context: OpExecutionContext, biosamples: li @op def ncbi_submission_xml_from_nmdc_study( context: OpExecutionContext, - nmdc_study_id: str, + nmdc_study: Any, ncbi_exporter_metadata: dict, biosamples: list, data_objects: list, ) -> str: - ncbi_exporter = NCBISubmissionXML(nmdc_study_id, ncbi_exporter_metadata) + ncbi_exporter = NCBISubmissionXML(nmdc_study, ncbi_exporter_metadata) ncbi_xml = ncbi_exporter.get_submission_xml(biosamples, data_objects) return ncbi_xml diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py index 79919b2a..c6788459 100644 --- a/nmdc_runtime/site/repository.py +++ b/nmdc_runtime/site/repository.py @@ -882,7 +882,7 @@ def biosample_export(): }, ), "ops": { - "get_ncbi_export_pipeline_study_id": { + "get_ncbi_export_pipeline_study": { "config": { "nmdc_study_id": "", } @@ -891,16 +891,10 @@ def biosample_export(): "config": { "nmdc_ncbi_attribute_mapping_file_url": "", "ncbi_submission_metadata": { - "email": "", - "first": "", - "last": "", - "user": "", "organization": "", }, "ncbi_bioproject_metadata": { - "title": "", "project_id": "", - "description": "", "data_type": "", "exists": False, }, From 5a9e7498e8e7560cb617a846bcdb87d88ffa880c Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Fri, 21 Jun 2024 11:29:55 -0700 Subject: [PATCH 24/27] reduce manual metadata entry through Dagit UI fields --- nmdc_runtime/site/export/ncbi_xml.py | 49 ++++-- nmdc_runtime/site/export/ncbi_xml_utils.py | 37 +++++ nmdc_runtime/site/graphs.py | 8 +- nmdc_runtime/site/ops.py | 34 ++-- nmdc_runtime/site/repository.py | 6 - tests/test_data/test_ncbi_xml.py | 176 +++++++++++++-------- 6 files changed, 209 insertions(+), 101 deletions(-) diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py index 2e27472e..0e82e24a 100644 --- a/nmdc_runtime/site/export/ncbi_xml.py +++ b/nmdc_runtime/site/export/ncbi_xml.py @@ -40,9 +40,6 @@ def __init__(self, nmdc_study: Any, ncbi_submission_metadata: dict): self.ncbi_submission_metadata = ncbi_submission_metadata.get( "ncbi_submission_metadata", {} ) - self.ncbi_bioproject_metadata = ncbi_submission_metadata.get( - "ncbi_bioproject_metadata", {} - ) self.ncbi_biosample_metadata = ncbi_submission_metadata.get( "ncbi_biosample_metadata", {} ) @@ -157,10 +154,10 @@ def set_bioproject(self, title, project_id, description, data_type, org): def set_biosample( self, organism_name, - package, org, bioproject_id, nmdc_biosamples, + nmdc_omics_processing, ): attribute_mappings, slot_range_mappings = load_mappings( self.nmdc_ncbi_attribute_mapping_file_url @@ -169,11 +166,15 @@ def set_biosample( for biosample in nmdc_biosamples: attributes = {} sample_id_value = None + env_package = None for json_key, value in biosample.items(): if isinstance(value, list): continue # Skip processing for list values + if json_key == "env_package": + env_package = f"MIMS.me.{handle_text_value(value)}.6.0" + # Special handling for NMDC Biosample "id" if json_key == "id": sample_id_value = value @@ -219,7 +220,7 @@ def set_biosample( ) ], ), - self.set_element("Package", package), + self.set_element("Package", env_package), self.set_element( "Attributes", children=[ @@ -365,7 +366,23 @@ def set_fastq( self.root.append(action) - def get_submission_xml(self, biosamples_list: list, data_objects_list: list): + def get_submission_xml( + self, + biosamples_list: list, + biosample_omics_processing_list: list, + biosample_data_objects_list: list, + ): + data_type = None + ncbi_project_id = None + for bsm_omprc in biosample_omics_processing_list: + for _, omprc_list in bsm_omprc.items(): + for omprc in omprc_list: + if "omics_type" in omprc: + data_type = handle_text_value(omprc["omics_type"]).capitalize() + + if "ncbi_project_name" in omprc: + ncbi_project_id = omprc["ncbi_project_name"] + self.set_description( email=self.nmdc_pi_email, user="National Microbiome Data Collaborative (NMDC)", @@ -374,26 +391,26 @@ def get_submission_xml(self, biosamples_list: list, data_objects_list: list): org=self.ncbi_submission_metadata.get("organization", ""), ) - if not self.ncbi_bioproject_metadata.get("exists"): + if not ncbi_project_id: self.set_bioproject( title=self.nmdc_study_title, - project_id=self.ncbi_bioproject_metadata.get("project_id", ""), + project_id=ncbi_project_id, description=self.nmdc_study_description, - data_type=self.ncbi_bioproject_metadata.get("data_type", ""), + data_type=data_type, org=self.ncbi_submission_metadata.get("organization", ""), ) self.set_biosample( organism_name=self.ncbi_biosample_metadata.get("organism_name", ""), - package=self.ncbi_biosample_metadata.get("package", ""), org=self.ncbi_submission_metadata.get("organization", ""), - bioproject_id=self.ncbi_bioproject_metadata.get("project_id", ""), + bioproject_id=ncbi_project_id, nmdc_biosamples=biosamples_list, + nmdc_omics_processing=biosample_omics_processing_list, ) self.set_fastq( - biosample_data_objects=data_objects_list, - bioproject_id=self.ncbi_bioproject_metadata.get("project_id", ""), + biosample_data_objects=biosample_data_objects_list, + bioproject_id=ncbi_project_id, org=self.ncbi_submission_metadata.get("organization", ""), ) @@ -403,12 +420,12 @@ def get_submission_xml(self, biosamples_list: list, data_objects_list: list): # ============= Uncomment the following code to validate the XML against NCBI XSDs ============ # # submission_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/submission.xsd?view=co" - # submission_xsd_validation = validate_xml(submission_xml, submission_xsd_url) + # validate_xml(submission_xml, submission_xsd_url) # bioproject_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/bioproject.xsd?view=co" - # bioproject_xsd_validation = validate_xml(submission_xml, bioproject_xsd_url) + # validate_xml(submission_xml, bioproject_xsd_url) # biosample_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/biosample.xsd?view=co" - # biosample_xsd_validation = validate_xml(submission_xml, biosample_xsd_url) + # validate_xml(submission_xml, biosample_xsd_url) return submission_xml diff --git a/nmdc_runtime/site/export/ncbi_xml_utils.py b/nmdc_runtime/site/export/ncbi_xml_utils.py index f7293795..84ee95d6 100644 --- a/nmdc_runtime/site/export/ncbi_xml_utils.py +++ b/nmdc_runtime/site/export/ncbi_xml_utils.py @@ -57,6 +57,43 @@ def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list): return biosample_data_objects +def fetch_omics_processing_from_biosamples(all_docs_collection, biosamples_list): + biosample_data_objects = [] + + for biosample in biosamples_list: + current_ids = [biosample["id"]] + collected_data_objects = [] + + while current_ids: + new_current_ids = [] + for current_id in current_ids: + query = {"has_input": current_id} + document = all_docs_collection.find_one(query) + + if not document: + continue + + has_output = document.get("has_output") + if not has_output: + continue + + for output_id in has_output: + if get_classname_from_typecode(output_id) == "DataObject": + omics_processing_doc = all_docs_collection.find_one( + {"id": document["id"]} + ) + if omics_processing_doc: + collected_data_objects.append(omics_processing_doc) + else: + new_current_ids.append(output_id) + + current_ids = new_current_ids + + if collected_data_objects: + biosample_data_objects.append({biosample["id"]: collected_data_objects}) + + return biosample_data_objects + def handle_quantity_value(slot_value): if "has_numeric_value" in slot_value and "has_unit" in slot_value: diff --git a/nmdc_runtime/site/graphs.py b/nmdc_runtime/site/graphs.py index 2798bccb..a3d2aebd 100644 --- a/nmdc_runtime/site/graphs.py +++ b/nmdc_runtime/site/graphs.py @@ -51,6 +51,7 @@ site_code_mapping, get_ncbi_export_pipeline_study, get_data_objects_from_biosamples, + get_omics_processing_from_biosamples, get_ncbi_export_pipeline_inputs, ncbi_submission_xml_from_nmdc_study, ncbi_submission_xml_asset, @@ -394,8 +395,13 @@ def nmdc_study_to_ncbi_submission_export(): nmdc_study = get_ncbi_export_pipeline_study() ncbi_submission_metadata = get_ncbi_export_pipeline_inputs() biosamples = get_biosamples_by_study_id(nmdc_study) + omics_processing_records = get_omics_processing_from_biosamples(biosamples) data_objects = get_data_objects_from_biosamples(biosamples) xml_data = ncbi_submission_xml_from_nmdc_study( - nmdc_study, ncbi_submission_metadata, biosamples, data_objects + nmdc_study, + ncbi_submission_metadata, + biosamples, + omics_processing_records, + data_objects, ) ncbi_submission_xml_asset(xml_data) diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index 51d8c22d..6d8d6ebf 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -63,7 +63,10 @@ ) from nmdc_runtime.api.models.util import ResultT from nmdc_runtime.site.export.ncbi_xml import NCBISubmissionXML -from nmdc_runtime.site.export.ncbi_xml_utils import fetch_data_objects_from_biosamples +from nmdc_runtime.site.export.ncbi_xml_utils import ( + fetch_data_objects_from_biosamples, + fetch_omics_processing_from_biosamples, +) from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict from nmdc_runtime.site.resources import ( NmdcPortalApiClient, @@ -1035,22 +1038,10 @@ def get_ncbi_export_pipeline_study(context: OpExecutionContext) -> Any: is_required=True, description="General metadata about the NCBI submission.", ), - "ncbi_bioproject_metadata": Field( - Permissive( - { - "project_id": String, - "data_type": String, - "exists": Bool, - } - ), - is_required=True, - description="Metadata for NCBI BioProject in the Submission.", - ), "ncbi_biosample_metadata": Field( Permissive( { "organism_name": String, - "package": String, } ), is_required=True, @@ -1064,13 +1055,11 @@ def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str: "nmdc_ncbi_attribute_mapping_file_url" ] ncbi_submission_metadata = context.op_config.get("ncbi_submission_metadata", {}) - ncbi_bioproject_metadata = context.op_config.get("ncbi_bioproject_metadata", {}) ncbi_biosample_metadata = context.op_config.get("ncbi_biosample_metadata", {}) return { "nmdc_ncbi_attribute_mapping_file_url": nmdc_ncbi_attribute_mapping_file_url, "ncbi_submission_metadata": ncbi_submission_metadata, - "ncbi_bioproject_metadata": ncbi_bioproject_metadata, "ncbi_biosample_metadata": ncbi_biosample_metadata, } @@ -1085,14 +1074,27 @@ def get_data_objects_from_biosamples(context: OpExecutionContext, biosamples: li return biosample_data_objects +@op(required_resource_keys={"mongo"}) +def get_omics_processing_from_biosamples(context: OpExecutionContext, biosamples: list): + mdb = context.resources.mongo.db + alldocs_collection = mdb["alldocs"] + biosample_omics_processing = fetch_omics_processing_from_biosamples( + alldocs_collection, biosamples + ) + return biosample_omics_processing + + @op def ncbi_submission_xml_from_nmdc_study( context: OpExecutionContext, nmdc_study: Any, ncbi_exporter_metadata: dict, biosamples: list, + omics_processing_records: list, data_objects: list, ) -> str: ncbi_exporter = NCBISubmissionXML(nmdc_study, ncbi_exporter_metadata) - ncbi_xml = ncbi_exporter.get_submission_xml(biosamples, data_objects) + ncbi_xml = ncbi_exporter.get_submission_xml( + biosamples, omics_processing_records, data_objects + ) return ncbi_xml diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py index c6788459..7a0fad22 100644 --- a/nmdc_runtime/site/repository.py +++ b/nmdc_runtime/site/repository.py @@ -893,14 +893,8 @@ def biosample_export(): "ncbi_submission_metadata": { "organization": "", }, - "ncbi_bioproject_metadata": { - "project_id": "", - "data_type": "", - "exists": False, - }, "ncbi_biosample_metadata": { "organism_name": "", - "package": "", }, } }, diff --git a/tests/test_data/test_ncbi_xml.py b/tests/test_data/test_ncbi_xml.py index 0816adbd..7996b4d1 100644 --- a/tests/test_data/test_ncbi_xml.py +++ b/tests/test_data/test_ncbi_xml.py @@ -15,28 +15,72 @@ handle_string_value, ) -MOCK_NCBI_NMDC_STUDY_ID = "nmdc:sty-11-12345" +MOCK_NMDC_STUDY = { + "id": "nmdc:sty-11-34xj1150", + "name": "National Ecological Observatory Network: soil metagenomes (DP1.10107.001)", + "description": "This study contains the quality-controlled laboratory metadata and minimally processed sequence data from NEON's soil microbial shotgun metagenomics sequencing. Typically, measurements are done on plot-level composite samples and represent up to three randomly selected sampling locations within a plot.", + "gold_study_identifiers": ["gold:Gs0144570", "gold:Gs0161344"], + "principal_investigator": { + "has_raw_value": "Kate Thibault", + "email": "kthibault@battelleecology.org", + "name": "Kate Thibault", + "orcid": "orcid:0000-0003-3477-6424", + "profile_image_url": "https://portal.nersc.gov/project/m3408/profile_images/thibault_katy.jpg", + }, + "title": "National Ecological Observatory Network: soil metagenomes (DP1.10107.001)", + "type": "nmdc:Study", + "websites": [ + "https://data.neonscience.org/data-products/DP1.10107.001", + "https://data.neonscience.org/api/v0/documents/NEON.DOC.014048vO", + "https://data.neonscience.org/api/v0/documents/NEON_metagenomes_userGuide_vE.pdf", + ], + "study_image": [ + { + "url": "https://portal.nersc.gov/project/m3408/profile_images/nmdc_sty-11-34xj1150.jpg" + } + ], + "funding_sources": [ + "NSF#1724433 National Ecological Observatory Network: Operations Activities" + ], + "has_credit_associations": [ + { + "applies_to_person": { + "name": "Hugh Cross", + "email": "crossh@battelleecology.org", + "orcid": "orcid:0000-0002-6745-9479", + }, + "applied_roles": ["Methodology", "Data curation"], + }, + { + "applies_to_person": { + "name": "Samantha Weintraub-Leff", + "email": "sweintraub@battelleecology.org", + "orcid": "orcid:0000-0003-4789-5086", + }, + "applied_roles": ["Methodology", "Data curation"], + }, + { + "applies_to_person": { + "name": "Kate Thibault", + "email": "kthibault@battelleecology.org", + "orcid": "orcid:0000-0003-3477-6424", + }, + "applied_roles": ["Principal Investigator"], + }, + ], + "part_of": ["nmdc:sty-11-nxrz9m96"], + "study_category": "consortium", + "insdc_bioproject_identifiers": ["bioproject:PRJNA1029061"], + "homepage_website": ["https://www.neonscience.org/"], +} MOCK_NCBI_SUBMISSION_METADATA = { "nmdc_ncbi_attribute_mapping_file_url": "http://example.com/mappings.tsv", "ncbi_submission_metadata": { - "email": "user@example.com", - "user": "testuser", - "first": "Test", - "last": "User", "organization": "Test Org", }, - "ncbi_bioproject_metadata": { - "title": "Test Project", - "project_id": "PRJNA12345", - "description": "A test project", - "data_type": "metagenome", - "exists": False, - }, "ncbi_biosample_metadata": { - "title": "Test Sample", "organism_name": "E. coli", - "package": "Test Package", }, } @@ -44,7 +88,7 @@ @pytest.fixture def ncbi_submission_client(): return NCBISubmissionXML( - nmdc_study_id=MOCK_NCBI_NMDC_STUDY_ID, + nmdc_study=MOCK_NMDC_STUDY, ncbi_submission_metadata=MOCK_NCBI_SUBMISSION_METADATA, ) @@ -77,6 +121,24 @@ def nmdc_biosample(): ] +@pytest.fixture +def omics_processing_list(): + return [ + { + "has_input": ["nmdc:procsm-12-ehktny16"], + "has_output": ["nmdc:dobj-12-1zv4q961", "nmdc:dobj-12-b3ft7a80"], + "id": "nmdc:omprc-12-zqm9p096", + "instrument_name": "Illumina NextSeq550", + "name": "Terrestrial soil microbial communities - ARIK.20150721.AMC.EPIPSAMMON.3-DNA1", + "ncbi_project_name": "PRJNA406976", + "omics_type": {"has_raw_value": "metagenome"}, + "part_of": ["nmdc:sty-11-34xj1150"], + "processing_institution": "Battelle", + "type": "nmdc:OmicsProcessing", + } + ] + + @pytest.fixture def data_objects_list(): return [ @@ -110,11 +172,11 @@ def test_set_element(self, ncbi_submission_client): def test_set_description(self, ncbi_submission_client): ncbi_submission_client.set_description( - MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"]["email"], - MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"]["user"], - MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"]["first"], - MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"]["last"], - MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"]["organization"], + ncbi_submission_client.nmdc_pi_email, + "testuser", + "Kate", + "Thibault", + "Test Org", ) description = ET.tostring( ncbi_submission_client.root.find("Description"), "unicode" @@ -128,35 +190,33 @@ def test_set_description(self, ncbi_submission_client): contact_first = root.find("Organization/Contact/Name/First").text contact_last = root.find("Organization/Contact/Name/Last").text - assert comment == "NMDC Submission for nmdc:sty-11-12345" + assert comment == f"NMDC Submission for {MOCK_NMDC_STUDY['id']}" assert submitter == "testuser" assert org_name == "Test Org" - assert contact_email == "user@example.com" - assert contact_first == "Test" - assert contact_last == "User" + assert contact_email == "kthibault@battelleecology.org" + assert contact_first == "Kate" + assert contact_last == "Thibault" def test_set_bioproject(self, ncbi_submission_client): ncbi_submission_client.set_bioproject( - title=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"]["title"], - project_id=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"][ - "project_id" - ], - description=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"][ - "description" - ], - data_type=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"][ - "data_type" - ], - org=MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"][ - "organization" - ], + title=MOCK_NMDC_STUDY["title"], + project_id=MOCK_NMDC_STUDY["insdc_bioproject_identifiers"][0], + description=MOCK_NMDC_STUDY["description"], + data_type="metagenome", + org="Test Org", ) bioproject_xml = ET.tostring( ncbi_submission_client.root.find(".//Project"), "unicode" ) - assert "Test Project" in bioproject_xml - assert "PRJNA12345" in bioproject_xml - assert "A test project" in bioproject_xml + assert ( + "National Ecological Observatory Network: soil metagenomes (DP1.10107.001)" + in bioproject_xml + ) + assert "bioproject:PRJNA1029061" in bioproject_xml + assert ( + "This study contains the quality-controlled laboratory metadata and minimally processed sequence data from NEON's soil microbial shotgun metagenomics sequencing." + in bioproject_xml + ) assert "metagenome" in bioproject_xml assert "Test Org" in bioproject_xml @@ -208,22 +268,19 @@ def test_set_biosample(self, ncbi_submission_client, nmdc_biosample, mocker): organism_name=MOCK_NCBI_SUBMISSION_METADATA["ncbi_biosample_metadata"][ "organism_name" ], - package=MOCK_NCBI_SUBMISSION_METADATA["ncbi_biosample_metadata"]["package"], org=MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"][ "organization" ], + bioproject_id=MOCK_NMDC_STUDY["insdc_bioproject_identifiers"][0], nmdc_biosamples=nmdc_biosample, - bioproject_id=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"][ - "project_id" - ], + nmdc_omics_processing=[], ) biosample_xml = ET.tostring( ncbi_submission_client.root.find(".//BioSample"), "unicode" ) assert "E. coli" in biosample_xml - assert "Test Package" in biosample_xml assert "Test Org" in biosample_xml - assert "PRJNA12345" in biosample_xml + assert "PRJNA1029061" in biosample_xml def test_set_fastq(self, ncbi_submission_client, data_objects_list, nmdc_biosample): biosample_data_objects = [ @@ -232,12 +289,8 @@ def test_set_fastq(self, ncbi_submission_client, data_objects_list, nmdc_biosamp ncbi_submission_client.set_fastq( biosample_data_objects=biosample_data_objects, - bioproject_id=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"][ - "project_id" - ], - org=MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"][ - "organization" - ], + bioproject_id=MOCK_NMDC_STUDY["insdc_bioproject_identifiers"][0], + org="Test Org", ) action_elements = ncbi_submission_client.root.findall(".//Action") @@ -249,7 +302,7 @@ def test_set_fastq(self, ncbi_submission_client, data_objects_list, nmdc_biosamp "BMI_HVKNKBGX5_Tube347_R1.fastq.gz" in action_xml or "BMI_HVKNKBGX5_Tube347_R2.fastq.gz" in action_xml ) - assert "PRJNA12345" in action_xml + assert "PRJNA1029061" in action_xml assert "nmdc:bsm-12-p9q5v236" in action_xml assert "Test Org" in action_xml @@ -306,16 +359,12 @@ def test_get_submission_xml( ncbi_submission_client.set_fastq( biosample_data_objects=biosample_data_objects, - bioproject_id=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"][ - "project_id" - ], - org=MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"][ - "organization" - ], + bioproject_id=MOCK_NMDC_STUDY["insdc_bioproject_identifiers"][0], + org="Test Org", ) submission_xml = ncbi_submission_client.get_submission_xml( - nmdc_biosample, data_objects_list + nmdc_biosample, [], biosample_data_objects ) assert "nmdc:bsm-12-p9q5v236" in submission_xml @@ -323,8 +372,11 @@ def test_get_submission_xml( assert "sediment" in submission_xml assert "USA: Colorado, Arikaree River" in submission_xml assert "2015-07-21T18:00Z" in submission_xml - assert "testuser" in submission_xml - assert "Test Project" in submission_xml + assert "National Microbiome Data Collaborative (NMDC)" in submission_xml + assert ( + "National Ecological Observatory Network: soil metagenomes (DP1.10107.001)" + in submission_xml + ) class TestNCBIXMLUtils: From 433e3174393f8a3f74048e84116b563c5587f314 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Fri, 21 Jun 2024 11:47:23 -0700 Subject: [PATCH 25/27] black format nmdc_runtime/site/export/ncbi_xml_utils.py --- nmdc_runtime/site/export/ncbi_xml_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nmdc_runtime/site/export/ncbi_xml_utils.py b/nmdc_runtime/site/export/ncbi_xml_utils.py index 84ee95d6..ac710f93 100644 --- a/nmdc_runtime/site/export/ncbi_xml_utils.py +++ b/nmdc_runtime/site/export/ncbi_xml_utils.py @@ -57,6 +57,7 @@ def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list): return biosample_data_objects + def fetch_omics_processing_from_biosamples(all_docs_collection, biosamples_list): biosample_data_objects = [] From e82780f6dacb159a83d9dd429b3d9f43d6f02827 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Fri, 21 Jun 2024 12:21:28 -0700 Subject: [PATCH 26/27] fix code review comments --- nmdc_runtime/site/export/ncbi_xml_utils.py | 8 ++------ nmdc_runtime/site/ops.py | 1 - requirements/dev.in | 3 +-- requirements/main.in | 1 - 4 files changed, 3 insertions(+), 10 deletions(-) diff --git a/nmdc_runtime/site/export/ncbi_xml_utils.py b/nmdc_runtime/site/export/ncbi_xml_utils.py index ac710f93..e527245e 100644 --- a/nmdc_runtime/site/export/ncbi_xml_utils.py +++ b/nmdc_runtime/site/export/ncbi_xml_utils.py @@ -198,13 +198,9 @@ def validate_xml(xml, xsd_url): xml_schema_doc = etree.parse(BytesIO(xsd_content.encode("utf-8"))) xml_schema = etree.XMLSchema(xml_schema_doc) - if "=1.10.0 -pytest-mock python-dotenv python-jose[cryptography] python-multipart From b9c6d3809876d71b92df059dd6b8cbd3ac5758a9 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Mon, 24 Jun 2024 13:15:58 -0700 Subject: [PATCH 27/27] clarify definition of harcoded text 'eEnvironment' --- nmdc_runtime/site/export/ncbi_xml.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py index 0e82e24a..27a4371d 100644 --- a/nmdc_runtime/site/export/ncbi_xml.py +++ b/nmdc_runtime/site/export/ncbi_xml.py @@ -125,6 +125,8 @@ def set_bioproject(self, title, project_id, description, data_type, org): descriptor = self.set_descriptor(title, description) project_type = self.set_element("ProjectType") + # "sample_scope" is a enumeration feild. Docs: https://www.ncbi.nlm.nih.gov/data_specs/schema/other/bioproject/Core.xsd + # scope is "eEnvironment" when "Content of species in a sample is not known, i.e. microbiome,metagenome, etc.." project_type_submission = self.set_element( "ProjectTypeSubmission", attrib={"sample_scope": "eEnvironment"} ) @@ -422,10 +424,10 @@ def get_submission_xml( # submission_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/submission.xsd?view=co" # validate_xml(submission_xml, submission_xsd_url) - # bioproject_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/bioproject.xsd?view=co" + # bioproject_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/bioproject/bioproject.xsd?view=co" # validate_xml(submission_xml, bioproject_xsd_url) - # biosample_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/biosample.xsd?view=co" + # biosample_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/biosample/biosample.xsd?view=co" # validate_xml(submission_xml, biosample_xsd_url) return submission_xml