diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py new file mode 100644 index 00000000..27a4371d --- /dev/null +++ b/nmdc_runtime/site/export/ncbi_xml.py @@ -0,0 +1,433 @@ +import os +import datetime +import xml.etree.ElementTree as ET +import xml.dom.minidom + +from typing import Any +from urllib.parse import urlparse +from nmdc_runtime.site.export.ncbi_xml_utils import ( + handle_controlled_identified_term_value, + handle_controlled_term_value, + handle_geolocation_value, + handle_quantity_value, + handle_text_value, + handle_timestamp_value, + handle_float_value, + handle_string_value, + load_mappings, + validate_xml, +) + + +class NCBISubmissionXML: + def __init__(self, nmdc_study: Any, ncbi_submission_metadata: dict): + self.root = ET.Element("Submission") + + self.nmdc_study_id = nmdc_study.get("id") + self.nmdc_study_title = nmdc_study.get("title") + self.nmdc_study_description = nmdc_study.get("description") + self.ncbi_bioproject_id = nmdc_study.get("insdc_bioproject_identifiers") + self.nmdc_pi_email = nmdc_study.get("principal_investigator", {}).get("email") + nmdc_study_pi_name = ( + nmdc_study.get("principal_investigator", {}).get("name").split() + ) + self.first_name = nmdc_study_pi_name[0] + self.last_name = nmdc_study_pi_name[1] if len(nmdc_study_pi_name) > 1 else None + + self.nmdc_ncbi_attribute_mapping_file_url = ncbi_submission_metadata.get( + "nmdc_ncbi_attribute_mapping_file_url" + ) + self.ncbi_submission_metadata = ncbi_submission_metadata.get( + "ncbi_submission_metadata", {} + ) + self.ncbi_biosample_metadata = ncbi_submission_metadata.get( + "ncbi_biosample_metadata", {} + ) + + # dispatcher dictionary capturing handlers for NMDC object to NCBI flat Attribute + # type handlers + self.type_handlers = { + "QuantityValue": handle_quantity_value, + "TextValue": handle_text_value, + "TimestampValue": handle_timestamp_value, + "ControlledTermValue": handle_controlled_term_value, + "ControlledIdentifiedTermValue": handle_controlled_identified_term_value, + "GeolocationValue": handle_geolocation_value, + "float": handle_float_value, + "string": handle_string_value, + } + + def set_element(self, tag, text="", attrib=None, children=None): + attrib = attrib or {} + children = children or [] + element = ET.Element(tag, attrib=attrib) + element.text = text + for child in children: + element.append(child) + return element + + def set_description(self, email, user, first, last, org, date=None): + date = date or datetime.datetime.now().strftime("%Y-%m-%d") + description = self.set_element( + "Description", + children=[ + self.set_element( + "Comment", f"NMDC Submission for {self.nmdc_study_id}" + ), + self.set_element("Submitter", attrib={"user_name": user}), + self.set_element( + "Organization", + attrib={"role": "owner", "type": "center"}, + children=[ + self.set_element("Name", org), + self.set_element( + "Contact", + attrib={"email": email}, + children=[ + self.set_element( + "Name", + children=[ + self.set_element("First", first), + self.set_element("Last", last), + ], + ) + ], + ), + ], + ), + self.set_element("Hold", attrib={"release_date": date}), + ], + ) + self.root.append(description) + + def set_descriptor(self, title, description): + descriptor_elements = [] + descriptor_elements.append(self.set_element("Title", title)) + descriptor_elements.append( + self.set_element( + "Description", children=[self.set_element("p", description)] + ) + ) + + return descriptor_elements + + def set_bioproject(self, title, project_id, description, data_type, org): + action = self.set_element("Action") + add_data = self.set_element("AddData", attrib={"target_db": "BioProject"}) + + data_element = self.set_element("Data", attrib={"content_type": "XML"}) + xml_content = self.set_element("XmlContent") + project = self.set_element("Project", attrib={"schema_version": "2.0"}) + + project_id_element = self.set_element("ProjectID") + spuid = self.set_element("SPUID", project_id, {"spuid_namespace": org}) + project_id_element.append(spuid) + + descriptor = self.set_descriptor(title, description) + project_type = self.set_element("ProjectType") + # "sample_scope" is a enumeration feild. Docs: https://www.ncbi.nlm.nih.gov/data_specs/schema/other/bioproject/Core.xsd + # scope is "eEnvironment" when "Content of species in a sample is not known, i.e. microbiome,metagenome, etc.." + project_type_submission = self.set_element( + "ProjectTypeSubmission", attrib={"sample_scope": "eEnvironment"} + ) + intended_data_type_set = self.set_element("IntendedDataTypeSet") + data_type_element = self.set_element("DataType", data_type) + + intended_data_type_set.append(data_type_element) + project_type_submission.append(intended_data_type_set) + project_type.append(project_type_submission) + + project.extend([project_id_element] + descriptor + [project_type]) + + xml_content.append(project) + data_element.append(xml_content) + add_data.append(data_element) + + identifier = self.set_element("Identifier") + spuid_identifier = self.set_element( + "SPUID", project_id, {"spuid_namespace": org} + ) + identifier.append(spuid_identifier) + add_data.append(identifier) + + action.append(add_data) + self.root.append(action) + + def set_biosample( + self, + organism_name, + org, + bioproject_id, + nmdc_biosamples, + nmdc_omics_processing, + ): + attribute_mappings, slot_range_mappings = load_mappings( + self.nmdc_ncbi_attribute_mapping_file_url + ) + + for biosample in nmdc_biosamples: + attributes = {} + sample_id_value = None + env_package = None + + for json_key, value in biosample.items(): + if isinstance(value, list): + continue # Skip processing for list values + + if json_key == "env_package": + env_package = f"MIMS.me.{handle_text_value(value)}.6.0" + + # Special handling for NMDC Biosample "id" + if json_key == "id": + sample_id_value = value + continue + + if json_key not in attribute_mappings: + continue + + xml_key = attribute_mappings[json_key] + value_type = slot_range_mappings.get(json_key, "string") + handler = self.type_handlers.get(value_type, handle_string_value) + + formatted_value = handler(value) + attributes[xml_key] = formatted_value + + biosample_elements = [ + self.set_element( + "SampleId", + children=[ + self.set_element( + "SPUID", sample_id_value, {"spuid_namespace": org} + ) + ], + ), + self.set_element( + "Descriptor", + children=[ + self.set_element( + "Title", + f"NMDC Biosample {sample_id_value} from {organism_name} part of {self.nmdc_study_id} study", + ), + ], + ), + self.set_element( + "Organism", + children=[self.set_element("OrganismName", organism_name)], + ), + self.set_element( + "BioProject", + children=[ + self.set_element( + "PrimaryId", bioproject_id, {"db": "BioProject"} + ) + ], + ), + self.set_element("Package", env_package), + self.set_element( + "Attributes", + children=[ + self.set_element( + "Attribute", attributes[key], {"attribute_name": key} + ) + for key in sorted(attributes) + ], + ), + ] + + action = self.set_element( + "Action", + children=[ + self.set_element( + "AddData", + attrib={"target_db": "BioSample"}, + children=[ + self.set_element( + "Data", + attrib={"content_type": "XML"}, + children=[ + self.set_element( + "XmlContent", + children=[ + self.set_element( + "BioSample", + attrib={"schema_version": "2.0"}, + children=biosample_elements, + ), + ], + ), + ], + ), + self.set_element( + "Identifier", + children=[ + self.set_element( + "SPUID", + sample_id_value, + {"spuid_namespace": org}, + ), + ], + ), + ], + ), + ], + ) + self.root.append(action) + + def set_fastq( + self, + biosample_data_objects: list, + bioproject_id: str, + org: str, + ): + for entry in biosample_data_objects: + fastq_files = [] + biosample_ids = [] + + for biosample_id, data_objects in entry.items(): + biosample_ids.append(biosample_id) + for data_object in data_objects: + if "url" in data_object: + url = urlparse(data_object["url"]) + file_path = os.path.join( + os.path.basename(os.path.dirname(url.path)), + os.path.basename(url.path), + ) + fastq_files.append(file_path) + + if fastq_files: + files_elements = [ + self.set_element( + "File", + "", + {"file_path": f}, + [self.set_element("DataType", "generic-data")], + ) + for f in fastq_files + ] + + attribute_elements = [ + self.set_element( + "AttributeRefId", + attrib={"name": "BioProject"}, + children=[ + self.set_element( + "RefId", + children=[ + self.set_element( + "SPUID", + bioproject_id, + {"spuid_namespace": org}, + ) + ], + ) + ], + ) + ] + + for biosample_id in biosample_ids: + attribute_elements.append( + self.set_element( + "AttributeRefId", + attrib={"name": "BioSample"}, + children=[ + self.set_element( + "RefId", + children=[ + self.set_element( + "SPUID", + biosample_id, + {"spuid_namespace": org}, + ) + ], + ) + ], + ) + ) + + identifier_element = self.set_element( + "Identifier", + children=[ + self.set_element( + "SPUID", bioproject_id, {"spuid_namespace": org} + ) + ], + ) + + action = self.set_element( + "Action", + children=[ + self.set_element( + "AddFiles", + attrib={"target_db": "SRA"}, + children=files_elements + + attribute_elements + + [identifier_element], + ), + ], + ) + + self.root.append(action) + + def get_submission_xml( + self, + biosamples_list: list, + biosample_omics_processing_list: list, + biosample_data_objects_list: list, + ): + data_type = None + ncbi_project_id = None + for bsm_omprc in biosample_omics_processing_list: + for _, omprc_list in bsm_omprc.items(): + for omprc in omprc_list: + if "omics_type" in omprc: + data_type = handle_text_value(omprc["omics_type"]).capitalize() + + if "ncbi_project_name" in omprc: + ncbi_project_id = omprc["ncbi_project_name"] + + self.set_description( + email=self.nmdc_pi_email, + user="National Microbiome Data Collaborative (NMDC)", + first=self.first_name, + last=self.last_name, + org=self.ncbi_submission_metadata.get("organization", ""), + ) + + if not ncbi_project_id: + self.set_bioproject( + title=self.nmdc_study_title, + project_id=ncbi_project_id, + description=self.nmdc_study_description, + data_type=data_type, + org=self.ncbi_submission_metadata.get("organization", ""), + ) + + self.set_biosample( + organism_name=self.ncbi_biosample_metadata.get("organism_name", ""), + org=self.ncbi_submission_metadata.get("organization", ""), + bioproject_id=ncbi_project_id, + nmdc_biosamples=biosamples_list, + nmdc_omics_processing=biosample_omics_processing_list, + ) + + self.set_fastq( + biosample_data_objects=biosample_data_objects_list, + bioproject_id=ncbi_project_id, + org=self.ncbi_submission_metadata.get("organization", ""), + ) + + rough_string = ET.tostring(self.root, "unicode") + reparsed = xml.dom.minidom.parseString(rough_string) + submission_xml = reparsed.toprettyxml(indent=" ", newl="\n") + + # ============= Uncomment the following code to validate the XML against NCBI XSDs ============ # + # submission_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/submission.xsd?view=co" + # validate_xml(submission_xml, submission_xsd_url) + + # bioproject_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/bioproject/bioproject.xsd?view=co" + # validate_xml(submission_xml, bioproject_xsd_url) + + # biosample_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/biosample/biosample.xsd?view=co" + # validate_xml(submission_xml, biosample_xsd_url) + + return submission_xml diff --git a/nmdc_runtime/site/export/ncbi_xml_utils.py b/nmdc_runtime/site/export/ncbi_xml_utils.py new file mode 100644 index 00000000..e527245e --- /dev/null +++ b/nmdc_runtime/site/export/ncbi_xml_utils.py @@ -0,0 +1,206 @@ +from io import BytesIO, StringIO +from nmdc_runtime.minter.config import typecodes +from lxml import etree + +import csv +import requests + + +def _build_class_map(class_map_data): + return { + entry["name"]: entry["schema_class"].split(":")[1] for entry in class_map_data + } + + +def get_classname_from_typecode(doc_id): + class_map_data = typecodes() + class_map = _build_class_map(class_map_data) + + typecode = doc_id.split(":")[1].split("-")[0] + return class_map.get(typecode) + + +def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list): + biosample_data_objects = [] + + for biosample in biosamples_list: + current_ids = [biosample["id"]] + collected_data_objects = [] + + while current_ids: + new_current_ids = [] + for current_id in current_ids: + query = {"has_input": current_id} + document = all_docs_collection.find_one(query) + + if not document: + continue + + has_output = document.get("has_output") + if not has_output: + continue + + for output_id in has_output: + if get_classname_from_typecode(output_id) == "DataObject": + data_object_doc = all_docs_collection.find_one( + {"id": output_id} + ) + if data_object_doc: + collected_data_objects.append(data_object_doc) + else: + new_current_ids.append(output_id) + + current_ids = new_current_ids + + if collected_data_objects: + biosample_data_objects.append({biosample["id"]: collected_data_objects}) + + return biosample_data_objects + + +def fetch_omics_processing_from_biosamples(all_docs_collection, biosamples_list): + biosample_data_objects = [] + + for biosample in biosamples_list: + current_ids = [biosample["id"]] + collected_data_objects = [] + + while current_ids: + new_current_ids = [] + for current_id in current_ids: + query = {"has_input": current_id} + document = all_docs_collection.find_one(query) + + if not document: + continue + + has_output = document.get("has_output") + if not has_output: + continue + + for output_id in has_output: + if get_classname_from_typecode(output_id) == "DataObject": + omics_processing_doc = all_docs_collection.find_one( + {"id": document["id"]} + ) + if omics_processing_doc: + collected_data_objects.append(omics_processing_doc) + else: + new_current_ids.append(output_id) + + current_ids = new_current_ids + + if collected_data_objects: + biosample_data_objects.append({biosample["id"]: collected_data_objects}) + + return biosample_data_objects + + +def handle_quantity_value(slot_value): + if "has_numeric_value" in slot_value and "has_unit" in slot_value: + return f"{slot_value['has_numeric_value']} {slot_value['has_unit']}" + elif ( + "has_maximum_numeric_value" in slot_value + and "has_minimum_numeric_value" in slot_value + and "has_unit" in slot_value + ): + range_value = ( + slot_value["has_maximum_numeric_value"] + - slot_value["has_minimum_numeric_value"] + ) + return f"{range_value} {slot_value['has_unit']}" + elif "has_raw_value" in slot_value: + return slot_value["has_raw_value"] + return "Unknown format" + + +def handle_text_value(slot_value): + return slot_value.get("has_raw_value", "Unknown format") + + +def handle_timestamp_value(slot_value): + return slot_value.get("has_raw_value", "Unknown format") + + +def handle_controlled_term_value(slot_value): + if "term" in slot_value: + term = slot_value["term"] + if "name" in term and "id" in term: + return f"{term['name']} [{term['id']}]" + elif "id" in term: + return term["id"] + elif "name" in term: + return term["name"] + elif "has_raw_value" in slot_value: + return slot_value["has_raw_value"] + return "Unknown format" + + +def handle_controlled_identified_term_value(slot_value): + if "term" in slot_value: + term = slot_value["term"] + if "name" in term and "id" in term: + return f"{term['name']} [{term['id']}]" + elif "id" in term: + return term["id"] + elif "has_raw_value" in slot_value: + return slot_value["has_raw_value"] + return "Unknown format" + + +def handle_geolocation_value(slot_value): + if "latitude" in slot_value and "longitude" in slot_value: + return f"{slot_value['latitude']} {slot_value['longitude']}" + elif "has_raw_value" in slot_value: + return slot_value["has_raw_value"] + return "Unknown format" + + +def handle_float_value(slot_value): + return f"{slot_value:.2f}" + + +def handle_string_value(slot_value): + return f"{slot_value}" + + +def load_mappings(url): + response = requests.get(url) + response.raise_for_status() + file_content = response.text + + attribute_mappings = {} + slot_range_mappings = {} + reader = csv.DictReader(StringIO(file_content), delimiter="\t") + for row in reader: + if row["ignore"].strip(): + continue + + json_key = row["nmdc_schema_slot"] + # attribute mappings + xml_attribute_name = row["ncbi_biosample_attribute_name"] + attribute_mappings[json_key] = ( + xml_attribute_name if xml_attribute_name else json_key + ) + + # slot range mappings + data_type = row["nmdc_schema_slot_range"] + slot_range_mappings[json_key] = data_type if data_type else "default" + + return attribute_mappings, slot_range_mappings + + +def validate_xml(xml, xsd_url): + response = requests.get(xsd_url) + response.raise_for_status() + xsd_content = response.text + + xml_schema_doc = etree.parse(BytesIO(xsd_content.encode("utf-8"))) + xml_schema = etree.XMLSchema(xml_schema_doc) + + xml_doc = etree.parse(BytesIO(xml.encode("utf-8"))) + + if not xml_schema.validate(xml_doc): + raise ValueError(f"There were errors while validating against: {xsd_url}") + + return True diff --git a/nmdc_runtime/site/export/study_metadata.py b/nmdc_runtime/site/export/study_metadata.py index cdcfef8e..d9bb2a97 100644 --- a/nmdc_runtime/site/export/study_metadata.py +++ b/nmdc_runtime/site/export/study_metadata.py @@ -5,7 +5,6 @@ import csv from io import StringIO -import requests from dagster import ( op, get_dagster_logger, @@ -26,13 +25,27 @@ def get_all_docs(client, collection, filter_): per_page = 200 url_base = f"/{collection}?filter={filter_}&per_page={per_page}" results = [] - rv = client.request("GET", url_base).json() + response = client.request("GET", url_base) + if response.status_code != 200: + raise Exception( + f"Runtime API request failed with status {response.status_code}." + f" Check URL: {url_base}" + ) + rv = response.json() results.extend(rv.get("results", [])) page, count = rv["meta"]["page"], rv["meta"]["count"] assert count <= 10_000 while page * per_page < count: - rv = requests.get(url_base + f"&page={page + 1}").json() - results.extend(rv["results"]) + page += 1 + url = f"{url_base}&page={page}" + response = client.request("GET", url) + if response.status_code != 200: + raise Exception( + f"Runtime API request failed with status {response.status_code}." + f" Check URL: {url}" + ) + rv = response.json() + results.extend(rv.get("results", [])) return results @@ -115,3 +128,10 @@ def export_study_biosamples_as_csv(context: OpExecutionContext, study_export_inf def export_study_biosamples_metadata(): outputs = export_study_biosamples_as_csv(get_study_biosamples_metadata()) add_output_run_event(outputs) + + +@op(required_resource_keys={"runtime_api_site_client"}) +def get_biosamples_by_study_id(context: OpExecutionContext, nmdc_study: dict): + client: RuntimeApiSiteClient = context.resources.runtime_api_site_client + biosamples = get_all_docs(client, "biosamples", f"part_of:{nmdc_study['id']}") + return biosamples diff --git a/nmdc_runtime/site/graphs.py b/nmdc_runtime/site/graphs.py index 5b21ac01..ae3ccd76 100644 --- a/nmdc_runtime/site/graphs.py +++ b/nmdc_runtime/site/graphs.py @@ -48,7 +48,14 @@ get_neon_pipeline_inputs, get_df_from_url, site_code_mapping, + get_ncbi_export_pipeline_study, + get_data_objects_from_biosamples, + get_omics_processing_from_biosamples, + get_ncbi_export_pipeline_inputs, + ncbi_submission_xml_from_nmdc_study, + ncbi_submission_xml_asset, ) +from nmdc_runtime.site.export.study_metadata import get_biosamples_by_study_id @graph @@ -369,3 +376,20 @@ def ingest_neon_surface_water_metadata(): ) run_id = submit_metadata_to_db(database) poll_for_run_completion(run_id) + + +@graph +def nmdc_study_to_ncbi_submission_export(): + nmdc_study = get_ncbi_export_pipeline_study() + ncbi_submission_metadata = get_ncbi_export_pipeline_inputs() + biosamples = get_biosamples_by_study_id(nmdc_study) + omics_processing_records = get_omics_processing_from_biosamples(biosamples) + data_objects = get_data_objects_from_biosamples(biosamples) + xml_data = ncbi_submission_xml_from_nmdc_study( + nmdc_study, + ncbi_submission_metadata, + biosamples, + omics_processing_records, + data_objects, + ) + ncbi_submission_xml_asset(xml_data) diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index f4ff3577..4bd00f3b 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -9,6 +9,7 @@ from io import BytesIO, StringIO from typing import Tuple from zipfile import ZipFile + import pandas as pd import requests @@ -29,10 +30,14 @@ String, op, Optional, + Field, + Permissive, + Bool, ) from gridfs import GridFS from linkml_runtime.dumpers import json_dumper from linkml_runtime.utils.yamlutils import YAMLRoot +from nmdc_runtime.api.db.mongo import get_mongo_db from nmdc_runtime.api.core.idgen import generate_one_id from nmdc_runtime.api.core.metadata import ( _validate_changesheet, @@ -42,6 +47,7 @@ ) from nmdc_runtime.api.core.util import dotted_path_for, hash_from_str, json_clean, now from nmdc_runtime.api.endpoints.util import persist_content_and_get_drs_object +from nmdc_runtime.api.endpoints.find import find_study_by_id from nmdc_runtime.api.models.job import Job, JobOperationMetadata from nmdc_runtime.api.models.metadata import ChangesheetIn from nmdc_runtime.api.models.operation import ( @@ -55,6 +61,11 @@ _add_run_complete_event, ) from nmdc_runtime.api.models.util import ResultT +from nmdc_runtime.site.export.ncbi_xml import NCBISubmissionXML +from nmdc_runtime.site.export.ncbi_xml_utils import ( + fetch_data_objects_from_biosamples, + fetch_omics_processing_from_biosamples, +) from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict from nmdc_runtime.site.resources import ( NmdcPortalApiClient, @@ -724,6 +735,33 @@ def export_json_to_drs( return ["/objects/" + drs_object["id"]] +@op( + description="NCBI Submission XML file rendered in a Dagster Asset", + out=Out(description="XML content rendered through Dagit UI"), +) +def ncbi_submission_xml_asset(context: OpExecutionContext, data: str): + filename = "ncbi_submission.xml" + file_path = os.path.join(context.instance.storage_directory(), filename) + + os.makedirs(os.path.dirname(file_path), exist_ok=True) + + with open(file_path, "w") as f: + f.write(data) + + context.log_event( + AssetMaterialization( + asset_key="ncbi_submission_xml", + description="NCBI Submission XML Data", + metadata={ + "file_path": MetadataValue.path(file_path), + "xml": MetadataValue.text(data), + }, + ) + ) + + return Output(data) + + def unique_field_values(docs: List[Dict[str, Any]], field: str): return {doc[field] for doc in docs if field in doc} @@ -933,3 +971,85 @@ def site_code_mapping() -> dict: raise Exception( f"Failed to fetch site data from {endpoint}. Status code: {response.status_code}, Content: {response.content}" ) + + +@op(config_schema={"nmdc_study_id": str}, required_resource_keys={"mongo"}) +def get_ncbi_export_pipeline_study(context: OpExecutionContext) -> Any: + nmdc_study = find_study_by_id( + context.op_config["nmdc_study_id"], context.resources.mongo.db + ) + return nmdc_study + + +@op( + config_schema={ + "nmdc_ncbi_attribute_mapping_file_url": str, + "ncbi_submission_metadata": Field( + Permissive( + { + "organization": String, + } + ), + is_required=True, + description="General metadata about the NCBI submission.", + ), + "ncbi_biosample_metadata": Field( + Permissive( + { + "organism_name": String, + } + ), + is_required=True, + description="Metadata for one or many NCBI BioSample in the Submission.", + ), + }, + out=Out(Dict), +) +def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str: + nmdc_ncbi_attribute_mapping_file_url = context.op_config[ + "nmdc_ncbi_attribute_mapping_file_url" + ] + ncbi_submission_metadata = context.op_config.get("ncbi_submission_metadata", {}) + ncbi_biosample_metadata = context.op_config.get("ncbi_biosample_metadata", {}) + + return { + "nmdc_ncbi_attribute_mapping_file_url": nmdc_ncbi_attribute_mapping_file_url, + "ncbi_submission_metadata": ncbi_submission_metadata, + "ncbi_biosample_metadata": ncbi_biosample_metadata, + } + + +@op(required_resource_keys={"mongo"}) +def get_data_objects_from_biosamples(context: OpExecutionContext, biosamples: list): + mdb = context.resources.mongo.db + alldocs_collection = mdb["alldocs"] + biosample_data_objects = fetch_data_objects_from_biosamples( + alldocs_collection, biosamples + ) + return biosample_data_objects + + +@op(required_resource_keys={"mongo"}) +def get_omics_processing_from_biosamples(context: OpExecutionContext, biosamples: list): + mdb = context.resources.mongo.db + alldocs_collection = mdb["alldocs"] + biosample_omics_processing = fetch_omics_processing_from_biosamples( + alldocs_collection, biosamples + ) + return biosample_omics_processing + + +@op +def ncbi_submission_xml_from_nmdc_study( + context: OpExecutionContext, + nmdc_study: Any, + ncbi_exporter_metadata: dict, + biosamples: list, + omics_processing_records: list, + data_objects: list, +) -> str: + ncbi_exporter = NCBISubmissionXML(nmdc_study, ncbi_exporter_metadata) + ncbi_xml = ncbi_exporter.get_submission_xml( + biosamples, omics_processing_records, data_objects + ) + return ncbi_xml diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py index c0de4366..98b3dedb 100644 --- a/nmdc_runtime/site/repository.py +++ b/nmdc_runtime/site/repository.py @@ -42,6 +42,7 @@ ingest_neon_soil_metadata, ingest_neon_benthic_metadata, ingest_neon_surface_water_metadata, + nmdc_study_to_ncbi_submission_export, ) from nmdc_runtime.site.resources import ( get_mongo, @@ -850,6 +851,57 @@ def biosample_submission_ingest(): ] +@repository +def biosample_export(): + normal_resources = run_config_frozen__normal_env["resources"] + return [ + nmdc_study_to_ncbi_submission_export.to_job( + resource_defs=resource_defs, + config={ + "resources": merge( + unfreeze(normal_resources), + { + "mongo": { + "config": { + "host": {"env": "MONGO_HOST"}, + "username": {"env": "MONGO_USERNAME"}, + "password": {"env": "MONGO_PASSWORD"}, + "dbname": {"env": "MONGO_DBNAME"}, + }, + }, + "runtime_api_site_client": { + "config": { + "base_url": {"env": "API_HOST"}, + "client_id": {"env": "API_SITE_CLIENT_ID"}, + "client_secret": {"env": "API_SITE_CLIENT_SECRET"}, + "site_id": {"env": "API_SITE_ID"}, + }, + }, + }, + ), + "ops": { + "get_ncbi_export_pipeline_study": { + "config": { + "nmdc_study_id": "", + } + }, + "get_ncbi_export_pipeline_inputs": { + "config": { + "nmdc_ncbi_attribute_mapping_file_url": "", + "ncbi_submission_metadata": { + "organization": "", + }, + "ncbi_biosample_metadata": { + "organism_name": "", + }, + } + }, + }, + }, + ), + ] + + # @repository # def validation(): # graph_jobs = [validate_jgi_job, validate_gold_job, validate_emsl_job] diff --git a/nmdc_runtime/site/workspace.yaml b/nmdc_runtime/site/workspace.yaml index e594197e..5da09ab9 100644 --- a/nmdc_runtime/site/workspace.yaml +++ b/nmdc_runtime/site/workspace.yaml @@ -11,6 +11,9 @@ load_from: - python_package: package_name: nmdc_runtime.site.repository attribute: biosample_submission_ingest + - python_package: + package_name: nmdc_runtime.site.repository + attribute: biosample_export # - python_package: # package_name: nmdc_runtime.site.repository # attribute: validation diff --git a/requirements/dev.in b/requirements/dev.in index dbe7b8e9..601370de 100644 --- a/requirements/dev.in +++ b/requirements/dev.in @@ -11,4 +11,5 @@ pytest-cov requests-mock setuptools twine -requests-cache \ No newline at end of file +requests-cache +pytest-mock \ No newline at end of file diff --git a/requirements/dev.txt b/requirements/dev.txt index 0065d7ae..86cbbaea 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -45,12 +45,13 @@ exceptiongroup==1.2.0 # pytest flake8==7.1.0 # via -r requirements/dev.in -idna==3.6 +idna==3.7 # via # -c requirements/main.txt # requests importlib-metadata==7.1.0 # via + # build # keyring # twine iniconfig==2.0.0 @@ -67,6 +68,10 @@ jaraco-functools==4.0.1 # via keyring keyring==25.2.1 # via twine +lxml==5.2.2 + # via + # -c requirements/main.txt + # -r requirements/dev.in markdown-it-py==3.0.0 # via # -c requirements/main.txt @@ -85,7 +90,7 @@ mypy-extensions==1.0.0 # via black nh3==0.2.17 # via readme-renderer -packaging==23.2 +packaging==24.0 # via # -c requirements/main.txt # black @@ -99,12 +104,12 @@ pip-tools==7.4.1 # via -r requirements/dev.in pkginfo==1.11.1 # via twine -platformdirs==4.2.0 +platformdirs==4.2.2 # via # -c requirements/main.txt # black # requests-cache -pluggy==1.4.0 +pluggy==1.5.0 # via # -c requirements/main.txt # pytest @@ -121,7 +126,7 @@ pyproject-hooks==1.1.0 # via # build # pip-tools -pytest==8.0.2 +pytest==8.2.0 # via # -c requirements/main.txt # -r requirements/dev.in @@ -131,6 +136,10 @@ pytest-asyncio==0.23.7 # via -r requirements/dev.in pytest-cov==5.0.0 # via -r requirements/dev.in +pytest-mock==3.14.0 + # via + # -c requirements/main.txt + # -r requirements/dev.in readme-renderer==43.0 # via twine requests==2.31.0 @@ -170,7 +179,7 @@ tomli==2.0.1 # pytest twine==5.1.0 # via -r requirements/dev.in -typing-extensions==4.10.0 +typing-extensions==4.11.0 # via # -c requirements/main.txt # black @@ -179,7 +188,7 @@ url-normalize==1.4.3 # via # -c requirements/main.txt # requests-cache -urllib3==2.0.7 +urllib3==2.2.1 # via # -c requirements/main.txt # requests @@ -195,7 +204,7 @@ pip==24.0 # via # -r requirements/dev.in # pip-tools -setuptools==69.1.1 +setuptools==69.5.1 # via # -c requirements/main.txt # -r requirements/dev.in diff --git a/requirements/main.in b/requirements/main.in index e491c543..2ae359d7 100644 --- a/requirements/main.in +++ b/requirements/main.in @@ -20,6 +20,7 @@ jupyter jupyterlab linkml linkml-runtime +lxml mkdocs-jupyter mkdocs-material mkdocs-mermaid2-plugin diff --git a/requirements/main.txt b/requirements/main.txt index eb8eafbe..9facde66 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -48,9 +48,9 @@ backoff==2.2.1 # via gql base32-lib==1.0.2 # via -r requirements/main.in -bcrypt==4.1.2 +bcrypt==4.1.3 # via passlib -beanie==1.25.0 +beanie==1.26.0 # via -r requirements/main.in beautifulsoup4==4.12.3 # via @@ -59,9 +59,9 @@ beautifulsoup4==4.12.3 # nbconvert bleach==6.1.0 # via nbconvert -boto3==1.34.54 +boto3==1.34.106 # via -r requirements/main.in -botocore==1.34.54 +botocore==1.34.106 # via # boto3 # s3transfer @@ -104,31 +104,31 @@ comm==0.2.1 # via # ipykernel # ipywidgets -croniter==2.0.2 +croniter==2.0.5 # via dagster -cryptography==42.0.5 +cryptography==42.0.7 # via python-jose -curies==0.7.7 +curies==0.7.9 # via # linkml-runtime # prefixmaps -dagit==1.6.8 +dagit==1.7.5 # via -r requirements/main.in -dagster==1.6.8 +dagster==1.7.5 # via # -r requirements/main.in # dagster-graphql # dagster-postgres # dagster-webserver -dagster-graphql==1.6.8 +dagster-graphql==1.7.5 # via # -r requirements/main.in # dagster-webserver -dagster-pipes==1.6.8 +dagster-pipes==1.7.5 # via dagster -dagster-postgres==0.22.8 +dagster-postgres==0.23.5 # via -r requirements/main.in -dagster-webserver==1.6.8 +dagster-webserver==1.7.5 # via dagit debugpy==1.8.1 # via ipykernel @@ -144,19 +144,21 @@ dnspython==2.6.1 # via # email-validator # pymongo -docstring-parser==0.15 +docstring-parser==0.16 # via dagster dotted-dict==1.1.3 # via -r requirements/main.in -ecdsa==0.18.0 +ecdsa==0.19.0 # via python-jose editorconfig==0.12.4 # via jsbeautifier email-validator==2.1.1 - # via pydantic + # via + # fastapi + # pydantic et-xmlfile==1.1.0 # via openpyxl -exceptiongroup==1.2.0 +exceptiongroup==1.2.1 # via # anyio # cattrs @@ -164,19 +166,25 @@ exceptiongroup==1.2.0 # pytest executing==2.0.1 # via stack-data -fastapi==0.110.0 - # via -r requirements/main.in +fastapi==0.111.0 + # via + # -r requirements/main.in + # fastapi-cli +fastapi-cli==0.0.3 + # via fastapi fastjsonschema==2.19.1 # via # -r requirements/main.in # nbformat +filelock==3.14.0 + # via dagster fnc==0.5.3 # via -r requirements/main.in fqdn==1.5.1 # via jsonschema -frozendict==2.4.0 +frozendict==2.4.4 # via -r requirements/main.in -fsspec==2024.2.0 +fsspec==2024.5.0 # via universal-pathlib ghp-import==2.1.0 # via mkdocs @@ -193,13 +201,15 @@ graphql-core==3.2.3 # graphql-relay graphql-relay==3.2.0 # via graphene -graphviz==0.20.1 +graphviz==0.20.3 # via linkml -grpcio==1.62.0 +greenlet==3.0.3 + # via sqlalchemy +grpcio==1.63.0 # via # dagster # grpcio-health-checking -grpcio-health-checking==1.62.0 +grpcio-health-checking==1.62.2 # via dagster h11==0.14.0 # via @@ -210,15 +220,17 @@ hbreader==0.9.1 # jsonasobj2 # linkml # linkml-runtime -httpcore==1.0.4 +httpcore==1.0.5 # via httpx httptools==0.6.1 # via uvicorn httpx==0.27.0 - # via jupyterlab + # via + # fastapi + # jupyterlab humanfriendly==10.0 # via coloredlogs -idna==3.6 +idna==3.7 # via # anyio # email-validator @@ -228,14 +240,14 @@ idna==3.6 # yarl iniconfig==2.0.0 # via pytest -ipykernel==6.29.3 +ipykernel==6.29.4 # via # jupyter # jupyter-console # jupyterlab # mkdocs-jupyter # qtconsole -ipython==8.22.1 +ipython==8.24.0 # via # ipykernel # ipywidgets @@ -250,9 +262,10 @@ isoduration==20.11.0 # via jsonschema jedi==0.19.1 # via ipython -jinja2==3.1.3 +jinja2==3.1.4 # via # dagster + # fastapi # jupyter-server # jupyterlab # jupyterlab-server @@ -265,13 +278,13 @@ jmespath==1.0.1 # via # boto3 # botocore -jq==1.6.0 +jq==1.7.0 # via -r requirements/main.in jsbeautifier==1.15.1 # via mkdocs-mermaid2-plugin json-flattener==0.1.9 # via linkml-runtime -json5==0.9.18 +json5==0.9.25 # via jupyterlab-server jsonasobj==1.3.1 # via @@ -290,7 +303,7 @@ jsonpointer==2.4 # via # jsonpatch # jsonschema -jsonschema==4.21.1 +jsonschema==4.22.0 # via # jupyter-events # jupyterlab-server @@ -301,7 +314,7 @@ jsonschema-specifications==2023.12.1 # via jsonschema jupyter==1.0.0 # via -r requirements/main.in -jupyter-client==8.6.0 +jupyter-client==8.6.1 # via # ipykernel # jupyter-console @@ -310,7 +323,7 @@ jupyter-client==8.6.0 # qtconsole jupyter-console==6.6.3 # via jupyter -jupyter-core==5.7.1 +jupyter-core==5.7.2 # via # ipykernel # jupyter-client @@ -321,32 +334,32 @@ jupyter-core==5.7.1 # nbconvert # nbformat # qtconsole -jupyter-events==0.9.0 +jupyter-events==0.10.0 # via jupyter-server -jupyter-lsp==2.2.3 +jupyter-lsp==2.2.5 # via jupyterlab -jupyter-server==2.12.5 +jupyter-server==2.14.0 # via # jupyter-lsp # jupyterlab # jupyterlab-server # notebook # notebook-shim -jupyter-server-terminals==0.5.2 +jupyter-server-terminals==0.5.3 # via jupyter-server -jupyterlab==4.1.2 +jupyterlab==4.1.8 # via # -r requirements/main.in # notebook jupyterlab-pygments==0.3.0 # via nbconvert -jupyterlab-server==2.25.3 +jupyterlab-server==2.27.1 # via # jupyterlab # notebook jupyterlab-widgets==3.0.10 # via ipywidgets -jupytext==1.16.1 +jupytext==1.16.2 # via mkdocs-jupyter lazy-model==0.2.0 # via beanie @@ -356,15 +369,17 @@ linkml==1.7.5 # nmdc-schema linkml-dataops==0.1.0 # via linkml -linkml-runtime==1.7.2 +linkml-runtime==1.7.5 # via # -r requirements/main.in # linkml # linkml-dataops # nmdc-schema -mako==1.3.2 +lxml==5.2.2 + # via -r requirements/main.in +mako==1.3.5 # via alembic -markdown==3.5.2 +markdown==3.6 # via # mkdocs # mkdocs-material @@ -380,28 +395,32 @@ markupsafe==2.1.5 # mako # mkdocs # nbconvert -matplotlib-inline==0.1.6 +matplotlib-inline==0.1.7 # via # ipykernel # ipython -mdit-py-plugins==0.4.0 +mdit-py-plugins==0.4.1 # via jupytext mdurl==0.1.2 # via markdown-it-py mergedeep==1.3.4 - # via mkdocs + # via + # mkdocs + # mkdocs-get-deps mistune==3.0.2 # via nbconvert -mkdocs==1.5.3 +mkdocs==1.6.0 # via # mkdocs-jupyter # mkdocs-material # mkdocs-mermaid2-plugin # mkdocs-redirects # nmdc-schema -mkdocs-jupyter==0.24.6 +mkdocs-get-deps==0.2.0 + # via mkdocs +mkdocs-jupyter==0.24.7 # via -r requirements/main.in -mkdocs-material==9.5.12 +mkdocs-material==9.5.23 # via # -r requirements/main.in # mkdocs-jupyter @@ -414,7 +433,9 @@ mkdocs-mermaid2-plugin==0.6.0 # nmdc-schema mkdocs-redirects==1.2.1 # via nmdc-schema -motor==3.3.2 +more-click==0.1.2 + # via bioregistry +motor==3.4.0 # via # -r requirements/main.in # beanie @@ -422,12 +443,12 @@ multidict==6.0.5 # via yarl nbclient==0.9.0 # via nbconvert -nbconvert==7.16.1 +nbconvert==7.16.4 # via # jupyter # jupyter-server # mkdocs-jupyter -nbformat==5.9.2 +nbformat==5.10.4 # via # jupyter-server # jupytext @@ -437,7 +458,7 @@ nest-asyncio==1.6.0 # via ipykernel nmdc-schema==10.5.5 # via -r requirements/main.in -notebook==7.1.1 +notebook==7.1.3 # via jupyter notebook-shim==0.2.4 # via @@ -449,9 +470,11 @@ openpyxl==3.1.2 # via # -r requirements/main.in # linkml +orjson==3.10.3 + # via fastapi overrides==7.7.0 # via jupyter-server -packaging==23.2 +packaging==24.0 # via # dagster # ipykernel @@ -473,7 +496,7 @@ pandocfilters==1.5.1 # via nbconvert parse==1.20.1 # via linkml -parso==0.8.3 +parso==0.8.4 # via jedi passlib==1.7.4 # via -r requirements/main.in @@ -483,12 +506,12 @@ pendulum==3.0.0 # via dagster pexpect==4.9.0 # via ipython -platformdirs==4.2.0 +platformdirs==4.2.2 # via # jupyter-core - # mkdocs + # mkdocs-get-deps # requests-cache -pluggy==1.4.0 +pluggy==1.5.0 # via pytest ply==3.11 # via jsonpath-ng @@ -496,7 +519,7 @@ prefixcommons==0.1.12 # via # linkml # linkml-runtime -prefixmaps==0.2.2 +prefixmaps==0.2.4 # via # linkml # linkml-runtime @@ -520,13 +543,13 @@ ptyprocess==0.7.0 # terminado pure-eval==0.2.2 # via stack-data -pyasn1==0.5.1 +pyasn1==0.6.0 # via # python-jose # rsa -pycparser==2.21 +pycparser==2.22 # via cffi -pydantic==2.6.3 +pydantic==2.7.1 # via # -r requirements/main.in # beanie @@ -536,7 +559,7 @@ pydantic==2.6.3 # lazy-model # linkml # linkml-runtime -pydantic-core==2.16.3 +pydantic-core==2.18.2 # via pydantic pygments==2.17.2 # via @@ -552,7 +575,7 @@ pyjsg==0.11.10 # linkml # pyshexc # shexjsg -pymdown-extensions==10.7 +pymdown-extensions==10.8.1 # via # mkdocs-material # mkdocs-mermaid2-plugin @@ -561,7 +584,7 @@ pymongo==4.7.3 # -r requirements/main.in # motor # nmdc-schema -pyparsing==3.1.1 +pyparsing==3.1.2 # via rdflib pyshex==0.8.1 # via linkml @@ -569,10 +592,16 @@ pyshexc==0.9.1 # via # linkml # pyshex -pytest==8.0.2 - # via pytest-logging +pystow==0.5.4 + # via bioregistry +pytest==8.2.0 + # via + # pytest-logging + # pytest-mock pytest-logging==2015.11.4 # via prefixcommons +pytest-mock==3.14.0 + # via -r requirements/main.in python-dateutil==2.9.0.post0 # via # arrow @@ -595,7 +624,9 @@ python-jose==3.3.0 python-json-logger==2.0.7 # via jupyter-events python-multipart==0.0.9 - # via -r requirements/main.in + # via + # -r requirements/main.in + # fastapi pytrie==0.4.0 # via curies pytz==2024.1 @@ -613,6 +644,7 @@ pyyaml==6.0.1 # linkml # linkml-runtime # mkdocs + # mkdocs-get-deps # mkdocs-mermaid2-plugin # prefixcommons # prefixmaps @@ -621,14 +653,14 @@ pyyaml==6.0.1 # uvicorn pyyaml-env-tag==0.1 # via mkdocs -pyzmq==25.1.2 +pyzmq==26.0.3 # via # ipykernel # jupyter-client # jupyter-console # jupyter-server # qtconsole -qtconsole==5.5.1 +qtconsole==5.5.2 # via jupyter qtpy==2.4.1 # via qtconsole @@ -648,12 +680,12 @@ rdflib-shim==1.0.3 # pyshex # pyshexc # sparqlslurper -referencing==0.33.0 +referencing==0.35.1 # via # jsonschema # jsonschema-specifications # jupyter-events -regex==2023.12.25 +regex==2024.5.15 # via mkdocs-material requests==2.31.0 # via @@ -686,8 +718,10 @@ rfc3986-validator==0.1.1 rfc3987==1.3.8 # via jsonschema rich==13.7.1 - # via dagster -rpds-py==0.18.0 + # via + # dagster + # typer +rpds-py==0.18.1 # via # jsonschema # referencing @@ -697,13 +731,15 @@ ruamel-yaml==0.18.6 # via linkml-dataops ruamel-yaml-clib==0.2.8 # via ruamel-yaml -s3transfer==0.10.0 +ruff==0.4.4 + # via shed +s3transfer==0.10.1 # via boto3 semver==3.0.2 # via -r requirements/main.in -send2trash==1.8.2 +send2trash==1.8.3 # via jupyter-server -setuptools-scm==8.0.4 +setuptools-scm==8.1.0 # via -r requirements/main.in shexjsg==0.8.2 # via @@ -742,7 +778,7 @@ sqlalchemy==2.0.27 # linkml stack-data==0.6.3 # via ipython -starlette==0.36.3 +starlette==0.37.2 # via # dagster-graphql # dagster-webserver @@ -753,24 +789,23 @@ tabulate==0.9.0 # via dagster tenacity==8.2.3 # via -r requirements/main.in -terminado==0.18.0 +terminado==0.18.1 # via # jupyter-server # jupyter-server-terminals time-machine==2.13.0 # via pendulum -tinycss2==1.2.1 +tinycss2==1.3.0 # via nbconvert toml==0.10.2 - # via - # beanie - # jupytext + # via beanie tomli==2.0.1 # via # dagster # jupyterlab # pytest # setuptools-scm + # sphinx toolz==0.12.1 # via -r requirements/main.in toposort==1.10 @@ -783,7 +818,7 @@ tornado==6.4 # jupyterlab # notebook # terminado -tqdm==4.66.2 +tqdm==4.66.4 # via # -r requirements/main.in # dagster @@ -806,7 +841,7 @@ traitlets==5.14.1 # qtconsole types-python-dateutil==2.8.19.20240106 # via arrow -typing-extensions==4.10.0 +typing-extensions==4.11.0 # via # alembic # anyio @@ -817,29 +852,32 @@ typing-extensions==4.10.0 # fastapi # pydantic # pydantic-core - # setuptools-scm # sqlalchemy # uvicorn tzdata==2024.1 # via # pandas # pendulum -universal-pathlib==0.2.1 +ujson==5.10.0 + # via fastapi +universal-pathlib==0.2.2 # via dagster uri-template==1.3.0 # via jsonschema url-normalize==1.4.3 # via requests-cache -urllib3==2.0.7 +urllib3==2.2.1 # via # botocore # pyshex # requests # requests-cache -uvicorn==0.27.1 +uvicorn==0.29.0 # via # -r requirements/main.in # dagster-webserver + # fastapi + # fastapi-cli uvloop==0.19.0 # via uvicorn watchdog==4.0.0 @@ -857,7 +895,7 @@ webencodings==0.5.1 # via # bleach # tinycss2 -websocket-client==1.7.0 +websocket-client==1.8.0 # via jupyter-server websockets==12.0 # via uvicorn @@ -873,7 +911,7 @@ yarl==1.9.4 # via gql # The following packages are considered to be unsafe in a requirements file: -setuptools==69.1.1 +setuptools==69.5.1 # via # dagster # mkdocs-mermaid2-plugin diff --git a/tests/test_data/test_ncbi_xml.py b/tests/test_data/test_ncbi_xml.py new file mode 100644 index 00000000..7996b4d1 --- /dev/null +++ b/tests/test_data/test_ncbi_xml.py @@ -0,0 +1,540 @@ +from unittest.mock import MagicMock +import pytest +import xml.etree.ElementTree as ET + +from nmdc_runtime.site.export.ncbi_xml import NCBISubmissionXML +from nmdc_runtime.site.export.ncbi_xml_utils import ( + load_mappings, + handle_quantity_value, + handle_text_value, + handle_timestamp_value, + handle_controlled_term_value, + handle_controlled_identified_term_value, + handle_geolocation_value, + handle_float_value, + handle_string_value, +) + +MOCK_NMDC_STUDY = { + "id": "nmdc:sty-11-34xj1150", + "name": "National Ecological Observatory Network: soil metagenomes (DP1.10107.001)", + "description": "This study contains the quality-controlled laboratory metadata and minimally processed sequence data from NEON's soil microbial shotgun metagenomics sequencing. Typically, measurements are done on plot-level composite samples and represent up to three randomly selected sampling locations within a plot.", + "gold_study_identifiers": ["gold:Gs0144570", "gold:Gs0161344"], + "principal_investigator": { + "has_raw_value": "Kate Thibault", + "email": "kthibault@battelleecology.org", + "name": "Kate Thibault", + "orcid": "orcid:0000-0003-3477-6424", + "profile_image_url": "https://portal.nersc.gov/project/m3408/profile_images/thibault_katy.jpg", + }, + "title": "National Ecological Observatory Network: soil metagenomes (DP1.10107.001)", + "type": "nmdc:Study", + "websites": [ + "https://data.neonscience.org/data-products/DP1.10107.001", + "https://data.neonscience.org/api/v0/documents/NEON.DOC.014048vO", + "https://data.neonscience.org/api/v0/documents/NEON_metagenomes_userGuide_vE.pdf", + ], + "study_image": [ + { + "url": "https://portal.nersc.gov/project/m3408/profile_images/nmdc_sty-11-34xj1150.jpg" + } + ], + "funding_sources": [ + "NSF#1724433 National Ecological Observatory Network: Operations Activities" + ], + "has_credit_associations": [ + { + "applies_to_person": { + "name": "Hugh Cross", + "email": "crossh@battelleecology.org", + "orcid": "orcid:0000-0002-6745-9479", + }, + "applied_roles": ["Methodology", "Data curation"], + }, + { + "applies_to_person": { + "name": "Samantha Weintraub-Leff", + "email": "sweintraub@battelleecology.org", + "orcid": "orcid:0000-0003-4789-5086", + }, + "applied_roles": ["Methodology", "Data curation"], + }, + { + "applies_to_person": { + "name": "Kate Thibault", + "email": "kthibault@battelleecology.org", + "orcid": "orcid:0000-0003-3477-6424", + }, + "applied_roles": ["Principal Investigator"], + }, + ], + "part_of": ["nmdc:sty-11-nxrz9m96"], + "study_category": "consortium", + "insdc_bioproject_identifiers": ["bioproject:PRJNA1029061"], + "homepage_website": ["https://www.neonscience.org/"], +} + +MOCK_NCBI_SUBMISSION_METADATA = { + "nmdc_ncbi_attribute_mapping_file_url": "http://example.com/mappings.tsv", + "ncbi_submission_metadata": { + "organization": "Test Org", + }, + "ncbi_biosample_metadata": { + "organism_name": "E. coli", + }, +} + + +@pytest.fixture +def ncbi_submission_client(): + return NCBISubmissionXML( + nmdc_study=MOCK_NMDC_STUDY, + ncbi_submission_metadata=MOCK_NCBI_SUBMISSION_METADATA, + ) + + +@pytest.fixture +def nmdc_biosample(): + return [ + { + "analysis_type": ["metagenomics"], + "biosample_categories": ["NEON"], + "collection_date": {"has_raw_value": "2015-07-21T18:00Z"}, + "depth": { + "has_maximum_numeric_value": 1, + "has_minimum_numeric_value": 0, + "has_unit": "meters", + }, + "elev": 1179.5, + "env_broad_scale": { + "term": {"id": "ENVO:01000253", "name": "freshwater river biome"} + }, + "env_local_scale": {"term": {"id": "ENVO:03600094", "name": "stream pool"}}, + "env_medium": {"term": {"id": "ENVO:00002007", "name": "sediment"}}, + "geo_loc_name": {"has_raw_value": "USA: Colorado, Arikaree River"}, + "id": "nmdc:bsm-12-p9q5v236", + "lat_lon": {"latitude": 39.758206, "longitude": -102.447148}, + "name": "ARIK.20150721.AMC.EPIPSAMMON.3", + "part_of": ["nmdc:sty-11-34xj1150"], + "type": "nmdc:Biosample", + } + ] + + +@pytest.fixture +def omics_processing_list(): + return [ + { + "has_input": ["nmdc:procsm-12-ehktny16"], + "has_output": ["nmdc:dobj-12-1zv4q961", "nmdc:dobj-12-b3ft7a80"], + "id": "nmdc:omprc-12-zqm9p096", + "instrument_name": "Illumina NextSeq550", + "name": "Terrestrial soil microbial communities - ARIK.20150721.AMC.EPIPSAMMON.3-DNA1", + "ncbi_project_name": "PRJNA406976", + "omics_type": {"has_raw_value": "metagenome"}, + "part_of": ["nmdc:sty-11-34xj1150"], + "processing_institution": "Battelle", + "type": "nmdc:OmicsProcessing", + } + ] + + +@pytest.fixture +def data_objects_list(): + return [ + { + "data_object_type": "Metagenome Raw Read 1", + "description": "sequencing results for BMI_HVKNKBGX5_Tube347_R1", + "id": "nmdc:dobj-12-b3ft7a80", + "md5_checksum": "cae0a9342d786e731ae71f6f37b76120", + "name": "BMI_HVKNKBGX5_Tube347_R1.fastq.gz", + "type": "nmdc:DataObject", + "url": "https://storage.neonscience.org/neon-microbial-raw-seq-files/2023/BMI_HVKNKBGX5_mms_R1/BMI_HVKNKBGX5_Tube347_R1.fastq.gz", + }, + { + "data_object_type": "Metagenome Raw Read 2", + "description": "sequencing results for BMI_HVKNKBGX5_Tube347_R2", + "id": "nmdc:dobj-12-1zv4q961", + "md5_checksum": "7340fe25644183a4f56d36ce52389d83", + "name": "BMI_HVKNKBGX5_Tube347_R2.fastq.gz", + "type": "nmdc:DataObject", + "url": "https://storage.neonscience.org/neon-microbial-raw-seq-files/2023/BMI_HVKNKBGX5_mms_R2/BMI_HVKNKBGX5_Tube347_R2.fastq.gz", + }, + ] + + +class TestNCBISubmissionXML: + def test_set_element(self, ncbi_submission_client): + element = ncbi_submission_client.set_element("Test", "Hello", {"attr": "value"}) + assert element.tag == "Test" + assert element.text == "Hello" + assert element.attrib == {"attr": "value"} + + def test_set_description(self, ncbi_submission_client): + ncbi_submission_client.set_description( + ncbi_submission_client.nmdc_pi_email, + "testuser", + "Kate", + "Thibault", + "Test Org", + ) + description = ET.tostring( + ncbi_submission_client.root.find("Description"), "unicode" + ) + + root = ET.fromstring(description) + comment = root.find("Comment").text + submitter = root.find("Submitter").attrib["user_name"] + org_name = root.find("Organization/Name").text + contact_email = root.find("Organization/Contact").attrib["email"] + contact_first = root.find("Organization/Contact/Name/First").text + contact_last = root.find("Organization/Contact/Name/Last").text + + assert comment == f"NMDC Submission for {MOCK_NMDC_STUDY['id']}" + assert submitter == "testuser" + assert org_name == "Test Org" + assert contact_email == "kthibault@battelleecology.org" + assert contact_first == "Kate" + assert contact_last == "Thibault" + + def test_set_bioproject(self, ncbi_submission_client): + ncbi_submission_client.set_bioproject( + title=MOCK_NMDC_STUDY["title"], + project_id=MOCK_NMDC_STUDY["insdc_bioproject_identifiers"][0], + description=MOCK_NMDC_STUDY["description"], + data_type="metagenome", + org="Test Org", + ) + bioproject_xml = ET.tostring( + ncbi_submission_client.root.find(".//Project"), "unicode" + ) + assert ( + "National Ecological Observatory Network: soil metagenomes (DP1.10107.001)" + in bioproject_xml + ) + assert "bioproject:PRJNA1029061" in bioproject_xml + assert ( + "This study contains the quality-controlled laboratory metadata and minimally processed sequence data from NEON's soil microbial shotgun metagenomics sequencing." + in bioproject_xml + ) + assert "metagenome" in bioproject_xml + assert "Test Org" in bioproject_xml + + def test_set_biosample(self, ncbi_submission_client, nmdc_biosample, mocker): + mocker.patch( + "nmdc_runtime.site.export.ncbi_xml.load_mappings", + return_value=( + { + "analysis_type": "", + "biosample_categories": "", + "collection_date": "collection_date", + "conduc": "conduc", + "elev": "elev", + "env_broad_scale": "env_broad_scale", + "env_local_scale": "env_local_scale", + "env_medium": "env_medium", + "env_package": "env_package", + "geo_loc_name": "geo_loc_name", + "id": "", + "lat_lon": "lat_lon", + "name": "sample_name", + "part_of": "", + "samp_collec_device": "samp_collect_device", + "temp": "temp", + "type": "", + }, + { + "analysis_type": "AnalysisTypeEnum", + "biosample_categories": "BiosampleCategoryEnum", + "collection_date": "TimestampValue", + "conduc": "QuantityValue", + "elev": "float", + "env_broad_scale": "ControlledIdentifiedTermValue", + "env_local_scale": "ControlledIdentifiedTermValue", + "env_medium": "ControlledIdentifiedTermValue", + "env_package": "TextValue", + "geo_loc_name": "TextValue", + "id": "uriorcurie", + "lat_lon": "GeolocationValue", + "name": "string", + "part_of": "Study", + "samp_collec_device": "string", + "temp": "QuantityValue", + "type": "string", + }, + ), + ) + ncbi_submission_client.set_biosample( + organism_name=MOCK_NCBI_SUBMISSION_METADATA["ncbi_biosample_metadata"][ + "organism_name" + ], + org=MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"][ + "organization" + ], + bioproject_id=MOCK_NMDC_STUDY["insdc_bioproject_identifiers"][0], + nmdc_biosamples=nmdc_biosample, + nmdc_omics_processing=[], + ) + biosample_xml = ET.tostring( + ncbi_submission_client.root.find(".//BioSample"), "unicode" + ) + assert "E. coli" in biosample_xml + assert "Test Org" in biosample_xml + assert "PRJNA1029061" in biosample_xml + + def test_set_fastq(self, ncbi_submission_client, data_objects_list, nmdc_biosample): + biosample_data_objects = [ + {biosample["id"]: data_objects_list} for biosample in nmdc_biosample + ] + + ncbi_submission_client.set_fastq( + biosample_data_objects=biosample_data_objects, + bioproject_id=MOCK_NMDC_STUDY["insdc_bioproject_identifiers"][0], + org="Test Org", + ) + + action_elements = ncbi_submission_client.root.findall(".//Action") + assert len(action_elements) == len(biosample_data_objects) + + for action_element in action_elements: + action_xml = ET.tostring(action_element, "unicode") + assert ( + "BMI_HVKNKBGX5_Tube347_R1.fastq.gz" in action_xml + or "BMI_HVKNKBGX5_Tube347_R2.fastq.gz" in action_xml + ) + assert "PRJNA1029061" in action_xml + assert "nmdc:bsm-12-p9q5v236" in action_xml + assert "Test Org" in action_xml + + def test_get_submission_xml( + self, mocker, ncbi_submission_client, nmdc_biosample, data_objects_list + ): + mocker.patch( + "nmdc_runtime.site.export.ncbi_xml.load_mappings", + return_value=( + { + "analysis_type": "", + "biosample_categories": "", + "collection_date": "collection_date", + "conduc": "conduc", + "elev": "elev", + "env_broad_scale": "env_broad_scale", + "env_local_scale": "env_local_scale", + "env_medium": "env_medium", + "env_package": "env_package", + "geo_loc_name": "geo_loc_name", + "id": "", + "lat_lon": "lat_lon", + "name": "sample_name", + "part_of": "", + "samp_collec_device": "samp_collect_device", + "temp": "temp", + "type": "", + }, + { + "analysis_type": "AnalysisTypeEnum", + "biosample_categories": "BiosampleCategoryEnum", + "collection_date": "TimestampValue", + "conduc": "QuantityValue", + "elev": "float", + "env_broad_scale": "ControlledIdentifiedTermValue", + "env_local_scale": "ControlledIdentifiedTermValue", + "env_medium": "ControlledIdentifiedTermValue", + "env_package": "TextValue", + "geo_loc_name": "TextValue", + "id": "uriorcurie", + "lat_lon": "GeolocationValue", + "name": "string", + "part_of": "Study", + "samp_collec_device": "string", + "temp": "QuantityValue", + "type": "string", + }, + ), + ) + + biosample_data_objects = [ + {biosample["id"]: data_objects_list} for biosample in nmdc_biosample + ] + + ncbi_submission_client.set_fastq( + biosample_data_objects=biosample_data_objects, + bioproject_id=MOCK_NMDC_STUDY["insdc_bioproject_identifiers"][0], + org="Test Org", + ) + + submission_xml = ncbi_submission_client.get_submission_xml( + nmdc_biosample, [], biosample_data_objects + ) + + assert "nmdc:bsm-12-p9q5v236" in submission_xml + assert "E. coli" in submission_xml + assert "sediment" in submission_xml + assert "USA: Colorado, Arikaree River" in submission_xml + assert "2015-07-21T18:00Z" in submission_xml + assert "National Microbiome Data Collaborative (NMDC)" in submission_xml + assert ( + "National Ecological Observatory Network: soil metagenomes (DP1.10107.001)" + in submission_xml + ) + + +class TestNCBIXMLUtils: + def test_handle_quantity_value(self): + assert ( + handle_quantity_value({"has_numeric_value": 10, "has_unit": "mg"}) + == "10 mg" + ) + assert ( + handle_quantity_value( + { + "has_maximum_numeric_value": 15, + "has_minimum_numeric_value": 5, + "has_unit": "kg", + } + ) + == "10 kg" + ) + assert handle_quantity_value({"has_raw_value": "20 units"}) == "20 units" + assert handle_quantity_value({}) == "Unknown format" + + def test_handle_text_value(self): + assert handle_text_value({"has_raw_value": "Sample Text"}) == "Sample Text" + assert handle_text_value({}) == "Unknown format" + + def test_handle_timestamp_value(self): + assert handle_timestamp_value({"has_raw_value": "2021-01-01"}) == "2021-01-01" + assert handle_timestamp_value({}) == "Unknown format" + + def test_handle_controlled_term_value(self): + term_data = {"term": {"name": "Homo sapiens", "id": "NCBITaxon:9606"}} + assert ( + handle_controlled_term_value(term_data) == "Homo sapiens [NCBITaxon:9606]" + ) + assert ( + handle_controlled_term_value({"term": {"id": "NCBITaxon:9606"}}) + == "NCBITaxon:9606" + ) + assert ( + handle_controlled_term_value({"term": {"name": "Homo sapiens"}}) + == "Homo sapiens" + ) + assert ( + handle_controlled_term_value( + {"has_raw_value": "Homo sapiens [NCBITaxon:9606]"} + ) + == "Homo sapiens [NCBITaxon:9606]" + ) + assert handle_controlled_term_value({}) == "Unknown format" + + def test_handle_controlled_identified_term_value(self): + term_data = {"term": {"name": "Homo sapiens", "id": "NCBITaxon:9606"}} + assert ( + handle_controlled_identified_term_value(term_data) + == "Homo sapiens [NCBITaxon:9606]" + ) + assert ( + handle_controlled_identified_term_value({"term": {"id": "NCBITaxon:9606"}}) + == "NCBITaxon:9606" + ) + assert ( + handle_controlled_identified_term_value({"term": {"name": "Homo sapiens"}}) + == "Unknown format" + ) + assert ( + handle_controlled_identified_term_value( + {"has_raw_value": "Homo sapiens [NCBITaxon:9606]"} + ) + == "Homo sapiens [NCBITaxon:9606]" + ) + assert handle_controlled_identified_term_value({}) == "Unknown format" + + def test_handle_geolocation_value(self): + assert ( + handle_geolocation_value({"latitude": 34.05, "longitude": -118.25}) + == "34.05 -118.25" + ) + assert ( + handle_geolocation_value({"has_raw_value": "34.05, -118.25"}) + == "34.05, -118.25" + ) + assert handle_geolocation_value({}) == "Unknown format" + + def test_handle_float_value(self): + assert handle_float_value(10.1234) == "10.12" + + def test_handle_string_value(self): + assert handle_string_value("Foo") == "Foo" + + def test_load_mappings(self, mocker): + mock_tsv_content = ( + "nmdc_schema_class\tnmdc_schema_slot\tnmdc_schema_slot_range\tncbi_biosample_attribute_name\tstatic_value\tignore\n" + "Biosample\tanalysis_type\tAnalysisTypeEnum\t\t\t\n" + "Biosample\tbiosample_categories\tBiosampleCategoryEnum\t\t\t\n" + "Biosample\tcollection_date\tTimestampValue\tcollection_date\t\t\n" + "Biosample\tconduc\tQuantityValue\tconduc\t\t\n" + "Biosample\telev\tfloat\telev\t\t\n" + "Biosample\tenv_broad_scale\tControlledIdentifiedTermValue\tenv_broad_scale\t\t\n" + "Biosample\tenv_local_scale\tControlledIdentifiedTermValue\tenv_local_scale\t\t\n" + "Biosample\tenv_medium\tControlledIdentifiedTermValue\tenv_medium\t\t\n" + "Biosample\tenv_package\tTextValue\tenv_package\t\t\n" + "Biosample\tgeo_loc_name\tTextValue\tgeo_loc_name\t\t\n" + "Biosample\tid\turiorcurie\t\t\t\n" + "Biosample\tlat_lon\tGeolocationValue\tlat_lon\t\t\n" + "Biosample\tname\tstring\tsample_name\t\t\n" + "Biosample\tpart_of\tStudy\t\t\t\n" + "Biosample\tsamp_collec_device\tstring\tsamp_collect_device\t\t\n" + "Biosample\ttemp\tQuantityValue\ttemp\t\t\n" + "Biosample\ttype\tstring\t\t\t\n" + ) + + mock_response = MagicMock() + mock_response.text = mock_tsv_content + mocker.patch("requests.get", return_value=mock_response) + + attribute_mappings, slot_range_mappings = load_mappings( + "http://example.com/mappings.tsv" + ) + + expected_attribute_mappings = { + "analysis_type": "analysis_type", + "biosample_categories": "biosample_categories", + "collection_date": "collection_date", + "conduc": "conduc", + "elev": "elev", + "env_broad_scale": "env_broad_scale", + "env_local_scale": "env_local_scale", + "env_medium": "env_medium", + "env_package": "env_package", + "geo_loc_name": "geo_loc_name", + "id": "id", + "lat_lon": "lat_lon", + "name": "sample_name", + "part_of": "part_of", + "samp_collec_device": "samp_collect_device", + "temp": "temp", + "type": "type", + } + + expected_slot_range_mappings = { + "analysis_type": "AnalysisTypeEnum", + "biosample_categories": "BiosampleCategoryEnum", + "collection_date": "TimestampValue", + "conduc": "QuantityValue", + "elev": "float", + "env_broad_scale": "ControlledIdentifiedTermValue", + "env_local_scale": "ControlledIdentifiedTermValue", + "env_medium": "ControlledIdentifiedTermValue", + "env_package": "TextValue", + "geo_loc_name": "TextValue", + "id": "uriorcurie", + "lat_lon": "GeolocationValue", + "name": "string", + "part_of": "Study", + "samp_collec_device": "string", + "temp": "QuantityValue", + "type": "string", + } + + assert attribute_mappings == expected_attribute_mappings + assert slot_range_mappings == expected_slot_range_mappings