From e863061d2e1edf1f1a1d873b13134f52068d360a Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Tue, 3 Dec 2024 11:46:38 -0800 Subject: [PATCH 1/3] add special handling for elev and host_taxid slots in NCBI export pipeline --- nmdc_runtime/site/export/ncbi_xml.py | 53 ++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py index da87d5df..b3432e05 100644 --- a/nmdc_runtime/site/export/ncbi_xml.py +++ b/nmdc_runtime/site/export/ncbi_xml.py @@ -1,4 +1,5 @@ import os +import re import datetime import xml.etree.ElementTree as ET import xml.dom.minidom @@ -170,7 +171,39 @@ def set_biosample( for json_key, value in biosample.items(): if isinstance(value, list): - continue # Skip processing for list values + for item in value: + if json_key not in attribute_mappings: + continue + + xml_key = attribute_mappings[json_key] + value_type = slot_range_mappings.get(json_key, "string") + handler = self.type_handlers.get( + value_type, handle_string_value + ) + + # Special handling for "elev" key + if json_key == "elev": + value = f"{float(value)} m" # Convert to float if possible + attributes[xml_key] = value + continue # Skip applying the handler to this key + + # Special handling for "host_taxid" + if json_key == "host_taxid" and isinstance(value, dict): + if "term" in value and "id" in value["term"]: + value = re.findall( + r"\d+", value["term"]["id"].split(":")[1] + )[0] + attributes[xml_key] = value + continue # Skip applying the handler to this key + + formatted_value = handler(item) + + # Combine multiple values with a separator for list elements + if xml_key in attributes: + attributes[xml_key] += f"| {formatted_value}" + else: + attributes[xml_key] = formatted_value + continue if json_key == "env_package": env_package = f"MIMS.me.{handle_text_value(value)}.6.0" @@ -187,6 +220,20 @@ def set_biosample( value_type = slot_range_mappings.get(json_key, "string") handler = self.type_handlers.get(value_type, handle_string_value) + # Special handling for "elev" key + if json_key == "elev": + value = f"{float(value)} m" # Convert to float if possible + attributes[xml_key] = value + continue # Skip applying the handler to this key + + # Special handling for "host_taxid" + if json_key == "host_taxid" and isinstance(value, dict): + if "term" in value and "id" in value["term"]: + value = re.findall(r"\d+", value["term"]["id"].split(":")[1])[0] + attributes[xml_key] = value + continue # Skip applying the handler to this key + + # Default processing for other keys formatted_value = handler(value) attributes[xml_key] = formatted_value @@ -353,9 +400,9 @@ def set_fastq( "RefId", children=[ self.set_element( - "SPUID", + "PrimaryId", bioproject_id, - {"spuid_namespace": org}, + {"db": "BioProject"}, ) ], ) From c3d5154f0018209f228d89561b5bc0f5dcbb5ca0 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Tue, 3 Dec 2024 15:22:37 -0800 Subject: [PATCH 2/3] platform and instrument_model term inference in XML --- nmdc_runtime/site/export/ncbi_xml.py | 17 +++++++++++---- nmdc_runtime/site/export/ncbi_xml_utils.py | 25 ++++++++++++++++++++++ nmdc_runtime/site/graphs.py | 3 +++ nmdc_runtime/site/ops.py | 11 ++++++++++ 4 files changed, 52 insertions(+), 4 deletions(-) diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py index b3432e05..896bd093 100644 --- a/nmdc_runtime/site/export/ncbi_xml.py +++ b/nmdc_runtime/site/export/ncbi_xml.py @@ -7,6 +7,7 @@ from typing import Any from urllib.parse import urlparse from nmdc_runtime.site.export.ncbi_xml_utils import ( + get_instruments, handle_controlled_identified_term_value, handle_controlled_term_value, handle_geolocation_value, @@ -333,6 +334,7 @@ def set_fastq( nmdc_nucleotide_sequencing: list, nmdc_biosamples: list, nmdc_library_preparation: list, + all_instruments: dict, ): bsm_id_name_dict = { biosample["id"]: biosample["name"] for biosample in nmdc_biosamples @@ -343,9 +345,10 @@ def set_fastq( biosample_ids = [] nucleotide_sequencing_ids = {} lib_prep_protocol_names = {} - instrument_name = "" analyte_category = "" library_name = "" + instrument_vendor = "" + instrument_model = "" for biosample_id, data_objects in entry.items(): biosample_ids.append(biosample_id) @@ -363,7 +366,11 @@ def set_fastq( ) # Currently, we are making the assumption that only one instrument # is used to sequence a Biosample - instrument_name = ntseq.get("instrument_used", "")[0] + instrument_id = ntseq.get("instrument_used", "")[0] + instrument = all_instruments.get(instrument_id, {}) + instrument_vendor = instrument.get("vendor", "") + instrument_model = instrument.get("model", "") + analyte_category = ntseq.get("analyte_category", "") library_name = bsm_id_name_dict.get(biosample_id, "") @@ -431,11 +438,11 @@ def set_fastq( ) sra_attributes = [] - if instrument_name.lower().startswith("illumina"): + if instrument_vendor == "illumina": sra_attributes.append( self.set_element("Attribute", "ILLUMINA", {"name": "platform"}) ) - if "nextseq550" in instrument_name.lower(): + if instrument_model == "nextseq_550": sra_attributes.append( self.set_element( "Attribute", "NextSeq 550", {"name": "instrument_model"} @@ -548,6 +555,7 @@ def get_submission_xml( biosample_nucleotide_sequencing_list: list, biosample_data_objects_list: list, biosample_library_preparation_list: list, + instruments_dict: dict, ): data_type = None ncbi_project_id = None @@ -592,6 +600,7 @@ def get_submission_xml( nmdc_nucleotide_sequencing=biosample_nucleotide_sequencing_list, nmdc_biosamples=biosamples_list, nmdc_library_preparation=biosample_library_preparation_list, + all_instruments=instruments_dict, ) rough_string = ET.tostring(self.root, "unicode") diff --git a/nmdc_runtime/site/export/ncbi_xml_utils.py b/nmdc_runtime/site/export/ncbi_xml_utils.py index a7247ca8..5feb1e45 100644 --- a/nmdc_runtime/site/export/ncbi_xml_utils.py +++ b/nmdc_runtime/site/export/ncbi_xml_utils.py @@ -20,6 +20,31 @@ def get_classname_from_typecode(doc_id): return class_map.get(typecode) +def get_instruments(instrument_set_collection): + # dictionary to capture a list of all instruments + # Structure of dict: + # {"instrument_id": {"vendor": "vendor_name", "model": "model_name"}} + all_instruments = {} + + try: + query = {"type": "nmdc:Instrument"} + cursor = instrument_set_collection.find(query) + + for document in cursor: + instrument_id = document.get("id") + vendor = document.get("vendor") + model = document.get("model") + + if not instrument_id or not vendor or not model: + continue + + all_instruments[instrument_id] = {"vendor": vendor, "model": model} + + return all_instruments + except Exception as e: + raise RuntimeError(f"An error occurred while fetching instrument data: {e}") + + def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list): biosample_data_objects = [] diff --git a/nmdc_runtime/site/graphs.py b/nmdc_runtime/site/graphs.py index 1cb1f1e3..bb69ab5a 100644 --- a/nmdc_runtime/site/graphs.py +++ b/nmdc_runtime/site/graphs.py @@ -53,6 +53,7 @@ get_data_objects_from_biosamples, get_nucleotide_sequencing_from_biosamples, get_library_preparation_from_biosamples, + get_all_instruments, get_ncbi_export_pipeline_inputs, ncbi_submission_xml_from_nmdc_study, ncbi_submission_xml_asset, @@ -449,6 +450,7 @@ def nmdc_study_to_ncbi_submission_export(): ) data_object_records = get_data_objects_from_biosamples(biosamples) library_preparation_records = get_library_preparation_from_biosamples(biosamples) + all_instruments = get_all_instruments() xml_data = ncbi_submission_xml_from_nmdc_study( nmdc_study, ncbi_submission_metadata, @@ -456,5 +458,6 @@ def nmdc_study_to_ncbi_submission_export(): nucleotide_sequencing_records, data_object_records, library_preparation_records, + all_instruments, ) ncbi_submission_xml_asset(xml_data) diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index c8317fae..058592d0 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -68,6 +68,7 @@ fetch_data_objects_from_biosamples, fetch_nucleotide_sequencing_from_biosamples, fetch_library_preparation_from_biosamples, + get_instruments, ) from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict from nmdc_runtime.site.resources import ( @@ -1221,6 +1222,14 @@ def get_library_preparation_from_biosamples( return biosample_lib_prep +@op(required_resource_keys={"mongo"}) +def get_all_instruments(context: OpExecutionContext): + mdb = context.resources.mongo.db + instrument_set_collection = mdb["instrument_set"] + all_instruments = get_instruments(instrument_set_collection) + return all_instruments + + @op def ncbi_submission_xml_from_nmdc_study( context: OpExecutionContext, @@ -1230,6 +1239,7 @@ def ncbi_submission_xml_from_nmdc_study( omics_processing_records: list, data_object_records: list, library_preparation_records: list, + all_instruments: dict, ) -> str: ncbi_exporter = NCBISubmissionXML(nmdc_study, ncbi_exporter_metadata) ncbi_xml = ncbi_exporter.get_submission_xml( @@ -1237,5 +1247,6 @@ def ncbi_submission_xml_from_nmdc_study( omics_processing_records, data_object_records, library_preparation_records, + all_instruments, ) return ncbi_xml From d1cca89fa8c70076fea5e6e3b7788b1cd63e1109 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Wed, 4 Dec 2024 09:59:35 -0800 Subject: [PATCH 3/3] modify tests for NCBI Submission XML utils --- tests/test_data/test_ncbi_xml.py | 158 ++++++++++++++++++++++++------- 1 file changed, 122 insertions(+), 36 deletions(-) diff --git a/tests/test_data/test_ncbi_xml.py b/tests/test_data/test_ncbi_xml.py index 8e34fff7..f824be8f 100644 --- a/tests/test_data/test_ncbi_xml.py +++ b/tests/test_data/test_ncbi_xml.py @@ -16,6 +16,7 @@ handle_geolocation_value, handle_float_value, handle_string_value, + get_instruments, ) MOCK_NMDC_STUDY = { @@ -102,24 +103,54 @@ def nmdc_biosample(): { "analysis_type": ["metagenomics"], "biosample_categories": ["NEON"], - "collection_date": {"has_raw_value": "2015-07-21T18:00Z"}, + "collection_date": { + "has_raw_value": "2015-07-21T18:00Z", + "type": "nmdc:TimestampValue", + }, "depth": { "has_maximum_numeric_value": 1, "has_minimum_numeric_value": 0, - "has_unit": "meters", + "has_unit": "m", + "type": "nmdc:QuantityValue", }, "elev": 1179.5, "env_broad_scale": { - "term": {"id": "ENVO:01000253", "name": "freshwater river biome"} + "term": { + "id": "ENVO:01000253", + "name": "freshwater river biome", + "type": "nmdc:OntologyClass", + }, + "type": "nmdc:ControlledIdentifiedTermValue", + }, + "env_local_scale": { + "term": { + "id": "ENVO:03600094", + "name": "stream pool", + "type": "nmdc:OntologyClass", + }, + "type": "nmdc:ControlledIdentifiedTermValue", + }, + "env_medium": { + "term": { + "id": "ENVO:03605004", + "name": "epipsammon", + "type": "nmdc:OntologyClass", + }, + "type": "nmdc:ControlledIdentifiedTermValue", + }, + "geo_loc_name": { + "has_raw_value": "USA: Colorado, Arikaree River", + "type": "nmdc:TextValue", }, - "env_local_scale": {"term": {"id": "ENVO:03600094", "name": "stream pool"}}, - "env_medium": {"term": {"id": "ENVO:00002007", "name": "sediment"}}, - "geo_loc_name": {"has_raw_value": "USA: Colorado, Arikaree River"}, "id": "nmdc:bsm-12-p9q5v236", - "lat_lon": {"latitude": 39.758206, "longitude": -102.447148}, + "lat_lon": { + "latitude": 39.758206, + "longitude": -102.447148, + "type": "nmdc:GeolocationValue", + }, "name": "ARIK.20150721.AMC.EPIPSAMMON.3", - "part_of": ["nmdc:sty-11-34xj1150"], "type": "nmdc:Biosample", + "associated_studies": ["nmdc:sty-11-pzmd0x14"], } ] @@ -128,21 +159,16 @@ def nmdc_biosample(): def nucleotide_sequencing_list(): return [ { + "id": "nmdc:dgns-11-e01w1f21", + "type": "nmdc:NucleotideSequencing", + "name": "Benthic microbial communities - ARIK.20150721.AMC.EPIPSAMMON.3-DNA1", "has_input": ["nmdc:procsm-12-ehktny16"], - "has_output": ["nmdc:dobj-12-1zv4q961", "nmdc:dobj-12-b3ft7a80"], - "id": "nmdc:omprc-12-zqm9p096", - "instrument_used": ["Illumina NextSeq550"], - "name": "Terrestrial soil microbial communities - ARIK.20150721.AMC.EPIPSAMMON.3-DNA1", - "ncbi_project_name": "PRJNA406976", - "associated_studies": ["nmdc:sty-11-34xj1150"], + "has_output": ["nmdc:dobj-11-8wjdvj33", "nmdc:dobj-11-0y3amn94"], "processing_institution": "Battelle", "analyte_category": "metagenome", - "type": [ - "NucleotideSequencing", - "DataGeneration", - "PlannedProcess", - "NamedThing", - ], + "associated_studies": ["nmdc:sty-11-pzmd0x14"], + "instrument_used": ["nmdc:inst-14-xz5tb342"], + "ncbi_project_name": "PRJNA406976", } ] @@ -151,22 +177,22 @@ def nucleotide_sequencing_list(): def data_objects_list(): return [ { - "data_object_type": "Metagenome Raw Read 1", - "description": "sequencing results for BMI_HVKNKBGX5_Tube347_R1", - "id": "nmdc:dobj-12-b3ft7a80", - "md5_checksum": "cae0a9342d786e731ae71f6f37b76120", - "name": "BMI_HVKNKBGX5_Tube347_R1.fastq.gz", + "id": "nmdc:dobj-11-8wjdvj33", "type": "nmdc:DataObject", - "url": "https://storage.neonscience.org/neon-microbial-raw-seq-files/2023/BMI_HVKNKBGX5_mms_R1/BMI_HVKNKBGX5_Tube347_R1.fastq.gz", + "name": "BMI_HVKNKBGX5_Tube347_srt_R1.fastq.gz", + "description": "sequencing results for BMI_HVKNKBGX5_Tube347_srt_R1", + "data_object_type": "Metagenome Raw Read 1", + "md5_checksum": "98017c587ef4e6a8a54f8daa0925e4e1", + "url": "https://storage.neonscience.org/neon-microbial-raw-seq-files/2023/BMI_HVKNKBGX5_srt_R1/BMI_HVKNKBGX5_Tube347_srt_R1.fastq.gz", }, { - "data_object_type": "Metagenome Raw Read 2", - "description": "sequencing results for BMI_HVKNKBGX5_Tube347_R2", - "id": "nmdc:dobj-12-1zv4q961", - "md5_checksum": "7340fe25644183a4f56d36ce52389d83", - "name": "BMI_HVKNKBGX5_Tube347_R2.fastq.gz", + "id": "nmdc:dobj-11-0y3amn94", "type": "nmdc:DataObject", - "url": "https://storage.neonscience.org/neon-microbial-raw-seq-files/2023/BMI_HVKNKBGX5_mms_R2/BMI_HVKNKBGX5_Tube347_R2.fastq.gz", + "name": "BMI_HVKNKBGX5_Tube347_srt_R2.fastq.gz", + "description": "sequencing results for BMI_HVKNKBGX5_Tube347_srt_R2", + "data_object_type": "Metagenome Raw Read 2", + "md5_checksum": "5358ce1da32bfad7c358c484cbf5075b", + "url": "https://storage.neonscience.org/neon-microbial-raw-seq-files/2023/BMI_HVKNKBGX5_srt_R2/BMI_HVKNKBGX5_Tube347_srt_R2.fastq.gz", }, ] @@ -184,6 +210,26 @@ def library_preparation_dict(): } +@pytest.fixture +def mocked_instruments(): + return [ + { + "id": "nmdc:inst-14-xz5tb342", + "model": "nextseq_550", + "name": "Illumina NextSeq 550", + "vendor": "illumina", + "type": "nmdc:Instrument", + }, + { + "id": "nmdc:inst-14-79zxap02", + "model": "hiseq", + "name": "Illumina HiSeq", + "vendor": "illumina", + "type": "nmdc:Instrument", + }, + ] + + class TestNCBISubmissionXML: def test_set_element(self, ncbi_submission_client: NCBISubmissionXML): element = ncbi_submission_client.set_element("Test", "Hello", {"attr": "value"}) @@ -311,7 +357,16 @@ def test_set_fastq( data_objects_list: list[dict[str, str]], nucleotide_sequencing_list: list[dict[str, Any]], library_preparation_dict: dict[str, Any], + mocked_instruments: list[dict[str, Any]], ): + all_instruments = { + instrument["id"]: { + "vendor": instrument["vendor"], + "model": instrument["model"], + } + for instrument in mocked_instruments + } + biosample_data_objects = [ {biosample["id"]: data_objects_list} for biosample in nmdc_biosample ] @@ -332,6 +387,7 @@ def test_set_fastq( nmdc_nucleotide_sequencing=biosample_nucleotide_sequencing, nmdc_biosamples=nmdc_biosample, nmdc_library_preparation=biosample_library_preparation, + all_instruments=all_instruments, ) action_elements = ncbi_submission_client.root.findall(".//Action") @@ -340,8 +396,8 @@ def test_set_fastq( for action_element in action_elements: action_xml = ET.tostring(action_element, "unicode") assert ( - "BMI_HVKNKBGX5_Tube347_R1.fastq.gz" in action_xml - or "BMI_HVKNKBGX5_Tube347_R2.fastq.gz" in action_xml + "BMI_HVKNKBGX5_Tube347_srt_R1.fastq.gz" in action_xml + or "BMI_HVKNKBGX5_Tube347_srt_R2.fastq.gz" in action_xml ) assert "PRJNA1029061" in action_xml assert "nmdc:bsm-12-p9q5v236" in action_xml @@ -364,6 +420,7 @@ def test_get_submission_xml( data_objects_list: list[dict[str, str]], nucleotide_sequencing_list: list[dict[str, Any]], library_preparation_dict: dict[str, Any], + mocked_instruments: list[dict[str, Any]], ): mocker.patch( "nmdc_runtime.site.export.ncbi_xml.load_mappings", @@ -409,6 +466,14 @@ def test_get_submission_xml( ), ) + all_instruments = { + instrument["id"]: { + "vendor": instrument["vendor"], + "model": instrument["model"], + } + for instrument in mocked_instruments + } + biosample_data_objects = [ {biosample["id"]: data_objects_list} for biosample in nmdc_biosample ] @@ -429,15 +494,19 @@ def test_get_submission_xml( nmdc_nucleotide_sequencing=biosample_nucleotide_sequencing, nmdc_biosamples=nmdc_biosample, nmdc_library_preparation=biosample_library_preparation, + all_instruments=all_instruments, ) submission_xml = ncbi_submission_client.get_submission_xml( - nmdc_biosample, [], biosample_data_objects, biosample_library_preparation + nmdc_biosample, + [], + biosample_data_objects, + biosample_library_preparation, + all_instruments, ) assert "nmdc:bsm-12-p9q5v236" in submission_xml assert "E. coli" in submission_xml - assert "sediment" in submission_xml assert "USA: Colorado, Arikaree River" in submission_xml assert "2015-07-21T18:00Z" in submission_xml assert "National Microbiome Data Collaborative" in submission_xml @@ -608,3 +677,20 @@ def test_load_mappings( assert attribute_mappings == expected_attribute_mappings assert slot_range_mappings == expected_slot_range_mappings + + def test_get_instruments( + self, + mocker: Callable[..., Generator[MockerFixture, None, None]], + mocked_instruments: list[dict[str, Any]], + ): + mock_instrument_set_collection = mocker.Mock() + mock_instrument_set_collection.find.return_value = iter(mocked_instruments) + + actual_instruments = get_instruments(mock_instrument_set_collection) + + expected_instruments = { + "nmdc:inst-14-xz5tb342": {"vendor": "illumina", "model": "nextseq_550"}, + "nmdc:inst-14-79zxap02": {"vendor": "illumina", "model": "hiseq"}, + } + + assert actual_instruments == expected_instruments