From e863061d2e1edf1f1a1d873b13134f52068d360a Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Tue, 3 Dec 2024 11:46:38 -0800
Subject: [PATCH 1/3] add special handling for elev and host_taxid slots in
 NCBI export pipeline

---
 nmdc_runtime/site/export/ncbi_xml.py | 53 ++++++++++++++++++++++++++--
 1 file changed, 50 insertions(+), 3 deletions(-)

diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py
index da87d5df..b3432e05 100644
--- a/nmdc_runtime/site/export/ncbi_xml.py
+++ b/nmdc_runtime/site/export/ncbi_xml.py
@@ -1,4 +1,5 @@
 import os
+import re
 import datetime
 import xml.etree.ElementTree as ET
 import xml.dom.minidom
@@ -170,7 +171,39 @@ def set_biosample(
 
             for json_key, value in biosample.items():
                 if isinstance(value, list):
-                    continue  # Skip processing for list values
+                    for item in value:
+                        if json_key not in attribute_mappings:
+                            continue
+
+                        xml_key = attribute_mappings[json_key]
+                        value_type = slot_range_mappings.get(json_key, "string")
+                        handler = self.type_handlers.get(
+                            value_type, handle_string_value
+                        )
+
+                        # Special handling for "elev" key
+                        if json_key == "elev":
+                            value = f"{float(value)} m"  # Convert to float if possible
+                            attributes[xml_key] = value
+                            continue  # Skip applying the handler to this key
+
+                        # Special handling for "host_taxid"
+                        if json_key == "host_taxid" and isinstance(value, dict):
+                            if "term" in value and "id" in value["term"]:
+                                value = re.findall(
+                                    r"\d+", value["term"]["id"].split(":")[1]
+                                )[0]
+                            attributes[xml_key] = value
+                            continue  # Skip applying the handler to this key
+
+                        formatted_value = handler(item)
+
+                        # Combine multiple values with a separator for list elements
+                        if xml_key in attributes:
+                            attributes[xml_key] += f"| {formatted_value}"
+                        else:
+                            attributes[xml_key] = formatted_value
+                    continue
 
                 if json_key == "env_package":
                     env_package = f"MIMS.me.{handle_text_value(value)}.6.0"
@@ -187,6 +220,20 @@ def set_biosample(
                 value_type = slot_range_mappings.get(json_key, "string")
                 handler = self.type_handlers.get(value_type, handle_string_value)
 
+                # Special handling for "elev" key
+                if json_key == "elev":
+                    value = f"{float(value)} m"  # Convert to float if possible
+                    attributes[xml_key] = value
+                    continue  # Skip applying the handler to this key
+
+                # Special handling for "host_taxid"
+                if json_key == "host_taxid" and isinstance(value, dict):
+                    if "term" in value and "id" in value["term"]:
+                        value = re.findall(r"\d+", value["term"]["id"].split(":")[1])[0]
+                    attributes[xml_key] = value
+                    continue  # Skip applying the handler to this key
+
+                # Default processing for other keys
                 formatted_value = handler(value)
                 attributes[xml_key] = formatted_value
 
@@ -353,9 +400,9 @@ def set_fastq(
                                 "RefId",
                                 children=[
                                     self.set_element(
-                                        "SPUID",
+                                        "PrimaryId",
                                         bioproject_id,
-                                        {"spuid_namespace": org},
+                                        {"db": "BioProject"},
                                     )
                                 ],
                             )

From c3d5154f0018209f228d89561b5bc0f5dcbb5ca0 Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Tue, 3 Dec 2024 15:22:37 -0800
Subject: [PATCH 2/3] platform and instrument_model term inference in XML

---
 nmdc_runtime/site/export/ncbi_xml.py       | 17 +++++++++++----
 nmdc_runtime/site/export/ncbi_xml_utils.py | 25 ++++++++++++++++++++++
 nmdc_runtime/site/graphs.py                |  3 +++
 nmdc_runtime/site/ops.py                   | 11 ++++++++++
 4 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py
index b3432e05..896bd093 100644
--- a/nmdc_runtime/site/export/ncbi_xml.py
+++ b/nmdc_runtime/site/export/ncbi_xml.py
@@ -7,6 +7,7 @@
 from typing import Any
 from urllib.parse import urlparse
 from nmdc_runtime.site.export.ncbi_xml_utils import (
+    get_instruments,
     handle_controlled_identified_term_value,
     handle_controlled_term_value,
     handle_geolocation_value,
@@ -333,6 +334,7 @@ def set_fastq(
         nmdc_nucleotide_sequencing: list,
         nmdc_biosamples: list,
         nmdc_library_preparation: list,
+        all_instruments: dict,
     ):
         bsm_id_name_dict = {
             biosample["id"]: biosample["name"] for biosample in nmdc_biosamples
@@ -343,9 +345,10 @@ def set_fastq(
             biosample_ids = []
             nucleotide_sequencing_ids = {}
             lib_prep_protocol_names = {}
-            instrument_name = ""
             analyte_category = ""
             library_name = ""
+            instrument_vendor = ""
+            instrument_model = ""
 
             for biosample_id, data_objects in entry.items():
                 biosample_ids.append(biosample_id)
@@ -363,7 +366,11 @@ def set_fastq(
                             )
                             # Currently, we are making the assumption that only one instrument
                             # is used to sequence a Biosample
-                            instrument_name = ntseq.get("instrument_used", "")[0]
+                            instrument_id = ntseq.get("instrument_used", "")[0]
+                            instrument = all_instruments.get(instrument_id, {})
+                            instrument_vendor = instrument.get("vendor", "")
+                            instrument_model = instrument.get("model", "")
+
                             analyte_category = ntseq.get("analyte_category", "")
                             library_name = bsm_id_name_dict.get(biosample_id, "")
 
@@ -431,11 +438,11 @@ def set_fastq(
                     )
 
                 sra_attributes = []
-                if instrument_name.lower().startswith("illumina"):
+                if instrument_vendor == "illumina":
                     sra_attributes.append(
                         self.set_element("Attribute", "ILLUMINA", {"name": "platform"})
                     )
-                    if "nextseq550" in instrument_name.lower():
+                    if instrument_model == "nextseq_550":
                         sra_attributes.append(
                             self.set_element(
                                 "Attribute", "NextSeq 550", {"name": "instrument_model"}
@@ -548,6 +555,7 @@ def get_submission_xml(
         biosample_nucleotide_sequencing_list: list,
         biosample_data_objects_list: list,
         biosample_library_preparation_list: list,
+        instruments_dict: dict,
     ):
         data_type = None
         ncbi_project_id = None
@@ -592,6 +600,7 @@ def get_submission_xml(
             nmdc_nucleotide_sequencing=biosample_nucleotide_sequencing_list,
             nmdc_biosamples=biosamples_list,
             nmdc_library_preparation=biosample_library_preparation_list,
+            all_instruments=instruments_dict,
         )
 
         rough_string = ET.tostring(self.root, "unicode")
diff --git a/nmdc_runtime/site/export/ncbi_xml_utils.py b/nmdc_runtime/site/export/ncbi_xml_utils.py
index a7247ca8..5feb1e45 100644
--- a/nmdc_runtime/site/export/ncbi_xml_utils.py
+++ b/nmdc_runtime/site/export/ncbi_xml_utils.py
@@ -20,6 +20,31 @@ def get_classname_from_typecode(doc_id):
     return class_map.get(typecode)
 
 
+def get_instruments(instrument_set_collection):
+    # dictionary to capture a list of all instruments
+    # Structure of dict:
+    # {"instrument_id": {"vendor": "vendor_name", "model": "model_name"}}
+    all_instruments = {}
+
+    try:
+        query = {"type": "nmdc:Instrument"}
+        cursor = instrument_set_collection.find(query)
+
+        for document in cursor:
+            instrument_id = document.get("id")
+            vendor = document.get("vendor")
+            model = document.get("model")
+
+            if not instrument_id or not vendor or not model:
+                continue
+
+            all_instruments[instrument_id] = {"vendor": vendor, "model": model}
+
+        return all_instruments
+    except Exception as e:
+        raise RuntimeError(f"An error occurred while fetching instrument data: {e}")
+
+
 def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
     biosample_data_objects = []
 
diff --git a/nmdc_runtime/site/graphs.py b/nmdc_runtime/site/graphs.py
index 1cb1f1e3..bb69ab5a 100644
--- a/nmdc_runtime/site/graphs.py
+++ b/nmdc_runtime/site/graphs.py
@@ -53,6 +53,7 @@
     get_data_objects_from_biosamples,
     get_nucleotide_sequencing_from_biosamples,
     get_library_preparation_from_biosamples,
+    get_all_instruments,
     get_ncbi_export_pipeline_inputs,
     ncbi_submission_xml_from_nmdc_study,
     ncbi_submission_xml_asset,
@@ -449,6 +450,7 @@ def nmdc_study_to_ncbi_submission_export():
     )
     data_object_records = get_data_objects_from_biosamples(biosamples)
     library_preparation_records = get_library_preparation_from_biosamples(biosamples)
+    all_instruments = get_all_instruments()
     xml_data = ncbi_submission_xml_from_nmdc_study(
         nmdc_study,
         ncbi_submission_metadata,
@@ -456,5 +458,6 @@ def nmdc_study_to_ncbi_submission_export():
         nucleotide_sequencing_records,
         data_object_records,
         library_preparation_records,
+        all_instruments,
     )
     ncbi_submission_xml_asset(xml_data)
diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py
index c8317fae..058592d0 100644
--- a/nmdc_runtime/site/ops.py
+++ b/nmdc_runtime/site/ops.py
@@ -68,6 +68,7 @@
     fetch_data_objects_from_biosamples,
     fetch_nucleotide_sequencing_from_biosamples,
     fetch_library_preparation_from_biosamples,
+    get_instruments,
 )
 from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict
 from nmdc_runtime.site.resources import (
@@ -1221,6 +1222,14 @@ def get_library_preparation_from_biosamples(
     return biosample_lib_prep
 
 
+@op(required_resource_keys={"mongo"})
+def get_all_instruments(context: OpExecutionContext):
+    mdb = context.resources.mongo.db
+    instrument_set_collection = mdb["instrument_set"]
+    all_instruments = get_instruments(instrument_set_collection)
+    return all_instruments
+
+
 @op
 def ncbi_submission_xml_from_nmdc_study(
     context: OpExecutionContext,
@@ -1230,6 +1239,7 @@ def ncbi_submission_xml_from_nmdc_study(
     omics_processing_records: list,
     data_object_records: list,
     library_preparation_records: list,
+    all_instruments: dict,
 ) -> str:
     ncbi_exporter = NCBISubmissionXML(nmdc_study, ncbi_exporter_metadata)
     ncbi_xml = ncbi_exporter.get_submission_xml(
@@ -1237,5 +1247,6 @@ def ncbi_submission_xml_from_nmdc_study(
         omics_processing_records,
         data_object_records,
         library_preparation_records,
+        all_instruments,
     )
     return ncbi_xml

From d1cca89fa8c70076fea5e6e3b7788b1cd63e1109 Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Wed, 4 Dec 2024 09:59:35 -0800
Subject: [PATCH 3/3] modify tests for NCBI Submission XML utils

---
 tests/test_data/test_ncbi_xml.py | 158 ++++++++++++++++++++++++-------
 1 file changed, 122 insertions(+), 36 deletions(-)

diff --git a/tests/test_data/test_ncbi_xml.py b/tests/test_data/test_ncbi_xml.py
index 8e34fff7..f824be8f 100644
--- a/tests/test_data/test_ncbi_xml.py
+++ b/tests/test_data/test_ncbi_xml.py
@@ -16,6 +16,7 @@
     handle_geolocation_value,
     handle_float_value,
     handle_string_value,
+    get_instruments,
 )
 
 MOCK_NMDC_STUDY = {
@@ -102,24 +103,54 @@ def nmdc_biosample():
         {
             "analysis_type": ["metagenomics"],
             "biosample_categories": ["NEON"],
-            "collection_date": {"has_raw_value": "2015-07-21T18:00Z"},
+            "collection_date": {
+                "has_raw_value": "2015-07-21T18:00Z",
+                "type": "nmdc:TimestampValue",
+            },
             "depth": {
                 "has_maximum_numeric_value": 1,
                 "has_minimum_numeric_value": 0,
-                "has_unit": "meters",
+                "has_unit": "m",
+                "type": "nmdc:QuantityValue",
             },
             "elev": 1179.5,
             "env_broad_scale": {
-                "term": {"id": "ENVO:01000253", "name": "freshwater river biome"}
+                "term": {
+                    "id": "ENVO:01000253",
+                    "name": "freshwater river biome",
+                    "type": "nmdc:OntologyClass",
+                },
+                "type": "nmdc:ControlledIdentifiedTermValue",
+            },
+            "env_local_scale": {
+                "term": {
+                    "id": "ENVO:03600094",
+                    "name": "stream pool",
+                    "type": "nmdc:OntologyClass",
+                },
+                "type": "nmdc:ControlledIdentifiedTermValue",
+            },
+            "env_medium": {
+                "term": {
+                    "id": "ENVO:03605004",
+                    "name": "epipsammon",
+                    "type": "nmdc:OntologyClass",
+                },
+                "type": "nmdc:ControlledIdentifiedTermValue",
+            },
+            "geo_loc_name": {
+                "has_raw_value": "USA: Colorado, Arikaree River",
+                "type": "nmdc:TextValue",
             },
-            "env_local_scale": {"term": {"id": "ENVO:03600094", "name": "stream pool"}},
-            "env_medium": {"term": {"id": "ENVO:00002007", "name": "sediment"}},
-            "geo_loc_name": {"has_raw_value": "USA: Colorado, Arikaree River"},
             "id": "nmdc:bsm-12-p9q5v236",
-            "lat_lon": {"latitude": 39.758206, "longitude": -102.447148},
+            "lat_lon": {
+                "latitude": 39.758206,
+                "longitude": -102.447148,
+                "type": "nmdc:GeolocationValue",
+            },
             "name": "ARIK.20150721.AMC.EPIPSAMMON.3",
-            "part_of": ["nmdc:sty-11-34xj1150"],
             "type": "nmdc:Biosample",
+            "associated_studies": ["nmdc:sty-11-pzmd0x14"],
         }
     ]
 
@@ -128,21 +159,16 @@ def nmdc_biosample():
 def nucleotide_sequencing_list():
     return [
         {
+            "id": "nmdc:dgns-11-e01w1f21",
+            "type": "nmdc:NucleotideSequencing",
+            "name": "Benthic microbial communities - ARIK.20150721.AMC.EPIPSAMMON.3-DNA1",
             "has_input": ["nmdc:procsm-12-ehktny16"],
-            "has_output": ["nmdc:dobj-12-1zv4q961", "nmdc:dobj-12-b3ft7a80"],
-            "id": "nmdc:omprc-12-zqm9p096",
-            "instrument_used": ["Illumina NextSeq550"],
-            "name": "Terrestrial soil microbial communities - ARIK.20150721.AMC.EPIPSAMMON.3-DNA1",
-            "ncbi_project_name": "PRJNA406976",
-            "associated_studies": ["nmdc:sty-11-34xj1150"],
+            "has_output": ["nmdc:dobj-11-8wjdvj33", "nmdc:dobj-11-0y3amn94"],
             "processing_institution": "Battelle",
             "analyte_category": "metagenome",
-            "type": [
-                "NucleotideSequencing",
-                "DataGeneration",
-                "PlannedProcess",
-                "NamedThing",
-            ],
+            "associated_studies": ["nmdc:sty-11-pzmd0x14"],
+            "instrument_used": ["nmdc:inst-14-xz5tb342"],
+            "ncbi_project_name": "PRJNA406976",
         }
     ]
 
@@ -151,22 +177,22 @@ def nucleotide_sequencing_list():
 def data_objects_list():
     return [
         {
-            "data_object_type": "Metagenome Raw Read 1",
-            "description": "sequencing results for BMI_HVKNKBGX5_Tube347_R1",
-            "id": "nmdc:dobj-12-b3ft7a80",
-            "md5_checksum": "cae0a9342d786e731ae71f6f37b76120",
-            "name": "BMI_HVKNKBGX5_Tube347_R1.fastq.gz",
+            "id": "nmdc:dobj-11-8wjdvj33",
             "type": "nmdc:DataObject",
-            "url": "https://storage.neonscience.org/neon-microbial-raw-seq-files/2023/BMI_HVKNKBGX5_mms_R1/BMI_HVKNKBGX5_Tube347_R1.fastq.gz",
+            "name": "BMI_HVKNKBGX5_Tube347_srt_R1.fastq.gz",
+            "description": "sequencing results for BMI_HVKNKBGX5_Tube347_srt_R1",
+            "data_object_type": "Metagenome Raw Read 1",
+            "md5_checksum": "98017c587ef4e6a8a54f8daa0925e4e1",
+            "url": "https://storage.neonscience.org/neon-microbial-raw-seq-files/2023/BMI_HVKNKBGX5_srt_R1/BMI_HVKNKBGX5_Tube347_srt_R1.fastq.gz",
         },
         {
-            "data_object_type": "Metagenome Raw Read 2",
-            "description": "sequencing results for BMI_HVKNKBGX5_Tube347_R2",
-            "id": "nmdc:dobj-12-1zv4q961",
-            "md5_checksum": "7340fe25644183a4f56d36ce52389d83",
-            "name": "BMI_HVKNKBGX5_Tube347_R2.fastq.gz",
+            "id": "nmdc:dobj-11-0y3amn94",
             "type": "nmdc:DataObject",
-            "url": "https://storage.neonscience.org/neon-microbial-raw-seq-files/2023/BMI_HVKNKBGX5_mms_R2/BMI_HVKNKBGX5_Tube347_R2.fastq.gz",
+            "name": "BMI_HVKNKBGX5_Tube347_srt_R2.fastq.gz",
+            "description": "sequencing results for BMI_HVKNKBGX5_Tube347_srt_R2",
+            "data_object_type": "Metagenome Raw Read 2",
+            "md5_checksum": "5358ce1da32bfad7c358c484cbf5075b",
+            "url": "https://storage.neonscience.org/neon-microbial-raw-seq-files/2023/BMI_HVKNKBGX5_srt_R2/BMI_HVKNKBGX5_Tube347_srt_R2.fastq.gz",
         },
     ]
 
@@ -184,6 +210,26 @@ def library_preparation_dict():
     }
 
 
+@pytest.fixture
+def mocked_instruments():
+    return [
+        {
+            "id": "nmdc:inst-14-xz5tb342",
+            "model": "nextseq_550",
+            "name": "Illumina NextSeq 550",
+            "vendor": "illumina",
+            "type": "nmdc:Instrument",
+        },
+        {
+            "id": "nmdc:inst-14-79zxap02",
+            "model": "hiseq",
+            "name": "Illumina HiSeq",
+            "vendor": "illumina",
+            "type": "nmdc:Instrument",
+        },
+    ]
+
+
 class TestNCBISubmissionXML:
     def test_set_element(self, ncbi_submission_client: NCBISubmissionXML):
         element = ncbi_submission_client.set_element("Test", "Hello", {"attr": "value"})
@@ -311,7 +357,16 @@ def test_set_fastq(
         data_objects_list: list[dict[str, str]],
         nucleotide_sequencing_list: list[dict[str, Any]],
         library_preparation_dict: dict[str, Any],
+        mocked_instruments: list[dict[str, Any]],
     ):
+        all_instruments = {
+            instrument["id"]: {
+                "vendor": instrument["vendor"],
+                "model": instrument["model"],
+            }
+            for instrument in mocked_instruments
+        }
+
         biosample_data_objects = [
             {biosample["id"]: data_objects_list} for biosample in nmdc_biosample
         ]
@@ -332,6 +387,7 @@ def test_set_fastq(
             nmdc_nucleotide_sequencing=biosample_nucleotide_sequencing,
             nmdc_biosamples=nmdc_biosample,
             nmdc_library_preparation=biosample_library_preparation,
+            all_instruments=all_instruments,
         )
 
         action_elements = ncbi_submission_client.root.findall(".//Action")
@@ -340,8 +396,8 @@ def test_set_fastq(
         for action_element in action_elements:
             action_xml = ET.tostring(action_element, "unicode")
             assert (
-                "BMI_HVKNKBGX5_Tube347_R1.fastq.gz" in action_xml
-                or "BMI_HVKNKBGX5_Tube347_R2.fastq.gz" in action_xml
+                "BMI_HVKNKBGX5_Tube347_srt_R1.fastq.gz" in action_xml
+                or "BMI_HVKNKBGX5_Tube347_srt_R2.fastq.gz" in action_xml
             )
             assert "PRJNA1029061" in action_xml
             assert "nmdc:bsm-12-p9q5v236" in action_xml
@@ -364,6 +420,7 @@ def test_get_submission_xml(
         data_objects_list: list[dict[str, str]],
         nucleotide_sequencing_list: list[dict[str, Any]],
         library_preparation_dict: dict[str, Any],
+        mocked_instruments: list[dict[str, Any]],
     ):
         mocker.patch(
             "nmdc_runtime.site.export.ncbi_xml.load_mappings",
@@ -409,6 +466,14 @@ def test_get_submission_xml(
             ),
         )
 
+        all_instruments = {
+            instrument["id"]: {
+                "vendor": instrument["vendor"],
+                "model": instrument["model"],
+            }
+            for instrument in mocked_instruments
+        }
+
         biosample_data_objects = [
             {biosample["id"]: data_objects_list} for biosample in nmdc_biosample
         ]
@@ -429,15 +494,19 @@ def test_get_submission_xml(
             nmdc_nucleotide_sequencing=biosample_nucleotide_sequencing,
             nmdc_biosamples=nmdc_biosample,
             nmdc_library_preparation=biosample_library_preparation,
+            all_instruments=all_instruments,
         )
 
         submission_xml = ncbi_submission_client.get_submission_xml(
-            nmdc_biosample, [], biosample_data_objects, biosample_library_preparation
+            nmdc_biosample,
+            [],
+            biosample_data_objects,
+            biosample_library_preparation,
+            all_instruments,
         )
 
         assert "nmdc:bsm-12-p9q5v236" in submission_xml
         assert "E. coli" in submission_xml
-        assert "sediment" in submission_xml
         assert "USA: Colorado, Arikaree River" in submission_xml
         assert "2015-07-21T18:00Z" in submission_xml
         assert "National Microbiome Data Collaborative" in submission_xml
@@ -608,3 +677,20 @@ def test_load_mappings(
 
         assert attribute_mappings == expected_attribute_mappings
         assert slot_range_mappings == expected_slot_range_mappings
+
+    def test_get_instruments(
+        self,
+        mocker: Callable[..., Generator[MockerFixture, None, None]],
+        mocked_instruments: list[dict[str, Any]],
+    ):
+        mock_instrument_set_collection = mocker.Mock()
+        mock_instrument_set_collection.find.return_value = iter(mocked_instruments)
+
+        actual_instruments = get_instruments(mock_instrument_set_collection)
+
+        expected_instruments = {
+            "nmdc:inst-14-xz5tb342": {"vendor": "illumina", "model": "nextseq_550"},
+            "nmdc:inst-14-79zxap02": {"vendor": "illumina", "model": "hiseq"},
+        }
+
+        assert actual_instruments == expected_instruments