Merge pull request #807 from microbiomedata/post-berk-ncbi-export-pip…

…eline-updates Post berkeley schema NCBI export pipeline updates
microbiomedata · Dec 5, 2024 · 92caec1 · 92caec1
2 parents da435c1 + d1cca89
commit 92caec1
Show file tree

Hide file tree

Showing 5 changed files with 224 additions and 43 deletions.
diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py
@@ -1,11 +1,13 @@
 import os
+import re
 import datetime
 import xml.etree.ElementTree as ET
 import xml.dom.minidom
 
 from typing import Any
 from urllib.parse import urlparse
 from nmdc_runtime.site.export.ncbi_xml_utils import (
+    get_instruments,
     handle_controlled_identified_term_value,
     handle_controlled_term_value,
     handle_geolocation_value,
@@ -170,7 +172,39 @@ def set_biosample(
 
             for json_key, value in biosample.items():
                 if isinstance(value, list):
-                    continue  # Skip processing for list values
+                    for item in value:
+                        if json_key not in attribute_mappings:
+                            continue
+
+                        xml_key = attribute_mappings[json_key]
+                        value_type = slot_range_mappings.get(json_key, "string")
+                        handler = self.type_handlers.get(
+                            value_type, handle_string_value
+                        )
+
+                        # Special handling for "elev" key
+                        if json_key == "elev":
+                            value = f"{float(value)} m"  # Convert to float if possible
+                            attributes[xml_key] = value
+                            continue  # Skip applying the handler to this key
+
+                        # Special handling for "host_taxid"
+                        if json_key == "host_taxid" and isinstance(value, dict):
+                            if "term" in value and "id" in value["term"]:
+                                value = re.findall(
+                                    r"\d+", value["term"]["id"].split(":")[1]
+                                )[0]
+                            attributes[xml_key] = value
+                            continue  # Skip applying the handler to this key
+
+                        formatted_value = handler(item)
+
+                        # Combine multiple values with a separator for list elements
+                        if xml_key in attributes:
+                            attributes[xml_key] += f"| {formatted_value}"
+                        else:
+                            attributes[xml_key] = formatted_value
+                    continue
 
                 if json_key == "env_package":
                     env_package = f"MIMS.me.{handle_text_value(value)}.6.0"
@@ -187,6 +221,20 @@ def set_biosample(
                 value_type = slot_range_mappings.get(json_key, "string")
                 handler = self.type_handlers.get(value_type, handle_string_value)
 
+                # Special handling for "elev" key
+                if json_key == "elev":
+                    value = f"{float(value)} m"  # Convert to float if possible
+                    attributes[xml_key] = value
+                    continue  # Skip applying the handler to this key
+
+                # Special handling for "host_taxid"
+                if json_key == "host_taxid" and isinstance(value, dict):
+                    if "term" in value and "id" in value["term"]:
+                        value = re.findall(r"\d+", value["term"]["id"].split(":")[1])[0]
+                    attributes[xml_key] = value
+                    continue  # Skip applying the handler to this key
+
+                # Default processing for other keys
                 formatted_value = handler(value)
                 attributes[xml_key] = formatted_value
 
@@ -286,6 +334,7 @@ def set_fastq(
         nmdc_nucleotide_sequencing: list,
         nmdc_biosamples: list,
         nmdc_library_preparation: list,
+        all_instruments: dict,
     ):
         bsm_id_name_dict = {
             biosample["id"]: biosample["name"] for biosample in nmdc_biosamples
@@ -296,9 +345,10 @@ def set_fastq(
             biosample_ids = []
             nucleotide_sequencing_ids = {}
             lib_prep_protocol_names = {}
-            instrument_name = ""
             analyte_category = ""
             library_name = ""
+            instrument_vendor = ""
+            instrument_model = ""
 
             for biosample_id, data_objects in entry.items():
                 biosample_ids.append(biosample_id)
@@ -316,7 +366,11 @@ def set_fastq(
                             )
                             # Currently, we are making the assumption that only one instrument
                             # is used to sequence a Biosample
-                            instrument_name = ntseq.get("instrument_used", "")[0]
+                            instrument_id = ntseq.get("instrument_used", "")[0]
+                            instrument = all_instruments.get(instrument_id, {})
+                            instrument_vendor = instrument.get("vendor", "")
+                            instrument_model = instrument.get("model", "")
+
                             analyte_category = ntseq.get("analyte_category", "")
                             library_name = bsm_id_name_dict.get(biosample_id, "")
 
@@ -353,9 +407,9 @@ def set_fastq(
                                 "RefId",
                                 children=[
                                     self.set_element(
-                                        "SPUID",
+                                        "PrimaryId",
                                         bioproject_id,
-                                        {"spuid_namespace": org},
+                                        {"db": "BioProject"},
                                     )
                                 ],
                             )
@@ -384,11 +438,11 @@ def set_fastq(
                     )
 
                 sra_attributes = []
-                if instrument_name.lower().startswith("illumina"):
+                if instrument_vendor == "illumina":
                     sra_attributes.append(
                         self.set_element("Attribute", "ILLUMINA", {"name": "platform"})
                     )
-                    if "nextseq550" in instrument_name.lower():
+                    if instrument_model == "nextseq_550":
                         sra_attributes.append(
                             self.set_element(
                                 "Attribute", "NextSeq 550", {"name": "instrument_model"}
@@ -501,6 +555,7 @@ def get_submission_xml(
         biosample_nucleotide_sequencing_list: list,
         biosample_data_objects_list: list,
         biosample_library_preparation_list: list,
+        instruments_dict: dict,
     ):
         data_type = None
         ncbi_project_id = None
@@ -545,6 +600,7 @@ def get_submission_xml(
             nmdc_nucleotide_sequencing=biosample_nucleotide_sequencing_list,
             nmdc_biosamples=biosamples_list,
             nmdc_library_preparation=biosample_library_preparation_list,
+            all_instruments=instruments_dict,
         )
 
         rough_string = ET.tostring(self.root, "unicode")

diff --git a/nmdc_runtime/site/export/ncbi_xml_utils.py b/nmdc_runtime/site/export/ncbi_xml_utils.py
@@ -20,6 +20,31 @@ def get_classname_from_typecode(doc_id):
     return class_map.get(typecode)
 
 
+def get_instruments(instrument_set_collection):
+    # dictionary to capture a list of all instruments
+    # Structure of dict:
+    # {"instrument_id": {"vendor": "vendor_name", "model": "model_name"}}
+    all_instruments = {}
+
+    try:
+        query = {"type": "nmdc:Instrument"}
+        cursor = instrument_set_collection.find(query)
+
+        for document in cursor:
+            instrument_id = document.get("id")
+            vendor = document.get("vendor")
+            model = document.get("model")
+
+            if not instrument_id or not vendor or not model:
+                continue
+
+            all_instruments[instrument_id] = {"vendor": vendor, "model": model}
+
+        return all_instruments
+    except Exception as e:
+        raise RuntimeError(f"An error occurred while fetching instrument data: {e}")
+
+
 def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
     biosample_data_objects = []
 

diff --git a/nmdc_runtime/site/graphs.py b/nmdc_runtime/site/graphs.py
@@ -53,6 +53,7 @@
     get_data_objects_from_biosamples,
     get_nucleotide_sequencing_from_biosamples,
     get_library_preparation_from_biosamples,
+    get_all_instruments,
     get_ncbi_export_pipeline_inputs,
     ncbi_submission_xml_from_nmdc_study,
     ncbi_submission_xml_asset,
@@ -449,12 +450,14 @@ def nmdc_study_to_ncbi_submission_export():
     )
     data_object_records = get_data_objects_from_biosamples(biosamples)
     library_preparation_records = get_library_preparation_from_biosamples(biosamples)
+    all_instruments = get_all_instruments()
     xml_data = ncbi_submission_xml_from_nmdc_study(
         nmdc_study,
         ncbi_submission_metadata,
         biosamples,
         nucleotide_sequencing_records,
         data_object_records,
         library_preparation_records,
+        all_instruments,
     )
     ncbi_submission_xml_asset(xml_data)
diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py
@@ -68,6 +68,7 @@
     fetch_data_objects_from_biosamples,
     fetch_nucleotide_sequencing_from_biosamples,
     fetch_library_preparation_from_biosamples,
+    get_instruments,
 )
 from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict
 from nmdc_runtime.site.resources import (
@@ -1221,6 +1222,14 @@ def get_library_preparation_from_biosamples(
     return biosample_lib_prep
 
 
+@op(required_resource_keys={"mongo"})
+def get_all_instruments(context: OpExecutionContext):
+    mdb = context.resources.mongo.db
+    instrument_set_collection = mdb["instrument_set"]
+    all_instruments = get_instruments(instrument_set_collection)
+    return all_instruments
+
+
 @op
 def ncbi_submission_xml_from_nmdc_study(
     context: OpExecutionContext,
@@ -1230,12 +1239,14 @@ def ncbi_submission_xml_from_nmdc_study(
     omics_processing_records: list,
     data_object_records: list,
     library_preparation_records: list,
+    all_instruments: dict,
 ) -> str:
     ncbi_exporter = NCBISubmissionXML(nmdc_study, ncbi_exporter_metadata)
     ncbi_xml = ncbi_exporter.get_submission_xml(
         biosamples,
         omics_processing_records,
         data_object_records,
         library_preparation_records,
+        all_instruments,
     )
     return ncbi_xml