Skip to content

Commit

Permalink
Merge pull request #807 from microbiomedata/post-berk-ncbi-export-pip…
Browse files Browse the repository at this point in the history
…eline-updates

Post berkeley schema NCBI export pipeline updates
  • Loading branch information
sujaypatil96 authored Dec 5, 2024
2 parents da435c1 + d1cca89 commit 92caec1
Show file tree
Hide file tree
Showing 5 changed files with 224 additions and 43 deletions.
70 changes: 63 additions & 7 deletions nmdc_runtime/site/export/ncbi_xml.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import os
import re
import datetime
import xml.etree.ElementTree as ET
import xml.dom.minidom

from typing import Any
from urllib.parse import urlparse
from nmdc_runtime.site.export.ncbi_xml_utils import (
get_instruments,
handle_controlled_identified_term_value,
handle_controlled_term_value,
handle_geolocation_value,
Expand Down Expand Up @@ -170,7 +172,39 @@ def set_biosample(

for json_key, value in biosample.items():
if isinstance(value, list):
continue # Skip processing for list values
for item in value:
if json_key not in attribute_mappings:
continue

xml_key = attribute_mappings[json_key]
value_type = slot_range_mappings.get(json_key, "string")
handler = self.type_handlers.get(
value_type, handle_string_value
)

# Special handling for "elev" key
if json_key == "elev":
value = f"{float(value)} m" # Convert to float if possible
attributes[xml_key] = value
continue # Skip applying the handler to this key

# Special handling for "host_taxid"
if json_key == "host_taxid" and isinstance(value, dict):
if "term" in value and "id" in value["term"]:
value = re.findall(
r"\d+", value["term"]["id"].split(":")[1]
)[0]
attributes[xml_key] = value
continue # Skip applying the handler to this key

formatted_value = handler(item)

# Combine multiple values with a separator for list elements
if xml_key in attributes:
attributes[xml_key] += f"| {formatted_value}"
else:
attributes[xml_key] = formatted_value
continue

if json_key == "env_package":
env_package = f"MIMS.me.{handle_text_value(value)}.6.0"
Expand All @@ -187,6 +221,20 @@ def set_biosample(
value_type = slot_range_mappings.get(json_key, "string")
handler = self.type_handlers.get(value_type, handle_string_value)

# Special handling for "elev" key
if json_key == "elev":
value = f"{float(value)} m" # Convert to float if possible
attributes[xml_key] = value
continue # Skip applying the handler to this key

# Special handling for "host_taxid"
if json_key == "host_taxid" and isinstance(value, dict):
if "term" in value and "id" in value["term"]:
value = re.findall(r"\d+", value["term"]["id"].split(":")[1])[0]
attributes[xml_key] = value
continue # Skip applying the handler to this key

# Default processing for other keys
formatted_value = handler(value)
attributes[xml_key] = formatted_value

Expand Down Expand Up @@ -286,6 +334,7 @@ def set_fastq(
nmdc_nucleotide_sequencing: list,
nmdc_biosamples: list,
nmdc_library_preparation: list,
all_instruments: dict,
):
bsm_id_name_dict = {
biosample["id"]: biosample["name"] for biosample in nmdc_biosamples
Expand All @@ -296,9 +345,10 @@ def set_fastq(
biosample_ids = []
nucleotide_sequencing_ids = {}
lib_prep_protocol_names = {}
instrument_name = ""
analyte_category = ""
library_name = ""
instrument_vendor = ""
instrument_model = ""

for biosample_id, data_objects in entry.items():
biosample_ids.append(biosample_id)
Expand All @@ -316,7 +366,11 @@ def set_fastq(
)
# Currently, we are making the assumption that only one instrument
# is used to sequence a Biosample
instrument_name = ntseq.get("instrument_used", "")[0]
instrument_id = ntseq.get("instrument_used", "")[0]
instrument = all_instruments.get(instrument_id, {})
instrument_vendor = instrument.get("vendor", "")
instrument_model = instrument.get("model", "")

analyte_category = ntseq.get("analyte_category", "")
library_name = bsm_id_name_dict.get(biosample_id, "")

Expand Down Expand Up @@ -353,9 +407,9 @@ def set_fastq(
"RefId",
children=[
self.set_element(
"SPUID",
"PrimaryId",
bioproject_id,
{"spuid_namespace": org},
{"db": "BioProject"},
)
],
)
Expand Down Expand Up @@ -384,11 +438,11 @@ def set_fastq(
)

sra_attributes = []
if instrument_name.lower().startswith("illumina"):
if instrument_vendor == "illumina":
sra_attributes.append(
self.set_element("Attribute", "ILLUMINA", {"name": "platform"})
)
if "nextseq550" in instrument_name.lower():
if instrument_model == "nextseq_550":
sra_attributes.append(
self.set_element(
"Attribute", "NextSeq 550", {"name": "instrument_model"}
Expand Down Expand Up @@ -501,6 +555,7 @@ def get_submission_xml(
biosample_nucleotide_sequencing_list: list,
biosample_data_objects_list: list,
biosample_library_preparation_list: list,
instruments_dict: dict,
):
data_type = None
ncbi_project_id = None
Expand Down Expand Up @@ -545,6 +600,7 @@ def get_submission_xml(
nmdc_nucleotide_sequencing=biosample_nucleotide_sequencing_list,
nmdc_biosamples=biosamples_list,
nmdc_library_preparation=biosample_library_preparation_list,
all_instruments=instruments_dict,
)

rough_string = ET.tostring(self.root, "unicode")
Expand Down
25 changes: 25 additions & 0 deletions nmdc_runtime/site/export/ncbi_xml_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,31 @@ def get_classname_from_typecode(doc_id):
return class_map.get(typecode)


def get_instruments(instrument_set_collection):
# dictionary to capture a list of all instruments
# Structure of dict:
# {"instrument_id": {"vendor": "vendor_name", "model": "model_name"}}
all_instruments = {}

try:
query = {"type": "nmdc:Instrument"}
cursor = instrument_set_collection.find(query)

for document in cursor:
instrument_id = document.get("id")
vendor = document.get("vendor")
model = document.get("model")

if not instrument_id or not vendor or not model:
continue

all_instruments[instrument_id] = {"vendor": vendor, "model": model}

return all_instruments
except Exception as e:
raise RuntimeError(f"An error occurred while fetching instrument data: {e}")


def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
biosample_data_objects = []

Expand Down
3 changes: 3 additions & 0 deletions nmdc_runtime/site/graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
get_data_objects_from_biosamples,
get_nucleotide_sequencing_from_biosamples,
get_library_preparation_from_biosamples,
get_all_instruments,
get_ncbi_export_pipeline_inputs,
ncbi_submission_xml_from_nmdc_study,
ncbi_submission_xml_asset,
Expand Down Expand Up @@ -449,12 +450,14 @@ def nmdc_study_to_ncbi_submission_export():
)
data_object_records = get_data_objects_from_biosamples(biosamples)
library_preparation_records = get_library_preparation_from_biosamples(biosamples)
all_instruments = get_all_instruments()
xml_data = ncbi_submission_xml_from_nmdc_study(
nmdc_study,
ncbi_submission_metadata,
biosamples,
nucleotide_sequencing_records,
data_object_records,
library_preparation_records,
all_instruments,
)
ncbi_submission_xml_asset(xml_data)
11 changes: 11 additions & 0 deletions nmdc_runtime/site/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
fetch_data_objects_from_biosamples,
fetch_nucleotide_sequencing_from_biosamples,
fetch_library_preparation_from_biosamples,
get_instruments,
)
from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict
from nmdc_runtime.site.resources import (
Expand Down Expand Up @@ -1221,6 +1222,14 @@ def get_library_preparation_from_biosamples(
return biosample_lib_prep


@op(required_resource_keys={"mongo"})
def get_all_instruments(context: OpExecutionContext):
mdb = context.resources.mongo.db
instrument_set_collection = mdb["instrument_set"]
all_instruments = get_instruments(instrument_set_collection)
return all_instruments


@op
def ncbi_submission_xml_from_nmdc_study(
context: OpExecutionContext,
Expand All @@ -1230,12 +1239,14 @@ def ncbi_submission_xml_from_nmdc_study(
omics_processing_records: list,
data_object_records: list,
library_preparation_records: list,
all_instruments: dict,
) -> str:
ncbi_exporter = NCBISubmissionXML(nmdc_study, ncbi_exporter_metadata)
ncbi_xml = ncbi_exporter.get_submission_xml(
biosamples,
omics_processing_records,
data_object_records,
library_preparation_records,
all_instruments,
)
return ncbi_xml
Loading

0 comments on commit 92caec1

Please sign in to comment.