From 4c77eeae0cf1a1f540097b647f503fd1a65c8342 Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Tue, 7 May 2024 17:02:23 -0700
Subject: [PATCH 01/27] dagster harness for NMDC-to-NCBI export code

---
 nmdc_runtime/site/export/ncbi_xml.py        | 126 ++++++++++++++++++++
 nmdc_runtime/site/export/nmdc_api_client.py |  34 ++++++
 nmdc_runtime/site/graphs.py                 |  10 ++
 nmdc_runtime/site/ops.py                    |  35 ++++++
 nmdc_runtime/site/repository.py             |  13 ++
 nmdc_runtime/site/workspace.yaml            |   3 +
 6 files changed, 221 insertions(+)
 create mode 100644 nmdc_runtime/site/export/ncbi_xml.py
 create mode 100644 nmdc_runtime/site/export/nmdc_api_client.py

diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py
new file mode 100644
index 00000000..7b88aa62
--- /dev/null
+++ b/nmdc_runtime/site/export/ncbi_xml.py
@@ -0,0 +1,126 @@
+import datetime
+import xml.etree.ElementTree as ET
+import xml.dom.minidom
+
+
+class NCBISubmissionXML:
+    def __init__(
+        self, study_id: str, org="National Microbiome Data Collaborative (NMDC)"
+    ):
+        self.root = ET.Element("Submission")
+        self.study_id = study_id
+        self.org = org
+
+    def set_element(self, tag, text="", attrib=None, children=None):
+        attrib = attrib or {}
+        children = children or []
+        element = ET.Element(tag, attrib=attrib)
+        element.text = text
+        for child in children:
+            element.append(child)
+        return element
+
+    def set_description(
+        self, email="aclum@lbl.gov", user="NMDC", first="Alicia", last="Clum", date=None
+    ):
+        date = date or datetime.datetime.now().strftime("%Y-%m-%d")
+        description = self.set_element(
+            "Description",
+            children=[
+                self.set_element("Comment", f"NMDC Submission for {self.study_id}"),
+                self.set_element("Submitter", attrib={"user_name": user}),
+                self.set_element(
+                    "Organization",
+                    attrib={"role": "owner", "type": "center"},
+                    children=[
+                        self.set_element("Name", self.org),
+                        self.set_element(
+                            "Contact",
+                            attrib={"email": email},
+                            children=[
+                                self.set_element(
+                                    "Name",
+                                    children=[
+                                        self.set_element("First", first),
+                                        self.set_element("Last", last),
+                                    ],
+                                )
+                            ],
+                        ),
+                    ],
+                ),
+                self.set_element("Hold", attrib={"release_date": date}),
+            ],
+        )
+        self.root.append(description)
+
+    def set_biosample(self, title, spuid, sid, name, pkg, attributes=None):
+        attributes = attributes or {}
+        biosample = self.set_element(
+            "BioSample",
+            attrib={"schema_version": "2.0"},
+            children=[
+                self.set_element(
+                    "SampleId",
+                    children=[
+                        self.set_element("SPUID", sid, {"spuid_namespace": self.org})
+                    ],
+                ),
+                self.set_element(
+                    "Descriptor",
+                    children=[
+                        self.set_element("Title", title),
+                        self.set_element(
+                            "Description", children=[self.set_element("p", spuid)]
+                        ),
+                    ],
+                ),
+                self.set_element(
+                    "Organism", children=[self.set_element("OrganismName", name)]
+                ),
+                self.set_element("Package", pkg),
+                self.set_element(
+                    "Attributes",
+                    children=[
+                        self.set_element(
+                            "Attribute", attributes[key], {"attribute_name": key}
+                        )
+                        for key in sorted(attributes)
+                    ],
+                ),
+            ],
+        )
+        action = self.set_element(
+            "Action",
+            children=[
+                self.set_element(
+                    "AddData",
+                    attrib={"target_db": "BioSample"},
+                    children=[
+                        self.set_element(
+                            "Data",
+                            attrib={"content_type": "XML"},
+                            children=[
+                                self.set_element("XmlContent", children=[biosample])
+                            ],
+                        ),
+                        self.set_element(
+                            "Identifier",
+                            children=[
+                                self.set_element(
+                                    "SPUID", sid, {"spuid_namespace": self.org}
+                                )
+                            ],
+                        ),
+                    ],
+                )
+            ],
+        )
+        self.root.append(action)
+
+    def get_submission_xml(self):
+        self.set_description()
+
+        rough_string = ET.tostring(self.root, "unicode")
+        reparsed = xml.dom.minidom.parseString(rough_string)
+        return reparsed.toprettyxml(indent="    ", newl="\n")
diff --git a/nmdc_runtime/site/export/nmdc_api_client.py b/nmdc_runtime/site/export/nmdc_api_client.py
new file mode 100644
index 00000000..6d7938e9
--- /dev/null
+++ b/nmdc_runtime/site/export/nmdc_api_client.py
@@ -0,0 +1,34 @@
+import requests
+
+
+class NMDCApiClient:
+    def __init__(self, api_base_url):
+        if not api_base_url.endswith("/"):
+            api_base_url += "/"
+        self.base_url = api_base_url
+        self.headers = {
+            "accept": "application/json",
+            "Content-Type": "application/json",
+        }
+
+    def get_biosamples_part_of_study(self, study_id: str) -> list[dict]:
+        """
+        Get the biosamples that are part of a study.
+        """
+        biosample_records = []
+        params = {
+            "filter": '{"part_of": "' + study_id + '"}',
+            "max_page_size": "1000",
+        }
+        url = self.base_url + "nmdcschema/biosample_set"
+        response = requests.get(url, params=params, headers=self.headers)
+        response.raise_for_status()
+        biosample_records.extend(response.json()["resources"])
+        # Get the next page of results, if any
+        while response.json().get("next_page_token") is not None:
+            params["page_token"] = response.json()["next_page_token"]
+            response = requests.get(url, params=params, headers=self.headers)
+            response.raise_for_status()
+            biosample_records.extend(response.json()["resources"])
+
+        return biosample_records
diff --git a/nmdc_runtime/site/graphs.py b/nmdc_runtime/site/graphs.py
index 076eb498..fccefe9a 100644
--- a/nmdc_runtime/site/graphs.py
+++ b/nmdc_runtime/site/graphs.py
@@ -49,6 +49,9 @@
     get_neon_pipeline_inputs,
     get_df_from_url,
     site_code_mapping,
+    get_ncbi_export_pipeline_inputs,
+    ncbi_submission_xml_from_nmdc_study,
+    ncbi_submission_xml_asset,
 )
 
 
@@ -381,3 +384,10 @@ def ingest_neon_surface_water_metadata():
     )
     run_id = submit_metadata_to_db(database)
     poll_for_run_completion(run_id)
+
+
+@graph
+def nmdc_study_to_ncbi_submission_export():
+    study_id = get_ncbi_export_pipeline_inputs()
+    xml_data = ncbi_submission_xml_from_nmdc_study(study_id)
+    ncbi_submission_xml_asset(xml_data)
diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py
index 59c45fd6..749cb4a9 100644
--- a/nmdc_runtime/site/ops.py
+++ b/nmdc_runtime/site/ops.py
@@ -9,6 +9,7 @@
 from io import BytesIO, StringIO
 from typing import Tuple
 from zipfile import ZipFile
+# import xml.etree.ElementTree as ET
 import pandas as pd
 import requests
 
@@ -55,6 +56,7 @@
     _add_run_complete_event,
 )
 from nmdc_runtime.api.models.util import ResultT
+from nmdc_runtime.site.export.ncbi_xml import NCBISubmissionXML
 from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict
 from nmdc_runtime.site.resources import (
     NmdcPortalApiClient,
@@ -768,6 +770,24 @@ def export_json_to_drs(
     return ["/objects/" + drs_object["id"]]
 
 
+@op(
+    description="NCBI Submission XML file rendered in a Dagster Asset",
+    out=Out(description="XML content rendered through Dagit UI")
+)
+def ncbi_submission_xml_asset(context: OpExecutionContext, data: str):
+    context.log_event(
+        AssetMaterialization(
+            asset_key="ncbi_submission_xml",
+            description="NCBI Submission XML Data",
+            metadata={
+                "xml": MetadataValue.text(data)
+            }
+        )
+    )
+
+    return Output(data)
+
+
 def unique_field_values(docs: List[Dict[str, Any]], field: str):
     return {doc[field] for doc in docs if field in doc}
 
@@ -977,3 +997,18 @@ def site_code_mapping() -> dict:
         raise Exception(
             f"Failed to fetch site data from {endpoint}. Status code: {response.status_code}, Content: {response.content}"
         )
+    
+
+@op(config_schema={"study_id": str})
+def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str:
+    return context.op_config["study_id"]
+
+
+@op
+def ncbi_submission_xml_from_nmdc_study(
+    context: OpExecutionContext,
+    study_id: str,
+    ) -> str:
+    ncbi_exporter = NCBISubmissionXML(study_id)
+    ncbi_xml = ncbi_exporter.get_submission_xml()
+    return ncbi_xml
diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py
index fada8da1..c716a0a9 100644
--- a/nmdc_runtime/site/repository.py
+++ b/nmdc_runtime/site/repository.py
@@ -42,6 +42,7 @@
     ingest_neon_soil_metadata,
     ingest_neon_benthic_metadata,
     ingest_neon_surface_water_metadata,
+    nmdc_study_to_ncbi_submission_export,
 )
 from nmdc_runtime.site.resources import (
     get_mongo,
@@ -852,6 +853,18 @@ def biosample_submission_ingest():
     ]
 
 
+@repository
+def biosample_export():
+    return [
+        nmdc_study_to_ncbi_submission_export.to_job(
+            config={
+                "ops": {
+                    "get_ncbi_export_pipeline_inputs": {"config": {"study_id": ""}},
+                },
+            },
+        ),
+    ]
+
 # @repository
 # def validation():
 #     graph_jobs = [validate_jgi_job, validate_gold_job, validate_emsl_job]
diff --git a/nmdc_runtime/site/workspace.yaml b/nmdc_runtime/site/workspace.yaml
index e594197e..5da09ab9 100644
--- a/nmdc_runtime/site/workspace.yaml
+++ b/nmdc_runtime/site/workspace.yaml
@@ -11,6 +11,9 @@ load_from:
   - python_package:
       package_name: nmdc_runtime.site.repository
       attribute: biosample_submission_ingest
+  - python_package:
+      package_name: nmdc_runtime.site.repository
+      attribute: biosample_export
 #  - python_package:
 #      package_name: nmdc_runtime.site.repository
 #      attribute: validation

From 43f6baf5dc285e8a572e3816a0afc411bed5c4ea Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Wed, 8 May 2024 21:39:44 +0000
Subject: [PATCH 02/27] style: reformat

---
 nmdc_runtime/site/ops.py        | 11 +++++------
 nmdc_runtime/site/repository.py |  1 +
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py
index 749cb4a9..0d19ffaa 100644
--- a/nmdc_runtime/site/ops.py
+++ b/nmdc_runtime/site/ops.py
@@ -9,6 +9,7 @@
 from io import BytesIO, StringIO
 from typing import Tuple
 from zipfile import ZipFile
+
 # import xml.etree.ElementTree as ET
 import pandas as pd
 import requests
@@ -772,16 +773,14 @@ def export_json_to_drs(
 
 @op(
     description="NCBI Submission XML file rendered in a Dagster Asset",
-    out=Out(description="XML content rendered through Dagit UI")
+    out=Out(description="XML content rendered through Dagit UI"),
 )
 def ncbi_submission_xml_asset(context: OpExecutionContext, data: str):
     context.log_event(
         AssetMaterialization(
             asset_key="ncbi_submission_xml",
             description="NCBI Submission XML Data",
-            metadata={
-                "xml": MetadataValue.text(data)
-            }
+            metadata={"xml": MetadataValue.text(data)},
         )
     )
 
@@ -997,7 +996,7 @@ def site_code_mapping() -> dict:
         raise Exception(
             f"Failed to fetch site data from {endpoint}. Status code: {response.status_code}, Content: {response.content}"
         )
-    
+
 
 @op(config_schema={"study_id": str})
 def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str:
@@ -1008,7 +1007,7 @@ def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str:
 def ncbi_submission_xml_from_nmdc_study(
     context: OpExecutionContext,
     study_id: str,
-    ) -> str:
+) -> str:
     ncbi_exporter = NCBISubmissionXML(study_id)
     ncbi_xml = ncbi_exporter.get_submission_xml()
     return ncbi_xml
diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py
index c716a0a9..9503d9b6 100644
--- a/nmdc_runtime/site/repository.py
+++ b/nmdc_runtime/site/repository.py
@@ -865,6 +865,7 @@ def biosample_export():
         ),
     ]
 
+
 # @repository
 # def validation():
 #     graph_jobs = [validate_jgi_job, validate_gold_job, validate_emsl_job]

From 2653223684291c4677422f8c9cda617147356829 Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Wed, 8 May 2024 15:54:04 -0700
Subject: [PATCH 03/27] type handlers and capability to parse information out
 from nested NMDCslot structure

---
 nmdc_runtime/site/export/ncbi_xml.py       | 133 +++++++++++++++------
 nmdc_runtime/site/export/ncbi_xml_utils.py |  95 +++++++++++++++
 2 files changed, 192 insertions(+), 36 deletions(-)
 create mode 100644 nmdc_runtime/site/export/ncbi_xml_utils.py

diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py
index 7b88aa62..2ff527a7 100644
--- a/nmdc_runtime/site/export/ncbi_xml.py
+++ b/nmdc_runtime/site/export/ncbi_xml.py
@@ -1,3 +1,14 @@
+from nmdc_runtime.site.export.ncbi_xml_utils import (
+    handle_controlled_identified_term_value,
+    handle_controlled_term_value,
+    handle_geolocation_value,
+    handle_quantity_value,
+    handle_text_value,
+    handle_timestamp_value,
+    handle_float_value,
+    handle_string_value,
+    load_mappings,
+)
 import datetime
 import xml.etree.ElementTree as ET
 import xml.dom.minidom
@@ -11,6 +22,19 @@ def __init__(
         self.study_id = study_id
         self.org = org
 
+        # dispatcher dictionary capturing handlers for NMDC object to NCBI flat Attribute
+        # type handlers
+        self.type_handlers = {
+            "QuantityValue": handle_quantity_value,
+            "TextValue": handle_text_value,
+            "TimestampValue": handle_timestamp_value,
+            "ControlledTermValue": handle_controlled_term_value,
+            "ControlledIdentifiedTermValue": handle_controlled_identified_term_value,
+            "GeolocationValue": handle_geolocation_value,
+            "float": handle_float_value,
+            "string": handle_string_value,
+        }
+
     def set_element(self, tag, text="", attrib=None, children=None):
         attrib = attrib or {}
         children = children or []
@@ -54,42 +78,67 @@ def set_description(
         )
         self.root.append(description)
 
-    def set_biosample(self, title, spuid, sid, name, pkg, attributes=None):
-        attributes = attributes or {}
-        biosample = self.set_element(
-            "BioSample",
-            attrib={"schema_version": "2.0"},
-            children=[
-                self.set_element(
-                    "SampleId",
-                    children=[
-                        self.set_element("SPUID", sid, {"spuid_namespace": self.org})
-                    ],
-                ),
-                self.set_element(
-                    "Descriptor",
-                    children=[
-                        self.set_element("Title", title),
-                        self.set_element(
-                            "Description", children=[self.set_element("p", spuid)]
-                        ),
-                    ],
-                ),
-                self.set_element(
-                    "Organism", children=[self.set_element("OrganismName", name)]
-                ),
-                self.set_element("Package", pkg),
-                self.set_element(
-                    "Attributes",
-                    children=[
-                        self.set_element(
-                            "Attribute", attributes[key], {"attribute_name": key}
-                        )
-                        for key in sorted(attributes)
-                    ],
-                ),
-            ],
+    def set_biosample(
+        self,
+        title,
+        spuid,
+        sid,
+        name,
+        pkg,
+        nmdc_biosample,
+    ):
+        attribute_mappings, slot_range_mappings = load_mappings(
+            "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/issue-1940/assets/ncbi_mappings/ncbi_attribute_mappings_filled.tsv"
         )
+
+        attributes = {}
+        for json_key, value in nmdc_biosample.items():
+            if isinstance(value, list):
+                continue
+
+            xml_key = attribute_mappings.get(json_key, json_key)
+            value_type = slot_range_mappings.get(
+                json_key, "string"
+            )
+            handler = self.type_handlers.get(
+                value_type, handle_string_value
+            )
+
+            formatted_value = handler(value)
+            attributes[xml_key] = formatted_value
+
+        # Create the BioSample XML block with these attributes
+        biosample_elements = [
+            self.set_element(
+                "SampleId",
+                children=[
+                    self.set_element("SPUID", sid, {"spuid_namespace": self.org})
+                ],
+            ),
+            self.set_element(
+                "Descriptor",
+                children=[
+                    self.set_element("Title", title),
+                    self.set_element(
+                        "Description", children=[self.set_element("p", spuid)]
+                    ),
+                ],
+            ),
+            self.set_element(
+                "Organism", children=[self.set_element("OrganismName", name)]
+            ),
+            self.set_element("Package", pkg),
+            self.set_element(
+                "Attributes",
+                children=[
+                    self.set_element(
+                        "Attribute", attributes[key], {"attribute_name": key}
+                    )
+                    for key in sorted(attributes)
+                ],
+            ),
+        ]
+
         action = self.set_element(
             "Action",
             children=[
@@ -101,7 +150,16 @@ def set_biosample(self, title, spuid, sid, name, pkg, attributes=None):
                             "Data",
                             attrib={"content_type": "XML"},
                             children=[
-                                self.set_element("XmlContent", children=[biosample])
+                                self.set_element(
+                                    "XmlContent",
+                                    children=[
+                                        self.set_element(
+                                            "BioSample",
+                                            attrib={"schema_version": "2.0"},
+                                            children=biosample_elements,
+                                        )
+                                    ],
+                                )
                             ],
                         ),
                         self.set_element(
@@ -121,6 +179,9 @@ def set_biosample(self, title, spuid, sid, name, pkg, attributes=None):
     def get_submission_xml(self):
         self.set_description()
 
+        # TODO: iterate over all biosamples in the study
+        # make call to self.set_biosample() here
+
         rough_string = ET.tostring(self.root, "unicode")
         reparsed = xml.dom.minidom.parseString(rough_string)
         return reparsed.toprettyxml(indent="    ", newl="\n")
diff --git a/nmdc_runtime/site/export/ncbi_xml_utils.py b/nmdc_runtime/site/export/ncbi_xml_utils.py
new file mode 100644
index 00000000..e34cae6d
--- /dev/null
+++ b/nmdc_runtime/site/export/ncbi_xml_utils.py
@@ -0,0 +1,95 @@
+from io import StringIO
+import csv
+import requests
+
+
+def handle_quantity_value(slot_value):
+    if "has_numeric_value" in slot_value and "has_unit" in slot_value:
+        return f"{slot_value['has_numeric_value']} {slot_value['has_unit']}"
+    elif (
+        "has_maximum_numeric_value" in slot_value
+        and "has_minimum_numeric_value" in slot_value
+        and "has_unit" in slot_value
+    ):
+        range_value = (
+            slot_value["has_maximum_numeric_value"]
+            - slot_value["has_minimum_numeric_value"]
+        )
+        return f"({range_value}) {slot_value['has_unit']}"
+    elif "has_raw_value" in slot_value:
+        return slot_value["has_raw_value"]
+    return "Unknown format"
+
+
+def handle_text_value(slot_value):
+    return slot_value.get("has_raw_value", "Unknown format")
+
+
+def handle_timestamp_value(slot_value):
+    return slot_value.get("has_raw_value", "Unknown format")
+
+
+def handle_controlled_term_value(slot_value):
+    if "term" in slot_value:
+        term = slot_value["term"]
+        if "name" in term and "id" in term:
+            return f"{term['name']} [{term['id']}]"
+        elif "id" in term:
+            return term["id"]
+        elif "name" in term:
+            return term["name"]
+    elif "has_raw_value" in slot_value:
+        return slot_value["has_raw_value"]
+    return "Unknown format"
+
+
+def handle_controlled_identified_term_value(slot_value):
+    if "term" in slot_value:
+        term = slot_value["term"]
+        if "name" in term and "id" in term:
+            return f"{term['name']} [{term['id']}]"
+    elif "has_raw_value" in slot_value:
+        return slot_value["has_raw_value"]
+    return "Unknown format"
+
+
+def handle_geolocation_value(slot_value):
+    if "latitude" in slot_value and "longitude" in slot_value:
+        return f"{slot_value['latitude']} {slot_value['longitude']}"
+    elif "has_raw_value" in slot_value:
+        return slot_value["has_raw_value"]
+    return "Unknown format"
+
+
+def handle_float_value(slot_value):
+    return f"{slot_value:.2f}"
+
+
+def handle_string_value(slot_value):
+    return f"{slot_value}"
+
+
+def load_mappings(url):
+    response = requests.get(url)
+    response.raise_for_status()
+    file_content = response.text
+
+    attribute_mappings = {}
+    slot_range_mappings = {}
+    reader = csv.DictReader(StringIO(file_content), delimiter="\t")
+    for row in reader:
+        if row["ignore"].strip():
+            continue
+
+        json_key = row["nmdc_schema_slot"]
+        # attribute mappings
+        xml_attribute_name = row["ncbi_biosample_attribute_name"]
+        attribute_mappings[json_key] = (
+            xml_attribute_name if xml_attribute_name else json_key
+        )
+
+        # slot range mappings
+        data_type = row["nmdc_schema_slot_range"]
+        slot_range_mappings[json_key] = data_type if data_type else "default"
+
+    return attribute_mappings, slot_range_mappings

From 7b714a5c0e86c639fa27b7496867265c4821f555 Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Wed, 8 May 2024 22:56:06 +0000
Subject: [PATCH 04/27] style: reformat

---
 nmdc_runtime/site/export/ncbi_xml.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py
index 2ff527a7..da3577d4 100644
--- a/nmdc_runtime/site/export/ncbi_xml.py
+++ b/nmdc_runtime/site/export/ncbi_xml.py
@@ -97,12 +97,8 @@ def set_biosample(
                 continue
 
             xml_key = attribute_mappings.get(json_key, json_key)
-            value_type = slot_range_mappings.get(
-                json_key, "string"
-            )
-            handler = self.type_handlers.get(
-                value_type, handle_string_value
-            )
+            value_type = slot_range_mappings.get(json_key, "string")
+            handler = self.type_handlers.get(value_type, handle_string_value)
 
             formatted_value = handler(value)
             attributes[xml_key] = formatted_value

From d992f1f6b4089fb1ce0a1f12711736b3094c8edf Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Thu, 9 May 2024 11:20:15 -0700
Subject: [PATCH 05/27] implement set_bioproject() method to create <Action>
 block for NCBI BioProject

---
 nmdc_runtime/site/export/ncbi_xml.py | 68 ++++++++++++++++++++++++++--
 1 file changed, 65 insertions(+), 3 deletions(-)

diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py
index da3577d4..c32fa0ed 100644
--- a/nmdc_runtime/site/export/ncbi_xml.py
+++ b/nmdc_runtime/site/export/ncbi_xml.py
@@ -1,3 +1,8 @@
+import json
+import datetime
+import xml.etree.ElementTree as ET
+import xml.dom.minidom
+
 from nmdc_runtime.site.export.ncbi_xml_utils import (
     handle_controlled_identified_term_value,
     handle_controlled_term_value,
@@ -9,9 +14,6 @@
     handle_string_value,
     load_mappings,
 )
-import datetime
-import xml.etree.ElementTree as ET
-import xml.dom.minidom
 
 
 class NCBISubmissionXML:
@@ -78,6 +80,64 @@ def set_description(
         )
         self.root.append(description)
 
+    def set_descriptor(self, title, description, url):
+        descriptor_elements = []
+        descriptor_elements.append(self.set_element("Title", title))
+        descriptor_elements.append(
+            self.set_element(
+                "Description", children=[self.set_element("p", description)]
+            )
+        )
+
+        external_resources = json.loads(url)
+        for label, link in external_resources.items():
+            external_link = self.set_element("ExternalLink", attrib={"label": label})
+            url_element = self.set_element("URL", link)
+            external_link.append(url_element)
+            descriptor_elements.append(external_link)
+
+        return descriptor_elements
+
+    def set_bioproject(self, title, project_id, description, data_type, url):
+        action = self.set_element("Action")
+        add_data = self.set_element("AddData", attrib={"target_db": "BioProject"})
+
+        data_element = self.set_element("Data", attrib={"content_type": "XML"})
+        xml_content = self.set_element("XmlContent")
+        project = self.set_element("Project", attrib={"schema_version": "2.0"})
+
+        project_id_element = self.set_element("ProjectID")
+        spuid = self.set_element("SPUID", project_id, {"spuid_namespace": self.org})
+        project_id_element.append(spuid)
+
+        descriptor = self.set_descriptor(title, description, url)
+        project_type = self.set_element("ProjectType")
+        project_type_submission = self.set_element(
+            "ProjectTypeSubmission", attrib={"sample_scope": "eEnvironment"}
+        )
+        intended_data_type_set = self.set_element("IntendedDataTypeSet")
+        data_type_element = self.set_element("DataType", data_type)
+
+        intended_data_type_set.append(data_type_element)
+        project_type_submission.append(intended_data_type_set)
+        project_type.append(project_type_submission)
+
+        project.extend([project_id_element] + descriptor + [project_type])
+
+        xml_content.append(project)
+        data_element.append(xml_content)
+        add_data.append(data_element)
+
+        identifier = self.set_element("Identifier")
+        spuid_identifier = self.set_element(
+            "SPUID", project_id, {"spuid_namespace": self.org}
+        )
+        identifier.append(spuid_identifier)
+        add_data.append(identifier)
+
+        action.append(add_data)
+        self.root.append(action)
+
     def set_biosample(
         self,
         title,
@@ -175,6 +235,8 @@ def set_biosample(
     def get_submission_xml(self):
         self.set_description()
 
+        # initialize/make call to self.set_bioproject() here
+
         # TODO: iterate over all biosamples in the study
         # make call to self.set_biosample() here
 

From 71be54f242f46a9099a39ed520d41b503bc42660 Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Thu, 9 May 2024 17:06:01 -0700
Subject: [PATCH 06/27] capture submission non Attribute metadata through Dagit
 repo interface

---
 nmdc_runtime/site/export/ncbi_xml.py | 63 +++++++++++++++++-----------
 nmdc_runtime/site/graphs.py          |  4 +-
 nmdc_runtime/site/ops.py             | 62 +++++++++++++++++++++++++--
 nmdc_runtime/site/repository.py      | 26 +++++++++++-
 4 files changed, 124 insertions(+), 31 deletions(-)

diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py
index c32fa0ed..f868174c 100644
--- a/nmdc_runtime/site/export/ncbi_xml.py
+++ b/nmdc_runtime/site/export/ncbi_xml.py
@@ -18,11 +18,13 @@
 
 class NCBISubmissionXML:
     def __init__(
-        self, study_id: str, org="National Microbiome Data Collaborative (NMDC)"
+        self, ncbi_submission_fields: dict
     ):
         self.root = ET.Element("Submission")
-        self.study_id = study_id
-        self.org = org
+        self.nmdc_study_id = ncbi_submission_fields.get("nmdc_study_id")
+        self.ncbi_submission_metadata = ncbi_submission_fields.get("ncbi_submission_metadata", {})
+        self.ncbi_bioproject_metadata = ncbi_submission_fields.get("ncbi_bioproject_metadata", {})
+        self.ncbi_biosample_metadata = ncbi_submission_fields.get("ncbi_biosample_metadata", {})
 
         # dispatcher dictionary capturing handlers for NMDC object to NCBI flat Attribute
         # type handlers
@@ -47,19 +49,19 @@ def set_element(self, tag, text="", attrib=None, children=None):
         return element
 
     def set_description(
-        self, email="aclum@lbl.gov", user="NMDC", first="Alicia", last="Clum", date=None
+        self, email, user, first, last, org, date=None
     ):
         date = date or datetime.datetime.now().strftime("%Y-%m-%d")
         description = self.set_element(
             "Description",
             children=[
-                self.set_element("Comment", f"NMDC Submission for {self.study_id}"),
+                self.set_element("Comment", f"NMDC Submission for {self.nmdc_study_id}"),
                 self.set_element("Submitter", attrib={"user_name": user}),
                 self.set_element(
                     "Organization",
                     attrib={"role": "owner", "type": "center"},
                     children=[
-                        self.set_element("Name", self.org),
+                        self.set_element("Name", org),
                         self.set_element(
                             "Contact",
                             attrib={"email": email},
@@ -80,7 +82,7 @@ def set_description(
         )
         self.root.append(description)
 
-    def set_descriptor(self, title, description, url):
+    def set_descriptor(self, title, description):
         descriptor_elements = []
         descriptor_elements.append(self.set_element("Title", title))
         descriptor_elements.append(
@@ -89,16 +91,9 @@ def set_descriptor(self, title, description, url):
             )
         )
 
-        external_resources = json.loads(url)
-        for label, link in external_resources.items():
-            external_link = self.set_element("ExternalLink", attrib={"label": label})
-            url_element = self.set_element("URL", link)
-            external_link.append(url_element)
-            descriptor_elements.append(external_link)
-
         return descriptor_elements
 
-    def set_bioproject(self, title, project_id, description, data_type, url):
+    def set_bioproject(self, title, project_id, description, data_type, org):
         action = self.set_element("Action")
         add_data = self.set_element("AddData", attrib={"target_db": "BioProject"})
 
@@ -107,10 +102,10 @@ def set_bioproject(self, title, project_id, description, data_type, url):
         project = self.set_element("Project", attrib={"schema_version": "2.0"})
 
         project_id_element = self.set_element("ProjectID")
-        spuid = self.set_element("SPUID", project_id, {"spuid_namespace": self.org})
+        spuid = self.set_element("SPUID", project_id, {"spuid_namespace": org})
         project_id_element.append(spuid)
 
-        descriptor = self.set_descriptor(title, description, url)
+        descriptor = self.set_descriptor(title, description)
         project_type = self.set_element("ProjectType")
         project_type_submission = self.set_element(
             "ProjectTypeSubmission", attrib={"sample_scope": "eEnvironment"}
@@ -130,7 +125,7 @@ def set_bioproject(self, title, project_id, description, data_type, url):
 
         identifier = self.set_element("Identifier")
         spuid_identifier = self.set_element(
-            "SPUID", project_id, {"spuid_namespace": self.org}
+            "SPUID", project_id, {"spuid_namespace": org}
         )
         identifier.append(spuid_identifier)
         add_data.append(identifier)
@@ -145,6 +140,7 @@ def set_biosample(
         sid,
         name,
         pkg,
+        org,
         nmdc_biosample,
     ):
         attribute_mappings, slot_range_mappings = load_mappings(
@@ -168,7 +164,7 @@ def set_biosample(
             self.set_element(
                 "SampleId",
                 children=[
-                    self.set_element("SPUID", sid, {"spuid_namespace": self.org})
+                    self.set_element("SPUID", sid, {"spuid_namespace": org})
                 ],
             ),
             self.set_element(
@@ -222,7 +218,7 @@ def set_biosample(
                             "Identifier",
                             children=[
                                 self.set_element(
-                                    "SPUID", sid, {"spuid_namespace": self.org}
+                                    "SPUID", sid, {"spuid_namespace": org}
                                 )
                             ],
                         ),
@@ -233,12 +229,31 @@ def set_biosample(
         self.root.append(action)
 
     def get_submission_xml(self):
-        self.set_description()
+        self.set_description(
+            email=self.ncbi_submission_metadata.get("email", ""),
+            user=self.ncbi_submission_metadata.get("user", ""),
+            first=self.ncbi_submission_metadata.get("first", ""),
+            last=self.ncbi_submission_metadata.get("last", ""),
+            org=self.ncbi_submission_metadata.get("organization", ""),
+        )
 
-        # initialize/make call to self.set_bioproject() here
+        self.set_bioproject(
+            title=self.ncbi_bioproject_metadata.get("title", ""),
+            project_id=self.ncbi_bioproject_metadata.get("project_id", ""),
+            description=self.ncbi_bioproject_metadata.get("description", ""),
+            data_type=self.ncbi_bioproject_metadata.get("data_type", ""),
+            org=self.ncbi_submission_metadata.get("organization", ""),
+        )
 
-        # TODO: iterate over all biosamples in the study
-        # make call to self.set_biosample() here
+        self.set_biosample(
+            title=self.ncbi_biosample_metadata.get("title", ""),
+            spuid=self.ncbi_biosample_metadata.get("spuid", ""),
+            sid=self.ncbi_biosample_metadata.get("sid", ""),
+            name=self.ncbi_biosample_metadata.get("name", ""),
+            pkg=self.ncbi_biosample_metadata.get("pkg", ""),
+            org=self.ncbi_submission_metadata.get("organization", ""),
+            nmdc_biosample={}
+        )
 
         rough_string = ET.tostring(self.root, "unicode")
         reparsed = xml.dom.minidom.parseString(rough_string)
diff --git a/nmdc_runtime/site/graphs.py b/nmdc_runtime/site/graphs.py
index fccefe9a..f1b755d6 100644
--- a/nmdc_runtime/site/graphs.py
+++ b/nmdc_runtime/site/graphs.py
@@ -388,6 +388,6 @@ def ingest_neon_surface_water_metadata():
 
 @graph
 def nmdc_study_to_ncbi_submission_export():
-    study_id = get_ncbi_export_pipeline_inputs()
-    xml_data = ncbi_submission_xml_from_nmdc_study(study_id)
+    ncbi_submission_fields = get_ncbi_export_pipeline_inputs()
+    xml_data = ncbi_submission_xml_from_nmdc_study(ncbi_submission_fields)
     ncbi_submission_xml_asset(xml_data)
diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py
index 0d19ffaa..58352640 100644
--- a/nmdc_runtime/site/ops.py
+++ b/nmdc_runtime/site/ops.py
@@ -31,6 +31,8 @@
     String,
     op,
     Optional,
+    Field,
+    Permissive,
 )
 from gridfs import GridFS
 from linkml_runtime.dumpers import json_dumper
@@ -998,16 +1000,68 @@ def site_code_mapping() -> dict:
         )
 
 
-@op(config_schema={"study_id": str})
+@op(
+    config_schema={
+        "nmdc_study_id": str,
+        "ncbi_submission_metadata": Field(
+            Permissive(
+                {
+                    "email": String,
+                    "first": String,
+                    "last": String,
+                    "user": String,
+                }
+            ),
+            is_required=True,
+            description="General metadata about the NCBI submission.",
+        ),
+        "ncbi_bioproject_metadata": Field(
+            Permissive(
+                {
+                    "title": String,
+                    "project_id": String,
+                    "description": String,
+                    "data_type": String,
+                }
+            ),
+            is_required=True,
+            description="Metadata for NCBI BioProject in the Submission.",
+        ),
+        "ncbi_biosample_metadata": Field(
+            Permissive(
+                {
+                    "title": String,
+                    "spuid": String,
+                    "sid": String,
+                    "name": String,
+                    "pkg": String,
+                }
+            ),
+            is_required=True,
+            description="Metadata for one or many NCBI BioSample in the Submission.",
+        ),
+    },
+    out=Out(Dict),
+)
 def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str:
-    return context.op_config["study_id"]
+    nmdc_study_id = context.op_config["nmdc_study_id"]
+    ncbi_submission_metadata = context.op_config.get("ncbi_submission_metadata", {})
+    ncbi_bioproject_metadata = context.op_config.get("ncbi_bioproject_metadata", {})
+    ncbi_biosample_metadata = context.op_config.get("ncbi_biosample_metadata", {})
+
+    return {
+        "nmdc_study_id": nmdc_study_id,
+        "ncbi_submission_metadata": ncbi_submission_metadata,
+        "ncbi_bioproject_metadata": ncbi_bioproject_metadata,
+        "ncbi_biosample_metadata": ncbi_biosample_metadata,
+    }
 
 
 @op
 def ncbi_submission_xml_from_nmdc_study(
     context: OpExecutionContext,
-    study_id: str,
+    ncbi_exporter_metadata: dict,
 ) -> str:
-    ncbi_exporter = NCBISubmissionXML(study_id)
+    ncbi_exporter = NCBISubmissionXML(ncbi_exporter_metadata)
     ncbi_xml = ncbi_exporter.get_submission_xml()
     return ncbi_xml
diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py
index 9503d9b6..90651210 100644
--- a/nmdc_runtime/site/repository.py
+++ b/nmdc_runtime/site/repository.py
@@ -859,7 +859,31 @@ def biosample_export():
         nmdc_study_to_ncbi_submission_export.to_job(
             config={
                 "ops": {
-                    "get_ncbi_export_pipeline_inputs": {"config": {"study_id": ""}},
+                    "get_ncbi_export_pipeline_inputs": {
+                        "config": {
+                            "nmdc_study_id": "",
+                            "ncbi_submission_metadata": {
+                                "email": "",
+                                "first": "",
+                                "last": "",
+                                "user": "",
+                                "organization": "",
+                            },
+                            "ncbi_bioproject_metadata": {
+                                "title": "",
+                                "project_id": "",
+                                "description": "",
+                                "data_type": "",
+                            },
+                            "ncbi_biosample_metadata": {
+                                "title": "",
+                                "spuid": "",
+                                "sid": "",
+                                "name": "",
+                                "pkg": "",
+                            },
+                        }
+                    },
                 },
             },
         ),

From 7fb364f997e7d52b6a693fe2b4031cc077afe3ef Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Fri, 10 May 2024 00:06:43 +0000
Subject: [PATCH 07/27] style: reformat

---
 nmdc_runtime/site/export/ncbi_xml.py | 34 ++++++++++++++--------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py
index f868174c..314baa75 100644
--- a/nmdc_runtime/site/export/ncbi_xml.py
+++ b/nmdc_runtime/site/export/ncbi_xml.py
@@ -17,14 +17,18 @@
 
 
 class NCBISubmissionXML:
-    def __init__(
-        self, ncbi_submission_fields: dict
-    ):
+    def __init__(self, ncbi_submission_fields: dict):
         self.root = ET.Element("Submission")
         self.nmdc_study_id = ncbi_submission_fields.get("nmdc_study_id")
-        self.ncbi_submission_metadata = ncbi_submission_fields.get("ncbi_submission_metadata", {})
-        self.ncbi_bioproject_metadata = ncbi_submission_fields.get("ncbi_bioproject_metadata", {})
-        self.ncbi_biosample_metadata = ncbi_submission_fields.get("ncbi_biosample_metadata", {})
+        self.ncbi_submission_metadata = ncbi_submission_fields.get(
+            "ncbi_submission_metadata", {}
+        )
+        self.ncbi_bioproject_metadata = ncbi_submission_fields.get(
+            "ncbi_bioproject_metadata", {}
+        )
+        self.ncbi_biosample_metadata = ncbi_submission_fields.get(
+            "ncbi_biosample_metadata", {}
+        )
 
         # dispatcher dictionary capturing handlers for NMDC object to NCBI flat Attribute
         # type handlers
@@ -48,14 +52,14 @@ def set_element(self, tag, text="", attrib=None, children=None):
             element.append(child)
         return element
 
-    def set_description(
-        self, email, user, first, last, org, date=None
-    ):
+    def set_description(self, email, user, first, last, org, date=None):
         date = date or datetime.datetime.now().strftime("%Y-%m-%d")
         description = self.set_element(
             "Description",
             children=[
-                self.set_element("Comment", f"NMDC Submission for {self.nmdc_study_id}"),
+                self.set_element(
+                    "Comment", f"NMDC Submission for {self.nmdc_study_id}"
+                ),
                 self.set_element("Submitter", attrib={"user_name": user}),
                 self.set_element(
                     "Organization",
@@ -163,9 +167,7 @@ def set_biosample(
         biosample_elements = [
             self.set_element(
                 "SampleId",
-                children=[
-                    self.set_element("SPUID", sid, {"spuid_namespace": org})
-                ],
+                children=[self.set_element("SPUID", sid, {"spuid_namespace": org})],
             ),
             self.set_element(
                 "Descriptor",
@@ -217,9 +219,7 @@ def set_biosample(
                         self.set_element(
                             "Identifier",
                             children=[
-                                self.set_element(
-                                    "SPUID", sid, {"spuid_namespace": org}
-                                )
+                                self.set_element("SPUID", sid, {"spuid_namespace": org})
                             ],
                         ),
                     ],
@@ -252,7 +252,7 @@ def get_submission_xml(self):
             name=self.ncbi_biosample_metadata.get("name", ""),
             pkg=self.ncbi_biosample_metadata.get("pkg", ""),
             org=self.ncbi_submission_metadata.get("organization", ""),
-            nmdc_biosample={}
+            nmdc_biosample={},
         )
 
         rough_string = ET.tostring(self.root, "unicode")

From ae062d16e44e858bdd072932392d6d6398780fea Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Fri, 10 May 2024 12:51:22 -0700
Subject: [PATCH 08/27] process all biosamples from a given NMDC study for NCBI
 XML translation

---
 nmdc_runtime/site/export/ncbi_xml.py        | 157 +++++++++++---------
 nmdc_runtime/site/export/nmdc_api_client.py |  37 +++--
 nmdc_runtime/site/ops.py                    |   5 +-
 nmdc_runtime/site/repository.py             |   4 +-
 4 files changed, 110 insertions(+), 93 deletions(-)

diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py
index 314baa75..03092930 100644
--- a/nmdc_runtime/site/export/ncbi_xml.py
+++ b/nmdc_runtime/site/export/ncbi_xml.py
@@ -1,4 +1,3 @@
-import json
 import datetime
 import xml.etree.ElementTree as ET
 import xml.dom.minidom
@@ -14,6 +13,7 @@
     handle_string_value,
     load_mappings,
 )
+from nmdc_runtime.site.export.nmdc_api_client import NMDCApiClient
 
 
 class NCBISubmissionXML:
@@ -29,6 +29,7 @@ def __init__(self, ncbi_submission_fields: dict):
         self.ncbi_biosample_metadata = ncbi_submission_fields.get(
             "ncbi_biosample_metadata", {}
         )
+        self.nmdc_api_client = NMDCApiClient()
 
         # dispatcher dictionary capturing handlers for NMDC object to NCBI flat Attribute
         # type handlers
@@ -142,91 +143,95 @@ def set_biosample(
         title,
         spuid,
         sid,
-        name,
-        pkg,
+        organism_name,
+        package,
         org,
-        nmdc_biosample,
+        nmdc_biosamples,
     ):
         attribute_mappings, slot_range_mappings = load_mappings(
             "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/issue-1940/assets/ncbi_mappings/ncbi_attribute_mappings_filled.tsv"
         )
 
-        attributes = {}
-        for json_key, value in nmdc_biosample.items():
-            if isinstance(value, list):
-                continue
+        for biosample in nmdc_biosamples:
+            attributes = {}
+            for json_key, value in biosample.items():
+                if isinstance(value, list):
+                    continue  # Skip processing for list values
 
-            xml_key = attribute_mappings.get(json_key, json_key)
-            value_type = slot_range_mappings.get(json_key, "string")
-            handler = self.type_handlers.get(value_type, handle_string_value)
+                xml_key = attribute_mappings.get(json_key, json_key)
+                value_type = slot_range_mappings.get(json_key, "string")
+                handler = self.type_handlers.get(value_type, handle_string_value)
 
-            formatted_value = handler(value)
-            attributes[xml_key] = formatted_value
+                formatted_value = handler(value)
+                attributes[xml_key] = formatted_value
 
-        # Create the BioSample XML block with these attributes
-        biosample_elements = [
-            self.set_element(
-                "SampleId",
-                children=[self.set_element("SPUID", sid, {"spuid_namespace": org})],
-            ),
-            self.set_element(
-                "Descriptor",
-                children=[
-                    self.set_element("Title", title),
-                    self.set_element(
-                        "Description", children=[self.set_element("p", spuid)]
-                    ),
-                ],
-            ),
-            self.set_element(
-                "Organism", children=[self.set_element("OrganismName", name)]
-            ),
-            self.set_element("Package", pkg),
-            self.set_element(
-                "Attributes",
-                children=[
-                    self.set_element(
-                        "Attribute", attributes[key], {"attribute_name": key}
-                    )
-                    for key in sorted(attributes)
-                ],
-            ),
-        ]
-
-        action = self.set_element(
-            "Action",
-            children=[
+            # Create the BioSample XML block with these attributes for each biosample
+            biosample_elements = [
                 self.set_element(
-                    "AddData",
-                    attrib={"target_db": "BioSample"},
+                    "SampleId",
+                    children=[self.set_element("SPUID", sid, {"spuid_namespace": org})],
+                ),
+                self.set_element(
+                    "Descriptor",
                     children=[
+                        self.set_element("Title", title),
                         self.set_element(
-                            "Data",
-                            attrib={"content_type": "XML"},
-                            children=[
-                                self.set_element(
-                                    "XmlContent",
-                                    children=[
-                                        self.set_element(
-                                            "BioSample",
-                                            attrib={"schema_version": "2.0"},
-                                            children=biosample_elements,
-                                        )
-                                    ],
-                                )
-                            ],
+                            "Description", children=[self.set_element("p", spuid)]
                         ),
+                    ],
+                ),
+                self.set_element(
+                    "Organism",
+                    children=[self.set_element("OrganismName", organism_name)],
+                ),
+                self.set_element("Package", package),
+                self.set_element(
+                    "Attributes",
+                    children=[
                         self.set_element(
-                            "Identifier",
-                            children=[
-                                self.set_element("SPUID", sid, {"spuid_namespace": org})
-                            ],
-                        ),
+                            "Attribute", attributes[key], {"attribute_name": key}
+                        )
+                        for key in sorted(attributes)
                     ],
-                )
-            ],
-        )
-        self.root.append(action)
+                ),
+            ]
+
+            action = self.set_element(
+                "Action",
+                children=[
+                    self.set_element(
+                        "AddData",
+                        attrib={"target_db": "BioSample"},
+                        children=[
+                            self.set_element(
+                                "Data",
+                                attrib={"content_type": "XML"},
+                                children=[
+                                    self.set_element(
+                                        "XmlContent",
+                                        children=[
+                                            self.set_element(
+                                                "BioSample",
+                                                attrib={"schema_version": "2.0"},
+                                                children=biosample_elements,
+                                            ),
+                                        ],
+                                    ),
+                                ],
+                            ),
+                            self.set_element(
+                                "Identifier",
+                                children=[
+                                    self.set_element(
+                                        "SPUID", sid, {"spuid_namespace": org}
+                                    ),
+                                ],
+                            ),
+                        ],
+                    ),
+                ],
+            )
+            self.root.append(action)
 
     def get_submission_xml(self):
         self.set_description(
@@ -245,14 +250,18 @@ def get_submission_xml(self):
             org=self.ncbi_submission_metadata.get("organization", ""),
         )
 
+        biosamples_list = self.nmdc_api_client.get_biosamples_part_of_study(
+            self.nmdc_study_id
+        )
+
         self.set_biosample(
             title=self.ncbi_biosample_metadata.get("title", ""),
             spuid=self.ncbi_biosample_metadata.get("spuid", ""),
             sid=self.ncbi_biosample_metadata.get("sid", ""),
-            name=self.ncbi_biosample_metadata.get("name", ""),
-            pkg=self.ncbi_biosample_metadata.get("pkg", ""),
+            organism_name=self.ncbi_biosample_metadata.get("organism_name", ""),
+            package=self.ncbi_biosample_metadata.get("package", ""),
             org=self.ncbi_submission_metadata.get("organization", ""),
-            nmdc_biosample={},
+            nmdc_biosamples=biosamples_list,
         )
 
         rough_string = ET.tostring(self.root, "unicode")
diff --git a/nmdc_runtime/site/export/nmdc_api_client.py b/nmdc_runtime/site/export/nmdc_api_client.py
index 6d7938e9..b4dd38d7 100644
--- a/nmdc_runtime/site/export/nmdc_api_client.py
+++ b/nmdc_runtime/site/export/nmdc_api_client.py
@@ -1,11 +1,18 @@
+import os
+import json
 import requests
 
+from dotenv import load_dotenv
+
 
 class NMDCApiClient:
-    def __init__(self, api_base_url):
-        if not api_base_url.endswith("/"):
-            api_base_url += "/"
-        self.base_url = api_base_url
+    def __init__(self, api_base_url=None):
+        load_dotenv()
+        self.base_url = api_base_url or os.getenv("API_HOST")
+        if not self.base_url:
+            raise ValueError("API base URL for runtime environment is required.")
+        if not self.base_url.endswith("/"):
+            self.base_url += "/"
         self.headers = {
             "accept": "application/json",
             "Content-Type": "application/json",
@@ -16,19 +23,19 @@ def get_biosamples_part_of_study(self, study_id: str) -> list[dict]:
         Get the biosamples that are part of a study.
         """
         biosample_records = []
-        params = {
-            "filter": '{"part_of": "' + study_id + '"}',
-            "max_page_size": "1000",
-        }
+        params = {"filter": json.dumps({"part_of": study_id}), "max_page_size": "1000"}
         url = self.base_url + "nmdcschema/biosample_set"
-        response = requests.get(url, params=params, headers=self.headers)
-        response.raise_for_status()
-        biosample_records.extend(response.json()["resources"])
-        # Get the next page of results, if any
-        while response.json().get("next_page_token") is not None:
-            params["page_token"] = response.json()["next_page_token"]
+
+        while True:
             response = requests.get(url, params=params, headers=self.headers)
             response.raise_for_status()
-            biosample_records.extend(response.json()["resources"])
+            data = response.json()
+            biosample_records.extend(data["resources"])
+
+            # Check if there's a next page
+            next_page_token = data.get("next_page_token")
+            if not next_page_token:
+                break
+            params["page_token"] = next_page_token
 
         return biosample_records
diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py
index 58352640..c327ee9d 100644
--- a/nmdc_runtime/site/ops.py
+++ b/nmdc_runtime/site/ops.py
@@ -1010,6 +1010,7 @@ def site_code_mapping() -> dict:
                     "first": String,
                     "last": String,
                     "user": String,
+                    "organization": String,
                 }
             ),
             is_required=True,
@@ -1033,8 +1034,8 @@ def site_code_mapping() -> dict:
                     "title": String,
                     "spuid": String,
                     "sid": String,
-                    "name": String,
-                    "pkg": String,
+                    "organism_name": String,
+                    "package": String,
                 }
             ),
             is_required=True,
diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py
index 90651210..1d7d4691 100644
--- a/nmdc_runtime/site/repository.py
+++ b/nmdc_runtime/site/repository.py
@@ -879,8 +879,8 @@ def biosample_export():
                                 "title": "",
                                 "spuid": "",
                                 "sid": "",
-                                "name": "",
-                                "pkg": "",
+                                "organism_name": "",
+                                "package": "",
                             },
                         }
                     },

From 24e8ad97b024d18f446f14f257b16e23c905b586 Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Fri, 10 May 2024 14:15:01 -0700
Subject: [PATCH 09/27] allow users to pass in mapping file through Dagit
 interface

---
 nmdc_runtime/site/export/ncbi_xml.py | 5 ++++-
 nmdc_runtime/site/ops.py             | 5 +++++
 nmdc_runtime/site/repository.py      | 1 +
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py
index 03092930..ab40f3d9 100644
--- a/nmdc_runtime/site/export/ncbi_xml.py
+++ b/nmdc_runtime/site/export/ncbi_xml.py
@@ -20,6 +20,9 @@ class NCBISubmissionXML:
     def __init__(self, ncbi_submission_fields: dict):
         self.root = ET.Element("Submission")
         self.nmdc_study_id = ncbi_submission_fields.get("nmdc_study_id")
+        self.nmdc_ncbi_attribute_mapping_file_url = ncbi_submission_fields.get(
+            "nmdc_ncbi_attribute_mapping_file_url"
+        )
         self.ncbi_submission_metadata = ncbi_submission_fields.get(
             "ncbi_submission_metadata", {}
         )
@@ -149,7 +152,7 @@ def set_biosample(
         nmdc_biosamples,
     ):
         attribute_mappings, slot_range_mappings = load_mappings(
-            "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/issue-1940/assets/ncbi_mappings/ncbi_attribute_mappings_filled.tsv"
+            self.nmdc_ncbi_attribute_mapping_file_url
         )
 
         for biosample in nmdc_biosamples:
diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py
index c327ee9d..29545e78 100644
--- a/nmdc_runtime/site/ops.py
+++ b/nmdc_runtime/site/ops.py
@@ -1003,6 +1003,7 @@ def site_code_mapping() -> dict:
 @op(
     config_schema={
         "nmdc_study_id": str,
+        "nmdc_ncbi_attribute_mapping_file_url": str,
         "ncbi_submission_metadata": Field(
             Permissive(
                 {
@@ -1046,12 +1047,16 @@ def site_code_mapping() -> dict:
 )
 def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str:
     nmdc_study_id = context.op_config["nmdc_study_id"]
+    nmdc_ncbi_attribute_mapping_file_url = context.op_config[
+        "nmdc_ncbi_attribute_mapping_file_url"
+    ]
     ncbi_submission_metadata = context.op_config.get("ncbi_submission_metadata", {})
     ncbi_bioproject_metadata = context.op_config.get("ncbi_bioproject_metadata", {})
     ncbi_biosample_metadata = context.op_config.get("ncbi_biosample_metadata", {})
 
     return {
         "nmdc_study_id": nmdc_study_id,
+        "nmdc_ncbi_attribute_mapping_file_url": nmdc_ncbi_attribute_mapping_file_url,
         "ncbi_submission_metadata": ncbi_submission_metadata,
         "ncbi_bioproject_metadata": ncbi_bioproject_metadata,
         "ncbi_biosample_metadata": ncbi_biosample_metadata,
diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py
index 1d7d4691..8075551d 100644
--- a/nmdc_runtime/site/repository.py
+++ b/nmdc_runtime/site/repository.py
@@ -862,6 +862,7 @@ def biosample_export():
                     "get_ncbi_export_pipeline_inputs": {
                         "config": {
                             "nmdc_study_id": "",
+                            "nmdc_ncbi_attribute_mapping_file_url": "",
                             "ncbi_submission_metadata": {
                                 "email": "",
                                 "first": "",

From 48e7be347c41a4101d01c447ec48305a0609e093 Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Mon, 13 May 2024 12:13:11 -0700
Subject: [PATCH 10/27] remove spuid, sid and title NCBI BioSample
 configuration parameters

---
 nmdc_runtime/site/export/ncbi_xml.py | 27 +++++++++++++++++----------
 nmdc_runtime/site/ops.py             |  3 ---
 nmdc_runtime/site/repository.py      |  3 ---
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py
index ab40f3d9..311b057a 100644
--- a/nmdc_runtime/site/export/ncbi_xml.py
+++ b/nmdc_runtime/site/export/ncbi_xml.py
@@ -143,9 +143,6 @@ def set_bioproject(self, title, project_id, description, data_type, org):
 
     def set_biosample(
         self,
-        title,
-        spuid,
-        sid,
         organism_name,
         package,
         org,
@@ -157,10 +154,17 @@ def set_biosample(
 
         for biosample in nmdc_biosamples:
             attributes = {}
+            sample_id_value = None
+
             for json_key, value in biosample.items():
                 if isinstance(value, list):
                     continue  # Skip processing for list values
 
+                # Special handling for NMDC Biosample "id"
+                if json_key == "id":
+                    sample_id_value = value
+                    continue
+
                 xml_key = attribute_mappings.get(json_key, json_key)
                 value_type = slot_range_mappings.get(json_key, "string")
                 handler = self.type_handlers.get(value_type, handle_string_value)
@@ -172,14 +176,18 @@ def set_biosample(
             biosample_elements = [
                 self.set_element(
                     "SampleId",
-                    children=[self.set_element("SPUID", sid, {"spuid_namespace": org})],
+                    children=[
+                        self.set_element(
+                            "SPUID", sample_id_value, {"spuid_namespace": org}
+                        )
+                    ],
                 ),
                 self.set_element(
                     "Descriptor",
                     children=[
-                        self.set_element("Title", title),
                         self.set_element(
-                            "Description", children=[self.set_element("p", spuid)]
+                            "Title",
+                            f"NMDC Biosample {sample_id_value} from {organism_name} part of {self.nmdc_study_id} study",
                         ),
                     ],
                 ),
@@ -226,7 +234,9 @@ def set_biosample(
                                 "Identifier",
                                 children=[
                                     self.set_element(
-                                        "SPUID", sid, {"spuid_namespace": org}
+                                        "SPUID",
+                                        sample_id_value,
+                                        {"spuid_namespace": org},
                                     ),
                                 ],
                             ),
@@ -258,9 +268,6 @@ def get_submission_xml(self):
         )
 
         self.set_biosample(
-            title=self.ncbi_biosample_metadata.get("title", ""),
-            spuid=self.ncbi_biosample_metadata.get("spuid", ""),
-            sid=self.ncbi_biosample_metadata.get("sid", ""),
             organism_name=self.ncbi_biosample_metadata.get("organism_name", ""),
             package=self.ncbi_biosample_metadata.get("package", ""),
             org=self.ncbi_submission_metadata.get("organization", ""),
diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py
index 29545e78..df6b07a1 100644
--- a/nmdc_runtime/site/ops.py
+++ b/nmdc_runtime/site/ops.py
@@ -1032,9 +1032,6 @@ def site_code_mapping() -> dict:
         "ncbi_biosample_metadata": Field(
             Permissive(
                 {
-                    "title": String,
-                    "spuid": String,
-                    "sid": String,
                     "organism_name": String,
                     "package": String,
                 }
diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py
index 8075551d..3e68892e 100644
--- a/nmdc_runtime/site/repository.py
+++ b/nmdc_runtime/site/repository.py
@@ -877,9 +877,6 @@ def biosample_export():
                                 "data_type": "",
                             },
                             "ncbi_biosample_metadata": {
-                                "title": "",
-                                "spuid": "",
-                                "sid": "",
                                 "organism_name": "",
                                 "package": "",
                             },

From b7a8000a66449cf7127b46366b34ba818cdbec88 Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Mon, 13 May 2024 12:24:03 -0700
Subject: [PATCH 11/27] update handle_quantity_value() in ncbi_xml_utils.py

---
 nmdc_runtime/site/export/ncbi_xml_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nmdc_runtime/site/export/ncbi_xml_utils.py b/nmdc_runtime/site/export/ncbi_xml_utils.py
index e34cae6d..e8e66047 100644
--- a/nmdc_runtime/site/export/ncbi_xml_utils.py
+++ b/nmdc_runtime/site/export/ncbi_xml_utils.py
@@ -15,7 +15,7 @@ def handle_quantity_value(slot_value):
             slot_value["has_maximum_numeric_value"]
             - slot_value["has_minimum_numeric_value"]
         )
-        return f"({range_value}) {slot_value['has_unit']}"
+        return f"{range_value} {slot_value['has_unit']}"
     elif "has_raw_value" in slot_value:
         return slot_value["has_raw_value"]
     return "Unknown format"

From dae0d13045fdb34c06b01f9c35aeeae9331ad88c Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Mon, 13 May 2024 18:09:07 -0700
Subject: [PATCH 12/27] if an NMDC biosample key is not in mapping file, ignore
 it

---
 nmdc_runtime/site/export/ncbi_xml.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py
index 311b057a..9fd3ba80 100644
--- a/nmdc_runtime/site/export/ncbi_xml.py
+++ b/nmdc_runtime/site/export/ncbi_xml.py
@@ -165,7 +165,10 @@ def set_biosample(
                     sample_id_value = value
                     continue
 
-                xml_key = attribute_mappings.get(json_key, json_key)
+                if json_key not in attribute_mappings:
+                    continue
+
+                xml_key = attribute_mappings[json_key]
                 value_type = slot_range_mappings.get(json_key, "string")
                 handler = self.type_handlers.get(value_type, handle_string_value)
 

From dfe5f4161585156a7f0f043ea948105c64758394 Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Wed, 15 May 2024 17:31:25 -0700
Subject: [PATCH 13/27] comprehensive test suite for NMDC-to-NCBI export
 pipeline

---
 nmdc_runtime/site/export/ncbi_xml_utils.py |   2 +
 requirements/dev.in                        |   3 +-
 requirements/main.in                       |   1 +
 tests/test_data/test_ncbi_xml.py           | 434 +++++++++++++++++++++
 4 files changed, 439 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_data/test_ncbi_xml.py

diff --git a/nmdc_runtime/site/export/ncbi_xml_utils.py b/nmdc_runtime/site/export/ncbi_xml_utils.py
index e8e66047..1ad34aca 100644
--- a/nmdc_runtime/site/export/ncbi_xml_utils.py
+++ b/nmdc_runtime/site/export/ncbi_xml_utils.py
@@ -48,6 +48,8 @@ def handle_controlled_identified_term_value(slot_value):
         term = slot_value["term"]
         if "name" in term and "id" in term:
             return f"{term['name']} [{term['id']}]"
+        elif "id" in term:
+            return term["id"]
     elif "has_raw_value" in slot_value:
         return slot_value["has_raw_value"]
     return "Unknown format"
diff --git a/requirements/dev.in b/requirements/dev.in
index dbe7b8e9..601370de 100644
--- a/requirements/dev.in
+++ b/requirements/dev.in
@@ -11,4 +11,5 @@ pytest-cov
 requests-mock
 setuptools
 twine
-requests-cache
\ No newline at end of file
+requests-cache
+pytest-mock
\ No newline at end of file
diff --git a/requirements/main.in b/requirements/main.in
index 45998c84..4f58b6e3 100644
--- a/requirements/main.in
+++ b/requirements/main.in
@@ -30,6 +30,7 @@ pandas
 passlib[bcrypt]
 pymongo
 pydantic[email]>=1.10.0
+pytest-mock
 python-dotenv
 python-jose[cryptography]
 python-multipart
diff --git a/tests/test_data/test_ncbi_xml.py b/tests/test_data/test_ncbi_xml.py
new file mode 100644
index 00000000..3cd2913c
--- /dev/null
+++ b/tests/test_data/test_ncbi_xml.py
@@ -0,0 +1,434 @@
+from unittest.mock import MagicMock
+import pytest
+from requests.exceptions import HTTPError
+import xml.etree.ElementTree as ET
+
+from nmdc_runtime.site.export.ncbi_xml import NCBISubmissionXML
+from nmdc_runtime.site.export.ncbi_xml_utils import (
+    load_mappings,
+    handle_quantity_value,
+    handle_text_value,
+    handle_timestamp_value,
+    handle_controlled_term_value,
+    handle_controlled_identified_term_value,
+    handle_geolocation_value,
+    handle_float_value,
+    handle_string_value,
+)
+from nmdc_runtime.site.export.nmdc_api_client import NMDCApiClient
+
+MOCK_SUBMISSION_FIELDS = {
+    "nmdc_study_id": "nmdc:sty-11-12345",
+    "nmdc_ncbi_attribute_mapping_file_url": "http://example.com/mappings.tsv",
+    "ncbi_submission_metadata": {
+        "email": "user@example.com",
+        "user": "testuser",
+        "first": "Test",
+        "last": "User",
+        "organization": "Test Org",
+    },
+    "ncbi_bioproject_metadata": {
+        "title": "Test Project",
+        "project_id": "PRJNA12345",
+        "description": "A test project",
+        "data_type": "metagenome",
+    },
+    "ncbi_biosample_metadata": {
+        "title": "Test Sample",
+        "organism_name": "E. coli",
+        "package": "Test Package",
+    },
+}
+
+
+@pytest.fixture
+def ncbi_submission_client():
+    return NCBISubmissionXML(ncbi_submission_fields=MOCK_SUBMISSION_FIELDS)
+
+
+@pytest.fixture
+def nmdc_api_client():
+    return NMDCApiClient(api_base_url="http://fakeapi.com/")
+
+
+@pytest.fixture
+def nmdc_biosample():
+    return [
+        {
+            "analysis_type": ["metagenomics"],
+            "biosample_categories": ["NEON"],
+            "collection_date": {"has_raw_value": "2014-08-05T18:40Z"},
+            "conduc": {"has_numeric_value": 567, "has_unit": "uS/cm"},
+            "elev": 1178.7,
+            "env_broad_scale": {
+                "term": {"id": "ENVO:03605008", "name": "freshwater stream biome"}
+            },
+            "env_local_scale": {
+                "term": {"id": "ENVO:03605007", "name": "freshwater stream"}
+            },
+            "env_medium": {"term": {"id": "ENVO:03605006", "name": "stream water"}},
+            "env_package": {"has_raw_value": "water"},
+            "geo_loc_name": {"has_raw_value": "USA: Colorado, Arikaree River"},
+            "id": "nmdc:bsm-12-gnfpt483",
+            "lat_lon": {"latitude": 39.758359, "longitude": -102.448595},
+            "name": "ARIK.SS.20140805",
+            "part_of": ["nmdc:sty-11-hht5sb92"],
+            "samp_collec_device": "Grab",
+            "temp": {"has_numeric_value": 20.1, "has_unit": "Cel"},
+            "type": "nmdc:Biosample",
+        }
+    ]
+
+
+class TestNCBISubmissionXML:
+    def test_set_element(self, ncbi_submission_client):
+        element = ncbi_submission_client.set_element("Test", "Hello", {"attr": "value"})
+        assert element.tag == "Test"
+        assert element.text == "Hello"
+        assert element.attrib == {"attr": "value"}
+
+    def test_set_description(self, ncbi_submission_client):
+        ncbi_submission_client.set_description(
+            MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["email"],
+            MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["user"],
+            MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["first"],
+            MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["last"],
+            MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["organization"],
+        )
+        description = ET.tostring(
+            ncbi_submission_client.root.find("Description"), "unicode"
+        )
+
+        root = ET.fromstring(description)
+        comment = root.find("Comment").text
+        submitter = root.find("Submitter").attrib["user_name"]
+        org_name = root.find("Organization/Name").text
+        contact_email = root.find("Organization/Contact").attrib["email"]
+        contact_first = root.find("Organization/Contact/Name/First").text
+        contact_last = root.find("Organization/Contact/Name/Last").text
+
+        assert comment == "NMDC Submission for nmdc:sty-11-12345"
+        assert submitter == "testuser"
+        assert org_name == "Test Org"
+        assert contact_email == "user@example.com"
+        assert contact_first == "Test"
+        assert contact_last == "User"
+
+    def test_set_bioproject(self, ncbi_submission_client):
+        ncbi_submission_client.set_bioproject(
+            title=MOCK_SUBMISSION_FIELDS["ncbi_bioproject_metadata"]["title"],
+            project_id=MOCK_SUBMISSION_FIELDS["ncbi_bioproject_metadata"]["project_id"],
+            description=MOCK_SUBMISSION_FIELDS["ncbi_bioproject_metadata"][
+                "description"
+            ],
+            data_type=MOCK_SUBMISSION_FIELDS["ncbi_bioproject_metadata"]["data_type"],
+            org=MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["organization"],
+        )
+        bioproject_xml = ET.tostring(
+            ncbi_submission_client.root.find(".//Project"), "unicode"
+        )
+        assert "Test Project" in bioproject_xml
+        assert "PRJNA12345" in bioproject_xml
+        assert "A test project" in bioproject_xml
+        assert "metagenome" in bioproject_xml
+        assert "Test Org" in bioproject_xml
+
+    def test_set_biosample(self, ncbi_submission_client, nmdc_biosample, mocker):
+        mocker.patch(
+            "nmdc_runtime.site.export.ncbi_xml.load_mappings",
+            return_value=(
+                {
+                    "analysis_type": "",
+                    "biosample_categories": "",
+                    "collection_date": "collection_date",
+                    "conduc": "conduc",
+                    "elev": "elev",
+                    "env_broad_scale": "env_broad_scale",
+                    "env_local_scale": "env_local_scale",
+                    "env_medium": "env_medium",
+                    "env_package": "env_package",
+                    "geo_loc_name": "geo_loc_name",
+                    "id": "",
+                    "lat_lon": "lat_lon",
+                    "name": "sample_name",
+                    "part_of": "",
+                    "samp_collec_device": "samp_collect_device",
+                    "temp": "temp",
+                    "type": "",
+                },
+                {
+                    "analysis_type": "AnalysisTypeEnum",
+                    "biosample_categories": "BiosampleCategoryEnum",
+                    "collection_date": "TimestampValue",
+                    "conduc": "QuantityValue",
+                    "elev": "float",
+                    "env_broad_scale": "ControlledIdentifiedTermValue",
+                    "env_local_scale": "ControlledIdentifiedTermValue",
+                    "env_medium": "ControlledIdentifiedTermValue",
+                    "env_package": "TextValue",
+                    "geo_loc_name": "TextValue",
+                    "id": "uriorcurie",
+                    "lat_lon": "GeolocationValue",
+                    "name": "string",
+                    "part_of": "Study",
+                    "samp_collec_device": "string",
+                    "temp": "QuantityValue",
+                    "type": "string",
+                },
+            ),
+        )
+        ncbi_submission_client.set_biosample(
+            organism_name=MOCK_SUBMISSION_FIELDS["ncbi_biosample_metadata"][
+                "organism_name"
+            ],
+            package=MOCK_SUBMISSION_FIELDS["ncbi_biosample_metadata"]["package"],
+            org=MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["organization"],
+            nmdc_biosamples=nmdc_biosample,
+        )
+        biosample_xml = ET.tostring(
+            ncbi_submission_client.root.find(".//BioSample"), "unicode"
+        )
+        assert "E. coli" in biosample_xml
+        assert "Test Package" in biosample_xml
+        assert "Test Org" in biosample_xml
+
+    def test_get_submission_xml(self, mocker, ncbi_submission_client, nmdc_biosample):
+        mocker.patch(
+            "nmdc_runtime.site.export.ncbi_xml.load_mappings",
+            return_value=(
+                {
+                    "analysis_type": "",
+                    "biosample_categories": "",
+                    "collection_date": "collection_date",
+                    "conduc": "conduc",
+                    "elev": "elev",
+                    "env_broad_scale": "env_broad_scale",
+                    "env_local_scale": "env_local_scale",
+                    "env_medium": "env_medium",
+                    "env_package": "env_package",
+                    "geo_loc_name": "geo_loc_name",
+                    "id": "",
+                    "lat_lon": "lat_lon",
+                    "name": "sample_name",
+                    "part_of": "",
+                    "samp_collec_device": "samp_collect_device",
+                    "temp": "temp",
+                    "type": "",
+                },
+                {
+                    "analysis_type": "AnalysisTypeEnum",
+                    "biosample_categories": "BiosampleCategoryEnum",
+                    "collection_date": "TimestampValue",
+                    "conduc": "QuantityValue",
+                    "elev": "float",
+                    "env_broad_scale": "ControlledIdentifiedTermValue",
+                    "env_local_scale": "ControlledIdentifiedTermValue",
+                    "env_medium": "ControlledIdentifiedTermValue",
+                    "env_package": "TextValue",
+                    "geo_loc_name": "TextValue",
+                    "id": "uriorcurie",
+                    "lat_lon": "GeolocationValue",
+                    "name": "string",
+                    "part_of": "Study",
+                    "samp_collec_device": "string",
+                    "temp": "QuantityValue",
+                    "type": "string",
+                },
+            ),
+        )
+
+        mocker.patch.object(
+            NMDCApiClient, "get_biosamples_part_of_study", return_value=nmdc_biosample
+        )
+
+        submission_xml = ncbi_submission_client.get_submission_xml()
+
+        assert "nmdc:bsm-12-gnfpt483" in submission_xml
+        assert "E. coli" in submission_xml
+        assert "stream water" in submission_xml
+        assert "USA: Colorado, Arikaree River" in submission_xml
+        assert "2014-08-05T18:40Z" in submission_xml
+        assert "testuser" in submission_xml
+        assert "Test Project" in submission_xml
+
+
+class TestNMDCApiClient:
+    def test_get_biosamples_part_of_study_success(self, mocker, nmdc_api_client):
+        mock_response = mocker.MagicMock()
+        mock_response.json.return_value = {
+            "resources": [
+                {"id": "nmdc:bsm-12-gnfpt483", "part_of": ["nmdc:sty-11-hht5sb92"]}
+            ],
+            "next_page_token": None,
+        }
+        mocker.patch("requests.get", return_value=mock_response)
+        result = nmdc_api_client.get_biosamples_part_of_study("nmdc:sty-11-hht5sb92")
+        assert result == [
+            {"id": "nmdc:bsm-12-gnfpt483", "part_of": ["nmdc:sty-11-hht5sb92"]}
+        ]
+
+    def test_get_biosamples_part_of_study_failure(self, mocker, nmdc_api_client):
+        mocker.patch("requests.get", side_effect=HTTPError("API Error"))
+        with pytest.raises(HTTPError):
+            nmdc_api_client.get_biosamples_part_of_study("nmdc:sty-11-hht5sb92")
+
+
+class TestNCBIXMLUtils:
+    def test_handle_quantity_value(self):
+        assert (
+            handle_quantity_value({"has_numeric_value": 10, "has_unit": "mg"})
+            == "10 mg"
+        )
+        assert (
+            handle_quantity_value(
+                {
+                    "has_maximum_numeric_value": 15,
+                    "has_minimum_numeric_value": 5,
+                    "has_unit": "kg",
+                }
+            )
+            == "10 kg"
+        )
+        assert handle_quantity_value({"has_raw_value": "20 units"}) == "20 units"
+        assert handle_quantity_value({}) == "Unknown format"
+
+    def test_handle_text_value(self):
+        assert handle_text_value({"has_raw_value": "Sample Text"}) == "Sample Text"
+        assert handle_text_value({}) == "Unknown format"
+
+    def test_handle_timestamp_value(self):
+        assert handle_timestamp_value({"has_raw_value": "2021-01-01"}) == "2021-01-01"
+        assert handle_timestamp_value({}) == "Unknown format"
+
+    def test_handle_controlled_term_value(self):
+        term_data = {"term": {"name": "Homo sapiens", "id": "NCBITaxon:9606"}}
+        assert (
+            handle_controlled_term_value(term_data) == "Homo sapiens [NCBITaxon:9606]"
+        )
+        assert (
+            handle_controlled_term_value({"term": {"id": "NCBITaxon:9606"}})
+            == "NCBITaxon:9606"
+        )
+        assert (
+            handle_controlled_term_value({"term": {"name": "Homo sapiens"}})
+            == "Homo sapiens"
+        )
+        assert (
+            handle_controlled_term_value(
+                {"has_raw_value": "Homo sapiens [NCBITaxon:9606]"}
+            )
+            == "Homo sapiens [NCBITaxon:9606]"
+        )
+        assert handle_controlled_term_value({}) == "Unknown format"
+
+    def test_handle_controlled_identified_term_value(self):
+        term_data = {"term": {"name": "Homo sapiens", "id": "NCBITaxon:9606"}}
+        assert (
+            handle_controlled_identified_term_value(term_data)
+            == "Homo sapiens [NCBITaxon:9606]"
+        )
+        assert (
+            handle_controlled_identified_term_value({"term": {"id": "NCBITaxon:9606"}})
+            == "NCBITaxon:9606"
+        )
+        assert (
+            handle_controlled_identified_term_value({"term": {"name": "Homo sapiens"}})
+            == "Unknown format"
+        )
+        assert (
+            handle_controlled_identified_term_value(
+                {"has_raw_value": "Homo sapiens [NCBITaxon:9606]"}
+            )
+            == "Homo sapiens [NCBITaxon:9606]"
+        )
+        assert handle_controlled_identified_term_value({}) == "Unknown format"
+
+    def test_handle_geolocation_value(self):
+        assert (
+            handle_geolocation_value({"latitude": 34.05, "longitude": -118.25})
+            == "34.05 -118.25"
+        )
+        assert (
+            handle_geolocation_value({"has_raw_value": "34.05, -118.25"})
+            == "34.05, -118.25"
+        )
+        assert handle_geolocation_value({}) == "Unknown format"
+
+    def test_handle_float_value(self):
+        assert handle_float_value(10.1234) == "10.12"
+
+    def test_handle_string_value(self):
+        assert handle_string_value("Foo") == "Foo"
+
+    def test_load_mappings(self, mocker):
+        mock_tsv_content = (
+            "nmdc_schema_class\tnmdc_schema_slot\tnmdc_schema_slot_range\tncbi_biosample_attribute_name\tstatic_value\tignore\n"
+            "Biosample\tanalysis_type\tAnalysisTypeEnum\t\t\t\n"
+            "Biosample\tbiosample_categories\tBiosampleCategoryEnum\t\t\t\n"
+            "Biosample\tcollection_date\tTimestampValue\tcollection_date\t\t\n"
+            "Biosample\tconduc\tQuantityValue\tconduc\t\t\n"
+            "Biosample\telev\tfloat\telev\t\t\n"
+            "Biosample\tenv_broad_scale\tControlledIdentifiedTermValue\tenv_broad_scale\t\t\n"
+            "Biosample\tenv_local_scale\tControlledIdentifiedTermValue\tenv_local_scale\t\t\n"
+            "Biosample\tenv_medium\tControlledIdentifiedTermValue\tenv_medium\t\t\n"
+            "Biosample\tenv_package\tTextValue\tenv_package\t\t\n"
+            "Biosample\tgeo_loc_name\tQuantityValue\tgeo_loc_name\t\t\n"
+            "Biosample\tid\turiorcurie\t\t\t\n"
+            "Biosample\tlat_lon\tGeolocationValue\tlat_lon\t\t\n"
+            "Biosample\tname\tstring\tsample_name\t\t\n"
+            "Biosample\tpart_of\tStudy\t\t\t\n"
+            "Biosample\tsamp_collec_device\tstring\tsamp_collect_device\t\t\n"
+            "Biosample\ttemp\tQuantityValue\ttemp\t\t\n"
+            "Biosample\ttype\tstring\t\t\t\n"
+        )
+
+        mock_response = MagicMock()
+        mock_response.text = mock_tsv_content
+        mocker.patch("requests.get", return_value=mock_response)
+
+        attribute_mappings, slot_range_mappings = load_mappings(
+            "http://example.com/mappings.tsv"
+        )
+
+        expected_attribute_mappings = {
+            "analysis_type": "analysis_type",
+            "biosample_categories": "biosample_categories",
+            "collection_date": "collection_date",
+            "conduc": "conduc",
+            "elev": "elev",
+            "env_broad_scale": "env_broad_scale",
+            "env_local_scale": "env_local_scale",
+            "env_medium": "env_medium",
+            "env_package": "env_package",
+            "geo_loc_name": "geo_loc_name",
+            "id": "id",
+            "lat_lon": "lat_lon",
+            "name": "sample_name",
+            "part_of": "part_of",
+            "samp_collec_device": "samp_collect_device",
+            "temp": "temp",
+            "type": "type",
+        }
+
+        expected_slot_range_mappings = {
+            "analysis_type": "AnalysisTypeEnum",
+            "biosample_categories": "BiosampleCategoryEnum",
+            "collection_date": "TimestampValue",
+            "conduc": "QuantityValue",
+            "elev": "float",
+            "env_broad_scale": "ControlledIdentifiedTermValue",
+            "env_local_scale": "ControlledIdentifiedTermValue",
+            "env_medium": "ControlledIdentifiedTermValue",
+            "env_package": "TextValue",
+            "geo_loc_name": "QuantityValue",
+            "id": "uriorcurie",
+            "lat_lon": "GeolocationValue",
+            "name": "string",
+            "part_of": "Study",
+            "samp_collec_device": "string",
+            "temp": "QuantityValue",
+            "type": "string",
+        }
+
+        assert attribute_mappings == expected_attribute_mappings
+        assert slot_range_mappings == expected_slot_range_mappings

From 4ceb10a8221adc84c21ab7a6fb43911a6e3612c8 Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Wed, 15 May 2024 17:42:04 -0700
Subject: [PATCH 14/27] update dev.txt and main.txt in requirements folder

---
 requirements/dev.txt  |  92 +++++++-------
 requirements/main.txt | 280 +++++++++++++++++++++++-------------------
 2 files changed, 195 insertions(+), 177 deletions(-)

diff --git a/requirements/dev.txt b/requirements/dev.txt
index f0238446..8eb8d791 100644
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@@ -9,11 +9,13 @@ attrs==23.2.0
     #   -c requirements/main.txt
     #   cattrs
     #   requests-cache
-black==24.2.0
+backports-tarfile==1.1.1
+    # via jaraco-context
+black==24.4.2
     # via
     #   -c requirements/main.txt
     #   -r requirements/dev.in
-build==1.1.1
+build==1.2.1
     # via pip-tools
 cattrs==23.2.3
     # via
@@ -23,10 +25,6 @@ certifi==2024.2.2
     # via
     #   -c requirements/main.txt
     #   requests
-cffi==1.16.0
-    # via
-    #   -c requirements/main.txt
-    #   cryptography
 charset-normalizer==3.3.2
     # via
     #   -c requirements/main.txt
@@ -36,31 +34,28 @@ click==8.1.7
     #   -c requirements/main.txt
     #   black
     #   pip-tools
-coverage==7.4.3
+coverage==7.5.1
     # via
     #   -r requirements/dev.in
     #   pytest-cov
-cryptography==42.0.5
-    # via
-    #   -c requirements/main.txt
-    #   secretstorage
-docutils==0.20.1
+docutils==0.21.2
     # via
     #   -c requirements/main.txt
     #   readme-renderer
-exceptiongroup==1.2.0
+exceptiongroup==1.2.1
     # via
     #   -c requirements/main.txt
     #   cattrs
     #   pytest
 flake8==7.0.0
     # via -r requirements/dev.in
-idna==3.6
+idna==3.7
     # via
     #   -c requirements/main.txt
     #   requests
-importlib-metadata==7.0.1
+importlib-metadata==7.1.0
     # via
+    #   build
     #   keyring
     #   twine
 iniconfig==2.0.0
@@ -69,13 +64,13 @@ iniconfig==2.0.0
     #   pytest
 invoke==2.2.0
     # via -r requirements/dev.in
-jaraco-classes==3.3.1
+jaraco-classes==3.4.0
     # via keyring
-jeepney==0.8.0
-    # via
-    #   keyring
-    #   secretstorage
-keyring==24.3.1
+jaraco-context==5.3.0
+    # via keyring
+jaraco-functools==4.0.1
+    # via keyring
+keyring==25.2.1
     # via twine
 markdown-it-py==3.0.0
     # via
@@ -88,14 +83,16 @@ mdurl==0.1.2
     #   -c requirements/main.txt
     #   markdown-it-py
 more-itertools==10.2.0
-    # via jaraco-classes
+    # via
+    #   jaraco-classes
+    #   jaraco-functools
 mypy-extensions==1.0.0
     # via
     #   -c requirements/main.txt
     #   black
-nh3==0.2.15
+nh3==0.2.17
     # via readme-renderer
-packaging==23.2
+packaging==24.0
     # via
     #   -c requirements/main.txt
     #   black
@@ -105,48 +102,47 @@ pathspec==0.12.1
     # via
     #   -c requirements/main.txt
     #   black
-pip-tools==7.4.0
+pip-tools==7.4.1
     # via -r requirements/dev.in
-pkginfo==1.9.6
+pkginfo==1.10.0
     # via twine
-platformdirs==4.2.0
+platformdirs==4.2.2
     # via
     #   -c requirements/main.txt
     #   black
     #   requests-cache
-pluggy==1.4.0
+pluggy==1.5.0
     # via
     #   -c requirements/main.txt
     #   pytest
 pycodestyle==2.11.1
     # via flake8
-pycparser==2.21
-    # via
-    #   -c requirements/main.txt
-    #   cffi
 pyflakes==3.2.0
-    # via
-    #   -c requirements/main.txt
-    #   flake8
-pygments==2.17.2
+    # via flake8
+pygments==2.18.0
     # via
     #   -c requirements/main.txt
     #   readme-renderer
     #   rich
-pyproject-hooks==1.0.0
+pyproject-hooks==1.1.0
     # via
     #   build
     #   pip-tools
-pytest==8.0.2
+pytest==8.2.0
     # via
     #   -c requirements/main.txt
     #   -r requirements/dev.in
     #   pytest-asyncio
     #   pytest-cov
-pytest-asyncio==0.23.5
+    #   pytest-mock
+pytest-asyncio==0.23.6
     # via -r requirements/dev.in
-pytest-cov==4.1.0
+pytest-cov==5.0.0
     # via -r requirements/dev.in
+pytest-mock==3.14.0
+    # via
+    #   -c requirements/main.txt
+    #   -r requirements/dev.in
 readme-renderer==43.0
     # via twine
 requests==2.31.0
@@ -160,7 +156,7 @@ requests-cache==1.2.0
     # via
     #   -c requirements/main.txt
     #   -r requirements/dev.in
-requests-mock==1.11.0
+requests-mock==1.12.1
     # via -r requirements/dev.in
 requests-toolbelt==1.0.0
     # via
@@ -172,12 +168,9 @@ rich==13.7.1
     # via
     #   -c requirements/main.txt
     #   twine
-secretstorage==3.3.3
-    # via keyring
 six==1.16.0
     # via
     #   -c requirements/main.txt
-    #   requests-mock
     #   url-normalize
 tomli==2.0.1
     # via
@@ -186,11 +179,10 @@ tomli==2.0.1
     #   build
     #   coverage
     #   pip-tools
-    #   pyproject-hooks
     #   pytest
 twine==5.0.0
     # via -r requirements/dev.in
-typing-extensions==4.10.0
+typing-extensions==4.11.0
     # via
     #   -c requirements/main.txt
     #   black
@@ -199,15 +191,15 @@ url-normalize==1.4.3
     # via
     #   -c requirements/main.txt
     #   requests-cache
-urllib3==2.0.7
+urllib3==2.2.1
     # via
     #   -c requirements/main.txt
     #   requests
     #   requests-cache
     #   twine
-wheel==0.42.0
+wheel==0.43.0
     # via pip-tools
-zipp==3.17.0
+zipp==3.18.1
     # via importlib-metadata
 
 # The following packages are considered to be unsafe in a requirements file:
@@ -215,7 +207,7 @@ pip==24.0
     # via
     #   -r requirements/dev.in
     #   pip-tools
-setuptools==69.1.1
+setuptools==69.5.1
     # via
     #   -c requirements/main.txt
     #   -r requirements/dev.in
diff --git a/requirements/main.txt b/requirements/main.txt
index fd18a174..3548c168 100644
--- a/requirements/main.txt
+++ b/requirements/main.txt
@@ -42,9 +42,7 @@ attrs==23.2.0
     #   jsonschema
     #   referencing
     #   requests-cache
-autoflake==2.3.0
-    # via shed
-babel==2.14.0
+babel==2.15.0
     # via
     #   jupyterlab-server
     #   mkdocs-material
@@ -53,9 +51,9 @@ backoff==2.2.1
     # via gql
 base32-lib==1.0.2
     # via -r requirements/main.in
-bcrypt==4.1.2
+bcrypt==4.1.3
     # via passlib
-beanie==1.25.0
+beanie==1.26.0
     # via -r requirements/main.in
 beautifulsoup4==4.12.3
     # via
@@ -64,13 +62,13 @@ beautifulsoup4==4.12.3
     #   nbconvert
 bioregistry==0.10.158
     # via nmdc-schema
-black==24.2.0
+black==24.4.2
     # via shed
 bleach==6.1.0
     # via nbconvert
-boto3==1.34.54
+boto3==1.34.105
     # via -r requirements/main.in
-botocore==1.34.54
+botocore==1.34.105
     # via
     #   boto3
     #   s3transfer
@@ -109,6 +107,7 @@ click==8.1.7
     #   prefixcommons
     #   pystow
     #   terminusdb-client
+    #   typer
     #   uvicorn
 colorama==0.4.6
     # via mkdocs-material
@@ -116,36 +115,36 @@ coloredlogs==14.0
     # via dagster
 com2ann==0.3.0
     # via shed
-comm==0.2.1
+comm==0.2.2
     # via
     #   ipykernel
     #   ipywidgets
-croniter==2.0.2
+croniter==2.0.5
     # via dagster
-cryptography==42.0.5
+cryptography==42.0.7
     # via python-jose
-curies==0.7.7
+curies==0.7.9
     # via
     #   bioregistry
     #   linkml-runtime
     #   prefixmaps
-dagit==1.6.8
+dagit==1.7.5
     # via -r requirements/main.in
-dagster==1.6.8
+dagster==1.7.5
     # via
     #   -r requirements/main.in
     #   dagster-graphql
     #   dagster-postgres
     #   dagster-webserver
-dagster-graphql==1.6.8
+dagster-graphql==1.7.5
     # via
     #   -r requirements/main.in
     #   dagster-webserver
-dagster-pipes==1.6.8
+dagster-pipes==1.7.5
     # via dagster
-dagster-postgres==0.22.8
+dagster-postgres==0.23.5
     # via -r requirements/main.in
-dagster-webserver==1.6.8
+dagster-webserver==1.7.5
     # via dagit
 debugpy==1.8.1
     # via ipykernel
@@ -161,21 +160,23 @@ dnspython==2.6.1
     # via
     #   email-validator
     #   pymongo
-docstring-parser==0.15
+docstring-parser==0.16
     # via dagster
-docutils==0.20.1
+docutils==0.21.2
     # via sphinx
 dotted-dict==1.1.3
     # via -r requirements/main.in
-ecdsa==0.18.0
+ecdsa==0.19.0
     # via python-jose
 editorconfig==0.12.4
     # via jsbeautifier
 email-validator==2.1.1
-    # via pydantic
+    # via
+    #   fastapi
+    #   pydantic
 et-xmlfile==1.1.0
     # via openpyxl
-exceptiongroup==1.2.0
+exceptiongroup==1.2.1
     # via
     #   anyio
     #   cattrs
@@ -183,19 +184,25 @@ exceptiongroup==1.2.0
     #   pytest
 executing==2.0.1
     # via stack-data
-fastapi==0.110.0
-    # via -r requirements/main.in
+fastapi==0.111.0
+    # via
+    #   -r requirements/main.in
+    #   fastapi-cli
+fastapi-cli==0.0.3
+    # via fastapi
 fastjsonschema==2.19.1
     # via
     #   -r requirements/main.in
     #   nbformat
+filelock==3.14.0
+    # via dagster
 fnc==0.5.3
     # via -r requirements/main.in
 fqdn==1.5.1
     # via jsonschema
-frozendict==2.4.0
+frozendict==2.4.4
     # via -r requirements/main.in
-fsspec==2024.2.0
+fsspec==2024.3.1
     # via universal-pathlib
 ghp-import==2.1.0
     # via mkdocs
@@ -212,13 +219,15 @@ graphql-core==3.2.3
     #   graphql-relay
 graphql-relay==3.2.0
     # via graphene
-graphviz==0.20.1
+graphviz==0.20.3
     # via linkml
-grpcio==1.62.0
+greenlet==3.0.3
+    # via sqlalchemy
+grpcio==1.63.0
     # via
     #   dagster
     #   grpcio-health-checking
-grpcio-health-checking==1.62.0
+grpcio-health-checking==1.62.2
     # via dagster
 h11==0.14.0
     # via
@@ -229,15 +238,17 @@ hbreader==0.9.1
     #   jsonasobj2
     #   linkml
     #   linkml-runtime
-httpcore==1.0.4
+httpcore==1.0.5
     # via httpx
 httptools==0.6.1
     # via uvicorn
 httpx==0.27.0
-    # via jupyterlab
+    # via
+    #   fastapi
+    #   jupyterlab
 humanfriendly==10.0
     # via coloredlogs
-idna==3.6
+idna==3.7
     # via
     #   anyio
     #   email-validator
@@ -249,14 +260,14 @@ imagesize==1.4.1
     # via sphinx
 iniconfig==2.0.0
     # via pytest
-ipykernel==6.29.3
+ipykernel==6.29.4
     # via
     #   jupyter
     #   jupyter-console
     #   jupyterlab
     #   mkdocs-jupyter
     #   qtconsole
-ipython==8.22.1
+ipython==8.24.0
     # via
     #   ipykernel
     #   ipywidgets
@@ -269,13 +280,12 @@ isodate==0.6.1
     #   rdflib
 isoduration==20.11.0
     # via jsonschema
-isort==5.13.2
-    # via shed
 jedi==0.19.1
     # via ipython
-jinja2==3.1.3
+jinja2==3.1.4
     # via
     #   dagster
+    #   fastapi
     #   jupyter-server
     #   jupyterlab
     #   jupyterlab-server
@@ -284,19 +294,18 @@ jinja2==3.1.3
     #   mkdocs
     #   mkdocs-material
     #   nbconvert
-    #   numpydoc
     #   sphinx
 jmespath==1.0.1
     # via
     #   boto3
     #   botocore
-jq==1.6.0
+jq==1.7.0
     # via -r requirements/main.in
 jsbeautifier==1.15.1
     # via mkdocs-mermaid2-plugin
 json-flattener==0.1.9
     # via linkml-runtime
-json5==0.9.18
+json5==0.9.25
     # via jupyterlab-server
 jsonasobj==1.3.1
     # via
@@ -315,7 +324,7 @@ jsonpointer==2.4
     # via
     #   jsonpatch
     #   jsonschema
-jsonschema==4.21.1
+jsonschema==4.22.0
     # via
     #   jupyter-events
     #   jupyterlab-server
@@ -326,7 +335,7 @@ jsonschema-specifications==2023.12.1
     # via jsonschema
 jupyter==1.0.0
     # via -r requirements/main.in
-jupyter-client==8.6.0
+jupyter-client==8.6.1
     # via
     #   ipykernel
     #   jupyter-console
@@ -335,7 +344,7 @@ jupyter-client==8.6.0
     #   qtconsole
 jupyter-console==6.6.3
     # via jupyter
-jupyter-core==5.7.1
+jupyter-core==5.7.2
     # via
     #   ipykernel
     #   jupyter-client
@@ -346,52 +355,52 @@ jupyter-core==5.7.1
     #   nbconvert
     #   nbformat
     #   qtconsole
-jupyter-events==0.9.0
+jupyter-events==0.10.0
     # via jupyter-server
-jupyter-lsp==2.2.3
+jupyter-lsp==2.2.5
     # via jupyterlab
-jupyter-server==2.12.5
+jupyter-server==2.14.0
     # via
     #   jupyter-lsp
     #   jupyterlab
     #   jupyterlab-server
     #   notebook
     #   notebook-shim
-jupyter-server-terminals==0.5.2
+jupyter-server-terminals==0.5.3
     # via jupyter-server
-jupyterlab==4.1.2
+jupyterlab==4.1.8
     # via
     #   -r requirements/main.in
     #   notebook
 jupyterlab-pygments==0.3.0
     # via nbconvert
-jupyterlab-server==2.25.3
+jupyterlab-server==2.27.1
     # via
     #   jupyterlab
     #   notebook
 jupyterlab-widgets==3.0.10
     # via ipywidgets
-jupytext==1.16.1
+jupytext==1.16.2
     # via mkdocs-jupyter
 lazy-model==0.2.0
     # via beanie
-libcst==1.2.0
+libcst==1.3.1
     # via shed
-linkml==1.7.5
+linkml==1.7.10
     # via
     #   -r requirements/main.in
     #   nmdc-schema
 linkml-dataops==0.1.0
     # via linkml
-linkml-runtime==1.7.2
+linkml-runtime==1.7.5
     # via
     #   -r requirements/main.in
     #   linkml
     #   linkml-dataops
     #   nmdc-schema
-mako==1.3.2
+mako==1.3.5
     # via alembic
-markdown==3.5.2
+markdown==3.6
     # via
     #   mkdocs
     #   mkdocs-material
@@ -407,28 +416,32 @@ markupsafe==2.1.5
     #   mako
     #   mkdocs
     #   nbconvert
-matplotlib-inline==0.1.6
+matplotlib-inline==0.1.7
     # via
     #   ipykernel
     #   ipython
-mdit-py-plugins==0.4.0
+mdit-py-plugins==0.4.1
     # via jupytext
 mdurl==0.1.2
     # via markdown-it-py
 mergedeep==1.3.4
-    # via mkdocs
+    # via
+    #   mkdocs
+    #   mkdocs-get-deps
 mistune==3.0.2
     # via nbconvert
-mkdocs==1.5.3
+mkdocs==1.6.0
     # via
     #   mkdocs-jupyter
     #   mkdocs-material
     #   mkdocs-mermaid2-plugin
     #   mkdocs-redirects
     #   nmdc-schema
-mkdocs-jupyter==0.24.6
+mkdocs-get-deps==0.2.0
+    # via mkdocs
+mkdocs-jupyter==0.24.7
     # via -r requirements/main.in
-mkdocs-material==9.5.12
+mkdocs-material==9.5.22
     # via
     #   -r requirements/main.in
     #   mkdocs-jupyter
@@ -443,24 +456,22 @@ mkdocs-redirects==1.2.1
     # via nmdc-schema
 more-click==0.1.2
     # via bioregistry
-motor==3.3.2
+motor==3.4.0
     # via
     #   -r requirements/main.in
     #   beanie
 multidict==6.0.5
     # via yarl
 mypy-extensions==1.0.0
-    # via
-    #   black
-    #   typing-inspect
-nbclient==0.9.0
+    # via black
+nbclient==0.10.0
     # via nbconvert
-nbconvert==7.16.1
+nbconvert==7.16.4
     # via
     #   jupyter
     #   jupyter-server
     #   mkdocs-jupyter
-nbformat==5.9.2
+nbformat==5.10.4
     # via
     #   jupyter-server
     #   jupytext
@@ -470,7 +481,7 @@ nest-asyncio==1.6.0
     # via ipykernel
 nmdc-schema==10.2.0
     # via -r requirements/main.in
-notebook==7.1.1
+notebook==7.1.3
     # via jupyter
 notebook-shim==0.2.4
     # via
@@ -480,15 +491,17 @@ numpy==1.26.4
     # via
     #   pandas
     #   terminusdb-client
-numpydoc==1.6.0
+numpydoc==1.7.0
     # via terminusdb-client
 openpyxl==3.1.2
     # via
     #   -r requirements/main.in
     #   linkml
+orjson==3.10.3
+    # via fastapi
 overrides==7.7.0
     # via jupyter-server
-packaging==23.2
+packaging==24.0
     # via
     #   black
     #   dagster
@@ -506,7 +519,7 @@ packaging==23.2
     #   sphinx
 paginate==0.5.6
     # via mkdocs-material
-pandas==2.2.1
+pandas==2.2.2
     # via
     #   -r requirements/main.in
     #   terminusdb-client
@@ -514,7 +527,7 @@ pandocfilters==1.5.1
     # via nbconvert
 parse==1.20.1
     # via linkml
-parso==0.8.3
+parso==0.8.4
     # via jedi
 passlib==1.7.4
     # via -r requirements/main.in
@@ -526,13 +539,13 @@ pendulum==3.0.0
     # via dagster
 pexpect==4.9.0
     # via ipython
-platformdirs==4.2.0
+platformdirs==4.2.2
     # via
     #   black
     #   jupyter-core
-    #   mkdocs
+    #   mkdocs-get-deps
     #   requests-cache
-pluggy==1.4.0
+pluggy==1.5.0
     # via pytest
 ply==3.11
     # via jsonpath-ng
@@ -540,7 +553,7 @@ prefixcommons==0.1.12
     # via
     #   linkml
     #   linkml-runtime
-prefixmaps==0.2.2
+prefixmaps==0.2.4
     # via
     #   linkml
     #   linkml-runtime
@@ -564,13 +577,13 @@ ptyprocess==0.7.0
     #   terminado
 pure-eval==0.2.2
     # via stack-data
-pyasn1==0.5.1
+pyasn1==0.6.0
     # via
     #   python-jose
     #   rsa
-pycparser==2.21
+pycparser==2.22
     # via cffi
-pydantic==2.6.3
+pydantic==2.7.1
     # via
     #   -r requirements/main.in
     #   beanie
@@ -581,11 +594,9 @@ pydantic==2.6.3
     #   lazy-model
     #   linkml
     #   linkml-runtime
-pydantic-core==2.16.3
+pydantic-core==2.18.2
     # via pydantic
-pyflakes==3.2.0
-    # via autoflake
-pygments==2.17.2
+pygments==2.18.0
     # via
     #   ipython
     #   jupyter-console
@@ -600,16 +611,16 @@ pyjsg==0.11.10
     #   linkml
     #   pyshexc
     #   shexjsg
-pymdown-extensions==10.7
+pymdown-extensions==10.8.1
     # via
     #   mkdocs-material
     #   mkdocs-mermaid2-plugin
-pymongo==4.6.2
+pymongo==4.7.2
     # via
     #   -r requirements/main.in
     #   motor
     #   nmdc-schema
-pyparsing==3.1.1
+pyparsing==3.1.2
     # via rdflib
 pyshex==0.8.1
     # via linkml
@@ -619,10 +630,14 @@ pyshexc==0.9.1
     #   pyshex
 pystow==0.5.4
     # via bioregistry
-pytest==8.0.2
-    # via pytest-logging
+pytest==8.2.0
+    # via
+    #   pytest-logging
+    #   pytest-mock
 pytest-logging==2015.11.4
     # via prefixcommons
+pytest-mock==3.14.0
+    # via -r requirements/main.in
 python-dateutil==2.9.0.post0
     # via
     #   arrow
@@ -645,7 +660,9 @@ python-jose==3.3.0
 python-json-logger==2.0.7
     # via jupyter-events
 python-multipart==0.0.9
-    # via -r requirements/main.in
+    # via
+    #   -r requirements/main.in
+    #   fastapi
 pytrie==0.4.0
     # via curies
 pytz==2024.1
@@ -653,7 +670,7 @@ pytz==2024.1
     #   croniter
     #   dagster
     #   pandas
-pyupgrade==3.15.1
+pyupgrade==3.15.2
     # via shed
 pyyaml==6.0.1
     # via
@@ -666,6 +683,7 @@ pyyaml==6.0.1
     #   linkml
     #   linkml-runtime
     #   mkdocs
+    #   mkdocs-get-deps
     #   mkdocs-mermaid2-plugin
     #   prefixcommons
     #   prefixmaps
@@ -674,14 +692,14 @@ pyyaml==6.0.1
     #   uvicorn
 pyyaml-env-tag==0.1
     # via mkdocs
-pyzmq==25.1.2
+pyzmq==26.0.3
     # via
     #   ipykernel
     #   jupyter-client
     #   jupyter-console
     #   jupyter-server
     #   qtconsole
-qtconsole==5.5.1
+qtconsole==5.5.2
     # via jupyter
 qtpy==2.4.1
     # via qtconsole
@@ -701,12 +719,12 @@ rdflib-shim==1.0.3
     #   pyshex
     #   pyshexc
     #   sparqlslurper
-referencing==0.33.0
+referencing==0.35.1
     # via
     #   jsonschema
     #   jsonschema-specifications
     #   jupyter-events
-regex==2023.12.25
+regex==2024.5.10
     # via mkdocs-material
 requests==2.31.0
     # via
@@ -743,8 +761,10 @@ rfc3986-validator==0.1.1
 rfc3987==1.3.8
     # via jsonschema
 rich==13.7.1
-    # via dagster
-rpds-py==0.18.0
+    # via
+    #   dagster
+    #   typer
+rpds-py==0.18.1
     # via
     #   jsonschema
     #   referencing
@@ -754,16 +774,20 @@ ruamel-yaml==0.18.6
     # via linkml-dataops
 ruamel-yaml-clib==0.2.8
     # via ruamel-yaml
-s3transfer==0.10.0
+ruff==0.4.4
+    # via shed
+s3transfer==0.10.1
     # via boto3
 semver==3.0.2
     # via -r requirements/main.in
-send2trash==1.8.2
+send2trash==1.8.3
     # via jupyter-server
-setuptools-scm==8.0.4
+setuptools-scm==8.1.0
     # via -r requirements/main.in
-shed==2024.1.1
+shed==2024.3.1
     # via terminusdb-client
+shellingham==1.5.4
+    # via typer
 shexjsg==0.8.2
     # via
     #   pyshex
@@ -796,7 +820,7 @@ sparqlwrapper==2.0.0
     # via
     #   pyshex
     #   sparqlslurper
-sphinx==7.2.6
+sphinx==7.3.7
     # via numpydoc
 sphinxcontrib-applehelp==1.0.8
     # via sphinx
@@ -810,14 +834,14 @@ sphinxcontrib-qthelp==1.0.7
     # via sphinx
 sphinxcontrib-serializinghtml==1.1.10
     # via sphinx
-sqlalchemy==2.0.27
+sqlalchemy==2.0.30
     # via
     #   alembic
     #   dagster
     #   linkml
 stack-data==0.6.3
     # via ipython
-starlette==0.36.3
+starlette==0.37.2
     # via
     #   dagster-graphql
     #   dagster-webserver
@@ -828,33 +852,32 @@ tabulate==0.9.0
     # via
     #   dagster
     #   numpydoc
-tenacity==8.2.3
+tenacity==8.3.0
     # via -r requirements/main.in
-terminado==0.18.0
+terminado==0.18.1
     # via
     #   jupyter-server
     #   jupyter-server-terminals
 terminusdb-client==10.2.6
     # via -r requirements/main.in
-time-machine==2.13.0
+time-machine==2.14.1
     # via pendulum
-tinycss2==1.2.1
+tinycss2==1.3.0
     # via nbconvert
 tokenize-rt==5.2.0
     # via pyupgrade
 toml==0.10.2
-    # via
-    #   beanie
-    #   jupytext
+    # via beanie
 tomli==2.0.1
     # via
-    #   autoflake
     #   black
     #   dagster
     #   jupyterlab
+    #   jupytext
     #   numpydoc
     #   pytest
     #   setuptools-scm
+    #   sphinx
 toolz==0.12.1
     # via -r requirements/main.in
 toposort==1.10
@@ -867,14 +890,14 @@ tornado==6.4
     #   jupyterlab
     #   notebook
     #   terminado
-tqdm==4.66.2
+tqdm==4.66.4
     # via
     #   -r requirements/main.in
     #   bioregistry
     #   dagster
     #   pystow
     #   terminusdb-client
-traitlets==5.14.1
+traitlets==5.14.3
     # via
     #   comm
     #   ipykernel
@@ -893,9 +916,11 @@ traitlets==5.14.1
     #   qtconsole
 typeguard==2.13.3
     # via terminusdb-client
-types-python-dateutil==2.8.19.20240106
+typer==0.12.3
+    # via fastapi-cli
+types-python-dateutil==2.9.0.20240316
     # via arrow
-typing-extensions==4.10.0
+typing-extensions==4.11.0
     # via
     #   alembic
     #   anyio
@@ -905,35 +930,36 @@ typing-extensions==4.10.0
     #   cattrs
     #   dagster
     #   fastapi
-    #   libcst
+    #   ipython
     #   pydantic
     #   pydantic-core
-    #   setuptools-scm
     #   sqlalchemy
-    #   typing-inspect
+    #   typer
     #   uvicorn
-typing-inspect==0.9.0
-    # via libcst
 tzdata==2024.1
     # via
     #   pandas
     #   pendulum
-universal-pathlib==0.2.1
+ujson==5.10.0
+    # via fastapi
+universal-pathlib==0.2.2
     # via dagster
 uri-template==1.3.0
     # via jsonschema
 url-normalize==1.4.3
     # via requests-cache
-urllib3==2.0.7
+urllib3==2.2.1
     # via
     #   botocore
     #   pyshex
     #   requests
     #   requests-cache
-uvicorn==0.27.1
+uvicorn==0.29.0
     # via
     #   -r requirements/main.in
     #   dagster-webserver
+    #   fastapi
+    #   fastapi-cli
 uvloop==0.19.0
     # via uvicorn
 watchdog==4.0.0
@@ -951,7 +977,7 @@ webencodings==0.5.1
     # via
     #   bleach
     #   tinycss2
-websocket-client==1.7.0
+websocket-client==1.8.0
     # via jupyter-server
 websockets==12.0
     # via uvicorn
@@ -967,7 +993,7 @@ yarl==1.9.4
     # via gql
 
 # The following packages are considered to be unsafe in a requirements file:
-setuptools==69.1.1
+setuptools==69.5.1
     # via
     #   dagster
     #   mkdocs-mermaid2-plugin

From c7d3da9be48951bacbefb858158789eb18a0fa62 Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Thu, 16 May 2024 13:52:24 -0700
Subject: [PATCH 15/27] logic for validating generated XML against XSD schemas

---
 nmdc_runtime/site/export/ncbi_xml.py       | 16 ++++++++++++++-
 nmdc_runtime/site/export/ncbi_xml_utils.py | 23 +++++++++++++++++++++-
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py
index 9fd3ba80..78b3bc51 100644
--- a/nmdc_runtime/site/export/ncbi_xml.py
+++ b/nmdc_runtime/site/export/ncbi_xml.py
@@ -12,6 +12,7 @@
     handle_float_value,
     handle_string_value,
     load_mappings,
+    validate_xml,
 )
 from nmdc_runtime.site.export.nmdc_api_client import NMDCApiClient
 
@@ -277,6 +278,19 @@ def get_submission_xml(self):
             nmdc_biosamples=biosamples_list,
         )
 
+
         rough_string = ET.tostring(self.root, "unicode")
         reparsed = xml.dom.minidom.parseString(rough_string)
-        return reparsed.toprettyxml(indent="    ", newl="\n")
+        submission_xml = reparsed.toprettyxml(indent="    ", newl="\n")
+
+        # ============= Uncomment the following code to validate the XML against NCBI XSDs ============ #
+        # submission_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/submission.xsd?view=co"
+        # submission_xsd_validation = validate_xml(submission_xml, submission_xsd_url)
+        
+        # bioproject_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/bioproject.xsd?view=co"
+        # bioproject_xsd_validation = validate_xml(submission_xml, bioproject_xsd_url)
+        
+        # biosample_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/biosample.xsd?view=co"
+        # biosample_xsd_validation = validate_xml(submission_xml, biosample_xsd_url)
+
+        return submission_xml
\ No newline at end of file
diff --git a/nmdc_runtime/site/export/ncbi_xml_utils.py b/nmdc_runtime/site/export/ncbi_xml_utils.py
index 1ad34aca..59b99b56 100644
--- a/nmdc_runtime/site/export/ncbi_xml_utils.py
+++ b/nmdc_runtime/site/export/ncbi_xml_utils.py
@@ -1,4 +1,5 @@
-from io import StringIO
+from lxml import etree
+from io import BytesIO, StringIO
 import csv
 import requests
 
@@ -95,3 +96,23 @@ def load_mappings(url):
         slot_range_mappings[json_key] = data_type if data_type else "default"
 
     return attribute_mappings, slot_range_mappings
+
+
+def validate_xml(xml, xsd_url):
+    response = requests.get(xsd_url)
+    response.raise_for_status()
+    xsd_content = response.text
+
+    xml_schema_doc = etree.parse(BytesIO(xsd_content.encode('utf-8')))
+    xml_schema = etree.XMLSchema(xml_schema_doc)
+
+    if '<?xml' in xml:
+        xml_doc = etree.parse(BytesIO(xml.encode('utf-8')))
+    else:
+        xml_doc = etree.parse(StringIO(xml))
+
+    xml_doc = etree.parse(StringIO(xml))
+
+    if not xml_schema.validate(xml_doc):
+        raise ValueError(f"There were errors while validating against: {xsd_url}")
+    return True

From 41a54680e294fa98d53c36228471a5b63b173a71 Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Thu, 16 May 2024 13:54:52 -0700
Subject: [PATCH 16/27] black formatting NCBI XML related files

---
 nmdc_runtime/site/export/ncbi_xml.py       |  7 +++----
 nmdc_runtime/site/export/ncbi_xml_utils.py |  6 +++---
 requirements/dev.in                        |  3 ++-
 requirements/dev.txt                       |  4 ++++
 requirements/main.in                       |  1 +
 requirements/main.txt                      | 12 +++++++-----
 6 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py
index 78b3bc51..a9b433a4 100644
--- a/nmdc_runtime/site/export/ncbi_xml.py
+++ b/nmdc_runtime/site/export/ncbi_xml.py
@@ -278,7 +278,6 @@ def get_submission_xml(self):
             nmdc_biosamples=biosamples_list,
         )
 
-
         rough_string = ET.tostring(self.root, "unicode")
         reparsed = xml.dom.minidom.parseString(rough_string)
         submission_xml = reparsed.toprettyxml(indent="    ", newl="\n")
@@ -286,11 +285,11 @@ def get_submission_xml(self):
         # ============= Uncomment the following code to validate the XML against NCBI XSDs ============ #
         # submission_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/submission.xsd?view=co"
         # submission_xsd_validation = validate_xml(submission_xml, submission_xsd_url)
-        
+
         # bioproject_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/bioproject.xsd?view=co"
         # bioproject_xsd_validation = validate_xml(submission_xml, bioproject_xsd_url)
-        
+
         # biosample_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/biosample.xsd?view=co"
         # biosample_xsd_validation = validate_xml(submission_xml, biosample_xsd_url)
 
-        return submission_xml
\ No newline at end of file
+        return submission_xml
diff --git a/nmdc_runtime/site/export/ncbi_xml_utils.py b/nmdc_runtime/site/export/ncbi_xml_utils.py
index 59b99b56..64183202 100644
--- a/nmdc_runtime/site/export/ncbi_xml_utils.py
+++ b/nmdc_runtime/site/export/ncbi_xml_utils.py
@@ -103,11 +103,11 @@ def validate_xml(xml, xsd_url):
     response.raise_for_status()
     xsd_content = response.text
 
-    xml_schema_doc = etree.parse(BytesIO(xsd_content.encode('utf-8')))
+    xml_schema_doc = etree.parse(BytesIO(xsd_content.encode("utf-8")))
     xml_schema = etree.XMLSchema(xml_schema_doc)
 
-    if '<?xml' in xml:
-        xml_doc = etree.parse(BytesIO(xml.encode('utf-8')))
+    if "<?xml" in xml:
+        xml_doc = etree.parse(BytesIO(xml.encode("utf-8")))
     else:
         xml_doc = etree.parse(StringIO(xml))
 
diff --git a/requirements/dev.in b/requirements/dev.in
index 601370de..9689b9bc 100644
--- a/requirements/dev.in
+++ b/requirements/dev.in
@@ -12,4 +12,5 @@ requests-mock
 setuptools
 twine
 requests-cache
-pytest-mock
\ No newline at end of file
+pytest-mock
+lxml
\ No newline at end of file
diff --git a/requirements/dev.txt b/requirements/dev.txt
index 8eb8d791..11fc76bf 100644
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@@ -72,6 +72,10 @@ jaraco-functools==4.0.1
     # via keyring
 keyring==25.2.1
     # via twine
+lxml==5.2.2
+    # via
+    #   -c requirements/main.txt
+    #   -r requirements/dev.in
 markdown-it-py==3.0.0
     # via
     #   -c requirements/main.txt
diff --git a/requirements/main.in b/requirements/main.in
index 4f58b6e3..0f8042e8 100644
--- a/requirements/main.in
+++ b/requirements/main.in
@@ -20,6 +20,7 @@ jupyter
 jupyterlab
 linkml
 linkml-runtime
+lxml
 mkdocs-jupyter
 mkdocs-material
 mkdocs-mermaid2-plugin
diff --git a/requirements/main.txt b/requirements/main.txt
index 3548c168..bef99191 100644
--- a/requirements/main.txt
+++ b/requirements/main.txt
@@ -66,9 +66,9 @@ black==24.4.2
     # via shed
 bleach==6.1.0
     # via nbconvert
-boto3==1.34.105
+boto3==1.34.106
     # via -r requirements/main.in
-botocore==1.34.105
+botocore==1.34.106
     # via
     #   boto3
     #   s3transfer
@@ -202,7 +202,7 @@ fqdn==1.5.1
     # via jsonschema
 frozendict==2.4.4
     # via -r requirements/main.in
-fsspec==2024.3.1
+fsspec==2024.5.0
     # via universal-pathlib
 ghp-import==2.1.0
     # via mkdocs
@@ -398,6 +398,8 @@ linkml-runtime==1.7.5
     #   linkml
     #   linkml-dataops
     #   nmdc-schema
+lxml==5.2.2
+    # via -r requirements/main.in
 mako==1.3.5
     # via alembic
 markdown==3.6
@@ -441,7 +443,7 @@ mkdocs-get-deps==0.2.0
     # via mkdocs
 mkdocs-jupyter==0.24.7
     # via -r requirements/main.in
-mkdocs-material==9.5.22
+mkdocs-material==9.5.23
     # via
     #   -r requirements/main.in
     #   mkdocs-jupyter
@@ -724,7 +726,7 @@ referencing==0.35.1
     #   jsonschema
     #   jsonschema-specifications
     #   jupyter-events
-regex==2024.5.10
+regex==2024.5.15
     # via mkdocs-material
 requests==2.31.0
     # via

From e9ac70ddab4e72790c8fe9a150f261a448a85a10 Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Wed, 29 May 2024 12:35:39 -0700
Subject: [PATCH 17/27] use RuntimeApiSiteClient instead of defining new
 NmdcApiClient class

---
 nmdc_runtime/site/export/ncbi_xml.py        | 22 +++---
 nmdc_runtime/site/export/nmdc_api_client.py | 41 -----------
 nmdc_runtime/site/export/study_metadata.py  | 29 ++++++--
 nmdc_runtime/site/graphs.py                 | 10 ++-
 nmdc_runtime/site/ops.py                    | 14 ++--
 nmdc_runtime/site/repository.py             | 21 +++++-
 tests/test_data/test_ncbi_xml.py            | 77 ++++++++-------------
 7 files changed, 100 insertions(+), 114 deletions(-)
 delete mode 100644 nmdc_runtime/site/export/nmdc_api_client.py

diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py
index a9b433a4..3eba0c44 100644
--- a/nmdc_runtime/site/export/ncbi_xml.py
+++ b/nmdc_runtime/site/export/ncbi_xml.py
@@ -14,26 +14,26 @@
     load_mappings,
     validate_xml,
 )
-from nmdc_runtime.site.export.nmdc_api_client import NMDCApiClient
 
 
 class NCBISubmissionXML:
-    def __init__(self, ncbi_submission_fields: dict):
+    def __init__(self, nmdc_study_id: str, ncbi_submission_metadata: dict):
         self.root = ET.Element("Submission")
-        self.nmdc_study_id = ncbi_submission_fields.get("nmdc_study_id")
-        self.nmdc_ncbi_attribute_mapping_file_url = ncbi_submission_fields.get(
+
+        self.nmdc_study_id = nmdc_study_id
+
+        self.nmdc_ncbi_attribute_mapping_file_url = ncbi_submission_metadata.get(
             "nmdc_ncbi_attribute_mapping_file_url"
         )
-        self.ncbi_submission_metadata = ncbi_submission_fields.get(
+        self.ncbi_submission_metadata = ncbi_submission_metadata.get(
             "ncbi_submission_metadata", {}
         )
-        self.ncbi_bioproject_metadata = ncbi_submission_fields.get(
+        self.ncbi_bioproject_metadata = ncbi_submission_metadata.get(
             "ncbi_bioproject_metadata", {}
         )
-        self.ncbi_biosample_metadata = ncbi_submission_fields.get(
+        self.ncbi_biosample_metadata = ncbi_submission_metadata.get(
             "ncbi_biosample_metadata", {}
         )
-        self.nmdc_api_client = NMDCApiClient()
 
         # dispatcher dictionary capturing handlers for NMDC object to NCBI flat Attribute
         # type handlers
@@ -250,7 +250,7 @@ def set_biosample(
             )
             self.root.append(action)
 
-    def get_submission_xml(self):
+    def get_submission_xml(self, biosamples_list: list):
         self.set_description(
             email=self.ncbi_submission_metadata.get("email", ""),
             user=self.ncbi_submission_metadata.get("user", ""),
@@ -267,10 +267,6 @@ def get_submission_xml(self):
             org=self.ncbi_submission_metadata.get("organization", ""),
         )
 
-        biosamples_list = self.nmdc_api_client.get_biosamples_part_of_study(
-            self.nmdc_study_id
-        )
-
         self.set_biosample(
             organism_name=self.ncbi_biosample_metadata.get("organism_name", ""),
             package=self.ncbi_biosample_metadata.get("package", ""),
diff --git a/nmdc_runtime/site/export/nmdc_api_client.py b/nmdc_runtime/site/export/nmdc_api_client.py
deleted file mode 100644
index b4dd38d7..00000000
--- a/nmdc_runtime/site/export/nmdc_api_client.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import os
-import json
-import requests
-
-from dotenv import load_dotenv
-
-
-class NMDCApiClient:
-    def __init__(self, api_base_url=None):
-        load_dotenv()
-        self.base_url = api_base_url or os.getenv("API_HOST")
-        if not self.base_url:
-            raise ValueError("API base URL for runtime environment is required.")
-        if not self.base_url.endswith("/"):
-            self.base_url += "/"
-        self.headers = {
-            "accept": "application/json",
-            "Content-Type": "application/json",
-        }
-
-    def get_biosamples_part_of_study(self, study_id: str) -> list[dict]:
-        """
-        Get the biosamples that are part of a study.
-        """
-        biosample_records = []
-        params = {"filter": json.dumps({"part_of": study_id}), "max_page_size": "1000"}
-        url = self.base_url + "nmdcschema/biosample_set"
-
-        while True:
-            response = requests.get(url, params=params, headers=self.headers)
-            response.raise_for_status()
-            data = response.json()
-            biosample_records.extend(data["resources"])
-
-            # Check if there's a next page
-            next_page_token = data.get("next_page_token")
-            if not next_page_token:
-                break
-            params["page_token"] = next_page_token
-
-        return biosample_records
diff --git a/nmdc_runtime/site/export/study_metadata.py b/nmdc_runtime/site/export/study_metadata.py
index cdcfef8e..626ce01b 100644
--- a/nmdc_runtime/site/export/study_metadata.py
+++ b/nmdc_runtime/site/export/study_metadata.py
@@ -5,7 +5,6 @@
 import csv
 from io import StringIO
 
-import requests
 from dagster import (
     op,
     get_dagster_logger,
@@ -26,13 +25,27 @@ def get_all_docs(client, collection, filter_):
     per_page = 200
     url_base = f"/{collection}?filter={filter_}&per_page={per_page}"
     results = []
-    rv = client.request("GET", url_base).json()
+    response = client.request("GET", url_base)
+    if response.status_code != 200:
+        raise Exception(
+            f"Runtime API request failed with status {response.status_code}."
+            f" Check URL: {url_base}"
+        )
+    rv = response.json()
     results.extend(rv.get("results", []))
     page, count = rv["meta"]["page"], rv["meta"]["count"]
     assert count <= 10_000
     while page * per_page < count:
-        rv = requests.get(url_base + f"&page={page + 1}").json()
-        results.extend(rv["results"])
+        page += 1
+        url = f"{url_base}&page={page}"
+        response = client.request("GET", url)
+        if response.status_code != 200:
+            raise Exception(
+                f"Runtime API request failed with status {response.status_code}."
+                f" Check URL: {url}"
+            )
+        rv = response.json()
+        results.extend(rv.get("results", []))
     return results
 
 
@@ -115,3 +128,11 @@ def export_study_biosamples_as_csv(context: OpExecutionContext, study_export_inf
 def export_study_biosamples_metadata():
     outputs = export_study_biosamples_as_csv(get_study_biosamples_metadata())
     add_output_run_event(outputs)
+
+
+@op(required_resource_keys={"runtime_api_site_client"})
+def get_biosamples_by_study_id(context: OpExecutionContext, nmdc_study_id: str):
+    # nmdc_study_id = context.op_config["nmdc_study_id"]
+    client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
+    biosamples = get_all_docs(client, "biosamples", f"part_of:{nmdc_study_id}")
+    return biosamples
diff --git a/nmdc_runtime/site/graphs.py b/nmdc_runtime/site/graphs.py
index f1b755d6..c5b485a6 100644
--- a/nmdc_runtime/site/graphs.py
+++ b/nmdc_runtime/site/graphs.py
@@ -49,10 +49,12 @@
     get_neon_pipeline_inputs,
     get_df_from_url,
     site_code_mapping,
+    get_ncbi_export_pipeline_study_id,
     get_ncbi_export_pipeline_inputs,
     ncbi_submission_xml_from_nmdc_study,
     ncbi_submission_xml_asset,
 )
+from nmdc_runtime.site.export.study_metadata import get_biosamples_by_study_id
 
 
 @graph
@@ -388,6 +390,10 @@ def ingest_neon_surface_water_metadata():
 
 @graph
 def nmdc_study_to_ncbi_submission_export():
-    ncbi_submission_fields = get_ncbi_export_pipeline_inputs()
-    xml_data = ncbi_submission_xml_from_nmdc_study(ncbi_submission_fields)
+    nmdc_study_id = get_ncbi_export_pipeline_study_id()
+    biosamples = get_biosamples_by_study_id(nmdc_study_id)
+    ncbi_submission_metadata = get_ncbi_export_pipeline_inputs()
+    xml_data = ncbi_submission_xml_from_nmdc_study(
+        nmdc_study_id, ncbi_submission_metadata, biosamples
+    )
     ncbi_submission_xml_asset(xml_data)
diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py
index df6b07a1..ecd4f864 100644
--- a/nmdc_runtime/site/ops.py
+++ b/nmdc_runtime/site/ops.py
@@ -1000,9 +1000,13 @@ def site_code_mapping() -> dict:
         )
 
 
+@op(config_schema={"nmdc_study_id": str})
+def get_ncbi_export_pipeline_study_id(context: OpExecutionContext) -> str:
+    return context.op_config["nmdc_study_id"]
+
+
 @op(
     config_schema={
-        "nmdc_study_id": str,
         "nmdc_ncbi_attribute_mapping_file_url": str,
         "ncbi_submission_metadata": Field(
             Permissive(
@@ -1043,7 +1047,6 @@ def site_code_mapping() -> dict:
     out=Out(Dict),
 )
 def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str:
-    nmdc_study_id = context.op_config["nmdc_study_id"]
     nmdc_ncbi_attribute_mapping_file_url = context.op_config[
         "nmdc_ncbi_attribute_mapping_file_url"
     ]
@@ -1052,7 +1055,6 @@ def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str:
     ncbi_biosample_metadata = context.op_config.get("ncbi_biosample_metadata", {})
 
     return {
-        "nmdc_study_id": nmdc_study_id,
         "nmdc_ncbi_attribute_mapping_file_url": nmdc_ncbi_attribute_mapping_file_url,
         "ncbi_submission_metadata": ncbi_submission_metadata,
         "ncbi_bioproject_metadata": ncbi_bioproject_metadata,
@@ -1063,8 +1065,10 @@ def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str:
 @op
 def ncbi_submission_xml_from_nmdc_study(
     context: OpExecutionContext,
+    nmdc_study_id: str,
     ncbi_exporter_metadata: dict,
+    biosamples: list,
 ) -> str:
-    ncbi_exporter = NCBISubmissionXML(ncbi_exporter_metadata)
-    ncbi_xml = ncbi_exporter.get_submission_xml()
+    ncbi_exporter = NCBISubmissionXML(nmdc_study_id, ncbi_exporter_metadata)
+    ncbi_xml = ncbi_exporter.get_submission_xml(biosamples)
     return ncbi_xml
diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py
index 3e68892e..9719cede 100644
--- a/nmdc_runtime/site/repository.py
+++ b/nmdc_runtime/site/repository.py
@@ -855,13 +855,32 @@ def biosample_submission_ingest():
 
 @repository
 def biosample_export():
+    normal_resources = run_config_frozen__normal_env["resources"]
     return [
         nmdc_study_to_ncbi_submission_export.to_job(
+            resource_defs=resource_defs,
             config={
+                "resources": merge(
+                    unfreeze(normal_resources),
+                    {
+                        "runtime_api_site_client": {
+                            "config": {
+                                "base_url": {"env": "API_HOST"},
+                                "client_id": {"env": "API_SITE_CLIENT_ID"},
+                                "client_secret": {"env": "API_SITE_CLIENT_SECRET"},
+                                "site_id": {"env": "API_SITE_ID"},
+                            },
+                        },
+                    },
+                ),
                 "ops": {
-                    "get_ncbi_export_pipeline_inputs": {
+                    "get_ncbi_export_pipeline_study_id": {
                         "config": {
                             "nmdc_study_id": "",
+                        }
+                    },
+                    "get_ncbi_export_pipeline_inputs": {
+                        "config": {
                             "nmdc_ncbi_attribute_mapping_file_url": "",
                             "ncbi_submission_metadata": {
                                 "email": "",
diff --git a/tests/test_data/test_ncbi_xml.py b/tests/test_data/test_ncbi_xml.py
index 3cd2913c..f8974f68 100644
--- a/tests/test_data/test_ncbi_xml.py
+++ b/tests/test_data/test_ncbi_xml.py
@@ -15,10 +15,10 @@
     handle_float_value,
     handle_string_value,
 )
-from nmdc_runtime.site.export.nmdc_api_client import NMDCApiClient
 
-MOCK_SUBMISSION_FIELDS = {
-    "nmdc_study_id": "nmdc:sty-11-12345",
+MOCK_NCBI_NMDC_STUDY_ID = "nmdc:sty-11-12345"
+
+MOCK_NCBI_SUBMISSION_METADATA = {
     "nmdc_ncbi_attribute_mapping_file_url": "http://example.com/mappings.tsv",
     "ncbi_submission_metadata": {
         "email": "user@example.com",
@@ -43,12 +43,10 @@
 
 @pytest.fixture
 def ncbi_submission_client():
-    return NCBISubmissionXML(ncbi_submission_fields=MOCK_SUBMISSION_FIELDS)
-
-
-@pytest.fixture
-def nmdc_api_client():
-    return NMDCApiClient(api_base_url="http://fakeapi.com/")
+    return NCBISubmissionXML(
+        nmdc_study_id=MOCK_NCBI_NMDC_STUDY_ID,
+        ncbi_submission_metadata=MOCK_NCBI_SUBMISSION_METADATA,
+    )
 
 
 @pytest.fixture
@@ -89,11 +87,11 @@ def test_set_element(self, ncbi_submission_client):
 
     def test_set_description(self, ncbi_submission_client):
         ncbi_submission_client.set_description(
-            MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["email"],
-            MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["user"],
-            MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["first"],
-            MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["last"],
-            MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["organization"],
+            MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"]["email"],
+            MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"]["user"],
+            MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"]["first"],
+            MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"]["last"],
+            MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"]["organization"],
         )
         description = ET.tostring(
             ncbi_submission_client.root.find("Description"), "unicode"
@@ -116,13 +114,19 @@ def test_set_description(self, ncbi_submission_client):
 
     def test_set_bioproject(self, ncbi_submission_client):
         ncbi_submission_client.set_bioproject(
-            title=MOCK_SUBMISSION_FIELDS["ncbi_bioproject_metadata"]["title"],
-            project_id=MOCK_SUBMISSION_FIELDS["ncbi_bioproject_metadata"]["project_id"],
-            description=MOCK_SUBMISSION_FIELDS["ncbi_bioproject_metadata"][
+            title=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"]["title"],
+            project_id=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"][
+                "project_id"
+            ],
+            description=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"][
                 "description"
             ],
-            data_type=MOCK_SUBMISSION_FIELDS["ncbi_bioproject_metadata"]["data_type"],
-            org=MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["organization"],
+            data_type=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"][
+                "data_type"
+            ],
+            org=MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"][
+                "organization"
+            ],
         )
         bioproject_xml = ET.tostring(
             ncbi_submission_client.root.find(".//Project"), "unicode"
@@ -178,11 +182,13 @@ def test_set_biosample(self, ncbi_submission_client, nmdc_biosample, mocker):
             ),
         )
         ncbi_submission_client.set_biosample(
-            organism_name=MOCK_SUBMISSION_FIELDS["ncbi_biosample_metadata"][
+            organism_name=MOCK_NCBI_SUBMISSION_METADATA["ncbi_biosample_metadata"][
                 "organism_name"
             ],
-            package=MOCK_SUBMISSION_FIELDS["ncbi_biosample_metadata"]["package"],
-            org=MOCK_SUBMISSION_FIELDS["ncbi_submission_metadata"]["organization"],
+            package=MOCK_NCBI_SUBMISSION_METADATA["ncbi_biosample_metadata"]["package"],
+            org=MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"][
+                "organization"
+            ],
             nmdc_biosamples=nmdc_biosample,
         )
         biosample_xml = ET.tostring(
@@ -237,11 +243,7 @@ def test_get_submission_xml(self, mocker, ncbi_submission_client, nmdc_biosample
             ),
         )
 
-        mocker.patch.object(
-            NMDCApiClient, "get_biosamples_part_of_study", return_value=nmdc_biosample
-        )
-
-        submission_xml = ncbi_submission_client.get_submission_xml()
+        submission_xml = ncbi_submission_client.get_submission_xml(nmdc_biosample)
 
         assert "nmdc:bsm-12-gnfpt483" in submission_xml
         assert "E. coli" in submission_xml
@@ -252,27 +254,6 @@ def test_get_submission_xml(self, mocker, ncbi_submission_client, nmdc_biosample
         assert "Test Project" in submission_xml
 
 
-class TestNMDCApiClient:
-    def test_get_biosamples_part_of_study_success(self, mocker, nmdc_api_client):
-        mock_response = mocker.MagicMock()
-        mock_response.json.return_value = {
-            "resources": [
-                {"id": "nmdc:bsm-12-gnfpt483", "part_of": ["nmdc:sty-11-hht5sb92"]}
-            ],
-            "next_page_token": None,
-        }
-        mocker.patch("requests.get", return_value=mock_response)
-        result = nmdc_api_client.get_biosamples_part_of_study("nmdc:sty-11-hht5sb92")
-        assert result == [
-            {"id": "nmdc:bsm-12-gnfpt483", "part_of": ["nmdc:sty-11-hht5sb92"]}
-        ]
-
-    def test_get_biosamples_part_of_study_failure(self, mocker, nmdc_api_client):
-        mocker.patch("requests.get", side_effect=HTTPError("API Error"))
-        with pytest.raises(HTTPError):
-            nmdc_api_client.get_biosamples_part_of_study("nmdc:sty-11-hht5sb92")
-
-
 class TestNCBIXMLUtils:
     def test_handle_quantity_value(self):
         assert (

From 0ab92b3913dea8505f92c8bbf8f08de8ef57652c Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Mon, 3 Jun 2024 19:34:12 -0700
Subject: [PATCH 18/27] add logic for autogenerating <Action> block for SRA db

---
 nmdc_runtime/site/export/ncbi_xml.py       | 97 +++++++++++++++++++++-
 nmdc_runtime/site/export/ncbi_xml_utils.py | 53 ++++++++++++
 nmdc_runtime/site/export/study_metadata.py |  1 -
 nmdc_runtime/site/graphs.py                |  6 +-
 nmdc_runtime/site/ops.py                   | 15 +++-
 nmdc_runtime/site/repository.py            |  8 ++
 6 files changed, 174 insertions(+), 6 deletions(-)

diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py
index 3eba0c44..23346264 100644
--- a/nmdc_runtime/site/export/ncbi_xml.py
+++ b/nmdc_runtime/site/export/ncbi_xml.py
@@ -176,7 +176,6 @@ def set_biosample(
                 formatted_value = handler(value)
                 attributes[xml_key] = formatted_value
 
-            # Create the BioSample XML block with these attributes for each biosample
             biosample_elements = [
                 self.set_element(
                     "SampleId",
@@ -250,7 +249,95 @@ def set_biosample(
             )
             self.root.append(action)
 
-    def get_submission_xml(self, biosamples_list: list):
+    def set_fastq(
+        self,
+        biosample_data_objects: list,
+        bioproject_id: str,
+        org: str,
+    ):
+        fastq_files = []
+        biosample_ids = []
+
+        for entry in biosample_data_objects:
+            for biosample_id, data_objects in entry.items():
+                biosample_ids.append(biosample_id)
+                for data_object in data_objects:
+                    if "url" in data_object:
+                        fastq_files.append(data_object["url"])
+
+        if fastq_files:
+            files_elements = [
+                self.set_element(
+                    "File",
+                    "",
+                    {"file_path": f},
+                    [self.set_element("DataType", "generic-data")],
+                )
+                for f in fastq_files
+            ]
+
+            attribute_elements = [
+                self.set_element(
+                    "AttributeRefId",
+                    attrib={"name": "BioProject"},
+                    children=[
+                        self.set_element(
+                            "RefId",
+                            children=[
+                                self.set_element(
+                                    "SPUID",
+                                    bioproject_id,
+                                    {"spuid_namespace": org},
+                                )
+                            ],
+                        )
+                    ],
+                )
+            ]
+
+            for biosample_id in biosample_ids:
+                attribute_elements.append(
+                    self.set_element(
+                        "AttributeRefId",
+                        attrib={"name": "BioSample"},
+                        children=[
+                            self.set_element(
+                                "RefId",
+                                children=[
+                                    self.set_element(
+                                        "SPUID",
+                                        biosample_id,
+                                        {"spuid_namespace": org},
+                                    )
+                                ],
+                            )
+                        ],
+                    )
+                )
+
+            identifier_element = self.set_element(
+                "Identifier",
+                children=[
+                    self.set_element("SPUID", bioproject_id, {"spuid_namespace": org})
+                ],
+            )
+
+            action = self.set_element(
+                "Action",
+                children=[
+                    self.set_element(
+                        "AddFiles",
+                        attrib={"target_db": "SRA"},
+                        children=files_elements
+                        + attribute_elements
+                        + [identifier_element],
+                    ),
+                ],
+            )
+
+            self.root.append(action)
+
+    def get_submission_xml(self, biosamples_list: list, data_objects_list: list):
         self.set_description(
             email=self.ncbi_submission_metadata.get("email", ""),
             user=self.ncbi_submission_metadata.get("user", ""),
@@ -274,6 +361,12 @@ def get_submission_xml(self, biosamples_list: list):
             nmdc_biosamples=biosamples_list,
         )
 
+        self.set_fastq(
+            biosample_data_objects=data_objects_list,
+            bioproject_id=self.ncbi_bioproject_metadata.get("project_id", ""),
+            org=self.ncbi_submission_metadata.get("organization", ""),
+        )
+
         rough_string = ET.tostring(self.root, "unicode")
         reparsed = xml.dom.minidom.parseString(rough_string)
         submission_xml = reparsed.toprettyxml(indent="    ", newl="\n")
diff --git a/nmdc_runtime/site/export/ncbi_xml_utils.py b/nmdc_runtime/site/export/ncbi_xml_utils.py
index 64183202..bf3d285e 100644
--- a/nmdc_runtime/site/export/ncbi_xml_utils.py
+++ b/nmdc_runtime/site/export/ncbi_xml_utils.py
@@ -4,6 +4,59 @@
 import requests
 
 
+# TODO: do not hardcode this mapping
+def get_classname_from_typecode(doc_id):
+    typecode = doc_id.split(":")[1].split("-")[0]
+    class_map = {
+        "bsm": "Biosample",
+        "extr": "Extraction",
+        "pool": "Pooling",
+        "libprep": "LibraryPreparation",
+        "procsm": "ProcessedSample",
+        "omprc": "OmicsProcessing",
+        "dobj": "DataObject",
+    }
+    return class_map.get(typecode)
+
+
+def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
+    biosample_data_objects = []
+
+    for biosample in biosamples_list:
+        current_ids = [biosample["id"]]
+        collected_data_objects = []
+
+        while current_ids:
+            new_current_ids = []
+            for current_id in current_ids:
+                query = {"has_input": current_id}
+                document = all_docs_collection.find_one(query)
+
+                if not document:
+                    continue
+
+                has_output = document.get("has_output")
+                if not has_output:
+                    continue
+
+                for output_id in has_output:
+                    if get_classname_from_typecode(output_id) == "DataObject":
+                        data_object_doc = all_docs_collection.find_one(
+                            {"id": output_id}
+                        )
+                        if data_object_doc:
+                            collected_data_objects.append(data_object_doc)
+                    else:
+                        new_current_ids.append(output_id)
+
+            current_ids = new_current_ids
+
+        if collected_data_objects:
+            biosample_data_objects.append({biosample["id"]: collected_data_objects})
+
+    return biosample_data_objects
+
+
 def handle_quantity_value(slot_value):
     if "has_numeric_value" in slot_value and "has_unit" in slot_value:
         return f"{slot_value['has_numeric_value']} {slot_value['has_unit']}"
diff --git a/nmdc_runtime/site/export/study_metadata.py b/nmdc_runtime/site/export/study_metadata.py
index 626ce01b..3cf9bc6d 100644
--- a/nmdc_runtime/site/export/study_metadata.py
+++ b/nmdc_runtime/site/export/study_metadata.py
@@ -132,7 +132,6 @@ def export_study_biosamples_metadata():
 
 @op(required_resource_keys={"runtime_api_site_client"})
 def get_biosamples_by_study_id(context: OpExecutionContext, nmdc_study_id: str):
-    # nmdc_study_id = context.op_config["nmdc_study_id"]
     client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
     biosamples = get_all_docs(client, "biosamples", f"part_of:{nmdc_study_id}")
     return biosamples
diff --git a/nmdc_runtime/site/graphs.py b/nmdc_runtime/site/graphs.py
index c5b485a6..700ff6d7 100644
--- a/nmdc_runtime/site/graphs.py
+++ b/nmdc_runtime/site/graphs.py
@@ -50,6 +50,7 @@
     get_df_from_url,
     site_code_mapping,
     get_ncbi_export_pipeline_study_id,
+    get_data_objects_from_biosamples,
     get_ncbi_export_pipeline_inputs,
     ncbi_submission_xml_from_nmdc_study,
     ncbi_submission_xml_asset,
@@ -391,9 +392,10 @@ def ingest_neon_surface_water_metadata():
 @graph
 def nmdc_study_to_ncbi_submission_export():
     nmdc_study_id = get_ncbi_export_pipeline_study_id()
-    biosamples = get_biosamples_by_study_id(nmdc_study_id)
     ncbi_submission_metadata = get_ncbi_export_pipeline_inputs()
+    biosamples = get_biosamples_by_study_id(nmdc_study_id)
+    data_objects = get_data_objects_from_biosamples(biosamples)
     xml_data = ncbi_submission_xml_from_nmdc_study(
-        nmdc_study_id, ncbi_submission_metadata, biosamples
+        nmdc_study_id, ncbi_submission_metadata, biosamples, data_objects
     )
     ncbi_submission_xml_asset(xml_data)
diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py
index ecd4f864..330b056d 100644
--- a/nmdc_runtime/site/ops.py
+++ b/nmdc_runtime/site/ops.py
@@ -37,6 +37,7 @@
 from gridfs import GridFS
 from linkml_runtime.dumpers import json_dumper
 from linkml_runtime.utils.yamlutils import YAMLRoot
+from nmdc_runtime.api.db.mongo import get_mongo_db
 from nmdc_runtime.api.core.idgen import generate_one_id
 from nmdc_runtime.api.core.metadata import (
     _validate_changesheet,
@@ -60,6 +61,7 @@
 )
 from nmdc_runtime.api.models.util import ResultT
 from nmdc_runtime.site.export.ncbi_xml import NCBISubmissionXML
+from nmdc_runtime.site.export.ncbi_xml_utils import fetch_data_objects_from_biosamples
 from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict
 from nmdc_runtime.site.resources import (
     NmdcPortalApiClient,
@@ -1062,13 +1064,24 @@ def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str:
     }
 
 
+@op(required_resource_keys={"mongo"})
+def get_data_objects_from_biosamples(context: OpExecutionContext, biosamples: list):
+    mdb = context.resources.mongo.db
+    alldocs_collection = mdb["alldocs"]
+    biosample_data_objects = fetch_data_objects_from_biosamples(
+        alldocs_collection, biosamples
+    )
+    return biosample_data_objects
+
+
 @op
 def ncbi_submission_xml_from_nmdc_study(
     context: OpExecutionContext,
     nmdc_study_id: str,
     ncbi_exporter_metadata: dict,
     biosamples: list,
+    data_objects: list,
 ) -> str:
     ncbi_exporter = NCBISubmissionXML(nmdc_study_id, ncbi_exporter_metadata)
-    ncbi_xml = ncbi_exporter.get_submission_xml(biosamples)
+    ncbi_xml = ncbi_exporter.get_submission_xml(biosamples, data_objects)
     return ncbi_xml
diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py
index 9719cede..6d62b1cf 100644
--- a/nmdc_runtime/site/repository.py
+++ b/nmdc_runtime/site/repository.py
@@ -863,6 +863,14 @@ def biosample_export():
                 "resources": merge(
                     unfreeze(normal_resources),
                     {
+                        "mongo": {
+                            "config": {
+                                "host": {"env": "MONGO_HOST"},
+                                "username": {"env": "MONGO_USERNAME"},
+                                "password": {"env": "MONGO_PASSWORD"},
+                                "dbname": {"env": "MONGO_DBNAME"},
+                            },
+                        },
                         "runtime_api_site_client": {
                             "config": {
                                 "base_url": {"env": "API_HOST"},

From 151de042f0fd1ccf6e89db1af30cf4d215aa6267 Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Mon, 3 Jun 2024 19:57:30 -0700
Subject: [PATCH 19/27] update tests for new method set_fastq() in ncbi_xml.py

---
 tests/test_data/test_ncbi_xml.py | 104 +++++++++++++++++++++++++------
 1 file changed, 84 insertions(+), 20 deletions(-)

diff --git a/tests/test_data/test_ncbi_xml.py b/tests/test_data/test_ncbi_xml.py
index f8974f68..0af9ddc5 100644
--- a/tests/test_data/test_ncbi_xml.py
+++ b/tests/test_data/test_ncbi_xml.py
@@ -55,29 +55,52 @@ def nmdc_biosample():
         {
             "analysis_type": ["metagenomics"],
             "biosample_categories": ["NEON"],
-            "collection_date": {"has_raw_value": "2014-08-05T18:40Z"},
-            "conduc": {"has_numeric_value": 567, "has_unit": "uS/cm"},
-            "elev": 1178.7,
-            "env_broad_scale": {
-                "term": {"id": "ENVO:03605008", "name": "freshwater stream biome"}
+            "collection_date": {"has_raw_value": "2015-07-21T18:00Z"},
+            "depth": {
+                "has_maximum_numeric_value": 1,
+                "has_minimum_numeric_value": 0,
+                "has_unit": "meters",
             },
-            "env_local_scale": {
-                "term": {"id": "ENVO:03605007", "name": "freshwater stream"}
+            "elev": 1179.5,
+            "env_broad_scale": {
+                "term": {"id": "ENVO:01000253", "name": "freshwater river biome"}
             },
-            "env_medium": {"term": {"id": "ENVO:03605006", "name": "stream water"}},
-            "env_package": {"has_raw_value": "water"},
+            "env_local_scale": {"term": {"id": "ENVO:03600094", "name": "stream pool"}},
+            "env_medium": {"term": {"id": "ENVO:00002007", "name": "sediment"}},
             "geo_loc_name": {"has_raw_value": "USA: Colorado, Arikaree River"},
-            "id": "nmdc:bsm-12-gnfpt483",
-            "lat_lon": {"latitude": 39.758359, "longitude": -102.448595},
-            "name": "ARIK.SS.20140805",
-            "part_of": ["nmdc:sty-11-hht5sb92"],
-            "samp_collec_device": "Grab",
-            "temp": {"has_numeric_value": 20.1, "has_unit": "Cel"},
+            "id": "nmdc:bsm-12-p9q5v236",
+            "lat_lon": {"latitude": 39.758206, "longitude": -102.447148},
+            "name": "ARIK.20150721.AMC.EPIPSAMMON.3",
+            "part_of": ["nmdc:sty-11-34xj1150"],
             "type": "nmdc:Biosample",
         }
     ]
 
 
+@pytest.fixture
+def data_objects_list():
+    return [
+        {
+            "data_object_type": "Metagenome Raw Read 1",
+            "description": "sequencing results for BMI_HVKNKBGX5_Tube347_R1",
+            "id": "nmdc:dobj-12-b3ft7a80",
+            "md5_checksum": "cae0a9342d786e731ae71f6f37b76120",
+            "name": "BMI_HVKNKBGX5_Tube347_R1.fastq.gz",
+            "type": "nmdc:DataObject",
+            "url": "https://storage.neonscience.org/neon-microbial-raw-seq-files/2023/BMI_HVKNKBGX5_mms_R1/BMI_HVKNKBGX5_Tube347_R1.fastq.gz",
+        },
+        {
+            "data_object_type": "Metagenome Raw Read 2",
+            "description": "sequencing results for BMI_HVKNKBGX5_Tube347_R2",
+            "id": "nmdc:dobj-12-1zv4q961",
+            "md5_checksum": "7340fe25644183a4f56d36ce52389d83",
+            "name": "BMI_HVKNKBGX5_Tube347_R2.fastq.gz",
+            "type": "nmdc:DataObject",
+            "url": "https://storage.neonscience.org/neon-microbial-raw-seq-files/2023/BMI_HVKNKBGX5_mms_R2/BMI_HVKNKBGX5_Tube347_R2.fastq.gz",
+        },
+    ]
+
+
 class TestNCBISubmissionXML:
     def test_set_element(self, ncbi_submission_client):
         element = ncbi_submission_client.set_element("Test", "Hello", {"attr": "value"})
@@ -198,7 +221,32 @@ def test_set_biosample(self, ncbi_submission_client, nmdc_biosample, mocker):
         assert "Test Package" in biosample_xml
         assert "Test Org" in biosample_xml
 
-    def test_get_submission_xml(self, mocker, ncbi_submission_client, nmdc_biosample):
+    def test_set_fastq(self, ncbi_submission_client, data_objects_list, nmdc_biosample):
+        biosample_data_objects = [
+            {biosample["id"]: data_objects_list} for biosample in nmdc_biosample
+        ]
+
+        ncbi_submission_client.set_fastq(
+            biosample_data_objects=biosample_data_objects,
+            bioproject_id=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"][
+                "project_id"
+            ],
+            org=MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"][
+                "organization"
+            ],
+        )
+
+        action_xml = ET.tostring(
+            ncbi_submission_client.root.find(".//Action"), "unicode"
+        )
+        assert "BMI_HVKNKBGX5_Tube347_R2.fastq.gz" in action_xml
+        assert "PRJNA12345" in action_xml
+        assert "nmdc:bsm-12-p9q5v236" in action_xml
+        assert "Test Org" in action_xml
+
+    def test_get_submission_xml(
+        self, mocker, ncbi_submission_client, nmdc_biosample, data_objects_list
+    ):
         mocker.patch(
             "nmdc_runtime.site.export.ncbi_xml.load_mappings",
             return_value=(
@@ -243,13 +291,29 @@ def test_get_submission_xml(self, mocker, ncbi_submission_client, nmdc_biosample
             ),
         )
 
-        submission_xml = ncbi_submission_client.get_submission_xml(nmdc_biosample)
+        biosample_data_objects = [
+            {biosample["id"]: data_objects_list} for biosample in nmdc_biosample
+        ]
+
+        ncbi_submission_client.set_fastq(
+            biosample_data_objects=biosample_data_objects,
+            bioproject_id=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"][
+                "project_id"
+            ],
+            org=MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"][
+                "organization"
+            ],
+        )
+
+        submission_xml = ncbi_submission_client.get_submission_xml(
+            nmdc_biosample, biosample_data_objects
+        )
 
-        assert "nmdc:bsm-12-gnfpt483" in submission_xml
+        assert "nmdc:bsm-12-p9q5v236" in submission_xml
         assert "E. coli" in submission_xml
-        assert "stream water" in submission_xml
+        assert "sediment" in submission_xml
         assert "USA: Colorado, Arikaree River" in submission_xml
-        assert "2014-08-05T18:40Z" in submission_xml
+        assert "2015-07-21T18:00Z" in submission_xml
         assert "testuser" in submission_xml
         assert "Test Project" in submission_xml
 

From c9832374f2bd7e18aed5d48798702897d2541577 Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Tue, 4 Jun 2024 16:30:36 -0700
Subject: [PATCH 20/27] visual pass code review: fix structure of generated XML

---
 nmdc_runtime/site/export/ncbi_xml.py | 148 +++++++++++++++------------
 nmdc_runtime/site/ops.py             |   2 +
 nmdc_runtime/site/repository.py      |   1 +
 tests/test_data/test_ncbi_xml.py     |  31 ++++--
 4 files changed, 107 insertions(+), 75 deletions(-)

diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py
index 23346264..0280a007 100644
--- a/nmdc_runtime/site/export/ncbi_xml.py
+++ b/nmdc_runtime/site/export/ncbi_xml.py
@@ -1,7 +1,9 @@
+import os
 import datetime
 import xml.etree.ElementTree as ET
 import xml.dom.minidom
 
+from urllib.parse import urlparse
 from nmdc_runtime.site.export.ncbi_xml_utils import (
     handle_controlled_identified_term_value,
     handle_controlled_term_value,
@@ -147,6 +149,7 @@ def set_biosample(
         organism_name,
         package,
         org,
+        bioproject_id,
         nmdc_biosamples,
     ):
         attribute_mappings, slot_range_mappings = load_mappings(
@@ -198,6 +201,14 @@ def set_biosample(
                     "Organism",
                     children=[self.set_element("OrganismName", organism_name)],
                 ),
+                self.set_element(
+                    "BioProject",
+                    children=[
+                        self.set_element(
+                            "PrimaryId", bioproject_id, {"db": "BioProject"}
+                        )
+                    ],
+                ),
                 self.set_element("Package", package),
                 self.set_element(
                     "Attributes",
@@ -255,87 +266,94 @@ def set_fastq(
         bioproject_id: str,
         org: str,
     ):
-        fastq_files = []
-        biosample_ids = []
-
         for entry in biosample_data_objects:
+            fastq_files = []
+            biosample_ids = []
+
             for biosample_id, data_objects in entry.items():
                 biosample_ids.append(biosample_id)
                 for data_object in data_objects:
                     if "url" in data_object:
-                        fastq_files.append(data_object["url"])
-
-        if fastq_files:
-            files_elements = [
-                self.set_element(
-                    "File",
-                    "",
-                    {"file_path": f},
-                    [self.set_element("DataType", "generic-data")],
-                )
-                for f in fastq_files
-            ]
-
-            attribute_elements = [
-                self.set_element(
-                    "AttributeRefId",
-                    attrib={"name": "BioProject"},
-                    children=[
-                        self.set_element(
-                            "RefId",
-                            children=[
-                                self.set_element(
-                                    "SPUID",
-                                    bioproject_id,
-                                    {"spuid_namespace": org},
-                                )
-                            ],
+                        url = urlparse(data_object["url"])
+                        file_path = os.path.join(
+                            os.path.basename(os.path.dirname(url.path)),
+                            os.path.basename(url.path),
                         )
-                    ],
-                )
-            ]
+                        fastq_files.append(file_path)
 
-            for biosample_id in biosample_ids:
-                attribute_elements.append(
+            if fastq_files:
+                files_elements = [
+                    self.set_element(
+                        "File",
+                        "",
+                        {"file_path": f},
+                        [self.set_element("DataType", "generic-data")],
+                    )
+                    for f in fastq_files
+                ]
+
+                attribute_elements = [
                     self.set_element(
                         "AttributeRefId",
-                        attrib={"name": "BioSample"},
+                        attrib={"name": "BioProject"},
                         children=[
                             self.set_element(
                                 "RefId",
                                 children=[
                                     self.set_element(
                                         "SPUID",
-                                        biosample_id,
+                                        bioproject_id,
                                         {"spuid_namespace": org},
                                     )
                                 ],
                             )
                         ],
                     )
-                )
+                ]
 
-            identifier_element = self.set_element(
-                "Identifier",
-                children=[
-                    self.set_element("SPUID", bioproject_id, {"spuid_namespace": org})
-                ],
-            )
+                for biosample_id in biosample_ids:
+                    attribute_elements.append(
+                        self.set_element(
+                            "AttributeRefId",
+                            attrib={"name": "BioSample"},
+                            children=[
+                                self.set_element(
+                                    "RefId",
+                                    children=[
+                                        self.set_element(
+                                            "SPUID",
+                                            biosample_id,
+                                            {"spuid_namespace": org},
+                                        )
+                                    ],
+                                )
+                            ],
+                        )
+                    )
 
-            action = self.set_element(
-                "Action",
-                children=[
-                    self.set_element(
-                        "AddFiles",
-                        attrib={"target_db": "SRA"},
-                        children=files_elements
-                        + attribute_elements
-                        + [identifier_element],
-                    ),
-                ],
-            )
+                identifier_element = self.set_element(
+                    "Identifier",
+                    children=[
+                        self.set_element(
+                            "SPUID", bioproject_id, {"spuid_namespace": org}
+                        )
+                    ],
+                )
 
-            self.root.append(action)
+                action = self.set_element(
+                    "Action",
+                    children=[
+                        self.set_element(
+                            "AddFiles",
+                            attrib={"target_db": "SRA"},
+                            children=files_elements
+                            + attribute_elements
+                            + [identifier_element],
+                        ),
+                    ],
+                )
+
+                self.root.append(action)
 
     def get_submission_xml(self, biosamples_list: list, data_objects_list: list):
         self.set_description(
@@ -346,18 +364,20 @@ def get_submission_xml(self, biosamples_list: list, data_objects_list: list):
             org=self.ncbi_submission_metadata.get("organization", ""),
         )
 
-        self.set_bioproject(
-            title=self.ncbi_bioproject_metadata.get("title", ""),
-            project_id=self.ncbi_bioproject_metadata.get("project_id", ""),
-            description=self.ncbi_bioproject_metadata.get("description", ""),
-            data_type=self.ncbi_bioproject_metadata.get("data_type", ""),
-            org=self.ncbi_submission_metadata.get("organization", ""),
-        )
+        if not self.ncbi_bioproject_metadata.get("exists"):
+            self.set_bioproject(
+                title=self.ncbi_bioproject_metadata.get("title", ""),
+                project_id=self.ncbi_bioproject_metadata.get("project_id", ""),
+                description=self.ncbi_bioproject_metadata.get("description", ""),
+                data_type=self.ncbi_bioproject_metadata.get("data_type", ""),
+                org=self.ncbi_submission_metadata.get("organization", ""),
+            )
 
         self.set_biosample(
             organism_name=self.ncbi_biosample_metadata.get("organism_name", ""),
             package=self.ncbi_biosample_metadata.get("package", ""),
             org=self.ncbi_submission_metadata.get("organization", ""),
+            bioproject_id=self.ncbi_bioproject_metadata.get("project_id", ""),
             nmdc_biosamples=biosamples_list,
         )
 
diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py
index 330b056d..43a251ce 100644
--- a/nmdc_runtime/site/ops.py
+++ b/nmdc_runtime/site/ops.py
@@ -33,6 +33,7 @@
     Optional,
     Field,
     Permissive,
+    Bool,
 )
 from gridfs import GridFS
 from linkml_runtime.dumpers import json_dumper
@@ -1030,6 +1031,7 @@ def get_ncbi_export_pipeline_study_id(context: OpExecutionContext) -> str:
                     "project_id": String,
                     "description": String,
                     "data_type": String,
+                    "exists": Bool,
                 }
             ),
             is_required=True,
diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py
index 6d62b1cf..79919b2a 100644
--- a/nmdc_runtime/site/repository.py
+++ b/nmdc_runtime/site/repository.py
@@ -902,6 +902,7 @@ def biosample_export():
                                 "project_id": "",
                                 "description": "",
                                 "data_type": "",
+                                "exists": False,
                             },
                             "ncbi_biosample_metadata": {
                                 "organism_name": "",
diff --git a/tests/test_data/test_ncbi_xml.py b/tests/test_data/test_ncbi_xml.py
index 0af9ddc5..0816adbd 100644
--- a/tests/test_data/test_ncbi_xml.py
+++ b/tests/test_data/test_ncbi_xml.py
@@ -1,6 +1,5 @@
 from unittest.mock import MagicMock
 import pytest
-from requests.exceptions import HTTPError
 import xml.etree.ElementTree as ET
 
 from nmdc_runtime.site.export.ncbi_xml import NCBISubmissionXML
@@ -32,6 +31,7 @@
         "project_id": "PRJNA12345",
         "description": "A test project",
         "data_type": "metagenome",
+        "exists": False,
     },
     "ncbi_biosample_metadata": {
         "title": "Test Sample",
@@ -213,6 +213,9 @@ def test_set_biosample(self, ncbi_submission_client, nmdc_biosample, mocker):
                 "organization"
             ],
             nmdc_biosamples=nmdc_biosample,
+            bioproject_id=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"][
+                "project_id"
+            ],
         )
         biosample_xml = ET.tostring(
             ncbi_submission_client.root.find(".//BioSample"), "unicode"
@@ -220,6 +223,7 @@ def test_set_biosample(self, ncbi_submission_client, nmdc_biosample, mocker):
         assert "E. coli" in biosample_xml
         assert "Test Package" in biosample_xml
         assert "Test Org" in biosample_xml
+        assert "PRJNA12345" in biosample_xml
 
     def test_set_fastq(self, ncbi_submission_client, data_objects_list, nmdc_biosample):
         biosample_data_objects = [
@@ -236,13 +240,18 @@ def test_set_fastq(self, ncbi_submission_client, data_objects_list, nmdc_biosamp
             ],
         )
 
-        action_xml = ET.tostring(
-            ncbi_submission_client.root.find(".//Action"), "unicode"
-        )
-        assert "BMI_HVKNKBGX5_Tube347_R2.fastq.gz" in action_xml
-        assert "PRJNA12345" in action_xml
-        assert "nmdc:bsm-12-p9q5v236" in action_xml
-        assert "Test Org" in action_xml
+        action_elements = ncbi_submission_client.root.findall(".//Action")
+        assert len(action_elements) == len(biosample_data_objects)
+
+        for action_element in action_elements:
+            action_xml = ET.tostring(action_element, "unicode")
+            assert (
+                "BMI_HVKNKBGX5_Tube347_R1.fastq.gz" in action_xml
+                or "BMI_HVKNKBGX5_Tube347_R2.fastq.gz" in action_xml
+            )
+            assert "PRJNA12345" in action_xml
+            assert "nmdc:bsm-12-p9q5v236" in action_xml
+            assert "Test Org" in action_xml
 
     def test_get_submission_xml(
         self, mocker, ncbi_submission_client, nmdc_biosample, data_objects_list
@@ -306,7 +315,7 @@ def test_get_submission_xml(
         )
 
         submission_xml = ncbi_submission_client.get_submission_xml(
-            nmdc_biosample, biosample_data_objects
+            nmdc_biosample, data_objects_list
         )
 
         assert "nmdc:bsm-12-p9q5v236" in submission_xml
@@ -417,7 +426,7 @@ def test_load_mappings(self, mocker):
             "Biosample\tenv_local_scale\tControlledIdentifiedTermValue\tenv_local_scale\t\t\n"
             "Biosample\tenv_medium\tControlledIdentifiedTermValue\tenv_medium\t\t\n"
             "Biosample\tenv_package\tTextValue\tenv_package\t\t\n"
-            "Biosample\tgeo_loc_name\tQuantityValue\tgeo_loc_name\t\t\n"
+            "Biosample\tgeo_loc_name\tTextValue\tgeo_loc_name\t\t\n"
             "Biosample\tid\turiorcurie\t\t\t\n"
             "Biosample\tlat_lon\tGeolocationValue\tlat_lon\t\t\n"
             "Biosample\tname\tstring\tsample_name\t\t\n"
@@ -465,7 +474,7 @@ def test_load_mappings(self, mocker):
             "env_local_scale": "ControlledIdentifiedTermValue",
             "env_medium": "ControlledIdentifiedTermValue",
             "env_package": "TextValue",
-            "geo_loc_name": "QuantityValue",
+            "geo_loc_name": "TextValue",
             "id": "uriorcurie",
             "lat_lon": "GeolocationValue",
             "name": "string",

From e5421a357be39b35006db249eef194caa8bcb7fa Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Wed, 5 Jun 2024 15:18:55 -0700
Subject: [PATCH 21/27] typecode class map inference from schema

---
 nmdc_runtime/site/export/ncbi_xml_utils.py | 23 +++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/nmdc_runtime/site/export/ncbi_xml_utils.py b/nmdc_runtime/site/export/ncbi_xml_utils.py
index bf3d285e..f7293795 100644
--- a/nmdc_runtime/site/export/ncbi_xml_utils.py
+++ b/nmdc_runtime/site/export/ncbi_xml_utils.py
@@ -1,21 +1,22 @@
-from lxml import etree
 from io import BytesIO, StringIO
+from nmdc_runtime.minter.config import typecodes
+from lxml import etree
+
 import csv
 import requests
 
 
-# TODO: do not hardcode this mapping
+def _build_class_map(class_map_data):
+    return {
+        entry["name"]: entry["schema_class"].split(":")[1] for entry in class_map_data
+    }
+
+
 def get_classname_from_typecode(doc_id):
+    class_map_data = typecodes()
+    class_map = _build_class_map(class_map_data)
+
     typecode = doc_id.split(":")[1].split("-")[0]
-    class_map = {
-        "bsm": "Biosample",
-        "extr": "Extraction",
-        "pool": "Pooling",
-        "libprep": "LibraryPreparation",
-        "procsm": "ProcessedSample",
-        "omprc": "OmicsProcessing",
-        "dobj": "DataObject",
-    }
     return class_map.get(typecode)
 
 

From 80c4339245c2b4855747c908804f477bbd77101e Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Thu, 6 Jun 2024 15:08:22 -0700
Subject: [PATCH 22/27] allow copying of XML file contents from dagit UI

---
 nmdc_runtime/site/ops.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py
index 43a251ce..d3bbbeb7 100644
--- a/nmdc_runtime/site/ops.py
+++ b/nmdc_runtime/site/ops.py
@@ -781,11 +781,22 @@ def export_json_to_drs(
     out=Out(description="XML content rendered through Dagit UI"),
 )
 def ncbi_submission_xml_asset(context: OpExecutionContext, data: str):
+    filename = "ncbi_submission.xml"
+    file_path = os.path.join(context.instance.storage_directory(), filename)
+
+    os.makedirs(os.path.dirname(file_path), exist_ok=True)
+
+    with open(file_path, "w") as f:
+        f.write(data)
+
     context.log_event(
         AssetMaterialization(
             asset_key="ncbi_submission_xml",
             description="NCBI Submission XML Data",
-            metadata={"xml": MetadataValue.text(data)},
+            metadata={
+                "file_path": MetadataValue.path(file_path),
+                "xml": MetadataValue.text(data),
+            },
         )
     )
 

From adcd8e621b3db79bb2677693b9c9ec2295c9669d Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Fri, 14 Jun 2024 11:06:04 -0700
Subject: [PATCH 23/27] inference of some Submission and BioProject fields from
 existing metadata

---
 nmdc_runtime/site/export/ncbi_xml.py       | 26 +++++++++++++++-------
 nmdc_runtime/site/export/study_metadata.py |  4 ++--
 nmdc_runtime/site/graphs.py                |  8 +++----
 nmdc_runtime/site/ops.py                   | 20 ++++++++---------
 nmdc_runtime/site/repository.py            |  8 +------
 5 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py
index 0280a007..2e27472e 100644
--- a/nmdc_runtime/site/export/ncbi_xml.py
+++ b/nmdc_runtime/site/export/ncbi_xml.py
@@ -3,6 +3,7 @@
 import xml.etree.ElementTree as ET
 import xml.dom.minidom
 
+from typing import Any
 from urllib.parse import urlparse
 from nmdc_runtime.site.export.ncbi_xml_utils import (
     handle_controlled_identified_term_value,
@@ -19,10 +20,19 @@
 
 
 class NCBISubmissionXML:
-    def __init__(self, nmdc_study_id: str, ncbi_submission_metadata: dict):
+    def __init__(self, nmdc_study: Any, ncbi_submission_metadata: dict):
         self.root = ET.Element("Submission")
 
-        self.nmdc_study_id = nmdc_study_id
+        self.nmdc_study_id = nmdc_study.get("id")
+        self.nmdc_study_title = nmdc_study.get("title")
+        self.nmdc_study_description = nmdc_study.get("description")
+        self.ncbi_bioproject_id = nmdc_study.get("insdc_bioproject_identifiers")
+        self.nmdc_pi_email = nmdc_study.get("principal_investigator", {}).get("email")
+        nmdc_study_pi_name = (
+            nmdc_study.get("principal_investigator", {}).get("name").split()
+        )
+        self.first_name = nmdc_study_pi_name[0]
+        self.last_name = nmdc_study_pi_name[1] if len(nmdc_study_pi_name) > 1 else None
 
         self.nmdc_ncbi_attribute_mapping_file_url = ncbi_submission_metadata.get(
             "nmdc_ncbi_attribute_mapping_file_url"
@@ -357,18 +367,18 @@ def set_fastq(
 
     def get_submission_xml(self, biosamples_list: list, data_objects_list: list):
         self.set_description(
-            email=self.ncbi_submission_metadata.get("email", ""),
-            user=self.ncbi_submission_metadata.get("user", ""),
-            first=self.ncbi_submission_metadata.get("first", ""),
-            last=self.ncbi_submission_metadata.get("last", ""),
+            email=self.nmdc_pi_email,
+            user="National Microbiome Data Collaborative (NMDC)",
+            first=self.first_name,
+            last=self.last_name,
             org=self.ncbi_submission_metadata.get("organization", ""),
         )
 
         if not self.ncbi_bioproject_metadata.get("exists"):
             self.set_bioproject(
-                title=self.ncbi_bioproject_metadata.get("title", ""),
+                title=self.nmdc_study_title,
                 project_id=self.ncbi_bioproject_metadata.get("project_id", ""),
-                description=self.ncbi_bioproject_metadata.get("description", ""),
+                description=self.nmdc_study_description,
                 data_type=self.ncbi_bioproject_metadata.get("data_type", ""),
                 org=self.ncbi_submission_metadata.get("organization", ""),
             )
diff --git a/nmdc_runtime/site/export/study_metadata.py b/nmdc_runtime/site/export/study_metadata.py
index 3cf9bc6d..d9bb2a97 100644
--- a/nmdc_runtime/site/export/study_metadata.py
+++ b/nmdc_runtime/site/export/study_metadata.py
@@ -131,7 +131,7 @@ def export_study_biosamples_metadata():
 
 
 @op(required_resource_keys={"runtime_api_site_client"})
-def get_biosamples_by_study_id(context: OpExecutionContext, nmdc_study_id: str):
+def get_biosamples_by_study_id(context: OpExecutionContext, nmdc_study: dict):
     client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
-    biosamples = get_all_docs(client, "biosamples", f"part_of:{nmdc_study_id}")
+    biosamples = get_all_docs(client, "biosamples", f"part_of:{nmdc_study['id']}")
     return biosamples
diff --git a/nmdc_runtime/site/graphs.py b/nmdc_runtime/site/graphs.py
index 700ff6d7..2798bccb 100644
--- a/nmdc_runtime/site/graphs.py
+++ b/nmdc_runtime/site/graphs.py
@@ -49,7 +49,7 @@
     get_neon_pipeline_inputs,
     get_df_from_url,
     site_code_mapping,
-    get_ncbi_export_pipeline_study_id,
+    get_ncbi_export_pipeline_study,
     get_data_objects_from_biosamples,
     get_ncbi_export_pipeline_inputs,
     ncbi_submission_xml_from_nmdc_study,
@@ -391,11 +391,11 @@ def ingest_neon_surface_water_metadata():
 
 @graph
 def nmdc_study_to_ncbi_submission_export():
-    nmdc_study_id = get_ncbi_export_pipeline_study_id()
+    nmdc_study = get_ncbi_export_pipeline_study()
     ncbi_submission_metadata = get_ncbi_export_pipeline_inputs()
-    biosamples = get_biosamples_by_study_id(nmdc_study_id)
+    biosamples = get_biosamples_by_study_id(nmdc_study)
     data_objects = get_data_objects_from_biosamples(biosamples)
     xml_data = ncbi_submission_xml_from_nmdc_study(
-        nmdc_study_id, ncbi_submission_metadata, biosamples, data_objects
+        nmdc_study, ncbi_submission_metadata, biosamples, data_objects
     )
     ncbi_submission_xml_asset(xml_data)
diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py
index d3bbbeb7..51d8c22d 100644
--- a/nmdc_runtime/site/ops.py
+++ b/nmdc_runtime/site/ops.py
@@ -48,6 +48,7 @@
 )
 from nmdc_runtime.api.core.util import dotted_path_for, hash_from_str, json_clean, now
 from nmdc_runtime.api.endpoints.util import persist_content_and_get_drs_object
+from nmdc_runtime.api.endpoints.find import find_study_by_id
 from nmdc_runtime.api.models.job import Job, JobOperationMetadata
 from nmdc_runtime.api.models.metadata import ChangesheetIn
 from nmdc_runtime.api.models.operation import (
@@ -1014,9 +1015,12 @@ def site_code_mapping() -> dict:
         )
 
 
-@op(config_schema={"nmdc_study_id": str})
-def get_ncbi_export_pipeline_study_id(context: OpExecutionContext) -> str:
-    return context.op_config["nmdc_study_id"]
+@op(config_schema={"nmdc_study_id": str}, required_resource_keys={"mongo"})
+def get_ncbi_export_pipeline_study(context: OpExecutionContext) -> Any:
+    nmdc_study = find_study_by_id(
+        context.op_config["nmdc_study_id"], context.resources.mongo.db
+    )
+    return nmdc_study
 
 
 @op(
@@ -1025,10 +1029,6 @@ def get_ncbi_export_pipeline_study_id(context: OpExecutionContext) -> str:
         "ncbi_submission_metadata": Field(
             Permissive(
                 {
-                    "email": String,
-                    "first": String,
-                    "last": String,
-                    "user": String,
                     "organization": String,
                 }
             ),
@@ -1038,9 +1038,7 @@ def get_ncbi_export_pipeline_study_id(context: OpExecutionContext) -> str:
         "ncbi_bioproject_metadata": Field(
             Permissive(
                 {
-                    "title": String,
                     "project_id": String,
-                    "description": String,
                     "data_type": String,
                     "exists": Bool,
                 }
@@ -1090,11 +1088,11 @@ def get_data_objects_from_biosamples(context: OpExecutionContext, biosamples: li
 @op
 def ncbi_submission_xml_from_nmdc_study(
     context: OpExecutionContext,
-    nmdc_study_id: str,
+    nmdc_study: Any,
     ncbi_exporter_metadata: dict,
     biosamples: list,
     data_objects: list,
 ) -> str:
-    ncbi_exporter = NCBISubmissionXML(nmdc_study_id, ncbi_exporter_metadata)
+    ncbi_exporter = NCBISubmissionXML(nmdc_study, ncbi_exporter_metadata)
     ncbi_xml = ncbi_exporter.get_submission_xml(biosamples, data_objects)
     return ncbi_xml
diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py
index 79919b2a..c6788459 100644
--- a/nmdc_runtime/site/repository.py
+++ b/nmdc_runtime/site/repository.py
@@ -882,7 +882,7 @@ def biosample_export():
                     },
                 ),
                 "ops": {
-                    "get_ncbi_export_pipeline_study_id": {
+                    "get_ncbi_export_pipeline_study": {
                         "config": {
                             "nmdc_study_id": "",
                         }
@@ -891,16 +891,10 @@ def biosample_export():
                         "config": {
                             "nmdc_ncbi_attribute_mapping_file_url": "",
                             "ncbi_submission_metadata": {
-                                "email": "",
-                                "first": "",
-                                "last": "",
-                                "user": "",
                                 "organization": "",
                             },
                             "ncbi_bioproject_metadata": {
-                                "title": "",
                                 "project_id": "",
-                                "description": "",
                                 "data_type": "",
                                 "exists": False,
                             },

From 5a9e7498e8e7560cb617a846bcdb87d88ffa880c Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Fri, 21 Jun 2024 11:29:55 -0700
Subject: [PATCH 24/27] reduce manual metadata entry through Dagit UI fields

---
 nmdc_runtime/site/export/ncbi_xml.py       |  49 ++++--
 nmdc_runtime/site/export/ncbi_xml_utils.py |  37 +++++
 nmdc_runtime/site/graphs.py                |   8 +-
 nmdc_runtime/site/ops.py                   |  34 ++--
 nmdc_runtime/site/repository.py            |   6 -
 tests/test_data/test_ncbi_xml.py           | 176 +++++++++++++--------
 6 files changed, 209 insertions(+), 101 deletions(-)

diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py
index 2e27472e..0e82e24a 100644
--- a/nmdc_runtime/site/export/ncbi_xml.py
+++ b/nmdc_runtime/site/export/ncbi_xml.py
@@ -40,9 +40,6 @@ def __init__(self, nmdc_study: Any, ncbi_submission_metadata: dict):
         self.ncbi_submission_metadata = ncbi_submission_metadata.get(
             "ncbi_submission_metadata", {}
         )
-        self.ncbi_bioproject_metadata = ncbi_submission_metadata.get(
-            "ncbi_bioproject_metadata", {}
-        )
         self.ncbi_biosample_metadata = ncbi_submission_metadata.get(
             "ncbi_biosample_metadata", {}
         )
@@ -157,10 +154,10 @@ def set_bioproject(self, title, project_id, description, data_type, org):
     def set_biosample(
         self,
         organism_name,
-        package,
         org,
         bioproject_id,
         nmdc_biosamples,
+        nmdc_omics_processing,
     ):
         attribute_mappings, slot_range_mappings = load_mappings(
             self.nmdc_ncbi_attribute_mapping_file_url
@@ -169,11 +166,15 @@ def set_biosample(
         for biosample in nmdc_biosamples:
             attributes = {}
             sample_id_value = None
+            env_package = None
 
             for json_key, value in biosample.items():
                 if isinstance(value, list):
                     continue  # Skip processing for list values
 
+                if json_key == "env_package":
+                    env_package = f"MIMS.me.{handle_text_value(value)}.6.0"
+
                 # Special handling for NMDC Biosample "id"
                 if json_key == "id":
                     sample_id_value = value
@@ -219,7 +220,7 @@ def set_biosample(
                         )
                     ],
                 ),
-                self.set_element("Package", package),
+                self.set_element("Package", env_package),
                 self.set_element(
                     "Attributes",
                     children=[
@@ -365,7 +366,23 @@ def set_fastq(
 
                 self.root.append(action)
 
-    def get_submission_xml(self, biosamples_list: list, data_objects_list: list):
+    def get_submission_xml(
+        self,
+        biosamples_list: list,
+        biosample_omics_processing_list: list,
+        biosample_data_objects_list: list,
+    ):
+        data_type = None
+        ncbi_project_id = None
+        for bsm_omprc in biosample_omics_processing_list:
+            for _, omprc_list in bsm_omprc.items():
+                for omprc in omprc_list:
+                    if "omics_type" in omprc:
+                        data_type = handle_text_value(omprc["omics_type"]).capitalize()
+
+                    if "ncbi_project_name" in omprc:
+                        ncbi_project_id = omprc["ncbi_project_name"]
+
         self.set_description(
             email=self.nmdc_pi_email,
             user="National Microbiome Data Collaborative (NMDC)",
@@ -374,26 +391,26 @@ def get_submission_xml(self, biosamples_list: list, data_objects_list: list):
             org=self.ncbi_submission_metadata.get("organization", ""),
         )
 
-        if not self.ncbi_bioproject_metadata.get("exists"):
+        if not ncbi_project_id:
             self.set_bioproject(
                 title=self.nmdc_study_title,
-                project_id=self.ncbi_bioproject_metadata.get("project_id", ""),
+                project_id=ncbi_project_id,
                 description=self.nmdc_study_description,
-                data_type=self.ncbi_bioproject_metadata.get("data_type", ""),
+                data_type=data_type,
                 org=self.ncbi_submission_metadata.get("organization", ""),
             )
 
         self.set_biosample(
             organism_name=self.ncbi_biosample_metadata.get("organism_name", ""),
-            package=self.ncbi_biosample_metadata.get("package", ""),
             org=self.ncbi_submission_metadata.get("organization", ""),
-            bioproject_id=self.ncbi_bioproject_metadata.get("project_id", ""),
+            bioproject_id=ncbi_project_id,
             nmdc_biosamples=biosamples_list,
+            nmdc_omics_processing=biosample_omics_processing_list,
         )
 
         self.set_fastq(
-            biosample_data_objects=data_objects_list,
-            bioproject_id=self.ncbi_bioproject_metadata.get("project_id", ""),
+            biosample_data_objects=biosample_data_objects_list,
+            bioproject_id=ncbi_project_id,
             org=self.ncbi_submission_metadata.get("organization", ""),
         )
 
@@ -403,12 +420,12 @@ def get_submission_xml(self, biosamples_list: list, data_objects_list: list):
 
         # ============= Uncomment the following code to validate the XML against NCBI XSDs ============ #
         # submission_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/submission.xsd?view=co"
-        # submission_xsd_validation = validate_xml(submission_xml, submission_xsd_url)
+        # validate_xml(submission_xml, submission_xsd_url)
 
         # bioproject_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/bioproject.xsd?view=co"
-        # bioproject_xsd_validation = validate_xml(submission_xml, bioproject_xsd_url)
+        # validate_xml(submission_xml, bioproject_xsd_url)
 
         # biosample_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/biosample.xsd?view=co"
-        # biosample_xsd_validation = validate_xml(submission_xml, biosample_xsd_url)
+        # validate_xml(submission_xml, biosample_xsd_url)
 
         return submission_xml
diff --git a/nmdc_runtime/site/export/ncbi_xml_utils.py b/nmdc_runtime/site/export/ncbi_xml_utils.py
index f7293795..84ee95d6 100644
--- a/nmdc_runtime/site/export/ncbi_xml_utils.py
+++ b/nmdc_runtime/site/export/ncbi_xml_utils.py
@@ -57,6 +57,43 @@ def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
 
     return biosample_data_objects
 
+def fetch_omics_processing_from_biosamples(all_docs_collection, biosamples_list):
+    biosample_data_objects = []
+
+    for biosample in biosamples_list:
+        current_ids = [biosample["id"]]
+        collected_data_objects = []
+
+        while current_ids:
+            new_current_ids = []
+            for current_id in current_ids:
+                query = {"has_input": current_id}
+                document = all_docs_collection.find_one(query)
+
+                if not document:
+                    continue
+
+                has_output = document.get("has_output")
+                if not has_output:
+                    continue
+
+                for output_id in has_output:
+                    if get_classname_from_typecode(output_id) == "DataObject":
+                        omics_processing_doc = all_docs_collection.find_one(
+                            {"id": document["id"]}
+                        )
+                        if omics_processing_doc:
+                            collected_data_objects.append(omics_processing_doc)
+                    else:
+                        new_current_ids.append(output_id)
+
+            current_ids = new_current_ids
+
+        if collected_data_objects:
+            biosample_data_objects.append({biosample["id"]: collected_data_objects})
+
+    return biosample_data_objects
+
 
 def handle_quantity_value(slot_value):
     if "has_numeric_value" in slot_value and "has_unit" in slot_value:
diff --git a/nmdc_runtime/site/graphs.py b/nmdc_runtime/site/graphs.py
index 2798bccb..a3d2aebd 100644
--- a/nmdc_runtime/site/graphs.py
+++ b/nmdc_runtime/site/graphs.py
@@ -51,6 +51,7 @@
     site_code_mapping,
     get_ncbi_export_pipeline_study,
     get_data_objects_from_biosamples,
+    get_omics_processing_from_biosamples,
     get_ncbi_export_pipeline_inputs,
     ncbi_submission_xml_from_nmdc_study,
     ncbi_submission_xml_asset,
@@ -394,8 +395,13 @@ def nmdc_study_to_ncbi_submission_export():
     nmdc_study = get_ncbi_export_pipeline_study()
     ncbi_submission_metadata = get_ncbi_export_pipeline_inputs()
     biosamples = get_biosamples_by_study_id(nmdc_study)
+    omics_processing_records = get_omics_processing_from_biosamples(biosamples)
     data_objects = get_data_objects_from_biosamples(biosamples)
     xml_data = ncbi_submission_xml_from_nmdc_study(
-        nmdc_study, ncbi_submission_metadata, biosamples, data_objects
+        nmdc_study,
+        ncbi_submission_metadata,
+        biosamples,
+        omics_processing_records,
+        data_objects,
     )
     ncbi_submission_xml_asset(xml_data)
diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py
index 51d8c22d..6d8d6ebf 100644
--- a/nmdc_runtime/site/ops.py
+++ b/nmdc_runtime/site/ops.py
@@ -63,7 +63,10 @@
 )
 from nmdc_runtime.api.models.util import ResultT
 from nmdc_runtime.site.export.ncbi_xml import NCBISubmissionXML
-from nmdc_runtime.site.export.ncbi_xml_utils import fetch_data_objects_from_biosamples
+from nmdc_runtime.site.export.ncbi_xml_utils import (
+    fetch_data_objects_from_biosamples,
+    fetch_omics_processing_from_biosamples,
+)
 from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict
 from nmdc_runtime.site.resources import (
     NmdcPortalApiClient,
@@ -1035,22 +1038,10 @@ def get_ncbi_export_pipeline_study(context: OpExecutionContext) -> Any:
             is_required=True,
             description="General metadata about the NCBI submission.",
         ),
-        "ncbi_bioproject_metadata": Field(
-            Permissive(
-                {
-                    "project_id": String,
-                    "data_type": String,
-                    "exists": Bool,
-                }
-            ),
-            is_required=True,
-            description="Metadata for NCBI BioProject in the Submission.",
-        ),
         "ncbi_biosample_metadata": Field(
             Permissive(
                 {
                     "organism_name": String,
-                    "package": String,
                 }
             ),
             is_required=True,
@@ -1064,13 +1055,11 @@ def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str:
         "nmdc_ncbi_attribute_mapping_file_url"
     ]
     ncbi_submission_metadata = context.op_config.get("ncbi_submission_metadata", {})
-    ncbi_bioproject_metadata = context.op_config.get("ncbi_bioproject_metadata", {})
     ncbi_biosample_metadata = context.op_config.get("ncbi_biosample_metadata", {})
 
     return {
         "nmdc_ncbi_attribute_mapping_file_url": nmdc_ncbi_attribute_mapping_file_url,
         "ncbi_submission_metadata": ncbi_submission_metadata,
-        "ncbi_bioproject_metadata": ncbi_bioproject_metadata,
         "ncbi_biosample_metadata": ncbi_biosample_metadata,
     }
 
@@ -1085,14 +1074,27 @@ def get_data_objects_from_biosamples(context: OpExecutionContext, biosamples: li
     return biosample_data_objects
 
 
+@op(required_resource_keys={"mongo"})
+def get_omics_processing_from_biosamples(context: OpExecutionContext, biosamples: list):
+    mdb = context.resources.mongo.db
+    alldocs_collection = mdb["alldocs"]
+    biosample_omics_processing = fetch_omics_processing_from_biosamples(
+        alldocs_collection, biosamples
+    )
+    return biosample_omics_processing
+
+
 @op
 def ncbi_submission_xml_from_nmdc_study(
     context: OpExecutionContext,
     nmdc_study: Any,
     ncbi_exporter_metadata: dict,
     biosamples: list,
+    omics_processing_records: list,
     data_objects: list,
 ) -> str:
     ncbi_exporter = NCBISubmissionXML(nmdc_study, ncbi_exporter_metadata)
-    ncbi_xml = ncbi_exporter.get_submission_xml(biosamples, data_objects)
+    ncbi_xml = ncbi_exporter.get_submission_xml(
+        biosamples, omics_processing_records, data_objects
+    )
     return ncbi_xml
diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py
index c6788459..7a0fad22 100644
--- a/nmdc_runtime/site/repository.py
+++ b/nmdc_runtime/site/repository.py
@@ -893,14 +893,8 @@ def biosample_export():
                             "ncbi_submission_metadata": {
                                 "organization": "",
                             },
-                            "ncbi_bioproject_metadata": {
-                                "project_id": "",
-                                "data_type": "",
-                                "exists": False,
-                            },
                             "ncbi_biosample_metadata": {
                                 "organism_name": "",
-                                "package": "",
                             },
                         }
                     },
diff --git a/tests/test_data/test_ncbi_xml.py b/tests/test_data/test_ncbi_xml.py
index 0816adbd..7996b4d1 100644
--- a/tests/test_data/test_ncbi_xml.py
+++ b/tests/test_data/test_ncbi_xml.py
@@ -15,28 +15,72 @@
     handle_string_value,
 )
 
-MOCK_NCBI_NMDC_STUDY_ID = "nmdc:sty-11-12345"
+MOCK_NMDC_STUDY = {
+    "id": "nmdc:sty-11-34xj1150",
+    "name": "National Ecological Observatory Network: soil metagenomes (DP1.10107.001)",
+    "description": "This study contains the quality-controlled laboratory metadata and minimally processed sequence data from NEON's soil microbial shotgun metagenomics sequencing. Typically, measurements are done on plot-level composite samples and represent up to three randomly selected sampling locations within a plot.",
+    "gold_study_identifiers": ["gold:Gs0144570", "gold:Gs0161344"],
+    "principal_investigator": {
+        "has_raw_value": "Kate Thibault",
+        "email": "kthibault@battelleecology.org",
+        "name": "Kate Thibault",
+        "orcid": "orcid:0000-0003-3477-6424",
+        "profile_image_url": "https://portal.nersc.gov/project/m3408/profile_images/thibault_katy.jpg",
+    },
+    "title": "National Ecological Observatory Network: soil metagenomes (DP1.10107.001)",
+    "type": "nmdc:Study",
+    "websites": [
+        "https://data.neonscience.org/data-products/DP1.10107.001",
+        "https://data.neonscience.org/api/v0/documents/NEON.DOC.014048vO",
+        "https://data.neonscience.org/api/v0/documents/NEON_metagenomes_userGuide_vE.pdf",
+    ],
+    "study_image": [
+        {
+            "url": "https://portal.nersc.gov/project/m3408/profile_images/nmdc_sty-11-34xj1150.jpg"
+        }
+    ],
+    "funding_sources": [
+        "NSF#1724433 National Ecological Observatory Network: Operations Activities"
+    ],
+    "has_credit_associations": [
+        {
+            "applies_to_person": {
+                "name": "Hugh Cross",
+                "email": "crossh@battelleecology.org",
+                "orcid": "orcid:0000-0002-6745-9479",
+            },
+            "applied_roles": ["Methodology", "Data curation"],
+        },
+        {
+            "applies_to_person": {
+                "name": "Samantha Weintraub-Leff",
+                "email": "sweintraub@battelleecology.org",
+                "orcid": "orcid:0000-0003-4789-5086",
+            },
+            "applied_roles": ["Methodology", "Data curation"],
+        },
+        {
+            "applies_to_person": {
+                "name": "Kate Thibault",
+                "email": "kthibault@battelleecology.org",
+                "orcid": "orcid:0000-0003-3477-6424",
+            },
+            "applied_roles": ["Principal Investigator"],
+        },
+    ],
+    "part_of": ["nmdc:sty-11-nxrz9m96"],
+    "study_category": "consortium",
+    "insdc_bioproject_identifiers": ["bioproject:PRJNA1029061"],
+    "homepage_website": ["https://www.neonscience.org/"],
+}
 
 MOCK_NCBI_SUBMISSION_METADATA = {
     "nmdc_ncbi_attribute_mapping_file_url": "http://example.com/mappings.tsv",
     "ncbi_submission_metadata": {
-        "email": "user@example.com",
-        "user": "testuser",
-        "first": "Test",
-        "last": "User",
         "organization": "Test Org",
     },
-    "ncbi_bioproject_metadata": {
-        "title": "Test Project",
-        "project_id": "PRJNA12345",
-        "description": "A test project",
-        "data_type": "metagenome",
-        "exists": False,
-    },
     "ncbi_biosample_metadata": {
-        "title": "Test Sample",
         "organism_name": "E. coli",
-        "package": "Test Package",
     },
 }
 
@@ -44,7 +88,7 @@
 @pytest.fixture
 def ncbi_submission_client():
     return NCBISubmissionXML(
-        nmdc_study_id=MOCK_NCBI_NMDC_STUDY_ID,
+        nmdc_study=MOCK_NMDC_STUDY,
         ncbi_submission_metadata=MOCK_NCBI_SUBMISSION_METADATA,
     )
 
@@ -77,6 +121,24 @@ def nmdc_biosample():
     ]
 
 
+@pytest.fixture
+def omics_processing_list():
+    return [
+        {
+            "has_input": ["nmdc:procsm-12-ehktny16"],
+            "has_output": ["nmdc:dobj-12-1zv4q961", "nmdc:dobj-12-b3ft7a80"],
+            "id": "nmdc:omprc-12-zqm9p096",
+            "instrument_name": "Illumina NextSeq550",
+            "name": "Terrestrial soil microbial communities - ARIK.20150721.AMC.EPIPSAMMON.3-DNA1",
+            "ncbi_project_name": "PRJNA406976",
+            "omics_type": {"has_raw_value": "metagenome"},
+            "part_of": ["nmdc:sty-11-34xj1150"],
+            "processing_institution": "Battelle",
+            "type": "nmdc:OmicsProcessing",
+        }
+    ]
+
+
 @pytest.fixture
 def data_objects_list():
     return [
@@ -110,11 +172,11 @@ def test_set_element(self, ncbi_submission_client):
 
     def test_set_description(self, ncbi_submission_client):
         ncbi_submission_client.set_description(
-            MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"]["email"],
-            MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"]["user"],
-            MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"]["first"],
-            MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"]["last"],
-            MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"]["organization"],
+            ncbi_submission_client.nmdc_pi_email,
+            "testuser",
+            "Kate",
+            "Thibault",
+            "Test Org",
         )
         description = ET.tostring(
             ncbi_submission_client.root.find("Description"), "unicode"
@@ -128,35 +190,33 @@ def test_set_description(self, ncbi_submission_client):
         contact_first = root.find("Organization/Contact/Name/First").text
         contact_last = root.find("Organization/Contact/Name/Last").text
 
-        assert comment == "NMDC Submission for nmdc:sty-11-12345"
+        assert comment == f"NMDC Submission for {MOCK_NMDC_STUDY['id']}"
         assert submitter == "testuser"
         assert org_name == "Test Org"
-        assert contact_email == "user@example.com"
-        assert contact_first == "Test"
-        assert contact_last == "User"
+        assert contact_email == "kthibault@battelleecology.org"
+        assert contact_first == "Kate"
+        assert contact_last == "Thibault"
 
     def test_set_bioproject(self, ncbi_submission_client):
         ncbi_submission_client.set_bioproject(
-            title=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"]["title"],
-            project_id=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"][
-                "project_id"
-            ],
-            description=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"][
-                "description"
-            ],
-            data_type=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"][
-                "data_type"
-            ],
-            org=MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"][
-                "organization"
-            ],
+            title=MOCK_NMDC_STUDY["title"],
+            project_id=MOCK_NMDC_STUDY["insdc_bioproject_identifiers"][0],
+            description=MOCK_NMDC_STUDY["description"],
+            data_type="metagenome",
+            org="Test Org",
         )
         bioproject_xml = ET.tostring(
             ncbi_submission_client.root.find(".//Project"), "unicode"
         )
-        assert "Test Project" in bioproject_xml
-        assert "PRJNA12345" in bioproject_xml
-        assert "A test project" in bioproject_xml
+        assert (
+            "National Ecological Observatory Network: soil metagenomes (DP1.10107.001)"
+            in bioproject_xml
+        )
+        assert "bioproject:PRJNA1029061" in bioproject_xml
+        assert (
+            "This study contains the quality-controlled laboratory metadata and minimally processed sequence data from NEON's soil microbial shotgun metagenomics sequencing."
+            in bioproject_xml
+        )
         assert "metagenome" in bioproject_xml
         assert "Test Org" in bioproject_xml
 
@@ -208,22 +268,19 @@ def test_set_biosample(self, ncbi_submission_client, nmdc_biosample, mocker):
             organism_name=MOCK_NCBI_SUBMISSION_METADATA["ncbi_biosample_metadata"][
                 "organism_name"
             ],
-            package=MOCK_NCBI_SUBMISSION_METADATA["ncbi_biosample_metadata"]["package"],
             org=MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"][
                 "organization"
             ],
+            bioproject_id=MOCK_NMDC_STUDY["insdc_bioproject_identifiers"][0],
             nmdc_biosamples=nmdc_biosample,
-            bioproject_id=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"][
-                "project_id"
-            ],
+            nmdc_omics_processing=[],
         )
         biosample_xml = ET.tostring(
             ncbi_submission_client.root.find(".//BioSample"), "unicode"
         )
         assert "E. coli" in biosample_xml
-        assert "Test Package" in biosample_xml
         assert "Test Org" in biosample_xml
-        assert "PRJNA12345" in biosample_xml
+        assert "PRJNA1029061" in biosample_xml
 
     def test_set_fastq(self, ncbi_submission_client, data_objects_list, nmdc_biosample):
         biosample_data_objects = [
@@ -232,12 +289,8 @@ def test_set_fastq(self, ncbi_submission_client, data_objects_list, nmdc_biosamp
 
         ncbi_submission_client.set_fastq(
             biosample_data_objects=biosample_data_objects,
-            bioproject_id=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"][
-                "project_id"
-            ],
-            org=MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"][
-                "organization"
-            ],
+            bioproject_id=MOCK_NMDC_STUDY["insdc_bioproject_identifiers"][0],
+            org="Test Org",
         )
 
         action_elements = ncbi_submission_client.root.findall(".//Action")
@@ -249,7 +302,7 @@ def test_set_fastq(self, ncbi_submission_client, data_objects_list, nmdc_biosamp
                 "BMI_HVKNKBGX5_Tube347_R1.fastq.gz" in action_xml
                 or "BMI_HVKNKBGX5_Tube347_R2.fastq.gz" in action_xml
             )
-            assert "PRJNA12345" in action_xml
+            assert "PRJNA1029061" in action_xml
             assert "nmdc:bsm-12-p9q5v236" in action_xml
             assert "Test Org" in action_xml
 
@@ -306,16 +359,12 @@ def test_get_submission_xml(
 
         ncbi_submission_client.set_fastq(
             biosample_data_objects=biosample_data_objects,
-            bioproject_id=MOCK_NCBI_SUBMISSION_METADATA["ncbi_bioproject_metadata"][
-                "project_id"
-            ],
-            org=MOCK_NCBI_SUBMISSION_METADATA["ncbi_submission_metadata"][
-                "organization"
-            ],
+            bioproject_id=MOCK_NMDC_STUDY["insdc_bioproject_identifiers"][0],
+            org="Test Org",
         )
 
         submission_xml = ncbi_submission_client.get_submission_xml(
-            nmdc_biosample, data_objects_list
+            nmdc_biosample, [], biosample_data_objects
         )
 
         assert "nmdc:bsm-12-p9q5v236" in submission_xml
@@ -323,8 +372,11 @@ def test_get_submission_xml(
         assert "sediment" in submission_xml
         assert "USA: Colorado, Arikaree River" in submission_xml
         assert "2015-07-21T18:00Z" in submission_xml
-        assert "testuser" in submission_xml
-        assert "Test Project" in submission_xml
+        assert "National Microbiome Data Collaborative (NMDC)" in submission_xml
+        assert (
+            "National Ecological Observatory Network: soil metagenomes (DP1.10107.001)"
+            in submission_xml
+        )
 
 
 class TestNCBIXMLUtils:

From 433e3174393f8a3f74048e84116b563c5587f314 Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Fri, 21 Jun 2024 11:47:23 -0700
Subject: [PATCH 25/27] black format nmdc_runtime/site/export/ncbi_xml_utils.py

---
 nmdc_runtime/site/export/ncbi_xml_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nmdc_runtime/site/export/ncbi_xml_utils.py b/nmdc_runtime/site/export/ncbi_xml_utils.py
index 84ee95d6..ac710f93 100644
--- a/nmdc_runtime/site/export/ncbi_xml_utils.py
+++ b/nmdc_runtime/site/export/ncbi_xml_utils.py
@@ -57,6 +57,7 @@ def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
 
     return biosample_data_objects
 
+
 def fetch_omics_processing_from_biosamples(all_docs_collection, biosamples_list):
     biosample_data_objects = []
 

From e82780f6dacb159a83d9dd429b3d9f43d6f02827 Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Fri, 21 Jun 2024 12:21:28 -0700
Subject: [PATCH 26/27] fix code review comments

---
 nmdc_runtime/site/export/ncbi_xml_utils.py | 8 ++------
 nmdc_runtime/site/ops.py                   | 1 -
 requirements/dev.in                        | 3 +--
 requirements/main.in                       | 1 -
 4 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/nmdc_runtime/site/export/ncbi_xml_utils.py b/nmdc_runtime/site/export/ncbi_xml_utils.py
index ac710f93..e527245e 100644
--- a/nmdc_runtime/site/export/ncbi_xml_utils.py
+++ b/nmdc_runtime/site/export/ncbi_xml_utils.py
@@ -198,13 +198,9 @@ def validate_xml(xml, xsd_url):
     xml_schema_doc = etree.parse(BytesIO(xsd_content.encode("utf-8")))
     xml_schema = etree.XMLSchema(xml_schema_doc)
 
-    if "<?xml" in xml:
-        xml_doc = etree.parse(BytesIO(xml.encode("utf-8")))
-    else:
-        xml_doc = etree.parse(StringIO(xml))
-
-    xml_doc = etree.parse(StringIO(xml))
+    xml_doc = etree.parse(BytesIO(xml.encode("utf-8")))
 
     if not xml_schema.validate(xml_doc):
         raise ValueError(f"There were errors while validating against: {xsd_url}")
+
     return True
diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py
index c5aaeb62..4bd00f3b 100644
--- a/nmdc_runtime/site/ops.py
+++ b/nmdc_runtime/site/ops.py
@@ -10,7 +10,6 @@
 from typing import Tuple
 from zipfile import ZipFile
 
-# import xml.etree.ElementTree as ET
 import pandas as pd
 import requests
 
diff --git a/requirements/dev.in b/requirements/dev.in
index 9689b9bc..601370de 100644
--- a/requirements/dev.in
+++ b/requirements/dev.in
@@ -12,5 +12,4 @@ requests-mock
 setuptools
 twine
 requests-cache
-pytest-mock
-lxml
\ No newline at end of file
+pytest-mock
\ No newline at end of file
diff --git a/requirements/main.in b/requirements/main.in
index 1f280041..2ae359d7 100644
--- a/requirements/main.in
+++ b/requirements/main.in
@@ -31,7 +31,6 @@ pandas
 passlib[bcrypt]
 pymongo
 pydantic[email]>=1.10.0
-pytest-mock
 python-dotenv
 python-jose[cryptography]
 python-multipart

From b9c6d3809876d71b92df059dd6b8cbd3ac5758a9 Mon Sep 17 00:00:00 2001
From: Sujay Patil <sujaysanjeev.patil@gmail.com>
Date: Mon, 24 Jun 2024 13:15:58 -0700
Subject: [PATCH 27/27] clarify definition of harcoded text 'eEnvironment'

---
 nmdc_runtime/site/export/ncbi_xml.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py
index 0e82e24a..27a4371d 100644
--- a/nmdc_runtime/site/export/ncbi_xml.py
+++ b/nmdc_runtime/site/export/ncbi_xml.py
@@ -125,6 +125,8 @@ def set_bioproject(self, title, project_id, description, data_type, org):
 
         descriptor = self.set_descriptor(title, description)
         project_type = self.set_element("ProjectType")
+        # "sample_scope" is a enumeration feild. Docs: https://www.ncbi.nlm.nih.gov/data_specs/schema/other/bioproject/Core.xsd
+        # scope is "eEnvironment" when "Content of species in a sample is not known, i.e. microbiome,metagenome, etc.."
         project_type_submission = self.set_element(
             "ProjectTypeSubmission", attrib={"sample_scope": "eEnvironment"}
         )
@@ -422,10 +424,10 @@ def get_submission_xml(
         # submission_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/submission.xsd?view=co"
         # validate_xml(submission_xml, submission_xsd_url)
 
-        # bioproject_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/bioproject.xsd?view=co"
+        # bioproject_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/bioproject/bioproject.xsd?view=co"
         # validate_xml(submission_xml, bioproject_xsd_url)
 
-        # biosample_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/biosample.xsd?view=co"
+        # biosample_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/biosample/biosample.xsd?view=co"
         # validate_xml(submission_xml, biosample_xsd_url)
 
         return submission_xml