Merge pull request #833 from microbiomedata/remove-site-info-gold-tra…

…nslator Add option to GOLD translator to ignore field research sites
microbiomedata · Dec 13, 2024 · 22a9a58 · 22a9a58
2 parents fa94fd5 + 4004ca6
commit 22a9a58
Show file tree

Hide file tree

Showing 5 changed files with 55 additions and 32 deletions.
diff --git a/nmdc_runtime/site/graphs.py b/nmdc_runtime/site/graphs.py
@@ -127,9 +127,12 @@ def apply_metadata_in():
 
 @graph
 def gold_study_to_database():
-    (study_id, study_type, gold_nmdc_instrument_mapping_file_url) = (
-        get_gold_study_pipeline_inputs()
-    )
+    (
+        study_id,
+        study_type,
+        gold_nmdc_instrument_mapping_file_url,
+        include_field_site_info,
+    ) = get_gold_study_pipeline_inputs()
 
     projects = gold_projects_by_study(study_id)
     biosamples = gold_biosamples_by_study(study_id)
@@ -144,6 +147,7 @@ def gold_study_to_database():
         biosamples,
         analysis_projects,
         gold_nmdc_instrument_map_df,
+        include_field_site_info,
     )
     database_dict = nmdc_schema_object_to_dict(database)
     filename = nmdc_schema_database_export_filename(study)

diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py
@@ -589,18 +589,23 @@ def add_output_run_event(context: OpExecutionContext, outputs: List[str]):
         "study_id": str,
         "study_type": str,
         "gold_nmdc_instrument_mapping_file_url": str,
+        "include_field_site_info": bool,
     },
     out={
         "study_id": Out(str),
         "study_type": Out(str),
         "gold_nmdc_instrument_mapping_file_url": Out(str),
+        "include_field_site_info": Out(bool),
     },
 )
-def get_gold_study_pipeline_inputs(context: OpExecutionContext) -> Tuple[str, str, str]:
+def get_gold_study_pipeline_inputs(
+    context: OpExecutionContext,
+) -> Tuple[str, str, str, bool]:
     return (
         context.op_config["study_id"],
         context.op_config["study_type"],
         context.op_config["gold_nmdc_instrument_mapping_file_url"],
+        context.op_config["include_field_site_info"],
     )
 
 
@@ -643,6 +648,7 @@ def nmdc_schema_database_from_gold_study(
     biosamples: List[Dict[str, Any]],
     analysis_projects: List[Dict[str, Any]],
     gold_nmdc_instrument_map_df: pd.DataFrame,
+    include_field_site_info: bool,
 ) -> nmdc.Database:
     client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
 
@@ -657,6 +663,7 @@ def id_minter(*args, **kwargs):
         projects,
         analysis_projects,
         gold_nmdc_instrument_map_df,
+        include_field_site_info,
         id_minter=id_minter,
     )
     database = translator.get_database()

diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py
@@ -506,6 +506,7 @@ def biosample_submission_ingest():
                             "study_id": "",
                             "study_type": "research_study",
                             "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
+                            "include_field_site_info": False,
                         },
                     },
                     "export_json_to_drs": {"config": {"username": ""}},

diff --git a/nmdc_runtime/site/translation/gold_translator.py b/nmdc_runtime/site/translation/gold_translator.py
@@ -21,13 +21,15 @@ def __init__(
         projects: List[JSON_OBJECT] = [],
         analysis_projects: List[JSON_OBJECT] = [],
         gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
+        include_field_site_info: bool = False,
         *args,
         **kwargs,
     ) -> None:
         super().__init__(*args, **kwargs)
 
         self.study = study
         self.study_type = nmdc.StudyCategoryEnum(study_type)
+        self.include_field_site_info = include_field_site_info
         # Filter biosamples to only those with `sequencingStrategy` of
         # "Metagenome" or "Metatranscriptome"
         self.biosamples = [
@@ -629,7 +631,11 @@ def _translate_nucleotide_sequencing(
             principal_investigator=self._get_pi(gold_project),
             processing_institution=self._get_processing_institution(gold_project),
             instrument_used=self._get_instrument(gold_project),
-            analyte_category="metagenome",
+            analyte_category=(
+                gold_project.get("sequencingStrategy").lower()
+                if gold_project.get("sequencingStrategy")
+                else None
+            ),
             associated_studies=[nmdc_study_id],
         )
 
@@ -654,21 +660,24 @@ def get_database(self) -> nmdc.Database:
         nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(self.biosamples))
         gold_to_nmdc_biosample_ids = dict(zip(gold_biosample_ids, nmdc_biosample_ids))
 
-        gold_field_site_names = sorted(
-            {self._get_field_site_name(biosample) for biosample in self.biosamples}
-        )
-        nmdc_field_site_ids = self._id_minter(
-            "nmdc:FieldResearchSite", len(gold_field_site_names)
-        )
-        gold_name_to_nmdc_field_site_ids = dict(
-            zip(gold_field_site_names, nmdc_field_site_ids)
-        )
-        gold_biosample_to_nmdc_field_site_ids = {
-            biosample["biosampleGoldId"]: gold_name_to_nmdc_field_site_ids[
-                self._get_field_site_name(biosample)
-            ]
-            for biosample in self.biosamples
-        }
+        if self.include_field_site_info:
+            gold_field_site_names = sorted(
+                {self._get_field_site_name(biosample) for biosample in self.biosamples}
+            )
+            nmdc_field_site_ids = self._id_minter(
+                "nmdc:FieldResearchSite", len(gold_field_site_names)
+            )
+            gold_name_to_nmdc_field_site_ids = dict(
+                zip(gold_field_site_names, nmdc_field_site_ids)
+            )
+            gold_biosample_to_nmdc_field_site_ids = {
+                biosample["biosampleGoldId"]: gold_name_to_nmdc_field_site_ids[
+                    self._get_field_site_name(biosample)
+                ]
+                for biosample in self.biosamples
+            }
+        else:
+            gold_biosample_to_nmdc_field_site_ids = {}
 
         gold_project_ids = [project["projectGoldId"] for project in self.projects]
         nmdc_nucleotide_sequencing_ids = self._id_minter(
@@ -686,16 +695,17 @@ def get_database(self) -> nmdc.Database:
                     biosample["biosampleGoldId"]
                 ],
                 nmdc_study_id=nmdc_study_id,
-                nmdc_field_site_id=gold_biosample_to_nmdc_field_site_ids[
-                    biosample["biosampleGoldId"]
-                ],
+                nmdc_field_site_id=gold_biosample_to_nmdc_field_site_ids.get(
+                    biosample["biosampleGoldId"], None
+                ),
             )
             for biosample in self.biosamples
         ]
-        database.field_research_site_set = [
-            nmdc.FieldResearchSite(id=id, name=name, type="nmdc:FieldResearchSite")
-            for name, id in gold_name_to_nmdc_field_site_ids.items()
-        ]
+        if self.include_field_site_info:
+            database.field_research_site_set = [
+                nmdc.FieldResearchSite(id=id, name=name, type="nmdc:FieldResearchSite")
+                for name, id in gold_name_to_nmdc_field_site_ids.items()
+            ]
         database.data_generation_set = [
             self._translate_nucleotide_sequencing(
                 project,

diff --git a/tests/test_ops/test_gold_api_ops.py b/tests/test_ops/test_gold_api_ops.py
@@ -31,7 +31,8 @@ def op_context(client_config):
         op_config={
             "study_id": "Gs0149396",
             "study_type": "research_study",
-            "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
+            "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
+            "include_field_site_info": False,
         },
     )
 
@@ -55,7 +56,7 @@ def test_gold_biosamples_by_study(client_config, op_context):
             ],
         )
 
-        (study_id, _, _) = get_gold_study_pipeline_inputs(op_context)
+        (study_id, _, _, _) = get_gold_study_pipeline_inputs(op_context)
         gold_biosamples_by_study(op_context, study_id)
 
         assert (
@@ -72,7 +73,7 @@ def test_gold_projects_by_study(client_config, op_context):
             json=[{"projectGoldId": "Gp123456789"}],
         )
 
-        (study_id, _, _) = get_gold_study_pipeline_inputs(op_context)
+        (study_id, _, _, _) = get_gold_study_pipeline_inputs(op_context)
         gold_projects_by_study(op_context, study_id)
 
         assert len(mock.request_history) == 1
@@ -87,7 +88,7 @@ def test_gold_analysis_projects_by_study(client_config, op_context):
             json=[{"apGoldId": "Ga0499994"}],
         )
 
-        (study_id, _, _) = get_gold_study_pipeline_inputs(op_context)
+        (study_id, _, _, _) = get_gold_study_pipeline_inputs(op_context)
         gold_analysis_projects_by_study(op_context, study_id)
 
         assert len(mock.request_history) == 1
@@ -101,7 +102,7 @@ def test_gold_study(client_config, op_context):
             f'{client_config["base_url"]}/studies', json=[{"studyGoldId": "Gs0149396"}]
         )
 
-        (study_id, _, _) = get_gold_study_pipeline_inputs(op_context)
+        (study_id, _, _, _) = get_gold_study_pipeline_inputs(op_context)
         gold_study(op_context, study_id)
 
         assert len(mock.request_history) == 1