Skip to content

Commit

Permalink
Merge pull request #833 from microbiomedata/remove-site-info-gold-tra…
Browse files Browse the repository at this point in the history
…nslator

Add option to GOLD translator to ignore field research sites
  • Loading branch information
sujaypatil96 authored Dec 13, 2024
2 parents fa94fd5 + 4004ca6 commit 22a9a58
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 32 deletions.
10 changes: 7 additions & 3 deletions nmdc_runtime/site/graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,12 @@ def apply_metadata_in():

@graph
def gold_study_to_database():
(study_id, study_type, gold_nmdc_instrument_mapping_file_url) = (
get_gold_study_pipeline_inputs()
)
(
study_id,
study_type,
gold_nmdc_instrument_mapping_file_url,
include_field_site_info,
) = get_gold_study_pipeline_inputs()

projects = gold_projects_by_study(study_id)
biosamples = gold_biosamples_by_study(study_id)
Expand All @@ -144,6 +147,7 @@ def gold_study_to_database():
biosamples,
analysis_projects,
gold_nmdc_instrument_map_df,
include_field_site_info,
)
database_dict = nmdc_schema_object_to_dict(database)
filename = nmdc_schema_database_export_filename(study)
Expand Down
9 changes: 8 additions & 1 deletion nmdc_runtime/site/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,18 +589,23 @@ def add_output_run_event(context: OpExecutionContext, outputs: List[str]):
"study_id": str,
"study_type": str,
"gold_nmdc_instrument_mapping_file_url": str,
"include_field_site_info": bool,
},
out={
"study_id": Out(str),
"study_type": Out(str),
"gold_nmdc_instrument_mapping_file_url": Out(str),
"include_field_site_info": Out(bool),
},
)
def get_gold_study_pipeline_inputs(context: OpExecutionContext) -> Tuple[str, str, str]:
def get_gold_study_pipeline_inputs(
context: OpExecutionContext,
) -> Tuple[str, str, str, bool]:
return (
context.op_config["study_id"],
context.op_config["study_type"],
context.op_config["gold_nmdc_instrument_mapping_file_url"],
context.op_config["include_field_site_info"],
)


Expand Down Expand Up @@ -643,6 +648,7 @@ def nmdc_schema_database_from_gold_study(
biosamples: List[Dict[str, Any]],
analysis_projects: List[Dict[str, Any]],
gold_nmdc_instrument_map_df: pd.DataFrame,
include_field_site_info: bool,
) -> nmdc.Database:
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client

Expand All @@ -657,6 +663,7 @@ def id_minter(*args, **kwargs):
projects,
analysis_projects,
gold_nmdc_instrument_map_df,
include_field_site_info,
id_minter=id_minter,
)
database = translator.get_database()
Expand Down
1 change: 1 addition & 0 deletions nmdc_runtime/site/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,7 @@ def biosample_submission_ingest():
"study_id": "",
"study_type": "research_study",
"gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
"include_field_site_info": False,
},
},
"export_json_to_drs": {"config": {"username": ""}},
Expand Down
56 changes: 33 additions & 23 deletions nmdc_runtime/site/translation/gold_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,15 @@ def __init__(
projects: List[JSON_OBJECT] = [],
analysis_projects: List[JSON_OBJECT] = [],
gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
include_field_site_info: bool = False,
*args,
**kwargs,
) -> None:
super().__init__(*args, **kwargs)

self.study = study
self.study_type = nmdc.StudyCategoryEnum(study_type)
self.include_field_site_info = include_field_site_info
# Filter biosamples to only those with `sequencingStrategy` of
# "Metagenome" or "Metatranscriptome"
self.biosamples = [
Expand Down Expand Up @@ -629,7 +631,11 @@ def _translate_nucleotide_sequencing(
principal_investigator=self._get_pi(gold_project),
processing_institution=self._get_processing_institution(gold_project),
instrument_used=self._get_instrument(gold_project),
analyte_category="metagenome",
analyte_category=(
gold_project.get("sequencingStrategy").lower()
if gold_project.get("sequencingStrategy")
else None
),
associated_studies=[nmdc_study_id],
)

Expand All @@ -654,21 +660,24 @@ def get_database(self) -> nmdc.Database:
nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(self.biosamples))
gold_to_nmdc_biosample_ids = dict(zip(gold_biosample_ids, nmdc_biosample_ids))

gold_field_site_names = sorted(
{self._get_field_site_name(biosample) for biosample in self.biosamples}
)
nmdc_field_site_ids = self._id_minter(
"nmdc:FieldResearchSite", len(gold_field_site_names)
)
gold_name_to_nmdc_field_site_ids = dict(
zip(gold_field_site_names, nmdc_field_site_ids)
)
gold_biosample_to_nmdc_field_site_ids = {
biosample["biosampleGoldId"]: gold_name_to_nmdc_field_site_ids[
self._get_field_site_name(biosample)
]
for biosample in self.biosamples
}
if self.include_field_site_info:
gold_field_site_names = sorted(
{self._get_field_site_name(biosample) for biosample in self.biosamples}
)
nmdc_field_site_ids = self._id_minter(
"nmdc:FieldResearchSite", len(gold_field_site_names)
)
gold_name_to_nmdc_field_site_ids = dict(
zip(gold_field_site_names, nmdc_field_site_ids)
)
gold_biosample_to_nmdc_field_site_ids = {
biosample["biosampleGoldId"]: gold_name_to_nmdc_field_site_ids[
self._get_field_site_name(biosample)
]
for biosample in self.biosamples
}
else:
gold_biosample_to_nmdc_field_site_ids = {}

gold_project_ids = [project["projectGoldId"] for project in self.projects]
nmdc_nucleotide_sequencing_ids = self._id_minter(
Expand All @@ -686,16 +695,17 @@ def get_database(self) -> nmdc.Database:
biosample["biosampleGoldId"]
],
nmdc_study_id=nmdc_study_id,
nmdc_field_site_id=gold_biosample_to_nmdc_field_site_ids[
biosample["biosampleGoldId"]
],
nmdc_field_site_id=gold_biosample_to_nmdc_field_site_ids.get(
biosample["biosampleGoldId"], None
),
)
for biosample in self.biosamples
]
database.field_research_site_set = [
nmdc.FieldResearchSite(id=id, name=name, type="nmdc:FieldResearchSite")
for name, id in gold_name_to_nmdc_field_site_ids.items()
]
if self.include_field_site_info:
database.field_research_site_set = [
nmdc.FieldResearchSite(id=id, name=name, type="nmdc:FieldResearchSite")
for name, id in gold_name_to_nmdc_field_site_ids.items()
]
database.data_generation_set = [
self._translate_nucleotide_sequencing(
project,
Expand Down
11 changes: 6 additions & 5 deletions tests/test_ops/test_gold_api_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ def op_context(client_config):
op_config={
"study_id": "Gs0149396",
"study_type": "research_study",
"gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
"gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
"include_field_site_info": False,
},
)

Expand All @@ -55,7 +56,7 @@ def test_gold_biosamples_by_study(client_config, op_context):
],
)

(study_id, _, _) = get_gold_study_pipeline_inputs(op_context)
(study_id, _, _, _) = get_gold_study_pipeline_inputs(op_context)
gold_biosamples_by_study(op_context, study_id)

assert (
Expand All @@ -72,7 +73,7 @@ def test_gold_projects_by_study(client_config, op_context):
json=[{"projectGoldId": "Gp123456789"}],
)

(study_id, _, _) = get_gold_study_pipeline_inputs(op_context)
(study_id, _, _, _) = get_gold_study_pipeline_inputs(op_context)
gold_projects_by_study(op_context, study_id)

assert len(mock.request_history) == 1
Expand All @@ -87,7 +88,7 @@ def test_gold_analysis_projects_by_study(client_config, op_context):
json=[{"apGoldId": "Ga0499994"}],
)

(study_id, _, _) = get_gold_study_pipeline_inputs(op_context)
(study_id, _, _, _) = get_gold_study_pipeline_inputs(op_context)
gold_analysis_projects_by_study(op_context, study_id)

assert len(mock.request_history) == 1
Expand All @@ -101,7 +102,7 @@ def test_gold_study(client_config, op_context):
f'{client_config["base_url"]}/studies', json=[{"studyGoldId": "Gs0149396"}]
)

(study_id, _, _) = get_gold_study_pipeline_inputs(op_context)
(study_id, _, _, _) = get_gold_study_pipeline_inputs(op_context)
gold_study(op_context, study_id)

assert len(mock.request_history) == 1
Expand Down

0 comments on commit 22a9a58

Please sign in to comment.