Skip to content

Commit

Permalink
Merge pull request #773 from microbiomedata/fix-gold-jgi-ids-submissi…
Browse files Browse the repository at this point in the history
…on-translator

Fix handling of GOLD and JGI IDs in submission data translator
  • Loading branch information
pkalita-lbl authored Nov 15, 2024
2 parents c4c4a8d + 7c8719f commit a6012f4
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 14 deletions.
16 changes: 11 additions & 5 deletions nmdc_runtime/site/translation/gold_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,9 @@ def _get_insdc_biosample_identifiers(self, gold_biosample_id: str) -> List[str]:
for id in self._project_ids_by_biosample_id[gold_biosample_id]
)
return [
self._get_curie("biosample", project["ncbiBioSampleAccession"])
self._ensure_curie(
project["ncbiBioSampleAccession"], default_prefix="biosample"
)
for project in biosample_projects
if project["ncbiBioSampleAccession"]
]
Expand Down Expand Up @@ -471,7 +473,9 @@ def _translate_study(
"""
return nmdc.Study(
description=gold_study.get("description"),
gold_study_identifiers=self._get_curie("gold", gold_study["studyGoldId"]),
gold_study_identifiers=self._ensure_curie(
gold_study["studyGoldId"], default_prefix="gold"
),
id=nmdc_study_id,
name=gold_study.get("studyName"),
principal_investigator=self._get_pi(gold_study),
Expand Down Expand Up @@ -522,7 +526,9 @@ def _translate_biosample(
env_local_scale=self._get_env_term_value(gold_biosample, "envoLocalScale"),
env_medium=self._get_env_term_value(gold_biosample, "envoMedium"),
geo_loc_name=self._get_text_value(gold_biosample, "geoLocation"),
gold_biosample_identifiers=self._get_curie("gold", gold_biosample_id),
gold_biosample_identifiers=self._ensure_curie(
gold_biosample_id, default_prefix="gold"
),
habitat=gold_biosample.get("habitat"),
host_name=gold_biosample.get("hostName"),
host_taxid=self._get_host_taxid(gold_biosample),
Expand Down Expand Up @@ -579,8 +585,8 @@ def _translate_nucleotide_sequencing(
return nmdc.NucleotideSequencing(
id=nmdc_nucleotide_sequencing_id,
name=gold_project.get("projectName"),
gold_sequencing_project_identifiers=self._get_curie(
"gold", gold_project_id
gold_sequencing_project_identifiers=self._ensure_curie(
gold_project_id, default_prefix="gold"
),
ncbi_project_name=gold_project.get("projectName"),
type="nmdc:NucleotideSequencing",
Expand Down
22 changes: 18 additions & 4 deletions nmdc_runtime/site/translation/submission_portal_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,21 @@ def _get_gold_study_identifiers(
if not gold_study_id:
return None

return [self._get_curie("GOLD", gold_study_id)]
return [self._ensure_curie(gold_study_id, default_prefix="gold")]

def _get_jgi_study_identifiers(
self, metadata_submission: JSON_OBJECT
) -> Union[List[str], None]:
"""Construct a JGI proposal CURIE from the multiomics from data
:param metadata_submission: submission portal entry
:return: JGI proposal CURIE
"""
jgi_study_id = get_in(["multiOmicsForm", "JGIStudyId"], metadata_submission)
if not jgi_study_id:
return None

return [self._ensure_curie(jgi_study_id, default_prefix="jgi.proposal")]

def _get_quantity_value(
self, raw_value: Optional[str], unit: Optional[str] = None
Expand Down Expand Up @@ -410,9 +424,6 @@ def _translate_study(
:return: nmdc:Study object
"""
return nmdc.Study(
alternative_identifiers=self._get_from(
metadata_submission, ["multiOmicsForm", "JGIStudyId"]
),
alternative_names=self._get_from(
metadata_submission, ["multiOmicsForm", "alternativeNames"]
),
Expand All @@ -436,6 +447,9 @@ def _translate_study(
insdc_bioproject_identifiers=self._get_from(
metadata_submission, ["multiOmicsForm", "NCBIBioProjectId"]
),
jgi_portal_study_identifiers=self._get_jgi_study_identifiers(
metadata_submission
),
name=self._get_from(metadata_submission, ["studyForm", "studyName"]),
notes=self._get_from(metadata_submission, ["studyForm", "notes"]),
principal_investigator=self._get_pi(metadata_submission),
Expand Down
11 changes: 9 additions & 2 deletions nmdc_runtime/site/translation/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,15 @@ def __init__(
def _index_by_id(self, collection, id):
return {item[id]: item for item in collection}

def _get_curie(self, prefix: str, local: str) -> str:
return f"{prefix}:{local}"
@staticmethod
def _ensure_curie(identifier: str, *, default_prefix: str) -> str:
identifier_parts = identifier.split(":", 1)

# Don't add prefix if identifier is already a CURIE
if len(identifier_parts) == 2:
return identifier

return f"{default_prefix}:{identifier_parts[0]}"

@abstractmethod
def get_database(self) -> nmdc.Database:
Expand Down
2 changes: 1 addition & 1 deletion tests/test_data/test_submission_portal_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def test_get_gold_study_identifiers():
)
assert gold_ids is not None
assert len(gold_ids) == 1
assert gold_ids[0] == "GOLD:Gs000000"
assert gold_ids[0] == "gold:Gs000000"

gold_ids = translator._get_gold_study_identifiers(
{"multiOmicsForm": {"GOLDStudyId": ""}}
Expand Down
8 changes: 6 additions & 2 deletions tests/test_data/test_submission_portal_translator_data.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@ input:
multiOmicsForm:
alternativeNames: []
studyNumber: ''
GOLDStudyId: ''
JGIStudyId: ''
GOLDStudyId: 'Gs0123456'
JGIStudyId: '123456'
NCBIBioProjectId: ''
omicsProcessingTypes:
- mg
Expand Down Expand Up @@ -991,6 +991,10 @@ output:
- Principal Investigator
- Funding acquisition
type: nmdc:CreditAssociation
gold_study_identifiers:
- gold:Gs0123456
jgi_portal_study_identifiers:
- jgi.proposal:123456
---
input:
metadata_submission:
Expand Down
16 changes: 16 additions & 0 deletions tests/test_data/test_translator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from nmdc_schema import nmdc

from nmdc_runtime.site.translation.translator import Translator


class TestTranslator(Translator):
def get_database(self) -> nmdc.Database:
pass


def test_ensure_curie():
assert TestTranslator._ensure_curie("nmdc:bsm-11-z8x8p723", default_prefix="nmdc") == "nmdc:bsm-11-z8x8p723"

assert TestTranslator._ensure_curie("bsm-11-z8x8p723", default_prefix="nmdc") == "nmdc:bsm-11-z8x8p723"

assert TestTranslator._ensure_curie("gold:Gb0123456", default_prefix="nmdc") == "gold:Gb0123456"

0 comments on commit a6012f4

Please sign in to comment.