From ce83e35f72cf7f0b997c554fd4830f047e4df405 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Wed, 4 Dec 2024 13:37:58 +0100 Subject: [PATCH 1/4] Add NLM Catalog --- src/pyobo/sources/__init__.py | 2 + src/pyobo/sources/nlm_catalog.py | 81 ++++++++++++++++++++++++++++++++ src/pyobo/struct/struct.py | 4 ++ 3 files changed, 87 insertions(+) create mode 100644 src/pyobo/sources/nlm_catalog.py diff --git a/src/pyobo/sources/__init__.py b/src/pyobo/sources/__init__.py index 22908b40..082dd024 100644 --- a/src/pyobo/sources/__init__.py +++ b/src/pyobo/sources/__init__.py @@ -39,6 +39,7 @@ from .mirbase_mature import MiRBaseMatureGetter from .msigdb import MSigDBGetter from .ncbigene import NCBIGeneGetter +from .nlm_catalog import NLMCatalogGetter from .npass import NPASSGetter from .omim_ps import OMIMPSGetter from .pathbank import PathBankGetter @@ -101,6 +102,7 @@ "MiRBaseGetter", "MiRBaseMatureGetter", "NCBIGeneGetter", + "NLMCatalogGetter", "NPASSGetter", "OMIMPSGetter", "PIDGetter", diff --git a/src/pyobo/sources/nlm_catalog.py b/src/pyobo/sources/nlm_catalog.py new file mode 100644 index 00000000..16b4f0d8 --- /dev/null +++ b/src/pyobo/sources/nlm_catalog.py @@ -0,0 +1,81 @@ +"""Converter for NLM Providers.""" + +from collections.abc import Iterable +from xml.etree import ElementTree + +from pyobo.struct import Obo, Reference, Term, TypeDef, default_reference +from pyobo.utils.path import ensure_df, ensure_path + +__all__ = [ + "NLMCatalogGetter", +] + +PREFIX = "nlm" +CATALOG_TO_PUBLISHER = "https://ftp.ncbi.nlm.nih.gov/pubmed/xmlprovidernames.txt" +JOURNAL_INFO_PATH = "https://ftp.ncbi.nlm.nih.gov/pubmed/jourcache.xml" +PUBLISHER = TypeDef.default(PREFIX, "has_publisher", name="has publisher") +START_YEAR = TypeDef.default(PREFIX, "has_start_year", name="has start year") +END_YEAR = TypeDef.default(PREFIX, "has_end_year", name="has end year") + + +# TODO enrich with context from https://ftp.ncbi.nlm.nih.gov/pubmed/J_Entrez.txt and https://ftp.ncbi.nlm.nih.gov/pubmed/J_Medline.txt + + +class NLMCatalogGetter(Obo): + """An ontology representation of NLM Providers.""" + + bioversions_key = ontology = PREFIX + dynamic_version = True + typedefs = [PUBLISHER, START_YEAR, END_YEAR] + idspaces = { + PREFIX: "https://www.ncbi.nlm.nih.gov/nlmcatalog/", + } + + def iter_terms(self, force: bool = False) -> Iterable[Term]: + """Iterate over gene terms for NLM Catalog.""" + yield from get_terms() + + +def get_terms(force: bool = False) -> Iterable[Term]: + """Get NLM Catalog terms.""" + path = ensure_path(PREFIX, url=JOURNAL_INFO_PATH) + root = ElementTree.parse(path).getroot() + + journal_to_publisher_df = ensure_df( + PREFIX, url=CATALOG_TO_PUBLISHER, sep="|", force=force, dtype=str + ) + journal_id_to_publisher_key = { + # TODO change to external prefix later + journal_id: default_reference(PREFIX, key, name) + for journal_id, key, name in journal_to_publisher_df.values + } + for element in root.findall("Journal"): + term = _process_journal(element) + if pr := journal_id_to_publisher_key.get(term.identifier): + term.annotate_object(PUBLISHER, pr) + yield term + for k in sorted(set(journal_id_to_publisher_key.values())): + yield Term(reference=k) + + +def _process_journal(element) -> Term: + nlm_id = element.findtext("NlmUniqueID") + name = element.findtext("Name") + issns = [(issn.text, issn.attrib["type"]) for issn in element.findall("Issn")] + term = Term( + reference=Reference(prefix=PREFIX, identifier=nlm_id, name=name), + is_obsolete=element.findtext("ActivityFlag") == "0", + ) + for synonym in element.findall("Alias"): + term.append_synonym(synonym.text) + for issn, _issn_type in issns: + term.append_xref(Reference(prefix="issn", identifier=issn)) + if start_year := element.findtext("StartYear"): + term.annotate_integer(START_YEAR, start_year) + if end_year := element.findtext("EndYear"): + term.annotate_integer(END_YEAR, end_year) + return term + + +if __name__ == "__main__": + NLMCatalogGetter().cli() diff --git a/src/pyobo/struct/struct.py b/src/pyobo/struct/struct.py index a2490534..6a0032a1 100644 --- a/src/pyobo/struct/struct.py +++ b/src/pyobo/struct/struct.py @@ -494,6 +494,10 @@ def annotate_boolean(self, prop: ReferenceHint, value: bool) -> Self: prop, str(value).lower(), Reference(prefix="xsd", identifier="boolean") ) + def annotate_integer(self, prop: ReferenceHint, value: str) -> Self: + """Append an object annotation.""" + return self.annotate_literal(prop, value, Reference(prefix="xsd", identifier="integer")) + def _definition_fp(self) -> str: definition = obo_escape_slim(self.definition) if self.definition else "" return f'"{definition}" [{comma_separate_references(self.provenance)}]' From 59fed2f625b19edd32003526832da58cb7341fa5 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Wed, 4 Dec 2024 13:41:05 +0100 Subject: [PATCH 2/4] Update nlm_catalog.py --- src/pyobo/sources/nlm_catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pyobo/sources/nlm_catalog.py b/src/pyobo/sources/nlm_catalog.py index 16b4f0d8..e4b556f4 100644 --- a/src/pyobo/sources/nlm_catalog.py +++ b/src/pyobo/sources/nlm_catalog.py @@ -62,9 +62,9 @@ def _process_journal(element) -> Term: nlm_id = element.findtext("NlmUniqueID") name = element.findtext("Name") issns = [(issn.text, issn.attrib["type"]) for issn in element.findall("Issn")] + # ActivityFlag is either "0" or "1" term = Term( reference=Reference(prefix=PREFIX, identifier=nlm_id, name=name), - is_obsolete=element.findtext("ActivityFlag") == "0", ) for synonym in element.findall("Alias"): term.append_synonym(synonym.text) From 49140a7d9ae1ac45da4e6d36aeb70cfe045c8dbb Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Wed, 4 Dec 2024 13:51:37 +0100 Subject: [PATCH 3/4] Update --- src/pyobo/sources/nlm_catalog.py | 2 ++ src/pyobo/struct/struct.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/pyobo/sources/nlm_catalog.py b/src/pyobo/sources/nlm_catalog.py index e4b556f4..150e7eb5 100644 --- a/src/pyobo/sources/nlm_catalog.py +++ b/src/pyobo/sources/nlm_catalog.py @@ -69,6 +69,8 @@ def _process_journal(element) -> Term: for synonym in element.findall("Alias"): term.append_synonym(synonym.text) for issn, _issn_type in issns: + # TODO include ISSN type, this is important + # to determine a "canonical" one term.append_xref(Reference(prefix="issn", identifier=issn)) if start_year := element.findtext("StartYear"): term.annotate_integer(START_YEAR, start_year) diff --git a/src/pyobo/struct/struct.py b/src/pyobo/struct/struct.py index 6a0032a1..c6f1aa18 100644 --- a/src/pyobo/struct/struct.py +++ b/src/pyobo/struct/struct.py @@ -574,7 +574,7 @@ def _emit_relations( for typedef, reference in self.iterate_relations(): _typedef_warn(prefix=ontology_prefix, predicate=typedef, typedefs=typedefs) predicate_reference = self._reference(typedef, ontology_prefix) - s = f"relationship: {predicate_reference} {reference.preferred_curie}" + s = f"relationship: {predicate_reference} {self._reference(reference, ontology_prefix)}" if typedef.name or reference.name: s += " !" if typedef.name: @@ -596,7 +596,7 @@ def _emit_object_properties( _typedef_warn(prefix=ontology_prefix, predicate=predicate, typedefs=typedefs) predicate_curie = self._reference(predicate, ontology_prefix) for value in sorted(values): - yv = f"{predicate_curie} {value.preferred_curie}" + yv = f"{predicate_curie} {self._reference(value, ontology_prefix)}" if predicate.name and value.name: yv += f" ! {predicate.name} {value.name}" yield yv From 3aeee1f471988364c9ae973e514dfa059ac3eeed Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Wed, 4 Dec 2024 13:54:41 +0100 Subject: [PATCH 4/4] Update nlm_catalog.py --- src/pyobo/sources/nlm_catalog.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/pyobo/sources/nlm_catalog.py b/src/pyobo/sources/nlm_catalog.py index 150e7eb5..8a6b7f3b 100644 --- a/src/pyobo/sources/nlm_catalog.py +++ b/src/pyobo/sources/nlm_catalog.py @@ -44,21 +44,18 @@ def get_terms(force: bool = False) -> Iterable[Term]: journal_to_publisher_df = ensure_df( PREFIX, url=CATALOG_TO_PUBLISHER, sep="|", force=force, dtype=str ) - journal_id_to_publisher_key = { + journal_id_to_publisher_key: dict[str, Reference] = { # TODO change to external prefix later journal_id: default_reference(PREFIX, key, name) for journal_id, key, name in journal_to_publisher_df.values } for element in root.findall("Journal"): - term = _process_journal(element) - if pr := journal_id_to_publisher_key.get(term.identifier): - term.annotate_object(PUBLISHER, pr) - yield term + yield _process_journal(element, journal_id_to_publisher_key) for k in sorted(set(journal_id_to_publisher_key.values())): yield Term(reference=k) -def _process_journal(element) -> Term: +def _process_journal(element, journal_id_to_publisher_key: dict[str, Reference]) -> Term: nlm_id = element.findtext("NlmUniqueID") name = element.findtext("Name") issns = [(issn.text, issn.attrib["type"]) for issn in element.findall("Issn")] @@ -76,6 +73,8 @@ def _process_journal(element) -> Term: term.annotate_integer(START_YEAR, start_year) if end_year := element.findtext("EndYear"): term.annotate_integer(END_YEAR, end_year) + if publisher_reference := journal_id_to_publisher_key.get(term.identifier): + term.annotate_object(PUBLISHER, publisher_reference) return term