Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add NLM Catalog and fix default prefix on object properties and annotation properties #263

Merged
merged 4 commits into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/pyobo/sources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from .mirbase_mature import MiRBaseMatureGetter
from .msigdb import MSigDBGetter
from .ncbigene import NCBIGeneGetter
from .nlm_catalog import NLMCatalogGetter
from .npass import NPASSGetter
from .omim_ps import OMIMPSGetter
from .pathbank import PathBankGetter
Expand Down Expand Up @@ -101,6 +102,7 @@
"MiRBaseGetter",
"MiRBaseMatureGetter",
"NCBIGeneGetter",
"NLMCatalogGetter",
"NPASSGetter",
"OMIMPSGetter",
"PIDGetter",
Expand Down
82 changes: 82 additions & 0 deletions src/pyobo/sources/nlm_catalog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""Converter for NLM Providers."""

from collections.abc import Iterable
from xml.etree import ElementTree

from pyobo.struct import Obo, Reference, Term, TypeDef, default_reference
from pyobo.utils.path import ensure_df, ensure_path

__all__ = [
"NLMCatalogGetter",
]

PREFIX = "nlm"
CATALOG_TO_PUBLISHER = "https://ftp.ncbi.nlm.nih.gov/pubmed/xmlprovidernames.txt"
JOURNAL_INFO_PATH = "https://ftp.ncbi.nlm.nih.gov/pubmed/jourcache.xml"
PUBLISHER = TypeDef.default(PREFIX, "has_publisher", name="has publisher")
START_YEAR = TypeDef.default(PREFIX, "has_start_year", name="has start year")
END_YEAR = TypeDef.default(PREFIX, "has_end_year", name="has end year")


# TODO enrich with context from https://ftp.ncbi.nlm.nih.gov/pubmed/J_Entrez.txt and https://ftp.ncbi.nlm.nih.gov/pubmed/J_Medline.txt


class NLMCatalogGetter(Obo):
"""An ontology representation of NLM Providers."""

bioversions_key = ontology = PREFIX
dynamic_version = True
typedefs = [PUBLISHER, START_YEAR, END_YEAR]
idspaces = {
PREFIX: "https://www.ncbi.nlm.nih.gov/nlmcatalog/",
}

def iter_terms(self, force: bool = False) -> Iterable[Term]:
"""Iterate over gene terms for NLM Catalog."""
yield from get_terms()


def get_terms(force: bool = False) -> Iterable[Term]:
"""Get NLM Catalog terms."""
path = ensure_path(PREFIX, url=JOURNAL_INFO_PATH)
root = ElementTree.parse(path).getroot()

journal_to_publisher_df = ensure_df(
PREFIX, url=CATALOG_TO_PUBLISHER, sep="|", force=force, dtype=str
)
journal_id_to_publisher_key: dict[str, Reference] = {
# TODO change to external prefix later
journal_id: default_reference(PREFIX, key, name)
for journal_id, key, name in journal_to_publisher_df.values
}
for element in root.findall("Journal"):
yield _process_journal(element, journal_id_to_publisher_key)
for k in sorted(set(journal_id_to_publisher_key.values())):
yield Term(reference=k)


def _process_journal(element, journal_id_to_publisher_key: dict[str, Reference]) -> Term:
nlm_id = element.findtext("NlmUniqueID")
name = element.findtext("Name")
issns = [(issn.text, issn.attrib["type"]) for issn in element.findall("Issn")]
# ActivityFlag is either "0" or "1"
term = Term(
reference=Reference(prefix=PREFIX, identifier=nlm_id, name=name),
)
for synonym in element.findall("Alias"):
term.append_synonym(synonym.text)
for issn, _issn_type in issns:
# TODO include ISSN type, this is important
# to determine a "canonical" one
term.append_xref(Reference(prefix="issn", identifier=issn))
if start_year := element.findtext("StartYear"):
term.annotate_integer(START_YEAR, start_year)
if end_year := element.findtext("EndYear"):
term.annotate_integer(END_YEAR, end_year)
if publisher_reference := journal_id_to_publisher_key.get(term.identifier):
term.annotate_object(PUBLISHER, publisher_reference)
return term


if __name__ == "__main__":
NLMCatalogGetter().cli()
8 changes: 6 additions & 2 deletions src/pyobo/struct/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,10 @@ def annotate_boolean(self, prop: ReferenceHint, value: bool) -> Self:
prop, str(value).lower(), Reference(prefix="xsd", identifier="boolean")
)

def annotate_integer(self, prop: ReferenceHint, value: str) -> Self:
"""Append an object annotation."""
return self.annotate_literal(prop, value, Reference(prefix="xsd", identifier="integer"))

def _definition_fp(self) -> str:
definition = obo_escape_slim(self.definition) if self.definition else ""
return f'"{definition}" [{comma_separate_references(self.provenance)}]'
Expand Down Expand Up @@ -570,7 +574,7 @@ def _emit_relations(
for typedef, reference in self.iterate_relations():
_typedef_warn(prefix=ontology_prefix, predicate=typedef, typedefs=typedefs)
predicate_reference = self._reference(typedef, ontology_prefix)
s = f"relationship: {predicate_reference} {reference.preferred_curie}"
s = f"relationship: {predicate_reference} {self._reference(reference, ontology_prefix)}"
if typedef.name or reference.name:
s += " !"
if typedef.name:
Expand All @@ -592,7 +596,7 @@ def _emit_object_properties(
_typedef_warn(prefix=ontology_prefix, predicate=predicate, typedefs=typedefs)
predicate_curie = self._reference(predicate, ontology_prefix)
for value in sorted(values):
yv = f"{predicate_curie} {value.preferred_curie}"
yv = f"{predicate_curie} {self._reference(value, ontology_prefix)}"
if predicate.name and value.name:
yv += f" ! {predicate.name} {value.name}"
yield yv
Expand Down
Loading