Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve URI parsing #242

Merged
merged 7 commits into from
Nov 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/pyobo/getters.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
IterHelperHelperDict,
SlimGetOntologyKwargs,
)
from .identifier_utils import MissingPrefixError, wrap_norm_prefix
from .identifier_utils import ParseError, wrap_norm_prefix
from .plugins import has_nomenclature_plugin, run_nomenclature_plugin
from .struct import Obo
from .utils.io import get_writer
Expand Down Expand Up @@ -360,8 +360,8 @@ def iter_helper_helper(
logger.warning("[%s] unable to download - %s", prefix, e.reason)
if strict and not bioregistry.is_deprecated(prefix):
raise
except MissingPrefixError as e:
logger.warning("[%s] missing prefix: %s", prefix, e)
except ParseError as e:
logger.warning("[%s] CURIE/IRI parse error: %s", prefix, e)
if strict and not bioregistry.is_deprecated(prefix):
raise e
except RuntimeError as e:
Expand Down
47 changes: 39 additions & 8 deletions src/pyobo/identifier_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import logging
from functools import wraps
from typing import ClassVar

import bioregistry
from curies import Reference, ReferenceTuple
Expand All @@ -25,16 +26,18 @@
logger = logging.getLogger(__name__)


class MissingPrefixError(ValueError):
class ParseError(ValueError):
"""Raised on a missing prefix."""

text: ClassVar[str]

def __init__(
self,
*,
curie: str,
ontology_prefix: str | None = None,
node: Reference | None = None,
):
) -> None:
"""Initialize the error."""
self.curie = curie
self.ontology_prefix = ontology_prefix
Expand All @@ -44,12 +47,24 @@ def __str__(self) -> str:
s = ""
if self.ontology_prefix:
s += f"[{self.ontology_prefix}] "
s += f"CURIE contains unhandled prefix: `{self.curie}`"
s += f"{self.text}: `{self.curie}`"
if self.node is not None:
s += f" from {self.node.curie}"
return s


class MissingPrefixError(ParseError):
"""Raised on a missing prefix."""

text = "CURIE contains unhandled prefix"


class UnparsableIRIError(ParseError):
"""Raised on a an unparsable IRI."""

text = "IRI could not be parsed"


BAD_CURIES = set()


Expand All @@ -70,18 +85,26 @@ def normalize_curie(
- Normalizes the namespace
- Checks against a blacklist for the entire curie, for the namespace, and for suffixes.
"""
# Remap the curie with the full list
curie = remap_full(curie)

# Remap node's prefix (if necessary)
curie = remap_prefix(curie, ontology_prefix=ontology_prefix)

if curie_is_blacklisted(curie):
return None, None
if curie_has_blacklisted_prefix(curie):
return None, None
if curie_has_blacklisted_suffix(curie):
return None, None

# Remap the curie with the full list
curie = remap_full(curie)

# Remap node's prefix (if necessary)
curie = remap_prefix(curie, ontology_prefix=ontology_prefix)
if curie.startswith("http:") or curie.startswith("https:"):
if reference := parse_iri(curie):
return reference.pair
elif strict:
raise UnparsableIRIError(curie=curie, ontology_prefix=ontology_prefix, node=node)
else:
return None, None

try:
prefix, identifier = curie.split(":", 1)
Expand All @@ -104,6 +127,14 @@ def normalize_curie(
return None, None


def parse_iri(iri: str) -> Reference | None:
"""Parse an IRI into a reference, if possible."""
p, i = bioregistry.parse_iri(iri)
if p and i:
return Reference(prefix=p, identifier=i)
return None


def wrap_norm_prefix(f):
"""Decorate a function that take in a prefix to auto-normalize, or return None if it can't be normalized."""

Expand Down
21 changes: 10 additions & 11 deletions src/pyobo/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,21 +293,20 @@ def iterate_graph_synonym_typedefs(
for s in graph.graph.get("synonymtypedef", []):
sid, name = s.split(" ", 1)
name = name.strip().strip('"')
if sid.startswith("http://") or sid.startswith("https://"):
reference = Reference.from_iri(sid, name=name)
elif ":" not in sid: # assume it's ad-hoc
reference = Reference(prefix=ontology_prefix, identifier=sid, name=name)
else: # assume it's a curie
reference = Reference.from_curie(sid, name=name, strict=strict)

if reference is None:
if strict:
if ":" not in sid:
# assume it's a default reference
yield SynonymTypeDef(reference=default_reference(ontology_prefix, sid, name=name))
else:
reference = Reference.from_curie(
sid, name=name, strict=strict, ontology_prefix=ontology_prefix
)
if reference is not None:
yield SynonymTypeDef(reference=reference)
elif strict:
raise ValueError(f"Could not parse {sid}")
else:
continue

yield SynonymTypeDef(reference=reference)


def iterate_graph_typedefs(
graph: nx.MultiDiGraph, *, ontology_prefix: str, strict: bool = True
Expand Down
10 changes: 1 addition & 9 deletions src/pyobo/registries/metaregistry.json
Original file line number Diff line number Diff line change
Expand Up @@ -358,18 +358,13 @@
"prefix": [
"Image:",
"Category",
"http://",
"https://",
"http://dbpedia.org",
"https://github.com",
"PERSON",
"similar to",
"modelled on",
"SUBMITTER",
"STRUCTURE_ChemicalName_IUPAC",
"STRUCTURE_Formula",
"stedman",
"From_Merriam-Webster's_Online_Dictionary_at_www.Merriam-Webster.com",
"value-type:",
"binary-data-type:MS\\",
"PECO_GIT",
Expand All @@ -381,8 +376,6 @@
"Germplasm:",
"IUPAC:",
"IUPHAR:GPCRListForward?",
"GOC:",
"goc:",
"GIOC:",
"MONDORULE:",
"MTH:",
Expand All @@ -395,8 +388,7 @@
"INFOODs:",
"NLCD:",
"TEMP:",
"PO_GIT:",
"URL:http"
"PO_GIT:"
],
"suffix": [
".jpg",
Expand Down
40 changes: 7 additions & 33 deletions src/pyobo/struct/reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,38 +93,12 @@ def from_curie( # type:ignore[override]
prefix, identifier = normalize_curie(
curie, strict=strict, ontology_prefix=ontology_prefix, node=node
)
return cls._materialize(prefix=prefix, identifier=identifier, name=name, auto=auto)

@classmethod
def from_iri(
cls,
iri: str,
name: str | None = None,
*,
auto: bool = False,
) -> Reference | None:
"""Get a reference from an IRI using the Bioregistry.

:param iri: The IRI to parse
:param name: The name associated with the CURIE
:param auto: Automatically look up name
"""
prefix, identifier = bioregistry.parse_iri(iri)
return cls._materialize(prefix=prefix, identifier=identifier, name=name, auto=auto)

@classmethod
def _materialize(
cls,
prefix: str | None,
identifier: str | None,
name: str | None = None,
*,
auto: bool = False,
) -> Reference | None:
if prefix is None or identifier is None:
return None
if name is None and auto:
return cls.auto(prefix=prefix, identifier=identifier)
from ..api import get_name

name = get_name(prefix, identifier)
return cls.model_validate({"prefix": prefix, "identifier": identifier, "name": name})

@property
Expand Down Expand Up @@ -196,20 +170,20 @@ def bioregistry_link(self) -> str:
return self.reference.bioregistry_link


def default_reference(prefix: str, part: str, name: str | None = None) -> Reference:
def default_reference(prefix: str, identifier: str, name: str | None = None) -> Reference:
"""Create a CURIE for an "unqualified" reference.

:param prefix: The prefix of the ontology in which the "unqualified" reference is made
:param part: The "unqualified" reference. For example, if you just write
:param identifier: The "unqualified" reference. For example, if you just write
"located_in" somewhere there is supposed to be a CURIE
:returns: A CURIE for the "unqualified" reference based on the OBO semantic space

>>> default_reference("chebi", "conjugate_base_of")
Reference(prefix="obo", identifier="chebi#conjugate_base_of")
"""
if not part.strip():
if not identifier.strip():
raise ValueError("default identifier is empty")
return Reference(prefix="obo", identifier=f"{prefix}#{part}", name=name)
return Reference(prefix="obo", identifier=f"{prefix}#{identifier}", name=name)


def reference_escape(predicate: Reference | Referenced, *, ontology_prefix: str) -> str:
Expand Down
6 changes: 3 additions & 3 deletions tests/test_get.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,17 +49,17 @@ def test_get_graph_synonym_typedefs(self):
sorted(
[
SynonymTypeDef(
reference=Reference(
reference=default_reference(
prefix="chebi", identifier="IUPAC_NAME", name="IUPAC NAME"
)
),
SynonymTypeDef(
reference=Reference(
reference=default_reference(
prefix="chebi", identifier="BRAND_NAME", name="BRAND NAME"
)
),
SynonymTypeDef(
reference=Reference(prefix="chebi", identifier="INN", name="INN")
reference=default_reference(prefix="chebi", identifier="INN", name="INN")
),
],
key=attrgetter("curie"),
Expand Down
63 changes: 63 additions & 0 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,45 @@ def test_property_literal_obo_purl(self) -> None:
row,
)

def test_property_object_url(self) -> None:
"""Test parsing an object URI."""
ontology = _read("""\
ontology: chebi

[Term]
id: CHEBI:1234
property_value: http://purl.obolibrary.org/obo/RO_0018033 http://purl.obolibrary.org/obo/CHEBI_5678
""")
term = self.get_only_term(ontology)
self.assertEqual(0, len(list(term.annotations_literal)))
self.assertEqual(1, len(list(term.annotations_object)))
self.assertEqual("CHEBI:5678", term.get_property(is_conjugate_base_of))

df = ontology.get_properties_df()
self.assertEqual(4, len(df.columns))
self.assertEqual(1, len(df))
row = dict(df.iloc[0])
self.assertEqual(
{"chebi_id": "1234", "property": "RO:0018033", "value": "CHEBI:5678", "datatype": ""},
row,
)

def test_property_object_url_invalid(self) -> None:
"""Test parsing an object URI."""
text = """\
ontology: chebi

[Term]
id: CHEBI:1234
property_value: http://purl.obolibrary.org/obo/RO_0018033 http://example.org/nope:nope
"""
with self.assertRaises(ValueError):
_read(text)
ontology = _read(text, strict=False)
term = self.get_only_term(ontology)
self.assertEqual(0, len(list(term.annotations_literal)))
self.assertEqual(0, len(list(term.annotations_object)))

def test_property_literal_url(self) -> None:
"""Test using a full OBO PURL as the property."""
ontology = _read("""\
Expand Down Expand Up @@ -663,3 +702,27 @@ def test_synonym_full(self) -> None:
],
synonym.provenance,
)

def test_synonym_url(self) -> None:
"""Test parsing a synonym defined with a PURL."""
ontology = _read(f"""\
ontology: chebi
synonymtypedef: http://purl.obolibrary.org/obo/OMO_1234567 ""

[Term]
id: CHEBI:1234
synonym: "LTEC I" EXACT OMO:1234567 [Orphanet:93938,{CHARLIE.curie}]
""")
term = self.get_only_term(ontology)
self.assertEqual(1, len(term.synonyms))
synonym = term.synonyms[0]
self.assertEqual("LTEC I", synonym.name)
self.assertEqual("EXACT", synonym.specificity)
self.assertEqual(Reference(prefix="omo", identifier="1234567"), synonym.type.reference)
self.assertEqual(
[
Reference(prefix="orphanet", identifier="93938"),
CHARLIE,
],
synonym.provenance,
)
5 changes: 5 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ def test_strip_prefix(self):
("ncit", "C1234"), normalize_curie("Thesaurus:C1234", ontology_prefix="enm")
)

# parsing IRIs
self.assertEqual(
("chebi", "1234"), normalize_curie("http://purl.obolibrary.org/obo/CHEBI_1234")
)

def test_parse_eccode_transfer(self):
"""Test parse_eccode_transfer."""
self.assertEqual(
Expand Down