biopragmatics · cthoyt · Nov 23, 2024 · Nov 23, 2024 · Nov 23, 2024 · Nov 23, 2024
diff --git a/src/pyobo/getters.py b/src/pyobo/getters.py
@@ -31,7 +31,7 @@
     IterHelperHelperDict,
     SlimGetOntologyKwargs,
 )
-from .identifier_utils import MissingPrefixError, wrap_norm_prefix
+from .identifier_utils import ParseError, wrap_norm_prefix
 from .plugins import has_nomenclature_plugin, run_nomenclature_plugin
 from .struct import Obo
 from .utils.io import get_writer
@@ -360,8 +360,8 @@ def iter_helper_helper(
             logger.warning("[%s] unable to download - %s", prefix, e.reason)
             if strict and not bioregistry.is_deprecated(prefix):
                 raise
-        except MissingPrefixError as e:
-            logger.warning("[%s] missing prefix: %s", prefix, e)
+        except ParseError as e:
+            logger.warning("[%s] CURIE/IRI parse error: %s", prefix, e)
             if strict and not bioregistry.is_deprecated(prefix):
                 raise e
         except RuntimeError as e:

diff --git a/src/pyobo/identifier_utils.py b/src/pyobo/identifier_utils.py
@@ -4,6 +4,7 @@
 
 import logging
 from functools import wraps
+from typing import ClassVar
 
 import bioregistry
 from curies import Reference, ReferenceTuple
@@ -25,16 +26,18 @@
 logger = logging.getLogger(__name__)
 
 
-class MissingPrefixError(ValueError):
+class ParseError(ValueError):
     """Raised on a missing prefix."""
 
+    text: ClassVar[str]
+
     def __init__(
         self,
         *,
         curie: str,
         ontology_prefix: str | None = None,
         node: Reference | None = None,
-    ):
+    ) -> None:
         """Initialize the error."""
         self.curie = curie
         self.ontology_prefix = ontology_prefix
@@ -44,12 +47,24 @@ def __str__(self) -> str:
         s = ""
         if self.ontology_prefix:
             s += f"[{self.ontology_prefix}] "
-        s += f"CURIE contains unhandled prefix: `{self.curie}`"
+        s += f"{self.text}: `{self.curie}`"
         if self.node is not None:
             s += f" from {self.node.curie}"
         return s
 
 
+class MissingPrefixError(ParseError):
+    """Raised on a missing prefix."""
+
+    text = "CURIE contains unhandled prefix"
+
+
+class UnparsableIRIError(ParseError):
+    """Raised on a an unparsable IRI."""
+
+    text = "IRI could not be parsed"
+
+
 BAD_CURIES = set()
 
 
@@ -70,18 +85,26 @@ def normalize_curie(
     - Normalizes the namespace
     - Checks against a blacklist for the entire curie, for the namespace, and for suffixes.
     """
+    # Remap the curie with the full list
+    curie = remap_full(curie)
+
+    # Remap node's prefix (if necessary)
+    curie = remap_prefix(curie, ontology_prefix=ontology_prefix)
+
     if curie_is_blacklisted(curie):
         return None, None
     if curie_has_blacklisted_prefix(curie):
         return None, None
     if curie_has_blacklisted_suffix(curie):
         return None, None
 
-    # Remap the curie with the full list
-    curie = remap_full(curie)
-
-    # Remap node's prefix (if necessary)
-    curie = remap_prefix(curie, ontology_prefix=ontology_prefix)
+    if curie.startswith("http:") or curie.startswith("https:"):
+        if reference := parse_iri(curie):
+            return reference.pair
+        elif strict:
+            raise UnparsableIRIError(curie=curie, ontology_prefix=ontology_prefix, node=node)
+        else:
+            return None, None
 
     try:
         prefix, identifier = curie.split(":", 1)
@@ -104,6 +127,14 @@ def normalize_curie(
         return None, None
 
 
+def parse_iri(iri: str) -> Reference | None:
+    """Parse an IRI into a reference, if possible."""
+    p, i = bioregistry.parse_iri(iri)
+    if p and i:
+        return Reference(prefix=p, identifier=i)
+    return None
+
+
 def wrap_norm_prefix(f):
     """Decorate a function that take in a prefix to auto-normalize, or return None if it can't be normalized."""
 

diff --git a/src/pyobo/reader.py b/src/pyobo/reader.py
@@ -293,21 +293,20 @@ def iterate_graph_synonym_typedefs(
     for s in graph.graph.get("synonymtypedef", []):
         sid, name = s.split(" ", 1)
         name = name.strip().strip('"')
-        if sid.startswith("http://") or sid.startswith("https://"):
-            reference = Reference.from_iri(sid, name=name)
-        elif ":" not in sid:  # assume it's ad-hoc
-            reference = Reference(prefix=ontology_prefix, identifier=sid, name=name)
-        else:  # assume it's a curie
-            reference = Reference.from_curie(sid, name=name, strict=strict)
-
-        if reference is None:
-            if strict:
+        if ":" not in sid:
+            # assume it's a default reference
+            yield SynonymTypeDef(reference=default_reference(ontology_prefix, sid, name=name))
+        else:
+            reference = Reference.from_curie(
+                sid, name=name, strict=strict, ontology_prefix=ontology_prefix
+            )
+            if reference is not None:
+                yield SynonymTypeDef(reference=reference)
+            elif strict:
                 raise ValueError(f"Could not parse {sid}")
             else:
                 continue
 
-        yield SynonymTypeDef(reference=reference)
-
 
 def iterate_graph_typedefs(
     graph: nx.MultiDiGraph, *, ontology_prefix: str, strict: bool = True

diff --git a/src/pyobo/registries/metaregistry.json b/src/pyobo/registries/metaregistry.json
@@ -358,18 +358,13 @@
     "prefix": [
       "Image:",
       "Category",
-      "http://",
-      "https://",
-      "http://dbpedia.org",
-      "https://github.com",
       "PERSON",
       "similar to",
       "modelled on",
       "SUBMITTER",
       "STRUCTURE_ChemicalName_IUPAC",
       "STRUCTURE_Formula",
       "stedman",
-      "From_Merriam-Webster's_Online_Dictionary_at_www.Merriam-Webster.com",
       "value-type:",
       "binary-data-type:MS\\",
       "PECO_GIT",
@@ -381,8 +376,6 @@
       "Germplasm:",
       "IUPAC:",
       "IUPHAR:GPCRListForward?",
-      "GOC:",
-      "goc:",
       "GIOC:",
       "MONDORULE:",
       "MTH:",
@@ -395,8 +388,7 @@
       "INFOODs:",
       "NLCD:",
       "TEMP:",
-      "PO_GIT:",
-      "URL:http"
+      "PO_GIT:"
     ],
     "suffix": [
       ".jpg",

diff --git a/src/pyobo/struct/reference.py b/src/pyobo/struct/reference.py
@@ -93,38 +93,12 @@ def from_curie(  # type:ignore[override]
         prefix, identifier = normalize_curie(
             curie, strict=strict, ontology_prefix=ontology_prefix, node=node
         )
-        return cls._materialize(prefix=prefix, identifier=identifier, name=name, auto=auto)
-
-    @classmethod
-    def from_iri(
-        cls,
-        iri: str,
-        name: str | None = None,
-        *,
-        auto: bool = False,
-    ) -> Reference | None:
-        """Get a reference from an IRI using the Bioregistry.
-
-        :param iri: The IRI to parse
-        :param name: The name associated with the CURIE
-        :param auto: Automatically look up name
-        """
-        prefix, identifier = bioregistry.parse_iri(iri)
-        return cls._materialize(prefix=prefix, identifier=identifier, name=name, auto=auto)
-
-    @classmethod
-    def _materialize(
-        cls,
-        prefix: str | None,
-        identifier: str | None,
-        name: str | None = None,
-        *,
-        auto: bool = False,
-    ) -> Reference | None:
         if prefix is None or identifier is None:
             return None
         if name is None and auto:
-            return cls.auto(prefix=prefix, identifier=identifier)
+            from ..api import get_name
+
+            name = get_name(prefix, identifier)
         return cls.model_validate({"prefix": prefix, "identifier": identifier, "name": name})
 
     @property
@@ -196,20 +170,20 @@ def bioregistry_link(self) -> str:
         return self.reference.bioregistry_link
 
 
-def default_reference(prefix: str, part: str, name: str | None = None) -> Reference:
+def default_reference(prefix: str, identifier: str, name: str | None = None) -> Reference:
     """Create a CURIE for an "unqualified" reference.
 
     :param prefix: The prefix of the ontology in which the "unqualified" reference is made
-    :param part: The "unqualified" reference. For example, if you just write
+    :param identifier: The "unqualified" reference. For example, if you just write
         "located_in" somewhere there is supposed to be a CURIE
     :returns: A CURIE for the "unqualified" reference based on the OBO semantic space
 
     >>> default_reference("chebi", "conjugate_base_of")
     Reference(prefix="obo", identifier="chebi#conjugate_base_of")
     """
-    if not part.strip():
+    if not identifier.strip():
         raise ValueError("default identifier is empty")
-    return Reference(prefix="obo", identifier=f"{prefix}#{part}", name=name)
+    return Reference(prefix="obo", identifier=f"{prefix}#{identifier}", name=name)
 
 
 def reference_escape(predicate: Reference | Referenced, *, ontology_prefix: str) -> str:

diff --git a/tests/test_get.py b/tests/test_get.py
@@ -49,17 +49,17 @@ def test_get_graph_synonym_typedefs(self):
             sorted(
                 [
                     SynonymTypeDef(
-                        reference=Reference(
+                        reference=default_reference(
                             prefix="chebi", identifier="IUPAC_NAME", name="IUPAC NAME"
                         )
                     ),
                     SynonymTypeDef(
-                        reference=Reference(
+                        reference=default_reference(
                             prefix="chebi", identifier="BRAND_NAME", name="BRAND NAME"
                         )
                     ),
                     SynonymTypeDef(
-                        reference=Reference(prefix="chebi", identifier="INN", name="INN")
+                        reference=default_reference(prefix="chebi", identifier="INN", name="INN")
                     ),
                 ],
                 key=attrgetter("curie"),

diff --git a/tests/test_reader.py b/tests/test_reader.py
@@ -341,6 +341,45 @@ def test_property_literal_obo_purl(self) -> None:
             row,
         )
 
+    def test_property_object_url(self) -> None:
+        """Test parsing an object URI."""
+        ontology = _read("""\
+            ontology: chebi
+
+            [Term]
+            id: CHEBI:1234
+            property_value: http://purl.obolibrary.org/obo/RO_0018033 http://purl.obolibrary.org/obo/CHEBI_5678
+        """)
+        term = self.get_only_term(ontology)
+        self.assertEqual(0, len(list(term.annotations_literal)))
+        self.assertEqual(1, len(list(term.annotations_object)))
+        self.assertEqual("CHEBI:5678", term.get_property(is_conjugate_base_of))
+
+        df = ontology.get_properties_df()
+        self.assertEqual(4, len(df.columns))
+        self.assertEqual(1, len(df))
+        row = dict(df.iloc[0])
+        self.assertEqual(
+            {"chebi_id": "1234", "property": "RO:0018033", "value": "CHEBI:5678", "datatype": ""},
+            row,
+        )
+
+    def test_property_object_url_invalid(self) -> None:
+        """Test parsing an object URI."""
+        text = """\
+            ontology: chebi
+
+            [Term]
+            id: CHEBI:1234
+            property_value: http://purl.obolibrary.org/obo/RO_0018033 http://example.org/nope:nope
+        """
+        with self.assertRaises(ValueError):
+            _read(text)
+        ontology = _read(text, strict=False)
+        term = self.get_only_term(ontology)
+        self.assertEqual(0, len(list(term.annotations_literal)))
+        self.assertEqual(0, len(list(term.annotations_object)))
+
     def test_property_literal_url(self) -> None:
         """Test using a full OBO PURL as the property."""
         ontology = _read("""\
@@ -663,3 +702,27 @@ def test_synonym_full(self) -> None:
             ],
             synonym.provenance,
         )
+
+    def test_synonym_url(self) -> None:
+        """Test parsing a synonym defined with a PURL."""
+        ontology = _read(f"""\
+            ontology: chebi
+            synonymtypedef: http://purl.obolibrary.org/obo/OMO_1234567 ""
+
+            [Term]
+            id: CHEBI:1234
+            synonym: "LTEC I" EXACT OMO:1234567 [Orphanet:93938,{CHARLIE.curie}]
+        """)
+        term = self.get_only_term(ontology)
+        self.assertEqual(1, len(term.synonyms))
+        synonym = term.synonyms[0]
+        self.assertEqual("LTEC I", synonym.name)
+        self.assertEqual("EXACT", synonym.specificity)
+        self.assertEqual(Reference(prefix="omo", identifier="1234567"), synonym.type.reference)
+        self.assertEqual(
+            [
+                Reference(prefix="orphanet", identifier="93938"),
+                CHARLIE,
+            ],
+            synonym.provenance,
+        )
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -29,6 +29,11 @@ def test_strip_prefix(self):
             ("ncit", "C1234"), normalize_curie("Thesaurus:C1234", ontology_prefix="enm")
         )
 
+        # parsing IRIs
+        self.assertEqual(
+            ("chebi", "1234"), normalize_curie("http://purl.obolibrary.org/obo/CHEBI_1234")
+        )
+
     def test_parse_eccode_transfer(self):
         """Test parse_eccode_transfer."""
         self.assertEqual(