biopragmatics · cthoyt · Feb 28, 2024 · Feb 28, 2024 · Mar 9, 2024 · Mar 11, 2024
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -66,4 +66,5 @@ for updating your code.
    discovery
    struct
    api
+   w3c
    services/index
diff --git a/docs/source/w3c.rst b/docs/source/w3c.rst
@@ -0,0 +1,96 @@
+W3C Compliance
+==============
+The Worldwide Web Consortium (W3C) provides standards for
+`prefixes <https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName>`_ (i.e., ``NCName``),
+`CURIEs <https://www.w3.org/TR/2010/NOTE-curie-20101216/>`_, and
+`IRIs <https://www.ietf.org/rfc/rfc3987.txt>`_, but they are
+highly obfuscated and spread across many documents.
+
+In practice, some usages do not conform to these standards, often due
+to encoding things that aren't _really_ supposed to be CURIEs, such as
+like SMILES strings for molecules, UCUM codes for units,
+or other language-like "identifiers".
+
+Therefore, it's on the roadmap for the ``curies`` package to support
+operations for validating against the W3C standards and mapping
+between "loose" (i.e., un-URL-encoded) and strict (i.e., URL-encoded)
+CURIEs and IRIs. In practice, this will often solve issues with special
+characters like square brackets (``[`` and ``]``).
+
+.. code-block::
+
+     looseCURIE <-> strictCURIE
+          ^.    \./.    ^
+          |      X      |
+          v     / \.    v
+      looseURI  <->  strictURI
+
+A first step towards accomplishing this was implemented in https://github.com/biopragmatics/curies/pull/104
+by adding a ``w3c_validation`` flag to both the initialization of a :mod:`curies.Converter` as well as in the
+:meth:`curies.Converter.expand` function.
+
+Here's an example of using W3C validation during expansion:
+
+.. code-block::
+
+    import curies
+
+    converter = curies.Converter.from_prefix_map({
+        "smiles": "https://bioregistry.io/smiles:",
+    })
+
+    >>> converter.expand("smiles:CC(=O)NC([H])(C)C(=O)O")
+    https://bioregistry.io/smiles:CC(=O)NC([H])(C)C(=O)O
+
+    >>> converter.expand("smiles:CC(=O)NC([H])(C)C(=O)O", w3c_validation=True)
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+      File "/Users/cthoyt/dev/curies/src/curies/api.py", line 1362, in expand
+        raise W3CValidationError(f"CURIE is not valid under W3C spec: {curie}")
+    W3CValidationError: CURIE is not valid under W3C spec: smiles:CC(=O)NC([H])(C)C(=O)O
+
+This can also be used to extend :meth:`curies.Converter.is_curie`
+
+.. code-block::
+
+    import curies
+
+    converter = curies.Converter.from_prefix_map({
+        "smiles": "https://bioregistry.io/smiles:",
+    })
+
+    >>> converter.is_curie("smiles:CC(=O)NC([H])(C)C(=O)O")
+    True
+    >>> converter.is_curie("smiles:CC(=O)NC([H])(C)C(=O)O", w3c_validation=True)
+    False
+
+Finally, this can be used during instantiation of a converter:
+
+.. code-block::
+
+    import curies
+
+    >>> curies.Converter.from_prefix_map(
+    ...     {"4dn.biosource": "https://data.4dnucleome.org/biosources/"},
+    ...     w3c_validation=True,
+    ... )
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+      File "/Users/cthoyt/dev/curies/src/curies/api.py", line 816, in from_prefix_map
+        return cls(
+               ^^^^
+      File "/Users/cthoyt/dev/curies/src/curies/api.py", line 527, in __init__
+        raise W3CValidationError(f"Records not conforming to W3C:\n\n{msg}")
+    curies.api.W3CValidationError: Records not conforming to W3C:
+
+      - Record(prefix='4dn.biosource', uri_prefix='https://data.4dnucleome.org/biosources/', prefix_synonyms=[], uri_prefix_synonyms=[], pattern=None)
+
+
+.. seealso::
+
+    1. Discussion on the ``curies`` issue tracker about handling CURIEs that include e.g. square brackets
+       and therefore don't conform to the W3C specification: https://github.com/biopragmatics/curies/issues/103
+    2. Discussion on languages that shouldn't really get encoded in CURIEs, but still do:
+       https://github.com/biopragmatics/bioregistry/issues/460
+    3. Related to (2) - discussion on how to properly encode UCUM in CURIEs:
+       https://github.com/biopragmatics/bioregistry/issues/648
diff --git a/src/curies/__init__.py b/src/curies/__init__.py
@@ -10,6 +10,7 @@
     Record,
     Reference,
     ReferenceTuple,
+    W3CValidationError,
     chain,
     load_extended_prefix_map,
     load_jsonld_context,
@@ -39,6 +40,7 @@
     "DuplicateValueError",
     "DuplicateURIPrefixes",
     "DuplicatePrefixes",
+    "W3CValidationError",
     "chain",
     "remap_curie_prefixes",
     "remap_uri_prefixes",

diff --git a/src/curies/api.py b/src/curies/api.py
@@ -37,6 +37,7 @@
 from pytrie import StringTrie
 
 from ._pydantic_compat import field_validator, get_field_validator_values
+from .w3c import CURIE_PREFIX_RE, CURIE_RE
 
 if TYPE_CHECKING:  # pragma: no cover
     import pandas
@@ -47,9 +48,11 @@
     "Reference",
     "ReferenceTuple",
     "Record",
+    # Exceptions
     "DuplicateValueError",
     "DuplicatePrefixes",
     "DuplicateURIPrefixes",
+    "W3CValidationError",
     # Utilities
     "chain",
     "upgrade_prefix_map",
@@ -305,6 +308,12 @@ def _key(self) -> RecordKey:
             ",".join(sorted(self.uri_prefix_synonyms)),
         )
 
+    def _w3c_validate(self) -> bool:
+        """Check if all prefixes in this record are w3c compliant."""
+        all_curie_prefixes_valid = all(curie_prefix_is_w3c(prefix) for prefix in self._all_prefixes)
+        # TODO extend to check URI prefixes?
+        return all_curie_prefixes_valid
+
 
 class DuplicateSummary(NamedTuple):
     """A triple representing two records that are duplicated, either based on a CURIE or URI prefix."""
@@ -370,6 +379,10 @@ class URIStandardizationError(StandardizationError):
     """An error raise when a URI can't be standardized."""
 
 
+class W3CValidationError(ValueError):
+    """An error when W3C validation fails."""
+
+
 def _get_duplicate_uri_prefixes(records: List[Record]) -> List[DuplicateSummary]:
     return [
         DuplicateSummary(record_1, record_2, uri_prefix)
@@ -471,7 +484,14 @@ class Converter:
     #: .. warning:: patterns are an experimental feature
     pattern_map: Dict[str, str]
 
-    def __init__(self, records: List[Record], *, delimiter: str = ":", strict: bool = True) -> None:
+    def __init__(
+        self,
+        records: List[Record],
+        *,
+        delimiter: str = ":",
+        strict: bool = True,
+        w3c_validation: bool = False,
+    ) -> None:
         """Instantiate a converter.
 
         :param records:
@@ -480,8 +500,17 @@ def __init__(self, records: List[Record], *, delimiter: str = ":", strict: bool
             If true, raises issues on duplicate URI prefixes
         :param delimiter:
             The delimiter used for CURIEs. Defaults to a colon.
+        :param w3c_validation:
+            If true, validate all records against the
+            `W3C CURIE Syntax 1.0 <https://www.w3.org/TR/2010/NOTE-curie-20101216/>`_.
+            This includes the following:
+
+            1. Checking CURIE prefixes and CURIE prefix synonyms against the
+               W3C definition for `NCName <https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName>`_
+
         :raises DuplicatePrefixes: if any records share any synonyms
         :raises DuplicateURIPrefixes: if any records share any URI prefixes
+        :raises W3CValidationError: If w3c validation is on and there are non-conformant records
         """
         if strict:
             duplicate_uri_prefixes = _get_duplicate_uri_prefixes(records)
@@ -491,6 +520,12 @@ def __init__(self, records: List[Record], *, delimiter: str = ":", strict: bool
             if duplicate_prefixes:
                 raise DuplicatePrefixes(duplicate_prefixes)
 
+        if w3c_validation:
+            broken = [record for record in records if not record._w3c_validate()]
+            if broken:
+                msg = "\n".join(f"  - {record!r}" for record in records)
+                raise W3CValidationError(f"Records not conforming to W3C:\n\n{msg}")
+
         self.delimiter = delimiter
         self.records = sorted(records, key=lambda r: r.prefix)
         self.prefix_map = _get_prefix_map(records)
@@ -1201,10 +1236,12 @@ def parse_uri(self, uri: str) -> Union[ReferenceTuple, Tuple[None, None]]:
         else:
             return ReferenceTuple(prefix, uri[len(value) :])
 
-    def is_curie(self, s: str) -> bool:
+    def is_curie(self, s: str, *, w3c_validation: bool = False) -> bool:
         """Check if the string can be parsed as a CURIE by this converter.
 
         :param s: A string that might be a CURIE
+        :param w3c_validation: If true, requires CURIEs to be valid against the
+            `W3C CURIE specification <https://www.w3.org/TR/2010/NOTE-curie-20101216/>`_.
         :returns: If the string can be parsed as a CURIE by this converter.
             Note that some valid CURIEs, when passed to this function, will
             result in False if their prefixes are not registered with this
@@ -1225,7 +1262,7 @@ def is_curie(self, s: str) -> bool:
         False
         """
         try:
-            return self.expand(s) is not None
+            return self.expand(s, w3c_validation=w3c_validation) is not None
         except ValueError:
             return False
 
@@ -1311,36 +1348,60 @@ def expand_strict(self, curie: str) -> str:
     # docstr-coverage:excused `overload`
     @overload
     def expand(
-        self, curie: str, *, strict: Literal[True] = True, passthrough: bool = ...
+        self,
+        curie: str,
+        *,
+        strict: Literal[True] = True,
+        passthrough: bool = ...,
+        w3c_validation: bool = ...,
     ) -> str: ...
 
     # docstr-coverage:excused `overload`
     @overload
     def expand(
-        self, curie: str, *, strict: Literal[False] = False, passthrough: Literal[True] = True
+        self,
+        curie: str,
+        *,
+        strict: Literal[False] = False,
+        passthrough: Literal[True] = True,
+        w3c_validation: bool = ...,
     ) -> str: ...
 
     # docstr-coverage:excused `overload`
     @overload
     def expand(
-        self, curie: str, *, strict: Literal[False] = False, passthrough: Literal[False] = False
+        self,
+        curie: str,
+        *,
+        strict: Literal[False] = False,
+        passthrough: Literal[False] = False,
+        w3c_validation: bool = ...,
     ) -> Optional[str]: ...
 
     def expand(
-        self, curie: str, *, strict: bool = False, passthrough: bool = False
+        self,
+        curie: str,
+        *,
+        strict: bool = False,
+        passthrough: bool = False,
+        w3c_validation: bool = False,
     ) -> Optional[str]:
         """Expand a CURIE to a URI, if possible.
 
         :param curie:
             A string representing a compact URI (CURIE)
         :param strict: If true and the CURIE can't be expanded, returns an error. Defaults to false.
         :param passthrough: If true, strict is false, and the CURIE can't be expanded, return the input.
-            Defaults to false. If your strings can either be a CURIE _or_ a URI, consider using
+            Defaults to false. If your strings can either be a CURIE or a URI, consider using
             :meth:`Converter.expand_or_standardize` instead.
+        :param w3c_validation: If true, requires CURIEs to be valid against the
+            `W3C CURIE specification <https://www.w3.org/TR/2010/NOTE-curie-20101216/>`_.
         :returns:
             A URI if this converter contains a URI prefix for the prefix in this CURIE
         :raises ExpansionError:
             If strict is true and the CURIE can't be expanded
+        :raises W3CValidationError:
+            If W3C validation is turned on and the CURIE is not valid under the CURIE specification
 
         >>> from curies import Converter
         >>> converter = Converter.from_prefix_map({
@@ -1352,6 +1413,8 @@ def expand(
         'http://purl.obolibrary.org/obo/CHEBI_138488'
         >>> converter.expand("missing:0000000")
         """
+        if w3c_validation and not curie_is_w3c(curie):
+            raise W3CValidationError(f"CURIE is not valid under W3C spec: {curie}")
         prefix, identifier = self.parse_curie(curie)
         rv = self.expand_pair(prefix, identifier)
         if rv:
@@ -2295,3 +2358,48 @@ def upgrade_prefix_map(prefix_map: Mapping[str, str]) -> List[Record]:
         Record(prefix=prefix, prefix_synonyms=prefix_synonyms, uri_prefix=uri_prefix)
         for uri_prefix, (prefix, *prefix_synonyms) in sorted(priority_prefix_map.items())
     ]
+
+
+def curie_is_w3c(s: str) -> bool:
+    """Return if the CURIE is valid under the W3C specification.
+
+    :param s: A string to check if it is a valid CURIE under the W3C specification.
+    :return: True if the string is a valid CURIE under the W3C specification.
+
+    If no prefix is given, the host language chooses how to assign a default
+    prefix.
+
+    >>> curie_is_w3c(":test")
+    True
+
+    From the specification, regarding using an underscore as the prefix
+
+      The CURIE prefix '_' is reserved for use by languages that support RDF.
+      For this reason, the prefix '_' SHOULD be avoided by authors.
+
+    >>> curie_is_w3c("_:test")
+    True
+
+    This is invalid because a CURIE prefix isn't allowed to start with
+    a number. It has to start with either a letter, or an underscore.
+
+    >>> curie_is_w3c("4cdn:test")
+    False
+
+    Empty strings are explicitly noted as being invalid.
+
+    >>> curie_is_w3c("")
+    False
+    """
+    if "[" in s or "]" in s:
+        return False
+    if not s.strip():
+        return False
+    if s[0].isdigit():
+        return False  # TODO get that into the regex
+    return bool(CURIE_RE.match(s))
+
+
+def curie_prefix_is_w3c(s: str) -> bool:
+    """Return if the CURIE prefix is valid under the W3C specification."""
+    return bool(CURIE_PREFIX_RE.match(s))
diff --git a/src/curies/w3c.py b/src/curies/w3c.py
@@ -0,0 +1,32 @@
+"""
+Make it possible to check a CURIE against the W3C specification.
+
+https://github.com/linkml/linkml-runtime/blob/main/linkml_runtime/utils/uri_validator.py
+could serve as a good basis for extending this - adding documentation, improving readability,
+and making a more detailed testing suite would make this go a long way
+"""
+
+import re
+
+_CURIE_PREFIX_RE = r"[A-Za-z_][A-Za-z0-9\.\-_]*"
+"""The definition of a prefix, from https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName.
+
+.. code-block::
+
+    prefix := NCName
+    NCName := (Letter | '_') (NCNameChar)*
+    NCNameChar	::=	Letter | Digit | '.' | '-' | '_'
+"""
+
+CURIE_PREFIX_RE = re.compile(f"^{_CURIE_PREFIX_RE}$")
+
+#: Borrowed from https://github.com/linkml/prefixmaps/blob/82bfdbc/src/prefixmaps/datamodel/context.py#L26C1-L26C60
+#: Still needs adapting to see if there's an actual standard to match this to,
+#: or if this is an opinionated implementation
+URI_PREFIX_RE = re.compile(r"^http[s]?://[\w\.\-\/]+[#/_:]$")
+
+#: Adapted from https://gist.github.com/niklasl/2506955
+_IDENTIFIER_RE = r"(/[^\s/][^\s]*|[^\s/][^\s]*|[^\s]?)"
+
+CURIE_PATTERN = rf"^({_CURIE_PREFIX_RE}?:)?{_IDENTIFIER_RE}$"
+CURIE_RE = re.compile(CURIE_PATTERN)
diff --git a/tests/resources/invalid_curies.txt b/tests/resources/invalid_curies.txt
@@ -0,0 +1,3 @@
+
+smiles:CC(=O)NC([H])(C)C(=O)O
+4cdn:test
-Original file line number
+Diff line change
@@ Expand Up / @@ -66,4 +66,5 @@ for updating your code. @@
        discovery
        struct
        api
+       w3c
        services/index