From ba7e5cc04e58ff3e457e9fd2d7c5e3457b12cdf8 Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Wed, 28 Feb 2024 15:46:28 +0100
Subject: [PATCH 01/19] Add w3c strict expansion

---
 src/curies/api.py                  | 12 +++++++++++-
 src/curies/w3c.py                  | 19 +++++++++++++++++++
 tests/resources/invalid_curies.txt |  4 ++++
 tests/resources/valid_curies.txt   |  9 +++++++++
 tests/test_w3c.py                  | 24 ++++++++++++++++++++++++
 5 files changed, 67 insertions(+), 1 deletion(-)
 create mode 100644 src/curies/w3c.py
 create mode 100644 tests/resources/invalid_curies.txt
 create mode 100644 tests/resources/valid_curies.txt
 create mode 100644 tests/test_w3c.py

diff --git a/src/curies/api.py b/src/curies/api.py
index 730d4e6..cd94f8c 100644
--- a/src/curies/api.py
+++ b/src/curies/api.py
@@ -37,6 +37,7 @@
 from pytrie import StringTrie
 
 from ._pydantic_compat import field_validator, get_field_validator_values
+from .w3c import curie_is_w3c
 
 if TYPE_CHECKING:  # pragma: no cover
     import pandas
@@ -1316,7 +1317,12 @@ def expand(
     ) -> Optional[str]: ...
 
     def expand(
-        self, curie: str, *, strict: bool = False, passthrough: bool = False
+        self,
+        curie: str,
+        *,
+        strict: bool = False,
+        passthrough: bool = False,
+        require_w3c_spec: bool = False,
     ) -> Optional[str]:
         """Expand a CURIE to a URI, if possible.
 
@@ -1326,6 +1332,8 @@ def expand(
         :param passthrough: If true, strict is false, and the CURIE can't be expanded, return the input.
             Defaults to false. If your strings can either be a CURIE _or_ a URI, consider using
             :meth:`Converter.expand_ambiguous` instead.
+        :param require_w3c_spec: If true, requires CURIEs to be valid against the
+            `W3C CURIE specification <https://www.w3.org/TR/2010/NOTE-curie-20101216/>`_.
         :returns:
             A URI if this converter contains a URI prefix for the prefix in this CURIE
         :raises ExpansionError:
@@ -1350,6 +1358,8 @@ def expand(
             ``http://purl.obolibrary.org/obo/GO_0032571`` will return ``GO:0032571``
             instead of ``OBO:GO_0032571``.
         """
+        if require_w3c_spec and not curie_is_w3c(curie):
+            raise ValueError(f"CURIE is not valid under W3C spec: {curie}")
         prefix, identifier = self.parse_curie(curie)
         rv = self.expand_pair(prefix, identifier)
         if rv:
diff --git a/src/curies/w3c.py b/src/curies/w3c.py
new file mode 100644
index 0000000..61deb43
--- /dev/null
+++ b/src/curies/w3c.py
@@ -0,0 +1,19 @@
+"""
+Make it possible to check a CURIE against the W3C specification.
+"""
+
+import re
+
+__all__ = [
+    "curie_is_w3c",
+]
+
+# Borrowed from https://gist.github.com/niklasl/2506955
+CURIE_PATTERN = r"(([\i-[:]][\c-[:]]*)?:)?(/[^\s/][^\s]*|[^\s/][^\s]*|[^\s]?)"
+CURIE_PATTERN = CURIE_PATTERN.replace(r"\i-[:]", r"_A-Za-z").replace(r"\c-[:]", r"-._:A-Za-z0-9")
+CURIE_RE = re.compile(CURIE_PATTERN)
+
+
+def curie_is_w3c(curie) -> bool:
+    """Return if the CURIE is valid under the W3C specification."""
+    return bool(CURIE_RE.match(curie))
diff --git a/tests/resources/invalid_curies.txt b/tests/resources/invalid_curies.txt
new file mode 100644
index 0000000..74633dc
--- /dev/null
+++ b/tests/resources/invalid_curies.txt
@@ -0,0 +1,4 @@
+pfx://abc
+pfx://
+://
+/
diff --git a/tests/resources/valid_curies.txt b/tests/resources/valid_curies.txt
new file mode 100644
index 0000000..1066d8d
--- /dev/null
+++ b/tests/resources/valid_curies.txt
@@ -0,0 +1,9 @@
+pfx:abc
+:
+pfx:
+abc
+:abc
+
+pfx:/abc
+pfx:/
+:/
diff --git a/tests/test_w3c.py b/tests/test_w3c.py
new file mode 100644
index 0000000..1dd5b66
--- /dev/null
+++ b/tests/test_w3c.py
@@ -0,0 +1,24 @@
+"""Tests for W3C utilities."""
+
+import unittest
+from pathlib import Path
+
+from curies.w3c import curie_is_w3c
+
+HERE = Path(__file__).parent.resolve()
+RESOURCES = HERE.joinpath("resources")
+VALID_CURIES_PATH = RESOURCES.joinpath("valid_curies.txt")
+INVALID_CURIES_PATH = RESOURCES.joinpath("invalid_curies.txt")
+
+
+class TestW3C(unittest.TestCase):
+    """Tests for W3C utilities."""
+
+    def test_validating_curies(self):
+        """Test validating CURIEs."""
+        for curie in VALID_CURIES_PATH.read_text().splitlines():
+            with self.subTest(curie=curie):
+                self.assertTrue(curie_is_w3c(curie))
+        for curie in INVALID_CURIES_PATH.read_text().splitlines():
+            with self.subTest(curie=curie):
+                self.assertFalse(curie_is_w3c(curie))

From 6671887a12b407ce4e9eb27f464003b02bf558e6 Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Wed, 28 Feb 2024 15:49:27 +0100
Subject: [PATCH 02/19] Update w3c.py

---
 src/curies/w3c.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/curies/w3c.py b/src/curies/w3c.py
index 61deb43..a21c31d 100644
--- a/src/curies/w3c.py
+++ b/src/curies/w3c.py
@@ -1,5 +1,9 @@
 """
 Make it possible to check a CURIE against the W3C specification.
+
+https://github.com/linkml/linkml-runtime/blob/main/linkml_runtime/utils/uri_validator.py
+could serve as a good basis for extending this - adding documentation, improving readability,
+and making a more detailed testing suite would make this go a long way
 """
 
 import re

From 93053043f28890f82cff314ea97f794cf169c3ac Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Sat, 9 Mar 2024 21:24:16 +0100
Subject: [PATCH 03/19] Add up-front check for w3c compliance for converters

---
 src/curies/api.py |  66 ++++++++++-
 src/curies/w3c.py |   9 --
 src/curies/xx.py  | 291 ++++++++++++++++++++++++++++++++++++++++++++++
 tests/test_api.py |  30 +++++
 tests/test_w3c.py |   2 +-
 5 files changed, 386 insertions(+), 12 deletions(-)
 create mode 100644 src/curies/xx.py

diff --git a/src/curies/api.py b/src/curies/api.py
index cd94f8c..e6dbfec 100644
--- a/src/curies/api.py
+++ b/src/curies/api.py
@@ -37,7 +37,8 @@
 from pytrie import StringTrie
 
 from ._pydantic_compat import field_validator, get_field_validator_values
-from .w3c import curie_is_w3c
+from .w3c import CURIE_RE
+from .xx import PREFIX_RE
 
 if TYPE_CHECKING:  # pragma: no cover
     import pandas
@@ -306,6 +307,12 @@ def _key(self) -> RecordKey:
             ",".join(sorted(self.uri_prefix_synonyms)),
         )
 
+    def is_w3c_compliant(self) -> bool:
+        """Check if all prefixes in this record are w3c compliant."""
+        all_curie_prefixes_valid = all(curie_prefix_is_w3c(prefix) for prefix in self._all_prefixes)
+        # TODO extend to check URI prefixes?
+        return all_curie_prefixes_valid
+
 
 class DuplicateSummary(NamedTuple):
     """A triple representing two records that are duplicated, either based on a CURIE or URI prefix."""
@@ -472,7 +479,9 @@ class Converter:
     #: .. warning:: patterns are an experimental feature
     pattern_map: Dict[str, str]
 
-    def __init__(self, records: List[Record], *, delimiter: str = ":", strict: bool = True) -> None:
+    def __init__(
+        self, records: List[Record], *, delimiter: str = ":", strict: bool = True, w3c: bool = False
+    ) -> None:
         """Instantiate a converter.
 
         :param records:
@@ -481,6 +490,14 @@ def __init__(self, records: List[Record], *, delimiter: str = ":", strict: bool
             If true, raises issues on duplicate URI prefixes
         :param delimiter:
             The delimiter used for CURIEs. Defaults to a colon.
+        :param w3c:
+            If true, validate all records against the
+            `W3C CURIE Syntax 1.0 <https://www.w3.org/TR/2010/NOTE-curie-20101216/>`_.
+            This includes the following:
+
+              1. Checking CURIE prefixes and CURIE prefix synonyms against the
+                 W3C definition for `NCName <https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName>`_
+
         :raises DuplicatePrefixes: if any records share any synonyms
         :raises DuplicateURIPrefixes: if any records share any URI prefixes
         """
@@ -492,6 +509,11 @@ def __init__(self, records: List[Record], *, delimiter: str = ":", strict: bool
             if duplicate_prefixes:
                 raise DuplicatePrefixes(duplicate_prefixes)
 
+        if w3c:
+            broken = [record for record in records if not record.is_w3c_compliant()]
+            if broken:
+                raise ValueError(f"Records not conforming to W3C: {broken}")
+
         self.delimiter = delimiter
         self.records = sorted(records, key=lambda r: r.prefix)
         self.prefix_map = _get_prefix_map(records)
@@ -2303,3 +2325,43 @@ def upgrade_prefix_map(prefix_map: Mapping[str, str]) -> List[Record]:
         Record(prefix=prefix, prefix_synonyms=prefix_synonyms, uri_prefix=uri_prefix)
         for uri_prefix, (prefix, *prefix_synonyms) in sorted(priority_prefix_map.items())
     ]
+
+
+def curie_is_w3c(s: str) -> bool:
+    """Return if the CURIE is valid under the W3C specification.
+
+    :param s: A string to check if it is a valid CURIE under the W3C specification.
+    :return: True if the string is a valid CURIE under the W3C specification.
+
+
+    If no prefix is given, the host language chooses how to assign a default
+    prefix.
+
+    >>> curie_is_w3c(":test")
+    True
+
+    From the specification, regarding using an underscore as the prefix
+
+      The CURIE prefix '_' is reserved for use by languages that support RDF.
+      For this reason, the prefix '_' SHOULD be avoided by authors.
+
+    >>> curie_is_w3c("_:test")
+    True
+
+    This is invalid because a CURIE prefix isn't allowed to start with
+    a number. It has to start with either a letter, or an underscore.
+
+    >>> curie_is_w3c("4cdn:test")
+    False
+
+    Empty strings are explicitly noted as being invalid.
+
+    >>> curie_is_w3c("")
+    False
+    """
+    return bool(CURIE_RE.match(s))
+
+
+def curie_prefix_is_w3c(s: str) -> bool:
+    """Return if the CURIE prefix is valid under the W3C specification."""
+    return bool(PREFIX_RE.match(s))
diff --git a/src/curies/w3c.py b/src/curies/w3c.py
index a21c31d..e7c1ae2 100644
--- a/src/curies/w3c.py
+++ b/src/curies/w3c.py
@@ -8,16 +8,7 @@
 
 import re
 
-__all__ = [
-    "curie_is_w3c",
-]
-
 # Borrowed from https://gist.github.com/niklasl/2506955
 CURIE_PATTERN = r"(([\i-[:]][\c-[:]]*)?:)?(/[^\s/][^\s]*|[^\s/][^\s]*|[^\s]?)"
 CURIE_PATTERN = CURIE_PATTERN.replace(r"\i-[:]", r"_A-Za-z").replace(r"\c-[:]", r"-._:A-Za-z0-9")
 CURIE_RE = re.compile(CURIE_PATTERN)
-
-
-def curie_is_w3c(curie) -> bool:
-    """Return if the CURIE is valid under the W3C specification."""
-    return bool(CURIE_RE.match(curie))
diff --git a/src/curies/xx.py b/src/curies/xx.py
new file mode 100644
index 0000000..31b3043
--- /dev/null
+++ b/src/curies/xx.py
@@ -0,0 +1,291 @@
+# Copyright Siemens 2023
+# SPDX-License-Identifier: CC0-1.0
+
+
+"""
+Regular-expression-based URI and CURIE validation functions
+
+These regex are directly derived from the official sources mentioned in each
+section.
+
+They should be processed with re.VERBOSE.
+
+Python named regular expression groups are being used to better understand the
+URI/CURIE parsing.
+"""
+
+import re
+
+#: Define DIGIT according RFC2234 section 3.4:
+#: https://datatracker.ietf.org/doc/html/rfc2234/#section-3.4
+DIGIT = r"[0-9]"
+
+#: Define ALPHA (i.e., Letter) according RFC2234 section 6.1:
+#: https://datatracker.ietf.org/doc/html/rfc2234/#section-6.1
+ALPHA = r"[A-Za-z]"
+
+#: Define HEXDIG according RFC2234 section 6.1:
+#: https://datatracker.ietf.org/doc/html/rfc2234/#section-6.1
+HEXDIG = "[0-9A-F]"
+
+#   pct-encoded   = "%" HEXDIG HEXDIG
+pct_encoded = rf"% {HEXDIG}{{2}}"
+
+#   unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
+unreserved = rf"(?: {ALPHA} | {DIGIT} | \- | \. | _ | ~ )"
+
+#   gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+gen_delims = r"(?: : | / | \? | \# | \[ | \] | @ )"
+
+#   sub-delims    = "!" / "$" / "&" / "'" / "("
+sub_delims = r"(?: ! | \$ | & | ' | \( | \) | \* | \+ | , | ; | = )"
+
+#   pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
+pchar = rf"(?: {unreserved} | {pct_encoded} | {sub_delims} | : | @ )"
+
+#   reserved      = gen-delims / sub-delims
+reserved = rf"(?: {gen_delims} | {sub_delims} )"
+
+### required for Authority
+
+#   dec-octet     = DIGIT                 ; 0-9
+#                 / %x31-39 DIGIT         ; 10-99
+#                 / "1" 2DIGIT            ; 100-199
+#                 / "2" %x30-34 DIGIT     ; 200-249
+#                 / "25" %x30-35          ; 250-255
+dec_octet = rf"""(?: {DIGIT} |
+                    [1-9] {DIGIT} |
+                    1 {DIGIT}{{2}} |
+                    2 [0-4] {DIGIT} |
+                    25 [0-5]
+                )
+"""
+
+#  IPv4address   = dec-octet "." dec-octet "." dec-octet "." dec-octet
+IPv4address = rf"{dec_octet} \. {dec_octet} \. {dec_octet} \. {dec_octet}"
+
+#  h16           = 1*4HEXDIG
+h16 = rf"(?: {HEXDIG} ){{1,4}}"
+
+#  ls32          = ( h16 ":" h16 ) / IPv4address
+ls32 = rf"(?: (?: {h16} : {h16} ) | {IPv4address} )"
+
+#   IPv6address   =                            6( h16 ":" ) ls32
+#                 /                       "::" 5( h16 ":" ) ls32
+#                 / [               h16 ] "::" 4( h16 ":" ) ls32
+#                 / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
+#                 / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
+#                 / [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
+#                 / [ *4( h16 ":" ) h16 ] "::"              ls32
+#                 / [ *5( h16 ":" ) h16 ] "::"              h16
+#                 / [ *6( h16 ":" ) h16 ] "::"
+IPv6address = rf"""(?:                              (?: {h16} : ){{6}} {ls32} |
+                                                 :: (?: {h16} : ){{5}} {ls32} |
+                                    (?: {h16} )? :: (?: {h16} : ){{4}} {ls32} |
+               (?: (?: {h16} : )        {h16} )? :: (?: {h16} : ){{3}} {ls32} |
+               (?: (?: {h16} : ){{1,2}} {h16} )? :: (?: {h16} : ){{2}} {ls32} |
+               (?: (?: {h16} : ){{1,3}} {h16} )? ::     {h16} :        {ls32} |
+               (?: (?: {h16} : ){{1,4}} {h16} )? ::                    {ls32} |
+               (?: (?: {h16} : ){{1,5}} {h16} )? ::                    {h16}  |
+               (?: (?: {h16} : ){{1,6}} {h16} )? ::
+              )
+"""
+
+#   IPvFuture     = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
+IPvFuture = rf"v {HEXDIG}+ \. (?: {unreserved} | {sub_delims} | : )+"
+
+#   IP-literal    = "[" ( IPv6address / IPvFuture  ) "]"
+IP_literal = rf"\[ (?: {IPv6address} | {IPvFuture} ) \]"
+
+#   reg-name      = *( unreserved / pct-encoded / sub-delims )
+reg_name = rf"(?: {unreserved} | {pct_encoded} | {sub_delims} )*"
+
+### required for Path
+
+#   segment       = *pchar
+segment = rf"{pchar}*"
+
+#   segment-nz    = 1*pchar
+segment_nz = rf"{pchar}+"
+
+#   segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
+segment_nz_nc = rf"(?: {unreserved} | {pct_encoded} | {sub_delims} | @ )+"
+
+# -----------------------------------------------------------------------------
+#
+# Define SCHEME according RFC3986 section 3.1:
+# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.1
+#
+
+#   scheme        = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
+scheme = rf"(?P<scheme> {ALPHA} (?: {ALPHA} | {DIGIT} | \+ | \- | \. )* )"
+
+# -----------------------------------------------------------------------------
+#
+# Define AUTHORITY according RFC3986 section 3.2:
+
+# Define USER INFORMATION according RFC3986 section 3.2.1:
+# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.1
+
+#   userinfo      = *( unreserved / pct-encoded / sub-delims / ":" )
+userinfo = rf"""(?P<userinfo>
+                    (?: {unreserved} | {pct_encoded} | {sub_delims} | : )*
+                )
+"""
+
+# Define HOST according RFC3986 section 3.2.2:
+# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
+
+#   host          = IP-literal / IPv4address / reg-name
+host = rf"(?P<host> {IP_literal} | {IPv4address} | {reg_name} )"
+
+# Define PORT according RFC3986 section 3.2.3:
+# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.3
+
+#   port          = *DIGIT
+port = rf"(?P<port> ( {DIGIT} )* )"
+
+# Define AUTHORITY according RFC3986 section 3.2:
+# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2
+#
+
+#   authority     = [ userinfo "@" ] host [ ":" port ]
+# authority = rf"""(?: (?P<userinfo> {userinfo} ) @)?
+authority = rf"""(?P<authority>
+                    (?: {userinfo} @)?
+                    {host}
+                    (?: : {port} )?
+                )
+"""
+
+# -----------------------------------------------------------------------------
+#
+# Define different PATHs according RFC3986 section 3.3:
+# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.3
+#
+
+#   path-abempty  = *( "/" segment )
+path_abempty = rf"( / {segment} )*"
+
+#   path-absolute = "/" [ segment-nz *( "/" segment ) ]
+path_absolute = rf"( / (?: {segment_nz} (?: / {segment} )* )? )"
+
+#   path-noscheme = segment-nz-nc *( "/" segment )
+path_noscheme = rf"( {segment_nz_nc} (?: / {segment} )* )"
+
+#   path-rootless = segment-nz *( "/" segment )
+path_rootless = rf"( {segment_nz} (?: / {segment} )* )"
+
+#   path-empty    = 0<pchar>
+path_empty = r""
+
+#   path          = path-abempty    ; begins with "/" or is empty
+#                 / path-absolute   ; begins with "/" but not "//"
+#                 / path-noscheme   ; begins with a non-colon segment
+#                 / path-rootless   ; begins with a segment
+#                 / path-empty      ; zero characters
+path = rf"""(?:
+               {path_abempty} |
+               {path_absolute} |
+               {path_noscheme} |
+               {path_rootless} |
+               {path_empty}
+            )
+"""
+
+# -----------------------------------------------------------------------------
+#
+# Define QUERY according RFC3986 section 3.4:
+# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.4
+#
+
+#   query         = *( pchar / "/" / "?" )
+query = rf"(?P<query> (?: {pchar} | / | \? )* )"
+
+# -----------------------------------------------------------------------------
+#
+# Define FRAGMENT according RFC3986 section 3.5:
+# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.5
+#
+
+#   fragment      = *( pchar / "/" / "?" )
+fragment = rf"(?P<fragment> (?: {pchar} | / | \? )* )"
+
+# -----------------------------------------------------------------------------
+#
+# Define URI and HIERARCHICAL PATH according RFC3986 section 3:
+# https://datatracker.ietf.org/doc/html/rfc3986/#section-3
+#
+
+#   hier-part     = "//" authority path-abempty
+#                 / path-absolute
+#                 / path-rootless
+#                 / path-empty
+hier_part = rf"""(?P<hier_part>
+                    (?: // {authority} {path_abempty} ) |
+                    {path_absolute} |
+                    {path_rootless} |
+                    {path_empty}
+                )
+"""
+
+#   URI           = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
+URI = rf"""(?P<uri>
+                {scheme} : {hier_part} (?: \? {query} )? (?: \# {fragment} )?
+            )
+"""
+
+# -----------------------------------------------------------------------------
+#
+# Define RELATIVE REFERENCE according RFC3986 section 4.2:
+# https://datatracker.ietf.org/doc/html/rfc3986/#section-4.2
+#
+
+#   relative-part = "//" authority path-abempty
+#                 / path-absolute
+#                 / path-noscheme
+#                 / path-empty
+#   relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
+relative_ref = rf"""(?P<relative_ref>
+                        (?:
+                            (?: //
+                                {authority}
+                                (?P<path_abempty> {path_abempty} )
+                            ) |
+                            (?P<path_absolute> {path_absolute} ) |
+                            (?P<path_noscheme> {path_noscheme} ) |
+                            (?P<path_empty> {path_empty} )
+                         )
+                         (?: \? {query} )?
+                         (?: \# {fragment} )?
+                     )
+"""
+
+# -----------------------------------------------------------------------------
+#
+# Define CURIE according W3C CURIE Syntax 1.0
+# https://www.w3.org/TR/curie/#s_syntax
+#
+
+# NCNameChar	::=	Letter | Digit | '.' | '-' | '_' | CombiningChar | Extender
+# !! IMPORTANT NOTE !!
+# As of now this module doesn't support NCNameChar IRI, but
+# relative-refs as defined in URI,
+# NCNameChar	::=	Letter | Digit | '.' | '-' | '_'
+NCNameChar = rf"(?: {ALPHA} | {DIGIT} | \. | \- | _ )"
+
+# prefix      :=   NCName
+# NCName  :=   (Letter | '_') (NCNameChar)*
+prefix = rf"(?: {ALPHA} | _ ) (?: {NCNameChar} )*"
+
+# reference   :=   irelative-ref (as defined in IRI)
+# !! IMPORTANT NOTE !!
+# As of now this module don't support irelative-refs as defined in IRI, but
+# relative-refs as defined in URI
+# curie       :=   [ [ prefix ] ':' ] reference
+# reference   :=   relative-ref (as defined in URI)
+CURIE = rf"""(?P<CURIE> (?: (?P<prefix> {prefix} )? : )? {relative_ref})
+"""
+
+PREFIX_RE = re.compile(f"^{prefix}$", re.VERBOSE)
+CURIE_RE = re.compile(f"^{CURIE}$", re.VERBOSE)
diff --git a/tests/test_api.py b/tests/test_api.py
index b404f4e..b361f41 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -41,6 +41,36 @@
 GO_URI_PREFIX = "http://purl.obolibrary.org/obo/GO_"
 
 
+class TestRecord(unittest.TestCase):
+    """Tests for the record data structure."""
+
+    def test_w3c_prefix(self):
+        """Test CURIE prefix correctness."""
+        valid_prefixes = [
+            "go",
+            "GO",
+            "NCBITaxon",
+            "ncbi.taxon",
+            "ncbi_taxon",
+            "_",
+            "_secret",
+            "secret_",
+            "_secret",
+        ]
+        invalid_prefixes = ["", "4dn", "GO:GO:", "GO:"]
+        examples = [
+            *((prefix, True) for prefix in valid_prefixes),
+            *((prefix, False) for prefix in invalid_prefixes),
+        ]
+        for prefix, value in examples:
+            uri_prefix = f"https://example.com/{prefix}"
+            r1 = Record(prefix=prefix, uri_prefix=uri_prefix)
+            r2 = Record(prefix="prefix", prefix_synonyms=[prefix], uri_prefix=uri_prefix)
+            with self.subTest(prefix=prefix):
+                self.assertEqual(value, r1.is_w3c_compliant())
+                self.assertEqual(value, r2.is_w3c_compliant())
+
+
 class TestAddRecord(unittest.TestCase):
     """Test adding records."""
 
diff --git a/tests/test_w3c.py b/tests/test_w3c.py
index 1dd5b66..d0e6172 100644
--- a/tests/test_w3c.py
+++ b/tests/test_w3c.py
@@ -3,7 +3,7 @@
 import unittest
 from pathlib import Path
 
-from curies.w3c import curie_is_w3c
+from curies.api import curie_is_w3c
 
 HERE = Path(__file__).parent.resolve()
 RESOURCES = HERE.joinpath("resources")

From 22934583e24ab49397450f460fe25003c5c7f21a Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Mon, 11 Mar 2024 10:18:31 +0100
Subject: [PATCH 04/19] Update

---
 src/curies/api.py                  |   2 +
 src/curies/xx.py                   | 132 ++++++++++++++++-------------
 tests/resources/invalid_curies.txt |   1 +
 tests/test_api.py                  |  12 +++
 4 files changed, 87 insertions(+), 60 deletions(-)

diff --git a/src/curies/api.py b/src/curies/api.py
index e6dbfec..704aa51 100644
--- a/src/curies/api.py
+++ b/src/curies/api.py
@@ -2359,6 +2359,8 @@ def curie_is_w3c(s: str) -> bool:
     >>> curie_is_w3c("")
     False
     """
+    if "[" in s or "]" in s:
+        return False
     return bool(CURIE_RE.match(s))
 
 
diff --git a/src/curies/xx.py b/src/curies/xx.py
index 31b3043..a3a2dc9 100644
--- a/src/curies/xx.py
+++ b/src/curies/xx.py
@@ -1,40 +1,51 @@
-# Copyright Siemens 2023
-# SPDX-License-Identifier: CC0-1.0
+"""A regular expression implementation of the W3C CURIEs Syntax.
 
-
-"""
-Regular-expression-based URI and CURIE validation functions
-
-These regex are directly derived from the official sources mentioned in each
+These regular expressions are directly derived from the official sources mentioned in each
 section.
 
-They should be processed with re.VERBOSE.
+They should be processed with :data:`re.VERBOSE` to remove comments and other
+non-essential annotations.
 
 Python named regular expression groups are being used to better understand the
 URI/CURIE parsing.
+
+adapted from https://github.com/linkml/linkml-runtime/blob/main/linkml_runtime/utils/uri_validator.py, which
+was originally distributed under the CC-0 license
+
+Relevant documents:
+
+1. W3C CURIES Syntax 1.0 in https://www.w3.org/TR/2010/NOTE-curie-20101216/
+2. NCName definition (i.e., prefix) in https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName
+2. IRI definition in https://www.ietf.org/rfc/rfc3987.txt
 """
 
 import re
 
 #: Define DIGIT according RFC2234 section 3.4:
 #: https://datatracker.ietf.org/doc/html/rfc2234/#section-3.4
-DIGIT = r"[0-9]"
+DIGIT = "[0-9]"
 
 #: Define ALPHA (i.e., Letter) according RFC2234 section 6.1:
 #: https://datatracker.ietf.org/doc/html/rfc2234/#section-6.1
-ALPHA = r"[A-Za-z]"
+ALPHA = "[A-Za-z]"
 
 #: Define HEXDIG according RFC2234 section 6.1:
 #: https://datatracker.ietf.org/doc/html/rfc2234/#section-6.1
 HEXDIG = "[0-9A-F]"
 
 #   pct-encoded   = "%" HEXDIG HEXDIG
-pct_encoded = rf"% {HEXDIG}{{2}}"
+pct_encoded = f"%{HEXDIG}{{2}}"
+
+# unreserved = rf"(?: {ALPHA} | {DIGIT} | \- | \. | _ | ~ )"
+unreserved = r"[A-Za-z0-9\-\._~]"
+"""Defined in page 8 of https://www.ietf.org/rfc/rfc3987.txt as:
 
-#   unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
-unreserved = rf"(?: {ALPHA} | {DIGIT} | \- | \. | _ | ~ )"
+.. code-block::
 
-#   gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+    unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
+"""
+
+# gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
 gen_delims = r"(?: : | / | \? | \# | \[ | \] | @ )"
 
 #   sub-delims    = "!" / "$" / "&" / "'" / "("
@@ -46,23 +57,20 @@
 #   reserved      = gen-delims / sub-delims
 reserved = rf"(?: {gen_delims} | {sub_delims} )"
 
-### required for Authority
-
-#   dec-octet     = DIGIT                 ; 0-9
-#                 / %x31-39 DIGIT         ; 10-99
-#                 / "1" 2DIGIT            ; 100-199
-#                 / "2" %x30-34 DIGIT     ; 200-249
-#                 / "25" %x30-35          ; 250-255
-dec_octet = rf"""(?: {DIGIT} |
-                    [1-9] {DIGIT} |
-                    1 {DIGIT}{{2}} |
-                    2 [0-4] {DIGIT} |
-                    25 [0-5]
-                )
+dec_octet = rf"(?: {DIGIT} | [1-9]{DIGIT} | 1{DIGIT}{{2}} | 2[0-4]{DIGIT} | 25[0-5])"
+"""A definition of numbers between 1-255.
+
+.. code-block::
+
+    dec-octet = DIGIT ; 0-9
+                / %x31-39 DIGIT         ; 10-99
+                / "1" 2DIGIT            ; 100-199
+                / "2" %x30-34 DIGIT     ; 200-249
+                / "25" %x30-35          ; 250-255
 """
 
 #  IPv4address   = dec-octet "." dec-octet "." dec-octet "." dec-octet
-IPv4address = rf"{dec_octet} \. {dec_octet} \. {dec_octet} \. {dec_octet}"
+IPv4address = rf"{dec_octet}\.{dec_octet}\.{dec_octet}\.{dec_octet}"
 
 #  h16           = 1*4HEXDIG
 h16 = rf"(?: {HEXDIG} ){{1,4}}"
@@ -185,12 +193,12 @@
 #                 / path-rootless   ; begins with a segment
 #                 / path-empty      ; zero characters
 path = rf"""(?:
-               {path_abempty} |
-               {path_absolute} |
-               {path_noscheme} |
-               {path_rootless} |
-               {path_empty}
-            )
+   {path_abempty} |
+   {path_absolute} |
+   {path_noscheme} |
+   {path_rootless} |
+   {path_empty}
+)
 """
 
 # -----------------------------------------------------------------------------
@@ -222,18 +230,15 @@
 #                 / path-rootless
 #                 / path-empty
 hier_part = rf"""(?P<hier_part>
-                    (?: // {authority} {path_abempty} ) |
-                    {path_absolute} |
-                    {path_rootless} |
-                    {path_empty}
-                )
+    (?: // {authority} {path_abempty} ) |
+    {path_absolute} |
+    {path_rootless} |
+    {path_empty}
+)
 """
 
 #   URI           = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
-URI = rf"""(?P<uri>
-                {scheme} : {hier_part} (?: \? {query} )? (?: \# {fragment} )?
-            )
-"""
+URI = rf"(?P<uri> {scheme} : {hier_part} (?: \? {query} )? (?: \# {fragment} )?)"
 
 # -----------------------------------------------------------------------------
 #
@@ -246,19 +251,20 @@
 #                 / path-noscheme
 #                 / path-empty
 #   relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
-relative_ref = rf"""(?P<relative_ref>
-                        (?:
-                            (?: //
-                                {authority}
-                                (?P<path_abempty> {path_abempty} )
-                            ) |
-                            (?P<path_absolute> {path_absolute} ) |
-                            (?P<path_noscheme> {path_noscheme} ) |
-                            (?P<path_empty> {path_empty} )
-                         )
-                         (?: \? {query} )?
-                         (?: \# {fragment} )?
-                     )
+relative_ref = rf"""\
+(?P<relative_ref>
+    (?:
+        (?: //
+        {authority}
+        (?P<path_abempty> {path_abempty} )
+        ) |
+        (?P<path_absolute> {path_absolute} ) |
+        (?P<path_noscheme> {path_noscheme} ) |
+        (?P<path_empty> {path_empty} )
+        )
+    (?: \? {query} )?
+    (?: \# {fragment} )?
+)
 """
 
 # -----------------------------------------------------------------------------
@@ -274,9 +280,16 @@
 # NCNameChar	::=	Letter | Digit | '.' | '-' | '_'
 NCNameChar = rf"(?: {ALPHA} | {DIGIT} | \. | \- | _ )"
 
-# prefix      :=   NCName
-# NCName  :=   (Letter | '_') (NCNameChar)*
 prefix = rf"(?: {ALPHA} | _ ) (?: {NCNameChar} )*"
+"""The definition of a prefix.
+
+.. seealso:: https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName
+
+.. code-block::
+
+    prefix := NCName
+    NCName := (Letter | '_') (NCNameChar)*
+"""
 
 # reference   :=   irelative-ref (as defined in IRI)
 # !! IMPORTANT NOTE !!
@@ -284,8 +297,7 @@
 # relative-refs as defined in URI
 # curie       :=   [ [ prefix ] ':' ] reference
 # reference   :=   relative-ref (as defined in URI)
-CURIE = rf"""(?P<CURIE> (?: (?P<prefix> {prefix} )? : )? {relative_ref})
-"""
+CURIE = rf"(?P<CURIE> (?: (?P<prefix> {prefix} )? : )? {relative_ref})"
 
 PREFIX_RE = re.compile(f"^{prefix}$", re.VERBOSE)
 CURIE_RE = re.compile(f"^{CURIE}$", re.VERBOSE)
diff --git a/tests/resources/invalid_curies.txt b/tests/resources/invalid_curies.txt
index 74633dc..75e5cd1 100644
--- a/tests/resources/invalid_curies.txt
+++ b/tests/resources/invalid_curies.txt
@@ -2,3 +2,4 @@ pfx://abc
 pfx://
 ://
 /
+smiles:CC(=O)NC([H])(C)C(=O)O
diff --git a/tests/test_api.py b/tests/test_api.py
index b361f41..dd32352 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -808,6 +808,18 @@ def test_rdflib(self):
         converter_2 = Converter.from_rdflib(graph.namespace_manager)
         self._assert_convert(converter_2)
 
+    def test_expand_w3c_invalid(self):
+        """Test that expanding a non-w3c-conformant CURIE can lead to errors."""
+        converter = Converter.from_prefix_map(
+            {
+                "smiles": "https://bioregistry.io/smiles:",
+            }
+        )
+        curie = "smiles:CC(=O)NC([H])(C)C(=O)O"
+        self.assertIsNotNone(converter.expand(curie))
+        with self.assertRaises(ValueError):
+            converter.expand(curie, require_w3c_spec=True)
+
     def test_expand_all(self):
         """Test expand all."""
         priority_prefix_map = {

From 4c3e12b3ac12998dcd6340deb22cb494d817cb5b Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Mon, 11 Mar 2024 10:29:34 +0100
Subject: [PATCH 05/19] Update

---
 src/curies/api.py | 24 ++++++++++++++++--------
 tests/test_api.py |  6 +++---
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/curies/api.py b/src/curies/api.py
index 704aa51..7ccdef0 100644
--- a/src/curies/api.py
+++ b/src/curies/api.py
@@ -307,7 +307,7 @@ def _key(self) -> RecordKey:
             ",".join(sorted(self.uri_prefix_synonyms)),
         )
 
-    def is_w3c_compliant(self) -> bool:
+    def w3c_validate(self) -> bool:
         """Check if all prefixes in this record are w3c compliant."""
         all_curie_prefixes_valid = all(curie_prefix_is_w3c(prefix) for prefix in self._all_prefixes)
         # TODO extend to check URI prefixes?
@@ -480,7 +480,12 @@ class Converter:
     pattern_map: Dict[str, str]
 
     def __init__(
-        self, records: List[Record], *, delimiter: str = ":", strict: bool = True, w3c: bool = False
+        self,
+        records: List[Record],
+        *,
+        delimiter: str = ":",
+        strict: bool = True,
+        w3c_validation: bool = False,
     ) -> None:
         """Instantiate a converter.
 
@@ -490,7 +495,7 @@ def __init__(
             If true, raises issues on duplicate URI prefixes
         :param delimiter:
             The delimiter used for CURIEs. Defaults to a colon.
-        :param w3c:
+        :param w3c_validation:
             If true, validate all records against the
             `W3C CURIE Syntax 1.0 <https://www.w3.org/TR/2010/NOTE-curie-20101216/>`_.
             This includes the following:
@@ -500,6 +505,7 @@ def __init__(
 
         :raises DuplicatePrefixes: if any records share any synonyms
         :raises DuplicateURIPrefixes: if any records share any URI prefixes
+        :rasies ValueError: If w3c validation is on and there are non-conformant records
         """
         if strict:
             duplicate_uri_prefixes = _get_duplicate_uri_prefixes(records)
@@ -509,8 +515,8 @@ def __init__(
             if duplicate_prefixes:
                 raise DuplicatePrefixes(duplicate_prefixes)
 
-        if w3c:
-            broken = [record for record in records if not record.is_w3c_compliant()]
+        if w3c_validation:
+            broken = [record for record in records if not record.w3c_validate()]
             if broken:
                 raise ValueError(f"Records not conforming to W3C: {broken}")
 
@@ -1344,7 +1350,7 @@ def expand(
         *,
         strict: bool = False,
         passthrough: bool = False,
-        require_w3c_spec: bool = False,
+        w3c_validation: bool = False,
     ) -> Optional[str]:
         """Expand a CURIE to a URI, if possible.
 
@@ -1354,12 +1360,14 @@ def expand(
         :param passthrough: If true, strict is false, and the CURIE can't be expanded, return the input.
             Defaults to false. If your strings can either be a CURIE _or_ a URI, consider using
             :meth:`Converter.expand_ambiguous` instead.
-        :param require_w3c_spec: If true, requires CURIEs to be valid against the
+        :param w3c_validation: If true, requires CURIEs to be valid against the
             `W3C CURIE specification <https://www.w3.org/TR/2010/NOTE-curie-20101216/>`_.
         :returns:
             A URI if this converter contains a URI prefix for the prefix in this CURIE
         :raises ExpansionError:
             If strict is true and the CURIE can't be expanded
+        :raises ValueError:
+            If W3C validation is turned on and the CURIE is not valid under the CURIE specification
 
         >>> from curies import Converter
         >>> converter = Converter.from_prefix_map({
@@ -1380,7 +1388,7 @@ def expand(
             ``http://purl.obolibrary.org/obo/GO_0032571`` will return ``GO:0032571``
             instead of ``OBO:GO_0032571``.
         """
-        if require_w3c_spec and not curie_is_w3c(curie):
+        if w3c_validation and not curie_is_w3c(curie):
             raise ValueError(f"CURIE is not valid under W3C spec: {curie}")
         prefix, identifier = self.parse_curie(curie)
         rv = self.expand_pair(prefix, identifier)
diff --git a/tests/test_api.py b/tests/test_api.py
index dd32352..7391519 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -67,8 +67,8 @@ def test_w3c_prefix(self):
             r1 = Record(prefix=prefix, uri_prefix=uri_prefix)
             r2 = Record(prefix="prefix", prefix_synonyms=[prefix], uri_prefix=uri_prefix)
             with self.subTest(prefix=prefix):
-                self.assertEqual(value, r1.is_w3c_compliant())
-                self.assertEqual(value, r2.is_w3c_compliant())
+                self.assertEqual(value, r1.w3c_validate())
+                self.assertEqual(value, r2.w3c_validate())
 
 
 class TestAddRecord(unittest.TestCase):
@@ -818,7 +818,7 @@ def test_expand_w3c_invalid(self):
         curie = "smiles:CC(=O)NC([H])(C)C(=O)O"
         self.assertIsNotNone(converter.expand(curie))
         with self.assertRaises(ValueError):
-            converter.expand(curie, require_w3c_spec=True)
+            converter.expand(curie, w3c_validation=True)
 
     def test_expand_all(self):
         """Test expand all."""

From 011a9943e7390e77003c34490cbf73550e590a7a Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Mon, 11 Mar 2024 10:31:20 +0100
Subject: [PATCH 06/19] Flake

---
 .github/workflows/tests.yml | 2 +-
 src/curies/xx.py            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 5e83b4c..f3ff47b 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -43,7 +43,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
         run: |
-          pip install tox
+          pip install tox tox-uv
           sudo apt-get install graphviz
       - name: Check RST conformity with doc8
         run: tox run -e doc8
diff --git a/src/curies/xx.py b/src/curies/xx.py
index a3a2dc9..21fe11a 100644
--- a/src/curies/xx.py
+++ b/src/curies/xx.py
@@ -108,7 +108,7 @@
 #   reg-name      = *( unreserved / pct-encoded / sub-delims )
 reg_name = rf"(?: {unreserved} | {pct_encoded} | {sub_delims} )*"
 
-### required for Path
+# required for Path
 
 #   segment       = *pchar
 segment = rf"{pchar}*"

From e78078b021d27684b0be542576c0f33ea1bb3ecf Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Mon, 11 Mar 2024 10:33:18 +0100
Subject: [PATCH 07/19] Update api.py

---
 src/curies/api.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/curies/api.py b/src/curies/api.py
index 7ccdef0..468da30 100644
--- a/src/curies/api.py
+++ b/src/curies/api.py
@@ -500,12 +500,12 @@ def __init__(
             `W3C CURIE Syntax 1.0 <https://www.w3.org/TR/2010/NOTE-curie-20101216/>`_.
             This includes the following:
 
-              1. Checking CURIE prefixes and CURIE prefix synonyms against the
-                 W3C definition for `NCName <https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName>`_
+            1. Checking CURIE prefixes and CURIE prefix synonyms against the
+               W3C definition for `NCName <https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName>`_
 
         :raises DuplicatePrefixes: if any records share any synonyms
         :raises DuplicateURIPrefixes: if any records share any URI prefixes
-        :rasies ValueError: If w3c validation is on and there are non-conformant records
+        :raises ValueError: If w3c validation is on and there are non-conformant records
         """
         if strict:
             duplicate_uri_prefixes = _get_duplicate_uri_prefixes(records)

From 02e48f191ff93d3d7e3cc114a2f90bfcf9c44adc Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Mon, 11 Mar 2024 10:34:14 +0100
Subject: [PATCH 08/19] Update tests.yml

---
 .github/workflows/tests.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index f3ff47b..82d8d31 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -20,7 +20,7 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
-        run: pip install tox
+        run: pip install tox tox-uv
       - name: Check manifest
         run: tox run -e manifest
       - name: Check code quality with flake8
@@ -66,7 +66,7 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
-        run: pip install tox
+        run: pip install tox tox-uv
       - name: Test with pytest and generate coverage file
         run:
           tox run -e py-pydantic${{ matrix.pydantic }}

From 8c16a7d5552462e2caa734142b8d8d1a58c8f871 Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Mon, 11 Mar 2024 10:40:56 +0100
Subject: [PATCH 09/19] Clean

---
 src/curies/api.py |  3 +--
 src/curies/w3c.py | 15 ++++++++++++++-
 src/curies/xx.py  | 25 +------------------------
 3 files changed, 16 insertions(+), 27 deletions(-)

diff --git a/src/curies/api.py b/src/curies/api.py
index 468da30..e8ac76a 100644
--- a/src/curies/api.py
+++ b/src/curies/api.py
@@ -37,8 +37,7 @@
 from pytrie import StringTrie
 
 from ._pydantic_compat import field_validator, get_field_validator_values
-from .w3c import CURIE_RE
-from .xx import PREFIX_RE
+from .w3c import CURIE_RE, PREFIX_RE
 
 if TYPE_CHECKING:  # pragma: no cover
     import pandas
diff --git a/src/curies/w3c.py b/src/curies/w3c.py
index e7c1ae2..bc374b0 100644
--- a/src/curies/w3c.py
+++ b/src/curies/w3c.py
@@ -8,7 +8,20 @@
 
 import re
 
-# Borrowed from https://gist.github.com/niklasl/2506955
+_PREFIX_RE = rf"[A-Za-z_][A-Za-z0-9\.\-_]*"
+"""The definition of a prefix, from https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName.
+
+.. code-block::
+
+    prefix := NCName
+    NCName := (Letter | '_') (NCNameChar)*
+    NCNameChar	::=	Letter | Digit | '.' | '-' | '_'
+"""
+
+PREFIX_RE = re.compile(f"^{_PREFIX_RE}$")
+
+
+#: Borrowed from https://gist.github.com/niklasl/2506955
 CURIE_PATTERN = r"(([\i-[:]][\c-[:]]*)?:)?(/[^\s/][^\s]*|[^\s/][^\s]*|[^\s]?)"
 CURIE_PATTERN = CURIE_PATTERN.replace(r"\i-[:]", r"_A-Za-z").replace(r"\c-[:]", r"-._:A-Za-z0-9")
 CURIE_RE = re.compile(CURIE_PATTERN)
diff --git a/src/curies/xx.py b/src/curies/xx.py
index 21fe11a..8d886dd 100644
--- a/src/curies/xx.py
+++ b/src/curies/xx.py
@@ -267,29 +267,6 @@
 )
 """
 
-# -----------------------------------------------------------------------------
-#
-# Define CURIE according W3C CURIE Syntax 1.0
-# https://www.w3.org/TR/curie/#s_syntax
-#
-
-# NCNameChar	::=	Letter | Digit | '.' | '-' | '_' | CombiningChar | Extender
-# !! IMPORTANT NOTE !!
-# As of now this module doesn't support NCNameChar IRI, but
-# relative-refs as defined in URI,
-# NCNameChar	::=	Letter | Digit | '.' | '-' | '_'
-NCNameChar = rf"(?: {ALPHA} | {DIGIT} | \. | \- | _ )"
-
-prefix = rf"(?: {ALPHA} | _ ) (?: {NCNameChar} )*"
-"""The definition of a prefix.
-
-.. seealso:: https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName
-
-.. code-block::
-
-    prefix := NCName
-    NCName := (Letter | '_') (NCNameChar)*
-"""
 
 # reference   :=   irelative-ref (as defined in IRI)
 # !! IMPORTANT NOTE !!
@@ -299,5 +276,5 @@
 # reference   :=   relative-ref (as defined in URI)
 CURIE = rf"(?P<CURIE> (?: (?P<prefix> {prefix} )? : )? {relative_ref})"
 
-PREFIX_RE = re.compile(f"^{prefix}$", re.VERBOSE)
+
 CURIE_RE = re.compile(f"^{CURIE}$", re.VERBOSE)

From 039f729ec67f748bd8d3a3a797c4982fcba081a7 Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Mon, 11 Mar 2024 10:47:10 +0100
Subject: [PATCH 10/19] Cleanup

---
 src/curies/w3c.py |   2 +-
 src/curies/xx.py  | 280 ----------------------------------------------
 2 files changed, 1 insertion(+), 281 deletions(-)
 delete mode 100644 src/curies/xx.py

diff --git a/src/curies/w3c.py b/src/curies/w3c.py
index bc374b0..2317c24 100644
--- a/src/curies/w3c.py
+++ b/src/curies/w3c.py
@@ -8,7 +8,7 @@
 
 import re
 
-_PREFIX_RE = rf"[A-Za-z_][A-Za-z0-9\.\-_]*"
+_PREFIX_RE = r"[A-Za-z_][A-Za-z0-9\.\-_]*"
 """The definition of a prefix, from https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName.
 
 .. code-block::
diff --git a/src/curies/xx.py b/src/curies/xx.py
deleted file mode 100644
index 8d886dd..0000000
--- a/src/curies/xx.py
+++ /dev/null
@@ -1,280 +0,0 @@
-"""A regular expression implementation of the W3C CURIEs Syntax.
-
-These regular expressions are directly derived from the official sources mentioned in each
-section.
-
-They should be processed with :data:`re.VERBOSE` to remove comments and other
-non-essential annotations.
-
-Python named regular expression groups are being used to better understand the
-URI/CURIE parsing.
-
-adapted from https://github.com/linkml/linkml-runtime/blob/main/linkml_runtime/utils/uri_validator.py, which
-was originally distributed under the CC-0 license
-
-Relevant documents:
-
-1. W3C CURIES Syntax 1.0 in https://www.w3.org/TR/2010/NOTE-curie-20101216/
-2. NCName definition (i.e., prefix) in https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName
-2. IRI definition in https://www.ietf.org/rfc/rfc3987.txt
-"""
-
-import re
-
-#: Define DIGIT according RFC2234 section 3.4:
-#: https://datatracker.ietf.org/doc/html/rfc2234/#section-3.4
-DIGIT = "[0-9]"
-
-#: Define ALPHA (i.e., Letter) according RFC2234 section 6.1:
-#: https://datatracker.ietf.org/doc/html/rfc2234/#section-6.1
-ALPHA = "[A-Za-z]"
-
-#: Define HEXDIG according RFC2234 section 6.1:
-#: https://datatracker.ietf.org/doc/html/rfc2234/#section-6.1
-HEXDIG = "[0-9A-F]"
-
-#   pct-encoded   = "%" HEXDIG HEXDIG
-pct_encoded = f"%{HEXDIG}{{2}}"
-
-# unreserved = rf"(?: {ALPHA} | {DIGIT} | \- | \. | _ | ~ )"
-unreserved = r"[A-Za-z0-9\-\._~]"
-"""Defined in page 8 of https://www.ietf.org/rfc/rfc3987.txt as:
-
-.. code-block::
-
-    unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
-"""
-
-# gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
-gen_delims = r"(?: : | / | \? | \# | \[ | \] | @ )"
-
-#   sub-delims    = "!" / "$" / "&" / "'" / "("
-sub_delims = r"(?: ! | \$ | & | ' | \( | \) | \* | \+ | , | ; | = )"
-
-#   pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
-pchar = rf"(?: {unreserved} | {pct_encoded} | {sub_delims} | : | @ )"
-
-#   reserved      = gen-delims / sub-delims
-reserved = rf"(?: {gen_delims} | {sub_delims} )"
-
-dec_octet = rf"(?: {DIGIT} | [1-9]{DIGIT} | 1{DIGIT}{{2}} | 2[0-4]{DIGIT} | 25[0-5])"
-"""A definition of numbers between 1-255.
-
-.. code-block::
-
-    dec-octet = DIGIT ; 0-9
-                / %x31-39 DIGIT         ; 10-99
-                / "1" 2DIGIT            ; 100-199
-                / "2" %x30-34 DIGIT     ; 200-249
-                / "25" %x30-35          ; 250-255
-"""
-
-#  IPv4address   = dec-octet "." dec-octet "." dec-octet "." dec-octet
-IPv4address = rf"{dec_octet}\.{dec_octet}\.{dec_octet}\.{dec_octet}"
-
-#  h16           = 1*4HEXDIG
-h16 = rf"(?: {HEXDIG} ){{1,4}}"
-
-#  ls32          = ( h16 ":" h16 ) / IPv4address
-ls32 = rf"(?: (?: {h16} : {h16} ) | {IPv4address} )"
-
-#   IPv6address   =                            6( h16 ":" ) ls32
-#                 /                       "::" 5( h16 ":" ) ls32
-#                 / [               h16 ] "::" 4( h16 ":" ) ls32
-#                 / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
-#                 / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
-#                 / [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
-#                 / [ *4( h16 ":" ) h16 ] "::"              ls32
-#                 / [ *5( h16 ":" ) h16 ] "::"              h16
-#                 / [ *6( h16 ":" ) h16 ] "::"
-IPv6address = rf"""(?:                              (?: {h16} : ){{6}} {ls32} |
-                                                 :: (?: {h16} : ){{5}} {ls32} |
-                                    (?: {h16} )? :: (?: {h16} : ){{4}} {ls32} |
-               (?: (?: {h16} : )        {h16} )? :: (?: {h16} : ){{3}} {ls32} |
-               (?: (?: {h16} : ){{1,2}} {h16} )? :: (?: {h16} : ){{2}} {ls32} |
-               (?: (?: {h16} : ){{1,3}} {h16} )? ::     {h16} :        {ls32} |
-               (?: (?: {h16} : ){{1,4}} {h16} )? ::                    {ls32} |
-               (?: (?: {h16} : ){{1,5}} {h16} )? ::                    {h16}  |
-               (?: (?: {h16} : ){{1,6}} {h16} )? ::
-              )
-"""
-
-#   IPvFuture     = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
-IPvFuture = rf"v {HEXDIG}+ \. (?: {unreserved} | {sub_delims} | : )+"
-
-#   IP-literal    = "[" ( IPv6address / IPvFuture  ) "]"
-IP_literal = rf"\[ (?: {IPv6address} | {IPvFuture} ) \]"
-
-#   reg-name      = *( unreserved / pct-encoded / sub-delims )
-reg_name = rf"(?: {unreserved} | {pct_encoded} | {sub_delims} )*"
-
-# required for Path
-
-#   segment       = *pchar
-segment = rf"{pchar}*"
-
-#   segment-nz    = 1*pchar
-segment_nz = rf"{pchar}+"
-
-#   segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
-segment_nz_nc = rf"(?: {unreserved} | {pct_encoded} | {sub_delims} | @ )+"
-
-# -----------------------------------------------------------------------------
-#
-# Define SCHEME according RFC3986 section 3.1:
-# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.1
-#
-
-#   scheme        = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
-scheme = rf"(?P<scheme> {ALPHA} (?: {ALPHA} | {DIGIT} | \+ | \- | \. )* )"
-
-# -----------------------------------------------------------------------------
-#
-# Define AUTHORITY according RFC3986 section 3.2:
-
-# Define USER INFORMATION according RFC3986 section 3.2.1:
-# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.1
-
-#   userinfo      = *( unreserved / pct-encoded / sub-delims / ":" )
-userinfo = rf"""(?P<userinfo>
-                    (?: {unreserved} | {pct_encoded} | {sub_delims} | : )*
-                )
-"""
-
-# Define HOST according RFC3986 section 3.2.2:
-# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
-
-#   host          = IP-literal / IPv4address / reg-name
-host = rf"(?P<host> {IP_literal} | {IPv4address} | {reg_name} )"
-
-# Define PORT according RFC3986 section 3.2.3:
-# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.3
-
-#   port          = *DIGIT
-port = rf"(?P<port> ( {DIGIT} )* )"
-
-# Define AUTHORITY according RFC3986 section 3.2:
-# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2
-#
-
-#   authority     = [ userinfo "@" ] host [ ":" port ]
-# authority = rf"""(?: (?P<userinfo> {userinfo} ) @)?
-authority = rf"""(?P<authority>
-                    (?: {userinfo} @)?
-                    {host}
-                    (?: : {port} )?
-                )
-"""
-
-# -----------------------------------------------------------------------------
-#
-# Define different PATHs according RFC3986 section 3.3:
-# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.3
-#
-
-#   path-abempty  = *( "/" segment )
-path_abempty = rf"( / {segment} )*"
-
-#   path-absolute = "/" [ segment-nz *( "/" segment ) ]
-path_absolute = rf"( / (?: {segment_nz} (?: / {segment} )* )? )"
-
-#   path-noscheme = segment-nz-nc *( "/" segment )
-path_noscheme = rf"( {segment_nz_nc} (?: / {segment} )* )"
-
-#   path-rootless = segment-nz *( "/" segment )
-path_rootless = rf"( {segment_nz} (?: / {segment} )* )"
-
-#   path-empty    = 0<pchar>
-path_empty = r""
-
-#   path          = path-abempty    ; begins with "/" or is empty
-#                 / path-absolute   ; begins with "/" but not "//"
-#                 / path-noscheme   ; begins with a non-colon segment
-#                 / path-rootless   ; begins with a segment
-#                 / path-empty      ; zero characters
-path = rf"""(?:
-   {path_abempty} |
-   {path_absolute} |
-   {path_noscheme} |
-   {path_rootless} |
-   {path_empty}
-)
-"""
-
-# -----------------------------------------------------------------------------
-#
-# Define QUERY according RFC3986 section 3.4:
-# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.4
-#
-
-#   query         = *( pchar / "/" / "?" )
-query = rf"(?P<query> (?: {pchar} | / | \? )* )"
-
-# -----------------------------------------------------------------------------
-#
-# Define FRAGMENT according RFC3986 section 3.5:
-# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.5
-#
-
-#   fragment      = *( pchar / "/" / "?" )
-fragment = rf"(?P<fragment> (?: {pchar} | / | \? )* )"
-
-# -----------------------------------------------------------------------------
-#
-# Define URI and HIERARCHICAL PATH according RFC3986 section 3:
-# https://datatracker.ietf.org/doc/html/rfc3986/#section-3
-#
-
-#   hier-part     = "//" authority path-abempty
-#                 / path-absolute
-#                 / path-rootless
-#                 / path-empty
-hier_part = rf"""(?P<hier_part>
-    (?: // {authority} {path_abempty} ) |
-    {path_absolute} |
-    {path_rootless} |
-    {path_empty}
-)
-"""
-
-#   URI           = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
-URI = rf"(?P<uri> {scheme} : {hier_part} (?: \? {query} )? (?: \# {fragment} )?)"
-
-# -----------------------------------------------------------------------------
-#
-# Define RELATIVE REFERENCE according RFC3986 section 4.2:
-# https://datatracker.ietf.org/doc/html/rfc3986/#section-4.2
-#
-
-#   relative-part = "//" authority path-abempty
-#                 / path-absolute
-#                 / path-noscheme
-#                 / path-empty
-#   relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
-relative_ref = rf"""\
-(?P<relative_ref>
-    (?:
-        (?: //
-        {authority}
-        (?P<path_abempty> {path_abempty} )
-        ) |
-        (?P<path_absolute> {path_absolute} ) |
-        (?P<path_noscheme> {path_noscheme} ) |
-        (?P<path_empty> {path_empty} )
-        )
-    (?: \? {query} )?
-    (?: \# {fragment} )?
-)
-"""
-
-
-# reference   :=   irelative-ref (as defined in IRI)
-# !! IMPORTANT NOTE !!
-# As of now this module don't support irelative-refs as defined in IRI, but
-# relative-refs as defined in URI
-# curie       :=   [ [ prefix ] ':' ] reference
-# reference   :=   relative-ref (as defined in URI)
-CURIE = rf"(?P<CURIE> (?: (?P<prefix> {prefix} )? : )? {relative_ref})"
-
-
-CURIE_RE = re.compile(f"^{CURIE}$", re.VERBOSE)

From 6bc348ef022984c7a68e88612a270714081e1631 Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Mon, 11 Mar 2024 10:48:41 +0100
Subject: [PATCH 11/19] Update tests.yml

---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 82d8d31..4be0327 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -20,7 +20,7 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
-        run: pip install tox tox-uv
+        run: pip install tox
       - name: Check manifest
         run: tox run -e manifest
       - name: Check code quality with flake8

From 62c6e6e501ae218cf3b10b18eebbbdd06a95539a Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Mon, 11 Mar 2024 10:56:10 +0100
Subject: [PATCH 12/19] Add explicit error

---
 src/curies/api.py                  | 14 ++++++++++----
 tests/resources/invalid_curies.txt |  4 ----
 tests/test_api.py                  |  3 ++-
 tests/test_w3c.py                  | 15 ++++++++++++++-
 4 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/src/curies/api.py b/src/curies/api.py
index e8ac76a..fddb1a4 100644
--- a/src/curies/api.py
+++ b/src/curies/api.py
@@ -48,9 +48,11 @@
     "Reference",
     "ReferenceTuple",
     "Record",
+    # Exceptions
     "DuplicateValueError",
     "DuplicatePrefixes",
     "DuplicateURIPrefixes",
+    "W3CValidationError",
     # Utilities
     "chain",
     "upgrade_prefix_map",
@@ -377,6 +379,10 @@ class URIStandardizationError(StandardizationError):
     """An error raise when a URI can't be standardized."""
 
 
+class W3CValidationError(ValueError):
+    """An error when W3C validation fails."""
+
+
 def _get_duplicate_uri_prefixes(records: List[Record]) -> List[DuplicateSummary]:
     return [
         DuplicateSummary(record_1, record_2, uri_prefix)
@@ -504,7 +510,7 @@ def __init__(
 
         :raises DuplicatePrefixes: if any records share any synonyms
         :raises DuplicateURIPrefixes: if any records share any URI prefixes
-        :raises ValueError: If w3c validation is on and there are non-conformant records
+        :raises W3CValidationError: If w3c validation is on and there are non-conformant records
         """
         if strict:
             duplicate_uri_prefixes = _get_duplicate_uri_prefixes(records)
@@ -517,7 +523,7 @@ def __init__(
         if w3c_validation:
             broken = [record for record in records if not record.w3c_validate()]
             if broken:
-                raise ValueError(f"Records not conforming to W3C: {broken}")
+                raise W3CValidationError(f"Records not conforming to W3C: {broken}")
 
         self.delimiter = delimiter
         self.records = sorted(records, key=lambda r: r.prefix)
@@ -1365,7 +1371,7 @@ def expand(
             A URI if this converter contains a URI prefix for the prefix in this CURIE
         :raises ExpansionError:
             If strict is true and the CURIE can't be expanded
-        :raises ValueError:
+        :raises W3CValidationError:
             If W3C validation is turned on and the CURIE is not valid under the CURIE specification
 
         >>> from curies import Converter
@@ -1388,7 +1394,7 @@ def expand(
             instead of ``OBO:GO_0032571``.
         """
         if w3c_validation and not curie_is_w3c(curie):
-            raise ValueError(f"CURIE is not valid under W3C spec: {curie}")
+            raise W3CValidationError(f"CURIE is not valid under W3C spec: {curie}")
         prefix, identifier = self.parse_curie(curie)
         rv = self.expand_pair(prefix, identifier)
         if rv:
diff --git a/tests/resources/invalid_curies.txt b/tests/resources/invalid_curies.txt
index 75e5cd1..696858d 100644
--- a/tests/resources/invalid_curies.txt
+++ b/tests/resources/invalid_curies.txt
@@ -1,5 +1 @@
-pfx://abc
-pfx://
-://
-/
 smiles:CC(=O)NC([H])(C)C(=O)O
diff --git a/tests/test_api.py b/tests/test_api.py
index 7391519..3c1d192 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -24,6 +24,7 @@
     Reference,
     ReferenceTuple,
     URIStandardizationError,
+    W3CValidationError,
     chain,
     upgrade_prefix_map,
 )
@@ -817,7 +818,7 @@ def test_expand_w3c_invalid(self):
         )
         curie = "smiles:CC(=O)NC([H])(C)C(=O)O"
         self.assertIsNotNone(converter.expand(curie))
-        with self.assertRaises(ValueError):
+        with self.assertRaises(W3CValidationError):
             converter.expand(curie, w3c_validation=True)
 
     def test_expand_all(self):
diff --git a/tests/test_w3c.py b/tests/test_w3c.py
index d0e6172..8e9fed4 100644
--- a/tests/test_w3c.py
+++ b/tests/test_w3c.py
@@ -14,11 +14,24 @@
 class TestW3C(unittest.TestCase):
     """Tests for W3C utilities."""
 
-    def test_validating_curies(self):
+    def test_valid_curies(self):
         """Test validating CURIEs."""
         for curie in VALID_CURIES_PATH.read_text().splitlines():
             with self.subTest(curie=curie):
                 self.assertTrue(curie_is_w3c(curie))
+
+    def test_invalid_curies(self):
+        """Test validating CURIEs.
+
+        .. todo::
+
+            Later, extend this to the following:
+
+            1. ``pfx://abc``
+            2. ``pfx://``
+            3. ``://``
+            4. ``/``
+        """
         for curie in INVALID_CURIES_PATH.read_text().splitlines():
             with self.subTest(curie=curie):
                 self.assertFalse(curie_is_w3c(curie))

From 0398a67669032d8e1f7514feeab83a959039862e Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Mon, 11 Mar 2024 11:07:18 +0100
Subject: [PATCH 13/19] cleanup

---
 src/curies/api.py                  | 4 ++++
 tests/resources/invalid_curies.txt | 2 ++
 tests/resources/valid_curies.txt   | 1 -
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/curies/api.py b/src/curies/api.py
index fddb1a4..f6fcbaa 100644
--- a/src/curies/api.py
+++ b/src/curies/api.py
@@ -2374,6 +2374,10 @@ def curie_is_w3c(s: str) -> bool:
     """
     if "[" in s or "]" in s:
         return False
+    if not s.strip():
+        return False
+    if s[0].isdigit():
+        return False  # TODO get that into the regex
     return bool(CURIE_RE.match(s))
 
 
diff --git a/tests/resources/invalid_curies.txt b/tests/resources/invalid_curies.txt
index 696858d..d5d9efd 100644
--- a/tests/resources/invalid_curies.txt
+++ b/tests/resources/invalid_curies.txt
@@ -1 +1,3 @@
 smiles:CC(=O)NC([H])(C)C(=O)O
+4cdn:test
+
diff --git a/tests/resources/valid_curies.txt b/tests/resources/valid_curies.txt
index 1066d8d..8ed858c 100644
--- a/tests/resources/valid_curies.txt
+++ b/tests/resources/valid_curies.txt
@@ -3,7 +3,6 @@ pfx:abc
 pfx:
 abc
 :abc
-
 pfx:/abc
 pfx:/
 :/

From 80c3038c6b25ca0b5204d05c3c786df1d23d36ea Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Tue, 12 Mar 2024 07:33:49 +0100
Subject: [PATCH 14/19] Update

---
 src/curies/api.py                  |  4 ++--
 src/curies/w3c.py                  | 15 ++++++++++-----
 tests/resources/invalid_curies.txt |  2 +-
 tests/resources/valid_curies.txt   |  3 +++
 4 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/curies/api.py b/src/curies/api.py
index f6fcbaa..b256925 100644
--- a/src/curies/api.py
+++ b/src/curies/api.py
@@ -37,7 +37,7 @@
 from pytrie import StringTrie
 
 from ._pydantic_compat import field_validator, get_field_validator_values
-from .w3c import CURIE_RE, PREFIX_RE
+from .w3c import CURIE_PREFIX_RE, CURIE_RE, URI_PREFIX_RE
 
 if TYPE_CHECKING:  # pragma: no cover
     import pandas
@@ -2383,4 +2383,4 @@ def curie_is_w3c(s: str) -> bool:
 
 def curie_prefix_is_w3c(s: str) -> bool:
     """Return if the CURIE prefix is valid under the W3C specification."""
-    return bool(PREFIX_RE.match(s))
+    return bool(CURIE_PREFIX_RE.match(s))
diff --git a/src/curies/w3c.py b/src/curies/w3c.py
index 2317c24..ed3a652 100644
--- a/src/curies/w3c.py
+++ b/src/curies/w3c.py
@@ -8,7 +8,7 @@
 
 import re
 
-_PREFIX_RE = r"[A-Za-z_][A-Za-z0-9\.\-_]*"
+_CURIE_PREFIX_RE = r"[A-Za-z_][A-Za-z0-9\.\-_]*"
 """The definition of a prefix, from https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName.
 
 .. code-block::
@@ -18,10 +18,15 @@
     NCNameChar	::=	Letter | Digit | '.' | '-' | '_'
 """
 
-PREFIX_RE = re.compile(f"^{_PREFIX_RE}$")
+CURIE_PREFIX_RE = re.compile(f"^{_CURIE_PREFIX_RE}$")
 
+#: Borrowed from https://github.com/linkml/prefixmaps/blob/82bfdbc/src/prefixmaps/datamodel/context.py#L26C1-L26C60
+#: Still needs adapting to see if there's an actual standard to match this to,
+#: or if this is an opinionated implementation
+URI_PREFIX_RE = re.compile(r"http[s]?://[\w\.\-\/]+[#/_:]$")
 
-#: Borrowed from https://gist.github.com/niklasl/2506955
-CURIE_PATTERN = r"(([\i-[:]][\c-[:]]*)?:)?(/[^\s/][^\s]*|[^\s/][^\s]*|[^\s]?)"
-CURIE_PATTERN = CURIE_PATTERN.replace(r"\i-[:]", r"_A-Za-z").replace(r"\c-[:]", r"-._:A-Za-z0-9")
+#: Adapted from https://gist.github.com/niklasl/2506955
+_IDENTIFIER_RE = r"(/[^\s/][^\s]*|[^\s/][^\s]*|[^\s]?)"
+
+CURIE_PATTERN = rf"({_CURIE_PREFIX_RE}?:)?{_IDENTIFIER_RE}"
 CURIE_RE = re.compile(CURIE_PATTERN)
diff --git a/tests/resources/invalid_curies.txt b/tests/resources/invalid_curies.txt
index d5d9efd..b77661c 100644
--- a/tests/resources/invalid_curies.txt
+++ b/tests/resources/invalid_curies.txt
@@ -1,3 +1,3 @@
+
 smiles:CC(=O)NC([H])(C)C(=O)O
 4cdn:test
-
diff --git a/tests/resources/valid_curies.txt b/tests/resources/valid_curies.txt
index 8ed858c..7125989 100644
--- a/tests/resources/valid_curies.txt
+++ b/tests/resources/valid_curies.txt
@@ -6,3 +6,6 @@ abc
 pfx:/abc
 pfx:/
 :/
+bioregistry:bioregistry
+GO:0000012
+go:0123456

From c15a08e4af7db08efd6dbbaddd57ca7aaf9cefb2 Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Tue, 12 Mar 2024 07:42:52 +0100
Subject: [PATCH 15/19] Update type checking

---
 src/curies/api.py                | 29 +++++++++++++++++++++++------
 src/curies/w3c.py                |  4 ++--
 tests/resources/valid_curies.txt |  1 +
 tox.ini                          |  4 +++-
 4 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/src/curies/api.py b/src/curies/api.py
index b256925..5e99790 100644
--- a/src/curies/api.py
+++ b/src/curies/api.py
@@ -37,7 +37,7 @@
 from pytrie import StringTrie
 
 from ._pydantic_compat import field_validator, get_field_validator_values
-from .w3c import CURIE_PREFIX_RE, CURIE_RE, URI_PREFIX_RE
+from .w3c import CURIE_PREFIX_RE, CURIE_RE
 
 if TYPE_CHECKING:  # pragma: no cover
     import pandas
@@ -1224,10 +1224,12 @@ def parse_uri(self, uri: str) -> Union[ReferenceTuple, Tuple[None, None]]:
         else:
             return ReferenceTuple(prefix, uri[len(value) :])
 
-    def is_curie(self, s: str) -> bool:
+    def is_curie(self, s: str, *, w3c_validation: bool = False) -> bool:
         """Check if the string can be parsed as a CURIE by this converter.
 
         :param s: A string that might be a CURIE
+        :param w3c_validation: If true, requires CURIEs to be valid against the
+            `W3C CURIE specification <https://www.w3.org/TR/2010/NOTE-curie-20101216/>`_.
         :returns: If the string can be parsed as a CURIE by this converter.
             Note that some valid CURIEs, when passed to this function, will
             result in False if their prefixes are not registered with this
@@ -1248,7 +1250,7 @@ def is_curie(self, s: str) -> bool:
         False
         """
         try:
-            return self.expand(s) is not None
+            return self.expand(s, w3c_validation=w3c_validation) is not None
         except ValueError:
             return False
 
@@ -1334,19 +1336,34 @@ def expand_strict(self, curie: str) -> str:
     # docstr-coverage:excused `overload`
     @overload
     def expand(
-        self, curie: str, *, strict: Literal[True] = True, passthrough: bool = False
+        self,
+        curie: str,
+        *,
+        strict: Literal[True] = True,
+        passthrough: bool = ...,
+        w3c_validation: bool = ...,
     ) -> str: ...
 
     # docstr-coverage:excused `overload`
     @overload
     def expand(
-        self, curie: str, *, strict: Literal[False] = False, passthrough: Literal[True] = True
+        self,
+        curie: str,
+        *,
+        strict: Literal[False] = False,
+        passthrough: Literal[True] = True,
+        w3c_validation: bool = ...,
     ) -> str: ...
 
     # docstr-coverage:excused `overload`
     @overload
     def expand(
-        self, curie: str, *, strict: Literal[False] = False, passthrough: Literal[False] = False
+        self,
+        curie: str,
+        *,
+        strict: Literal[False] = False,
+        passthrough: Literal[False] = False,
+        w3c_validation: bool = ...,
     ) -> Optional[str]: ...
 
     def expand(
diff --git a/src/curies/w3c.py b/src/curies/w3c.py
index ed3a652..2d92ade 100644
--- a/src/curies/w3c.py
+++ b/src/curies/w3c.py
@@ -23,10 +23,10 @@
 #: Borrowed from https://github.com/linkml/prefixmaps/blob/82bfdbc/src/prefixmaps/datamodel/context.py#L26C1-L26C60
 #: Still needs adapting to see if there's an actual standard to match this to,
 #: or if this is an opinionated implementation
-URI_PREFIX_RE = re.compile(r"http[s]?://[\w\.\-\/]+[#/_:]$")
+URI_PREFIX_RE = re.compile(r"^http[s]?://[\w\.\-\/]+[#/_:]$")
 
 #: Adapted from https://gist.github.com/niklasl/2506955
 _IDENTIFIER_RE = r"(/[^\s/][^\s]*|[^\s/][^\s]*|[^\s]?)"
 
-CURIE_PATTERN = rf"({_CURIE_PREFIX_RE}?:)?{_IDENTIFIER_RE}"
+CURIE_PATTERN = rf"^({_CURIE_PREFIX_RE}?:)?{_IDENTIFIER_RE}$"
 CURIE_RE = re.compile(CURIE_PATTERN)
diff --git a/tests/resources/valid_curies.txt b/tests/resources/valid_curies.txt
index 7125989..d919dd8 100644
--- a/tests/resources/valid_curies.txt
+++ b/tests/resources/valid_curies.txt
@@ -3,6 +3,7 @@ pfx:abc
 pfx:
 abc
 :abc
+_:abc
 pfx:/abc
 pfx:/
 :/
diff --git a/tox.ini b/tox.ini
index 68f4354..620ffcb 100644
--- a/tox.ini
+++ b/tox.ini
@@ -110,7 +110,9 @@ commands = pyroma --min=10 .
 description = Run the pyroma tool to check the package friendliness of the project.
 
 [testenv:mypy]
-deps = mypy
+deps =
+    mypy
+    types-requests
 skip_install = true
 commands = mypy --install-types --non-interactive --ignore-missing-imports --strict src/
 description = Run the mypy tool to check static typing on the project.

From 291836aa562e23006f586dfb4738b8de3b83001d Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Tue, 12 Mar 2024 07:59:46 +0100
Subject: [PATCH 16/19] Add more docs

---
 docs/source/index.rst |  1 +
 docs/source/w3c.rst   | 39 +++++++++++++++++++++++++++++++++++++++
 src/curies/api.py     |  1 -
 3 files changed, 40 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/w3c.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 8bbe6ba..389e33f 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -66,4 +66,5 @@ for updating your code.
    discovery
    struct
    api
+   w3c
    services/index
diff --git a/docs/source/w3c.rst b/docs/source/w3c.rst
new file mode 100644
index 0000000..17561c8
--- /dev/null
+++ b/docs/source/w3c.rst
@@ -0,0 +1,39 @@
+W3C Compliance
+==============
+The Worldwide Web Consortium (W3C) provides standards for
+`prefixes (i.e., NCName) <https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName>`_),
+`CURIEs <https://www.w3.org/TR/2010/NOTE-curie-20101216/>`_, and
+`IRIs <https://www.ietf.org/rfc/rfc3987.txt>`_, but they are
+highly obfuscated and spread across many documents.
+
+In practice, some usages do not conform to these standards, often due
+to encoding things that aren't _really_ supposed to be CURIEs, such as
+like SMILES strings for molecules, UCUM codes for units,
+or other language-like "identifiers".
+
+Therefore, it's on the roadmap for the ``curies`` package to support
+operations for validating against the W3C standards and mapping
+between "loose" (i.e., un-URL-encoded) and strict (i.e., URL-encoded)
+CURIEs and IRIs. In practice, this will often solve issues with special
+characters like square brackets (``[`` and ``]``).
+
+.. code-block::
+
+     looseCURIE <-> strictCURIE
+          ^.    \./.    ^
+          |      X      |
+          v     / \.    v
+      looseURI  <->  strictURI
+
+A first step towards accomplishing this was implemented in https://github.com/biopragmatics/curies/pull/104
+by adding a ``w3c_validation`` flag to both the initialization of a :mod:`curies.Converter` as well as in the
+:meth:`curies.Converter.expand` function.
+
+.. seealso::
+
+    1. Discussion on the ``curies`` issue tracker about handling CURIEs that include e.g. square brackets
+       and therefore don't conform to the W3C specification: https://github.com/biopragmatics/curies/issues/103
+    2. Discussion on languages that shouldn't really get encoded in CURIEs, but still do:
+       https://github.com/biopragmatics/bioregistry/issues/460
+    3. Related to (2) - discussion on how to properly encode UCUM in CURIEs:
+       https://github.com/biopragmatics/bioregistry/issues/648
diff --git a/src/curies/api.py b/src/curies/api.py
index 5e99790..b9671e2 100644
--- a/src/curies/api.py
+++ b/src/curies/api.py
@@ -2363,7 +2363,6 @@ def curie_is_w3c(s: str) -> bool:
     :param s: A string to check if it is a valid CURIE under the W3C specification.
     :return: True if the string is a valid CURIE under the W3C specification.
 
-
     If no prefix is given, the host language chooses how to assign a default
     prefix.
 

From 9d8fd8a70fa80866cf3603d04edb1cadb50980fa Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Tue, 12 Mar 2024 08:05:57 +0100
Subject: [PATCH 17/19] Update docs

---
 docs/source/w3c.rst    |  2 +-
 src/curies/__init__.py |  2 ++
 src/curies/api.py      | 30 ++++++++++++++++--------------
 3 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/docs/source/w3c.rst b/docs/source/w3c.rst
index 17561c8..b19a9df 100644
--- a/docs/source/w3c.rst
+++ b/docs/source/w3c.rst
@@ -1,7 +1,7 @@
 W3C Compliance
 ==============
 The Worldwide Web Consortium (W3C) provides standards for
-`prefixes (i.e., NCName) <https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName>`_),
+`prefixes <https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName>`_ (i.e., ``NCName``),
 `CURIEs <https://www.w3.org/TR/2010/NOTE-curie-20101216/>`_, and
 `IRIs <https://www.ietf.org/rfc/rfc3987.txt>`_, but they are
 highly obfuscated and spread across many documents.
diff --git a/src/curies/__init__.py b/src/curies/__init__.py
index 718f198..efc4cf1 100644
--- a/src/curies/__init__.py
+++ b/src/curies/__init__.py
@@ -10,6 +10,7 @@
     Record,
     Reference,
     ReferenceTuple,
+    W3CValidationError,
     chain,
     load_extended_prefix_map,
     load_jsonld_context,
@@ -39,6 +40,7 @@
     "DuplicateValueError",
     "DuplicateURIPrefixes",
     "DuplicatePrefixes",
+    "W3CValidationError",
     "chain",
     "remap_curie_prefixes",
     "remap_uri_prefixes",
diff --git a/src/curies/api.py b/src/curies/api.py
index b9671e2..2344aaf 100644
--- a/src/curies/api.py
+++ b/src/curies/api.py
@@ -1178,16 +1178,27 @@ def compress(
         :raises CompressionError:
             If strict is set to true and the URI can't be compressed
 
-
         >>> from curies import Converter
         >>> converter = Converter.from_prefix_map({
         ...    "CHEBI": "http://purl.obolibrary.org/obo/CHEBI_",
         ...    "MONDO": "http://purl.obolibrary.org/obo/MONDO_",
         ...    "GO": "http://purl.obolibrary.org/obo/GO_",
+        ...    "OBO": "http://purl.obolibrary.org/obo/",
         ... })
-        >>> converter.compress("http://purl.obolibrary.org/obo/CHEBI_138488")
-        'CHEBI:138488'
+        >>> converter.compress("http://purl.obolibrary.org/obo/GO_0032571")
+        'GO:0032571'
+        >>> converter.compress("http://purl.obolibrary.org/obo/go.owl")
+        'OBO:go.owl'
         >>> converter.compress("http://example.org/missing:0000000")
+
+        .. note::
+
+            If there are partially overlapping *URI prefixes* in this converter
+            (e.g., ``http://purl.obolibrary.org/obo/GO_`` for the prefix ``GO`` and
+            ``http://purl.obolibrary.org/obo/`` for the prefix ``OBO``), the longest
+            URI prefix will always be matched. For example, parsing
+            ``http://purl.obolibrary.org/obo/GO_0032571`` will return ``GO:0032571``
+            instead of ``OBO:GO_0032571``.
         """
         prefix, identifier = self.parse_uri(uri)
         if prefix and identifier:
@@ -1380,8 +1391,8 @@ def expand(
             A string representing a compact URI (CURIE)
         :param strict: If true and the CURIE can't be expanded, returns an error. Defaults to false.
         :param passthrough: If true, strict is false, and the CURIE can't be expanded, return the input.
-            Defaults to false. If your strings can either be a CURIE _or_ a URI, consider using
-            :meth:`Converter.expand_ambiguous` instead.
+            Defaults to false. If your strings can either be a CURIE or a URI, consider using
+            :meth:`Converter.expand_or_standardize` instead.
         :param w3c_validation: If true, requires CURIEs to be valid against the
             `W3C CURIE specification <https://www.w3.org/TR/2010/NOTE-curie-20101216/>`_.
         :returns:
@@ -1400,15 +1411,6 @@ def expand(
         >>> converter.expand("CHEBI:138488")
         'http://purl.obolibrary.org/obo/CHEBI_138488'
         >>> converter.expand("missing:0000000")
-
-        .. note::
-
-            If there are partially overlapping *URI prefixes* in this converter
-            (e.g., ``http://purl.obolibrary.org/obo/GO_`` for the prefix ``GO`` and
-            ``http://purl.obolibrary.org/obo/`` for the prefix ``OBO``), the longest
-            URI prefix will always be matched. For example, parsing
-            ``http://purl.obolibrary.org/obo/GO_0032571`` will return ``GO:0032571``
-            instead of ``OBO:GO_0032571``.
         """
         if w3c_validation and not curie_is_w3c(curie):
             raise W3CValidationError(f"CURIE is not valid under W3C spec: {curie}")

From 24a15b67430e17057733455291b49d845de228dd Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Tue, 12 Mar 2024 09:21:23 +0100
Subject: [PATCH 18/19] Add specific code examples

---
 docs/source/w3c.rst | 57 +++++++++++++++++++++++++++++++++++++++++++++
 src/curies/api.py   |  3 ++-
 2 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/docs/source/w3c.rst b/docs/source/w3c.rst
index b19a9df..2c985ba 100644
--- a/docs/source/w3c.rst
+++ b/docs/source/w3c.rst
@@ -29,6 +29,63 @@ A first step towards accomplishing this was implemented in https://github.com/bi
 by adding a ``w3c_validation`` flag to both the initialization of a :mod:`curies.Converter` as well as in the
 :meth:`curies.Converter.expand` function.
 
+Here's an example of using W3C validation during expansion:
+
+.. code-block::
+
+    import curies
+
+    converter = curies.Converter.from_prefix_map({
+        "smiles": "https://bioregistry.io/smiles:",
+    })
+
+    >>> converter.expand("smiles:CC(=O)NC([H])(C)C(=O)O")
+    https://bioregistry.io/smiles:CC(=O)NC([H])(C)C(=O)O
+
+    >>> converter.expand("smiles:CC(=O)NC([H])(C)C(=O)O", w3c_validation=True)
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+      File "/Users/cthoyt/dev/curies/src/curies/api.py", line 1362, in expand
+        raise W3CValidationError(f"CURIE is not valid under W3C spec: {curie}")
+    W3CValidationError: CURIE is not valid under W3C spec: smiles:CC(=O)NC([H])(C)C(=O)O
+
+This can also be used to extend :meth:`curies.Converter.is_curie`
+
+.. code-block::
+
+    import curies
+
+    converter = curies.Converter.from_prefix_map({
+        "smiles": "https://bioregistry.io/smiles:",
+    })
+
+    >>> converter.is_curie("smiles:CC(=O)NC([H])(C)C(=O)O")
+    True
+    >>> converter.is_curie("smiles:CC(=O)NC([H])(C)C(=O)O", w3c_validation=True)
+    False
+
+Finally, this can be used during instantiation of a converter:
+
+.. code-block::
+
+    import curies
+
+    >>> curies.Converter.from_prefix_map(
+    ...     {"4dn.biosource": "https://data.4dnucleome.org/biosources/"},
+    ...     w3c_validation=True,
+    ... )
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+      File "/Users/cthoyt/dev/curies/src/curies/api.py", line 816, in from_prefix_map
+        return cls(
+               ^^^^
+      File "/Users/cthoyt/dev/curies/src/curies/api.py", line 527, in __init__
+        raise W3CValidationError(f"Records not conforming to W3C:\n\n{msg}")
+    curies.api.W3CValidationError: Records not conforming to W3C:
+
+      - Record(prefix='4dn.biosource', uri_prefix='https://data.4dnucleome.org/biosources/', prefix_synonyms=[], uri_prefix_synonyms=[], pattern=None)
+
+
 .. seealso::
 
     1. Discussion on the ``curies`` issue tracker about handling CURIEs that include e.g. square brackets
diff --git a/src/curies/api.py b/src/curies/api.py
index e397ee6..65f924e 100644
--- a/src/curies/api.py
+++ b/src/curies/api.py
@@ -523,7 +523,8 @@ def __init__(
         if w3c_validation:
             broken = [record for record in records if not record.w3c_validate()]
             if broken:
-                raise W3CValidationError(f"Records not conforming to W3C: {broken}")
+                msg = "\n".join(f"  - {record!r}" for record in records)
+                raise W3CValidationError(f"Records not conforming to W3C:\n\n{msg}")
 
         self.delimiter = delimiter
         self.records = sorted(records, key=lambda r: r.prefix)

From 483736a9829667fa57258ee4206a413f98ecd6f8 Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Tue, 12 Mar 2024 09:28:39 +0100
Subject: [PATCH 19/19] Hide internal code

---
 src/curies/api.py | 4 ++--
 tests/test_api.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/curies/api.py b/src/curies/api.py
index 65f924e..46c83a2 100644
--- a/src/curies/api.py
+++ b/src/curies/api.py
@@ -308,7 +308,7 @@ def _key(self) -> RecordKey:
             ",".join(sorted(self.uri_prefix_synonyms)),
         )
 
-    def w3c_validate(self) -> bool:
+    def _w3c_validate(self) -> bool:
         """Check if all prefixes in this record are w3c compliant."""
         all_curie_prefixes_valid = all(curie_prefix_is_w3c(prefix) for prefix in self._all_prefixes)
         # TODO extend to check URI prefixes?
@@ -521,7 +521,7 @@ def __init__(
                 raise DuplicatePrefixes(duplicate_prefixes)
 
         if w3c_validation:
-            broken = [record for record in records if not record.w3c_validate()]
+            broken = [record for record in records if not record._w3c_validate()]
             if broken:
                 msg = "\n".join(f"  - {record!r}" for record in records)
                 raise W3CValidationError(f"Records not conforming to W3C:\n\n{msg}")
diff --git a/tests/test_api.py b/tests/test_api.py
index 3c1d192..1986454 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -68,8 +68,8 @@ def test_w3c_prefix(self):
             r1 = Record(prefix=prefix, uri_prefix=uri_prefix)
             r2 = Record(prefix="prefix", prefix_synonyms=[prefix], uri_prefix=uri_prefix)
             with self.subTest(prefix=prefix):
-                self.assertEqual(value, r1.w3c_validate())
-                self.assertEqual(value, r2.w3c_validate())
+                self.assertEqual(value, r1._w3c_validate())
+                self.assertEqual(value, r2._w3c_validate())
 
 
 class TestAddRecord(unittest.TestCase):