From ba7e5cc04e58ff3e457e9fd2d7c5e3457b12cdf8 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Wed, 28 Feb 2024 15:46:28 +0100 Subject: [PATCH 01/19] Add w3c strict expansion --- src/curies/api.py | 12 +++++++++++- src/curies/w3c.py | 19 +++++++++++++++++++ tests/resources/invalid_curies.txt | 4 ++++ tests/resources/valid_curies.txt | 9 +++++++++ tests/test_w3c.py | 24 ++++++++++++++++++++++++ 5 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 src/curies/w3c.py create mode 100644 tests/resources/invalid_curies.txt create mode 100644 tests/resources/valid_curies.txt create mode 100644 tests/test_w3c.py diff --git a/src/curies/api.py b/src/curies/api.py index 730d4e6..cd94f8c 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -37,6 +37,7 @@ from pytrie import StringTrie from ._pydantic_compat import field_validator, get_field_validator_values +from .w3c import curie_is_w3c if TYPE_CHECKING: # pragma: no cover import pandas @@ -1316,7 +1317,12 @@ def expand( ) -> Optional[str]: ... def expand( - self, curie: str, *, strict: bool = False, passthrough: bool = False + self, + curie: str, + *, + strict: bool = False, + passthrough: bool = False, + require_w3c_spec: bool = False, ) -> Optional[str]: """Expand a CURIE to a URI, if possible. @@ -1326,6 +1332,8 @@ def expand( :param passthrough: If true, strict is false, and the CURIE can't be expanded, return the input. Defaults to false. If your strings can either be a CURIE _or_ a URI, consider using :meth:`Converter.expand_ambiguous` instead. + :param require_w3c_spec: If true, requires CURIEs to be valid against the + `W3C CURIE specification `_. :returns: A URI if this converter contains a URI prefix for the prefix in this CURIE :raises ExpansionError: @@ -1350,6 +1358,8 @@ def expand( ``http://purl.obolibrary.org/obo/GO_0032571`` will return ``GO:0032571`` instead of ``OBO:GO_0032571``. """ + if require_w3c_spec and not curie_is_w3c(curie): + raise ValueError(f"CURIE is not valid under W3C spec: {curie}") prefix, identifier = self.parse_curie(curie) rv = self.expand_pair(prefix, identifier) if rv: diff --git a/src/curies/w3c.py b/src/curies/w3c.py new file mode 100644 index 0000000..61deb43 --- /dev/null +++ b/src/curies/w3c.py @@ -0,0 +1,19 @@ +""" +Make it possible to check a CURIE against the W3C specification. +""" + +import re + +__all__ = [ + "curie_is_w3c", +] + +# Borrowed from https://gist.github.com/niklasl/2506955 +CURIE_PATTERN = r"(([\i-[:]][\c-[:]]*)?:)?(/[^\s/][^\s]*|[^\s/][^\s]*|[^\s]?)" +CURIE_PATTERN = CURIE_PATTERN.replace(r"\i-[:]", r"_A-Za-z").replace(r"\c-[:]", r"-._:A-Za-z0-9") +CURIE_RE = re.compile(CURIE_PATTERN) + + +def curie_is_w3c(curie) -> bool: + """Return if the CURIE is valid under the W3C specification.""" + return bool(CURIE_RE.match(curie)) diff --git a/tests/resources/invalid_curies.txt b/tests/resources/invalid_curies.txt new file mode 100644 index 0000000..74633dc --- /dev/null +++ b/tests/resources/invalid_curies.txt @@ -0,0 +1,4 @@ +pfx://abc +pfx:// +:// +/ diff --git a/tests/resources/valid_curies.txt b/tests/resources/valid_curies.txt new file mode 100644 index 0000000..1066d8d --- /dev/null +++ b/tests/resources/valid_curies.txt @@ -0,0 +1,9 @@ +pfx:abc +: +pfx: +abc +:abc + +pfx:/abc +pfx:/ +:/ diff --git a/tests/test_w3c.py b/tests/test_w3c.py new file mode 100644 index 0000000..1dd5b66 --- /dev/null +++ b/tests/test_w3c.py @@ -0,0 +1,24 @@ +"""Tests for W3C utilities.""" + +import unittest +from pathlib import Path + +from curies.w3c import curie_is_w3c + +HERE = Path(__file__).parent.resolve() +RESOURCES = HERE.joinpath("resources") +VALID_CURIES_PATH = RESOURCES.joinpath("valid_curies.txt") +INVALID_CURIES_PATH = RESOURCES.joinpath("invalid_curies.txt") + + +class TestW3C(unittest.TestCase): + """Tests for W3C utilities.""" + + def test_validating_curies(self): + """Test validating CURIEs.""" + for curie in VALID_CURIES_PATH.read_text().splitlines(): + with self.subTest(curie=curie): + self.assertTrue(curie_is_w3c(curie)) + for curie in INVALID_CURIES_PATH.read_text().splitlines(): + with self.subTest(curie=curie): + self.assertFalse(curie_is_w3c(curie)) From 6671887a12b407ce4e9eb27f464003b02bf558e6 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Wed, 28 Feb 2024 15:49:27 +0100 Subject: [PATCH 02/19] Update w3c.py --- src/curies/w3c.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/curies/w3c.py b/src/curies/w3c.py index 61deb43..a21c31d 100644 --- a/src/curies/w3c.py +++ b/src/curies/w3c.py @@ -1,5 +1,9 @@ """ Make it possible to check a CURIE against the W3C specification. + +https://github.com/linkml/linkml-runtime/blob/main/linkml_runtime/utils/uri_validator.py +could serve as a good basis for extending this - adding documentation, improving readability, +and making a more detailed testing suite would make this go a long way """ import re From 93053043f28890f82cff314ea97f794cf169c3ac Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sat, 9 Mar 2024 21:24:16 +0100 Subject: [PATCH 03/19] Add up-front check for w3c compliance for converters --- src/curies/api.py | 66 ++++++++++- src/curies/w3c.py | 9 -- src/curies/xx.py | 291 ++++++++++++++++++++++++++++++++++++++++++++++ tests/test_api.py | 30 +++++ tests/test_w3c.py | 2 +- 5 files changed, 386 insertions(+), 12 deletions(-) create mode 100644 src/curies/xx.py diff --git a/src/curies/api.py b/src/curies/api.py index cd94f8c..e6dbfec 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -37,7 +37,8 @@ from pytrie import StringTrie from ._pydantic_compat import field_validator, get_field_validator_values -from .w3c import curie_is_w3c +from .w3c import CURIE_RE +from .xx import PREFIX_RE if TYPE_CHECKING: # pragma: no cover import pandas @@ -306,6 +307,12 @@ def _key(self) -> RecordKey: ",".join(sorted(self.uri_prefix_synonyms)), ) + def is_w3c_compliant(self) -> bool: + """Check if all prefixes in this record are w3c compliant.""" + all_curie_prefixes_valid = all(curie_prefix_is_w3c(prefix) for prefix in self._all_prefixes) + # TODO extend to check URI prefixes? + return all_curie_prefixes_valid + class DuplicateSummary(NamedTuple): """A triple representing two records that are duplicated, either based on a CURIE or URI prefix.""" @@ -472,7 +479,9 @@ class Converter: #: .. warning:: patterns are an experimental feature pattern_map: Dict[str, str] - def __init__(self, records: List[Record], *, delimiter: str = ":", strict: bool = True) -> None: + def __init__( + self, records: List[Record], *, delimiter: str = ":", strict: bool = True, w3c: bool = False + ) -> None: """Instantiate a converter. :param records: @@ -481,6 +490,14 @@ def __init__(self, records: List[Record], *, delimiter: str = ":", strict: bool If true, raises issues on duplicate URI prefixes :param delimiter: The delimiter used for CURIEs. Defaults to a colon. + :param w3c: + If true, validate all records against the + `W3C CURIE Syntax 1.0 `_. + This includes the following: + + 1. Checking CURIE prefixes and CURIE prefix synonyms against the + W3C definition for `NCName `_ + :raises DuplicatePrefixes: if any records share any synonyms :raises DuplicateURIPrefixes: if any records share any URI prefixes """ @@ -492,6 +509,11 @@ def __init__(self, records: List[Record], *, delimiter: str = ":", strict: bool if duplicate_prefixes: raise DuplicatePrefixes(duplicate_prefixes) + if w3c: + broken = [record for record in records if not record.is_w3c_compliant()] + if broken: + raise ValueError(f"Records not conforming to W3C: {broken}") + self.delimiter = delimiter self.records = sorted(records, key=lambda r: r.prefix) self.prefix_map = _get_prefix_map(records) @@ -2303,3 +2325,43 @@ def upgrade_prefix_map(prefix_map: Mapping[str, str]) -> List[Record]: Record(prefix=prefix, prefix_synonyms=prefix_synonyms, uri_prefix=uri_prefix) for uri_prefix, (prefix, *prefix_synonyms) in sorted(priority_prefix_map.items()) ] + + +def curie_is_w3c(s: str) -> bool: + """Return if the CURIE is valid under the W3C specification. + + :param s: A string to check if it is a valid CURIE under the W3C specification. + :return: True if the string is a valid CURIE under the W3C specification. + + + If no prefix is given, the host language chooses how to assign a default + prefix. + + >>> curie_is_w3c(":test") + True + + From the specification, regarding using an underscore as the prefix + + The CURIE prefix '_' is reserved for use by languages that support RDF. + For this reason, the prefix '_' SHOULD be avoided by authors. + + >>> curie_is_w3c("_:test") + True + + This is invalid because a CURIE prefix isn't allowed to start with + a number. It has to start with either a letter, or an underscore. + + >>> curie_is_w3c("4cdn:test") + False + + Empty strings are explicitly noted as being invalid. + + >>> curie_is_w3c("") + False + """ + return bool(CURIE_RE.match(s)) + + +def curie_prefix_is_w3c(s: str) -> bool: + """Return if the CURIE prefix is valid under the W3C specification.""" + return bool(PREFIX_RE.match(s)) diff --git a/src/curies/w3c.py b/src/curies/w3c.py index a21c31d..e7c1ae2 100644 --- a/src/curies/w3c.py +++ b/src/curies/w3c.py @@ -8,16 +8,7 @@ import re -__all__ = [ - "curie_is_w3c", -] - # Borrowed from https://gist.github.com/niklasl/2506955 CURIE_PATTERN = r"(([\i-[:]][\c-[:]]*)?:)?(/[^\s/][^\s]*|[^\s/][^\s]*|[^\s]?)" CURIE_PATTERN = CURIE_PATTERN.replace(r"\i-[:]", r"_A-Za-z").replace(r"\c-[:]", r"-._:A-Za-z0-9") CURIE_RE = re.compile(CURIE_PATTERN) - - -def curie_is_w3c(curie) -> bool: - """Return if the CURIE is valid under the W3C specification.""" - return bool(CURIE_RE.match(curie)) diff --git a/src/curies/xx.py b/src/curies/xx.py new file mode 100644 index 0000000..31b3043 --- /dev/null +++ b/src/curies/xx.py @@ -0,0 +1,291 @@ +# Copyright Siemens 2023 +# SPDX-License-Identifier: CC0-1.0 + + +""" +Regular-expression-based URI and CURIE validation functions + +These regex are directly derived from the official sources mentioned in each +section. + +They should be processed with re.VERBOSE. + +Python named regular expression groups are being used to better understand the +URI/CURIE parsing. +""" + +import re + +#: Define DIGIT according RFC2234 section 3.4: +#: https://datatracker.ietf.org/doc/html/rfc2234/#section-3.4 +DIGIT = r"[0-9]" + +#: Define ALPHA (i.e., Letter) according RFC2234 section 6.1: +#: https://datatracker.ietf.org/doc/html/rfc2234/#section-6.1 +ALPHA = r"[A-Za-z]" + +#: Define HEXDIG according RFC2234 section 6.1: +#: https://datatracker.ietf.org/doc/html/rfc2234/#section-6.1 +HEXDIG = "[0-9A-F]" + +# pct-encoded = "%" HEXDIG HEXDIG +pct_encoded = rf"% {HEXDIG}{{2}}" + +# unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" +unreserved = rf"(?: {ALPHA} | {DIGIT} | \- | \. | _ | ~ )" + +# gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" +gen_delims = r"(?: : | / | \? | \# | \[ | \] | @ )" + +# sub-delims = "!" / "$" / "&" / "'" / "(" +sub_delims = r"(?: ! | \$ | & | ' | \( | \) | \* | \+ | , | ; | = )" + +# pchar = unreserved / pct-encoded / sub-delims / ":" / "@" +pchar = rf"(?: {unreserved} | {pct_encoded} | {sub_delims} | : | @ )" + +# reserved = gen-delims / sub-delims +reserved = rf"(?: {gen_delims} | {sub_delims} )" + +### required for Authority + +# dec-octet = DIGIT ; 0-9 +# / %x31-39 DIGIT ; 10-99 +# / "1" 2DIGIT ; 100-199 +# / "2" %x30-34 DIGIT ; 200-249 +# / "25" %x30-35 ; 250-255 +dec_octet = rf"""(?: {DIGIT} | + [1-9] {DIGIT} | + 1 {DIGIT}{{2}} | + 2 [0-4] {DIGIT} | + 25 [0-5] + ) +""" + +# IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet +IPv4address = rf"{dec_octet} \. {dec_octet} \. {dec_octet} \. {dec_octet}" + +# h16 = 1*4HEXDIG +h16 = rf"(?: {HEXDIG} ){{1,4}}" + +# ls32 = ( h16 ":" h16 ) / IPv4address +ls32 = rf"(?: (?: {h16} : {h16} ) | {IPv4address} )" + +# IPv6address = 6( h16 ":" ) ls32 +# / "::" 5( h16 ":" ) ls32 +# / [ h16 ] "::" 4( h16 ":" ) ls32 +# / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 +# / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 +# / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 +# / [ *4( h16 ":" ) h16 ] "::" ls32 +# / [ *5( h16 ":" ) h16 ] "::" h16 +# / [ *6( h16 ":" ) h16 ] "::" +IPv6address = rf"""(?: (?: {h16} : ){{6}} {ls32} | + :: (?: {h16} : ){{5}} {ls32} | + (?: {h16} )? :: (?: {h16} : ){{4}} {ls32} | + (?: (?: {h16} : ) {h16} )? :: (?: {h16} : ){{3}} {ls32} | + (?: (?: {h16} : ){{1,2}} {h16} )? :: (?: {h16} : ){{2}} {ls32} | + (?: (?: {h16} : ){{1,3}} {h16} )? :: {h16} : {ls32} | + (?: (?: {h16} : ){{1,4}} {h16} )? :: {ls32} | + (?: (?: {h16} : ){{1,5}} {h16} )? :: {h16} | + (?: (?: {h16} : ){{1,6}} {h16} )? :: + ) +""" + +# IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) +IPvFuture = rf"v {HEXDIG}+ \. (?: {unreserved} | {sub_delims} | : )+" + +# IP-literal = "[" ( IPv6address / IPvFuture ) "]" +IP_literal = rf"\[ (?: {IPv6address} | {IPvFuture} ) \]" + +# reg-name = *( unreserved / pct-encoded / sub-delims ) +reg_name = rf"(?: {unreserved} | {pct_encoded} | {sub_delims} )*" + +### required for Path + +# segment = *pchar +segment = rf"{pchar}*" + +# segment-nz = 1*pchar +segment_nz = rf"{pchar}+" + +# segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" ) +segment_nz_nc = rf"(?: {unreserved} | {pct_encoded} | {sub_delims} | @ )+" + +# ----------------------------------------------------------------------------- +# +# Define SCHEME according RFC3986 section 3.1: +# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.1 +# + +# scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) +scheme = rf"(?P {ALPHA} (?: {ALPHA} | {DIGIT} | \+ | \- | \. )* )" + +# ----------------------------------------------------------------------------- +# +# Define AUTHORITY according RFC3986 section 3.2: + +# Define USER INFORMATION according RFC3986 section 3.2.1: +# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.1 + +# userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) +userinfo = rf"""(?P + (?: {unreserved} | {pct_encoded} | {sub_delims} | : )* + ) +""" + +# Define HOST according RFC3986 section 3.2.2: +# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2 + +# host = IP-literal / IPv4address / reg-name +host = rf"(?P {IP_literal} | {IPv4address} | {reg_name} )" + +# Define PORT according RFC3986 section 3.2.3: +# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.3 + +# port = *DIGIT +port = rf"(?P ( {DIGIT} )* )" + +# Define AUTHORITY according RFC3986 section 3.2: +# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2 +# + +# authority = [ userinfo "@" ] host [ ":" port ] +# authority = rf"""(?: (?P {userinfo} ) @)? +authority = rf"""(?P + (?: {userinfo} @)? + {host} + (?: : {port} )? + ) +""" + +# ----------------------------------------------------------------------------- +# +# Define different PATHs according RFC3986 section 3.3: +# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.3 +# + +# path-abempty = *( "/" segment ) +path_abempty = rf"( / {segment} )*" + +# path-absolute = "/" [ segment-nz *( "/" segment ) ] +path_absolute = rf"( / (?: {segment_nz} (?: / {segment} )* )? )" + +# path-noscheme = segment-nz-nc *( "/" segment ) +path_noscheme = rf"( {segment_nz_nc} (?: / {segment} )* )" + +# path-rootless = segment-nz *( "/" segment ) +path_rootless = rf"( {segment_nz} (?: / {segment} )* )" + +# path-empty = 0 +path_empty = r"" + +# path = path-abempty ; begins with "/" or is empty +# / path-absolute ; begins with "/" but not "//" +# / path-noscheme ; begins with a non-colon segment +# / path-rootless ; begins with a segment +# / path-empty ; zero characters +path = rf"""(?: + {path_abempty} | + {path_absolute} | + {path_noscheme} | + {path_rootless} | + {path_empty} + ) +""" + +# ----------------------------------------------------------------------------- +# +# Define QUERY according RFC3986 section 3.4: +# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.4 +# + +# query = *( pchar / "/" / "?" ) +query = rf"(?P (?: {pchar} | / | \? )* )" + +# ----------------------------------------------------------------------------- +# +# Define FRAGMENT according RFC3986 section 3.5: +# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.5 +# + +# fragment = *( pchar / "/" / "?" ) +fragment = rf"(?P (?: {pchar} | / | \? )* )" + +# ----------------------------------------------------------------------------- +# +# Define URI and HIERARCHICAL PATH according RFC3986 section 3: +# https://datatracker.ietf.org/doc/html/rfc3986/#section-3 +# + +# hier-part = "//" authority path-abempty +# / path-absolute +# / path-rootless +# / path-empty +hier_part = rf"""(?P + (?: // {authority} {path_abempty} ) | + {path_absolute} | + {path_rootless} | + {path_empty} + ) +""" + +# URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] +URI = rf"""(?P + {scheme} : {hier_part} (?: \? {query} )? (?: \# {fragment} )? + ) +""" + +# ----------------------------------------------------------------------------- +# +# Define RELATIVE REFERENCE according RFC3986 section 4.2: +# https://datatracker.ietf.org/doc/html/rfc3986/#section-4.2 +# + +# relative-part = "//" authority path-abempty +# / path-absolute +# / path-noscheme +# / path-empty +# relative-ref = relative-part [ "?" query ] [ "#" fragment ] +relative_ref = rf"""(?P + (?: + (?: // + {authority} + (?P {path_abempty} ) + ) | + (?P {path_absolute} ) | + (?P {path_noscheme} ) | + (?P {path_empty} ) + ) + (?: \? {query} )? + (?: \# {fragment} )? + ) +""" + +# ----------------------------------------------------------------------------- +# +# Define CURIE according W3C CURIE Syntax 1.0 +# https://www.w3.org/TR/curie/#s_syntax +# + +# NCNameChar ::= Letter | Digit | '.' | '-' | '_' | CombiningChar | Extender +# !! IMPORTANT NOTE !! +# As of now this module doesn't support NCNameChar IRI, but +# relative-refs as defined in URI, +# NCNameChar ::= Letter | Digit | '.' | '-' | '_' +NCNameChar = rf"(?: {ALPHA} | {DIGIT} | \. | \- | _ )" + +# prefix := NCName +# NCName := (Letter | '_') (NCNameChar)* +prefix = rf"(?: {ALPHA} | _ ) (?: {NCNameChar} )*" + +# reference := irelative-ref (as defined in IRI) +# !! IMPORTANT NOTE !! +# As of now this module don't support irelative-refs as defined in IRI, but +# relative-refs as defined in URI +# curie := [ [ prefix ] ':' ] reference +# reference := relative-ref (as defined in URI) +CURIE = rf"""(?P (?: (?P {prefix} )? : )? {relative_ref}) +""" + +PREFIX_RE = re.compile(f"^{prefix}$", re.VERBOSE) +CURIE_RE = re.compile(f"^{CURIE}$", re.VERBOSE) diff --git a/tests/test_api.py b/tests/test_api.py index b404f4e..b361f41 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -41,6 +41,36 @@ GO_URI_PREFIX = "http://purl.obolibrary.org/obo/GO_" +class TestRecord(unittest.TestCase): + """Tests for the record data structure.""" + + def test_w3c_prefix(self): + """Test CURIE prefix correctness.""" + valid_prefixes = [ + "go", + "GO", + "NCBITaxon", + "ncbi.taxon", + "ncbi_taxon", + "_", + "_secret", + "secret_", + "_secret", + ] + invalid_prefixes = ["", "4dn", "GO:GO:", "GO:"] + examples = [ + *((prefix, True) for prefix in valid_prefixes), + *((prefix, False) for prefix in invalid_prefixes), + ] + for prefix, value in examples: + uri_prefix = f"https://example.com/{prefix}" + r1 = Record(prefix=prefix, uri_prefix=uri_prefix) + r2 = Record(prefix="prefix", prefix_synonyms=[prefix], uri_prefix=uri_prefix) + with self.subTest(prefix=prefix): + self.assertEqual(value, r1.is_w3c_compliant()) + self.assertEqual(value, r2.is_w3c_compliant()) + + class TestAddRecord(unittest.TestCase): """Test adding records.""" diff --git a/tests/test_w3c.py b/tests/test_w3c.py index 1dd5b66..d0e6172 100644 --- a/tests/test_w3c.py +++ b/tests/test_w3c.py @@ -3,7 +3,7 @@ import unittest from pathlib import Path -from curies.w3c import curie_is_w3c +from curies.api import curie_is_w3c HERE = Path(__file__).parent.resolve() RESOURCES = HERE.joinpath("resources") From 22934583e24ab49397450f460fe25003c5c7f21a Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 11 Mar 2024 10:18:31 +0100 Subject: [PATCH 04/19] Update --- src/curies/api.py | 2 + src/curies/xx.py | 132 ++++++++++++++++------------- tests/resources/invalid_curies.txt | 1 + tests/test_api.py | 12 +++ 4 files changed, 87 insertions(+), 60 deletions(-) diff --git a/src/curies/api.py b/src/curies/api.py index e6dbfec..704aa51 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -2359,6 +2359,8 @@ def curie_is_w3c(s: str) -> bool: >>> curie_is_w3c("") False """ + if "[" in s or "]" in s: + return False return bool(CURIE_RE.match(s)) diff --git a/src/curies/xx.py b/src/curies/xx.py index 31b3043..a3a2dc9 100644 --- a/src/curies/xx.py +++ b/src/curies/xx.py @@ -1,40 +1,51 @@ -# Copyright Siemens 2023 -# SPDX-License-Identifier: CC0-1.0 +"""A regular expression implementation of the W3C CURIEs Syntax. - -""" -Regular-expression-based URI and CURIE validation functions - -These regex are directly derived from the official sources mentioned in each +These regular expressions are directly derived from the official sources mentioned in each section. -They should be processed with re.VERBOSE. +They should be processed with :data:`re.VERBOSE` to remove comments and other +non-essential annotations. Python named regular expression groups are being used to better understand the URI/CURIE parsing. + +adapted from https://github.com/linkml/linkml-runtime/blob/main/linkml_runtime/utils/uri_validator.py, which +was originally distributed under the CC-0 license + +Relevant documents: + +1. W3C CURIES Syntax 1.0 in https://www.w3.org/TR/2010/NOTE-curie-20101216/ +2. NCName definition (i.e., prefix) in https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName +2. IRI definition in https://www.ietf.org/rfc/rfc3987.txt """ import re #: Define DIGIT according RFC2234 section 3.4: #: https://datatracker.ietf.org/doc/html/rfc2234/#section-3.4 -DIGIT = r"[0-9]" +DIGIT = "[0-9]" #: Define ALPHA (i.e., Letter) according RFC2234 section 6.1: #: https://datatracker.ietf.org/doc/html/rfc2234/#section-6.1 -ALPHA = r"[A-Za-z]" +ALPHA = "[A-Za-z]" #: Define HEXDIG according RFC2234 section 6.1: #: https://datatracker.ietf.org/doc/html/rfc2234/#section-6.1 HEXDIG = "[0-9A-F]" # pct-encoded = "%" HEXDIG HEXDIG -pct_encoded = rf"% {HEXDIG}{{2}}" +pct_encoded = f"%{HEXDIG}{{2}}" + +# unreserved = rf"(?: {ALPHA} | {DIGIT} | \- | \. | _ | ~ )" +unreserved = r"[A-Za-z0-9\-\._~]" +"""Defined in page 8 of https://www.ietf.org/rfc/rfc3987.txt as: -# unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" -unreserved = rf"(?: {ALPHA} | {DIGIT} | \- | \. | _ | ~ )" +.. code-block:: -# gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" + unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" +""" + +# gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" gen_delims = r"(?: : | / | \? | \# | \[ | \] | @ )" # sub-delims = "!" / "$" / "&" / "'" / "(" @@ -46,23 +57,20 @@ # reserved = gen-delims / sub-delims reserved = rf"(?: {gen_delims} | {sub_delims} )" -### required for Authority - -# dec-octet = DIGIT ; 0-9 -# / %x31-39 DIGIT ; 10-99 -# / "1" 2DIGIT ; 100-199 -# / "2" %x30-34 DIGIT ; 200-249 -# / "25" %x30-35 ; 250-255 -dec_octet = rf"""(?: {DIGIT} | - [1-9] {DIGIT} | - 1 {DIGIT}{{2}} | - 2 [0-4] {DIGIT} | - 25 [0-5] - ) +dec_octet = rf"(?: {DIGIT} | [1-9]{DIGIT} | 1{DIGIT}{{2}} | 2[0-4]{DIGIT} | 25[0-5])" +"""A definition of numbers between 1-255. + +.. code-block:: + + dec-octet = DIGIT ; 0-9 + / %x31-39 DIGIT ; 10-99 + / "1" 2DIGIT ; 100-199 + / "2" %x30-34 DIGIT ; 200-249 + / "25" %x30-35 ; 250-255 """ # IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet -IPv4address = rf"{dec_octet} \. {dec_octet} \. {dec_octet} \. {dec_octet}" +IPv4address = rf"{dec_octet}\.{dec_octet}\.{dec_octet}\.{dec_octet}" # h16 = 1*4HEXDIG h16 = rf"(?: {HEXDIG} ){{1,4}}" @@ -185,12 +193,12 @@ # / path-rootless ; begins with a segment # / path-empty ; zero characters path = rf"""(?: - {path_abempty} | - {path_absolute} | - {path_noscheme} | - {path_rootless} | - {path_empty} - ) + {path_abempty} | + {path_absolute} | + {path_noscheme} | + {path_rootless} | + {path_empty} +) """ # ----------------------------------------------------------------------------- @@ -222,18 +230,15 @@ # / path-rootless # / path-empty hier_part = rf"""(?P - (?: // {authority} {path_abempty} ) | - {path_absolute} | - {path_rootless} | - {path_empty} - ) + (?: // {authority} {path_abempty} ) | + {path_absolute} | + {path_rootless} | + {path_empty} +) """ # URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] -URI = rf"""(?P - {scheme} : {hier_part} (?: \? {query} )? (?: \# {fragment} )? - ) -""" +URI = rf"(?P {scheme} : {hier_part} (?: \? {query} )? (?: \# {fragment} )?)" # ----------------------------------------------------------------------------- # @@ -246,19 +251,20 @@ # / path-noscheme # / path-empty # relative-ref = relative-part [ "?" query ] [ "#" fragment ] -relative_ref = rf"""(?P - (?: - (?: // - {authority} - (?P {path_abempty} ) - ) | - (?P {path_absolute} ) | - (?P {path_noscheme} ) | - (?P {path_empty} ) - ) - (?: \? {query} )? - (?: \# {fragment} )? - ) +relative_ref = rf"""\ +(?P + (?: + (?: // + {authority} + (?P {path_abempty} ) + ) | + (?P {path_absolute} ) | + (?P {path_noscheme} ) | + (?P {path_empty} ) + ) + (?: \? {query} )? + (?: \# {fragment} )? +) """ # ----------------------------------------------------------------------------- @@ -274,9 +280,16 @@ # NCNameChar ::= Letter | Digit | '.' | '-' | '_' NCNameChar = rf"(?: {ALPHA} | {DIGIT} | \. | \- | _ )" -# prefix := NCName -# NCName := (Letter | '_') (NCNameChar)* prefix = rf"(?: {ALPHA} | _ ) (?: {NCNameChar} )*" +"""The definition of a prefix. + +.. seealso:: https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName + +.. code-block:: + + prefix := NCName + NCName := (Letter | '_') (NCNameChar)* +""" # reference := irelative-ref (as defined in IRI) # !! IMPORTANT NOTE !! @@ -284,8 +297,7 @@ # relative-refs as defined in URI # curie := [ [ prefix ] ':' ] reference # reference := relative-ref (as defined in URI) -CURIE = rf"""(?P (?: (?P {prefix} )? : )? {relative_ref}) -""" +CURIE = rf"(?P (?: (?P {prefix} )? : )? {relative_ref})" PREFIX_RE = re.compile(f"^{prefix}$", re.VERBOSE) CURIE_RE = re.compile(f"^{CURIE}$", re.VERBOSE) diff --git a/tests/resources/invalid_curies.txt b/tests/resources/invalid_curies.txt index 74633dc..75e5cd1 100644 --- a/tests/resources/invalid_curies.txt +++ b/tests/resources/invalid_curies.txt @@ -2,3 +2,4 @@ pfx://abc pfx:// :// / +smiles:CC(=O)NC([H])(C)C(=O)O diff --git a/tests/test_api.py b/tests/test_api.py index b361f41..dd32352 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -808,6 +808,18 @@ def test_rdflib(self): converter_2 = Converter.from_rdflib(graph.namespace_manager) self._assert_convert(converter_2) + def test_expand_w3c_invalid(self): + """Test that expanding a non-w3c-conformant CURIE can lead to errors.""" + converter = Converter.from_prefix_map( + { + "smiles": "https://bioregistry.io/smiles:", + } + ) + curie = "smiles:CC(=O)NC([H])(C)C(=O)O" + self.assertIsNotNone(converter.expand(curie)) + with self.assertRaises(ValueError): + converter.expand(curie, require_w3c_spec=True) + def test_expand_all(self): """Test expand all.""" priority_prefix_map = { From 4c3e12b3ac12998dcd6340deb22cb494d817cb5b Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 11 Mar 2024 10:29:34 +0100 Subject: [PATCH 05/19] Update --- src/curies/api.py | 24 ++++++++++++++++-------- tests/test_api.py | 6 +++--- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/src/curies/api.py b/src/curies/api.py index 704aa51..7ccdef0 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -307,7 +307,7 @@ def _key(self) -> RecordKey: ",".join(sorted(self.uri_prefix_synonyms)), ) - def is_w3c_compliant(self) -> bool: + def w3c_validate(self) -> bool: """Check if all prefixes in this record are w3c compliant.""" all_curie_prefixes_valid = all(curie_prefix_is_w3c(prefix) for prefix in self._all_prefixes) # TODO extend to check URI prefixes? @@ -480,7 +480,12 @@ class Converter: pattern_map: Dict[str, str] def __init__( - self, records: List[Record], *, delimiter: str = ":", strict: bool = True, w3c: bool = False + self, + records: List[Record], + *, + delimiter: str = ":", + strict: bool = True, + w3c_validation: bool = False, ) -> None: """Instantiate a converter. @@ -490,7 +495,7 @@ def __init__( If true, raises issues on duplicate URI prefixes :param delimiter: The delimiter used for CURIEs. Defaults to a colon. - :param w3c: + :param w3c_validation: If true, validate all records against the `W3C CURIE Syntax 1.0 `_. This includes the following: @@ -500,6 +505,7 @@ def __init__( :raises DuplicatePrefixes: if any records share any synonyms :raises DuplicateURIPrefixes: if any records share any URI prefixes + :rasies ValueError: If w3c validation is on and there are non-conformant records """ if strict: duplicate_uri_prefixes = _get_duplicate_uri_prefixes(records) @@ -509,8 +515,8 @@ def __init__( if duplicate_prefixes: raise DuplicatePrefixes(duplicate_prefixes) - if w3c: - broken = [record for record in records if not record.is_w3c_compliant()] + if w3c_validation: + broken = [record for record in records if not record.w3c_validate()] if broken: raise ValueError(f"Records not conforming to W3C: {broken}") @@ -1344,7 +1350,7 @@ def expand( *, strict: bool = False, passthrough: bool = False, - require_w3c_spec: bool = False, + w3c_validation: bool = False, ) -> Optional[str]: """Expand a CURIE to a URI, if possible. @@ -1354,12 +1360,14 @@ def expand( :param passthrough: If true, strict is false, and the CURIE can't be expanded, return the input. Defaults to false. If your strings can either be a CURIE _or_ a URI, consider using :meth:`Converter.expand_ambiguous` instead. - :param require_w3c_spec: If true, requires CURIEs to be valid against the + :param w3c_validation: If true, requires CURIEs to be valid against the `W3C CURIE specification `_. :returns: A URI if this converter contains a URI prefix for the prefix in this CURIE :raises ExpansionError: If strict is true and the CURIE can't be expanded + :raises ValueError: + If W3C validation is turned on and the CURIE is not valid under the CURIE specification >>> from curies import Converter >>> converter = Converter.from_prefix_map({ @@ -1380,7 +1388,7 @@ def expand( ``http://purl.obolibrary.org/obo/GO_0032571`` will return ``GO:0032571`` instead of ``OBO:GO_0032571``. """ - if require_w3c_spec and not curie_is_w3c(curie): + if w3c_validation and not curie_is_w3c(curie): raise ValueError(f"CURIE is not valid under W3C spec: {curie}") prefix, identifier = self.parse_curie(curie) rv = self.expand_pair(prefix, identifier) diff --git a/tests/test_api.py b/tests/test_api.py index dd32352..7391519 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -67,8 +67,8 @@ def test_w3c_prefix(self): r1 = Record(prefix=prefix, uri_prefix=uri_prefix) r2 = Record(prefix="prefix", prefix_synonyms=[prefix], uri_prefix=uri_prefix) with self.subTest(prefix=prefix): - self.assertEqual(value, r1.is_w3c_compliant()) - self.assertEqual(value, r2.is_w3c_compliant()) + self.assertEqual(value, r1.w3c_validate()) + self.assertEqual(value, r2.w3c_validate()) class TestAddRecord(unittest.TestCase): @@ -818,7 +818,7 @@ def test_expand_w3c_invalid(self): curie = "smiles:CC(=O)NC([H])(C)C(=O)O" self.assertIsNotNone(converter.expand(curie)) with self.assertRaises(ValueError): - converter.expand(curie, require_w3c_spec=True) + converter.expand(curie, w3c_validation=True) def test_expand_all(self): """Test expand all.""" From 011a9943e7390e77003c34490cbf73550e590a7a Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 11 Mar 2024 10:31:20 +0100 Subject: [PATCH 06/19] Flake --- .github/workflows/tests.yml | 2 +- src/curies/xx.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 5e83b4c..f3ff47b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -43,7 +43,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - pip install tox + pip install tox tox-uv sudo apt-get install graphviz - name: Check RST conformity with doc8 run: tox run -e doc8 diff --git a/src/curies/xx.py b/src/curies/xx.py index a3a2dc9..21fe11a 100644 --- a/src/curies/xx.py +++ b/src/curies/xx.py @@ -108,7 +108,7 @@ # reg-name = *( unreserved / pct-encoded / sub-delims ) reg_name = rf"(?: {unreserved} | {pct_encoded} | {sub_delims} )*" -### required for Path +# required for Path # segment = *pchar segment = rf"{pchar}*" From e78078b021d27684b0be542576c0f33ea1bb3ecf Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 11 Mar 2024 10:33:18 +0100 Subject: [PATCH 07/19] Update api.py --- src/curies/api.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/curies/api.py b/src/curies/api.py index 7ccdef0..468da30 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -500,12 +500,12 @@ def __init__( `W3C CURIE Syntax 1.0 `_. This includes the following: - 1. Checking CURIE prefixes and CURIE prefix synonyms against the - W3C definition for `NCName `_ + 1. Checking CURIE prefixes and CURIE prefix synonyms against the + W3C definition for `NCName `_ :raises DuplicatePrefixes: if any records share any synonyms :raises DuplicateURIPrefixes: if any records share any URI prefixes - :rasies ValueError: If w3c validation is on and there are non-conformant records + :raises ValueError: If w3c validation is on and there are non-conformant records """ if strict: duplicate_uri_prefixes = _get_duplicate_uri_prefixes(records) From 02e48f191ff93d3d7e3cc114a2f90bfcf9c44adc Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 11 Mar 2024 10:34:14 +0100 Subject: [PATCH 08/19] Update tests.yml --- .github/workflows/tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f3ff47b..82d8d31 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -20,7 +20,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install dependencies - run: pip install tox + run: pip install tox tox-uv - name: Check manifest run: tox run -e manifest - name: Check code quality with flake8 @@ -66,7 +66,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install dependencies - run: pip install tox + run: pip install tox tox-uv - name: Test with pytest and generate coverage file run: tox run -e py-pydantic${{ matrix.pydantic }} From 8c16a7d5552462e2caa734142b8d8d1a58c8f871 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 11 Mar 2024 10:40:56 +0100 Subject: [PATCH 09/19] Clean --- src/curies/api.py | 3 +-- src/curies/w3c.py | 15 ++++++++++++++- src/curies/xx.py | 25 +------------------------ 3 files changed, 16 insertions(+), 27 deletions(-) diff --git a/src/curies/api.py b/src/curies/api.py index 468da30..e8ac76a 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -37,8 +37,7 @@ from pytrie import StringTrie from ._pydantic_compat import field_validator, get_field_validator_values -from .w3c import CURIE_RE -from .xx import PREFIX_RE +from .w3c import CURIE_RE, PREFIX_RE if TYPE_CHECKING: # pragma: no cover import pandas diff --git a/src/curies/w3c.py b/src/curies/w3c.py index e7c1ae2..bc374b0 100644 --- a/src/curies/w3c.py +++ b/src/curies/w3c.py @@ -8,7 +8,20 @@ import re -# Borrowed from https://gist.github.com/niklasl/2506955 +_PREFIX_RE = rf"[A-Za-z_][A-Za-z0-9\.\-_]*" +"""The definition of a prefix, from https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName. + +.. code-block:: + + prefix := NCName + NCName := (Letter | '_') (NCNameChar)* + NCNameChar ::= Letter | Digit | '.' | '-' | '_' +""" + +PREFIX_RE = re.compile(f"^{_PREFIX_RE}$") + + +#: Borrowed from https://gist.github.com/niklasl/2506955 CURIE_PATTERN = r"(([\i-[:]][\c-[:]]*)?:)?(/[^\s/][^\s]*|[^\s/][^\s]*|[^\s]?)" CURIE_PATTERN = CURIE_PATTERN.replace(r"\i-[:]", r"_A-Za-z").replace(r"\c-[:]", r"-._:A-Za-z0-9") CURIE_RE = re.compile(CURIE_PATTERN) diff --git a/src/curies/xx.py b/src/curies/xx.py index 21fe11a..8d886dd 100644 --- a/src/curies/xx.py +++ b/src/curies/xx.py @@ -267,29 +267,6 @@ ) """ -# ----------------------------------------------------------------------------- -# -# Define CURIE according W3C CURIE Syntax 1.0 -# https://www.w3.org/TR/curie/#s_syntax -# - -# NCNameChar ::= Letter | Digit | '.' | '-' | '_' | CombiningChar | Extender -# !! IMPORTANT NOTE !! -# As of now this module doesn't support NCNameChar IRI, but -# relative-refs as defined in URI, -# NCNameChar ::= Letter | Digit | '.' | '-' | '_' -NCNameChar = rf"(?: {ALPHA} | {DIGIT} | \. | \- | _ )" - -prefix = rf"(?: {ALPHA} | _ ) (?: {NCNameChar} )*" -"""The definition of a prefix. - -.. seealso:: https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName - -.. code-block:: - - prefix := NCName - NCName := (Letter | '_') (NCNameChar)* -""" # reference := irelative-ref (as defined in IRI) # !! IMPORTANT NOTE !! @@ -299,5 +276,5 @@ # reference := relative-ref (as defined in URI) CURIE = rf"(?P (?: (?P {prefix} )? : )? {relative_ref})" -PREFIX_RE = re.compile(f"^{prefix}$", re.VERBOSE) + CURIE_RE = re.compile(f"^{CURIE}$", re.VERBOSE) From 039f729ec67f748bd8d3a3a797c4982fcba081a7 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 11 Mar 2024 10:47:10 +0100 Subject: [PATCH 10/19] Cleanup --- src/curies/w3c.py | 2 +- src/curies/xx.py | 280 ---------------------------------------------- 2 files changed, 1 insertion(+), 281 deletions(-) delete mode 100644 src/curies/xx.py diff --git a/src/curies/w3c.py b/src/curies/w3c.py index bc374b0..2317c24 100644 --- a/src/curies/w3c.py +++ b/src/curies/w3c.py @@ -8,7 +8,7 @@ import re -_PREFIX_RE = rf"[A-Za-z_][A-Za-z0-9\.\-_]*" +_PREFIX_RE = r"[A-Za-z_][A-Za-z0-9\.\-_]*" """The definition of a prefix, from https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName. .. code-block:: diff --git a/src/curies/xx.py b/src/curies/xx.py deleted file mode 100644 index 8d886dd..0000000 --- a/src/curies/xx.py +++ /dev/null @@ -1,280 +0,0 @@ -"""A regular expression implementation of the W3C CURIEs Syntax. - -These regular expressions are directly derived from the official sources mentioned in each -section. - -They should be processed with :data:`re.VERBOSE` to remove comments and other -non-essential annotations. - -Python named regular expression groups are being used to better understand the -URI/CURIE parsing. - -adapted from https://github.com/linkml/linkml-runtime/blob/main/linkml_runtime/utils/uri_validator.py, which -was originally distributed under the CC-0 license - -Relevant documents: - -1. W3C CURIES Syntax 1.0 in https://www.w3.org/TR/2010/NOTE-curie-20101216/ -2. NCName definition (i.e., prefix) in https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName -2. IRI definition in https://www.ietf.org/rfc/rfc3987.txt -""" - -import re - -#: Define DIGIT according RFC2234 section 3.4: -#: https://datatracker.ietf.org/doc/html/rfc2234/#section-3.4 -DIGIT = "[0-9]" - -#: Define ALPHA (i.e., Letter) according RFC2234 section 6.1: -#: https://datatracker.ietf.org/doc/html/rfc2234/#section-6.1 -ALPHA = "[A-Za-z]" - -#: Define HEXDIG according RFC2234 section 6.1: -#: https://datatracker.ietf.org/doc/html/rfc2234/#section-6.1 -HEXDIG = "[0-9A-F]" - -# pct-encoded = "%" HEXDIG HEXDIG -pct_encoded = f"%{HEXDIG}{{2}}" - -# unreserved = rf"(?: {ALPHA} | {DIGIT} | \- | \. | _ | ~ )" -unreserved = r"[A-Za-z0-9\-\._~]" -"""Defined in page 8 of https://www.ietf.org/rfc/rfc3987.txt as: - -.. code-block:: - - unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" -""" - -# gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" -gen_delims = r"(?: : | / | \? | \# | \[ | \] | @ )" - -# sub-delims = "!" / "$" / "&" / "'" / "(" -sub_delims = r"(?: ! | \$ | & | ' | \( | \) | \* | \+ | , | ; | = )" - -# pchar = unreserved / pct-encoded / sub-delims / ":" / "@" -pchar = rf"(?: {unreserved} | {pct_encoded} | {sub_delims} | : | @ )" - -# reserved = gen-delims / sub-delims -reserved = rf"(?: {gen_delims} | {sub_delims} )" - -dec_octet = rf"(?: {DIGIT} | [1-9]{DIGIT} | 1{DIGIT}{{2}} | 2[0-4]{DIGIT} | 25[0-5])" -"""A definition of numbers between 1-255. - -.. code-block:: - - dec-octet = DIGIT ; 0-9 - / %x31-39 DIGIT ; 10-99 - / "1" 2DIGIT ; 100-199 - / "2" %x30-34 DIGIT ; 200-249 - / "25" %x30-35 ; 250-255 -""" - -# IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet -IPv4address = rf"{dec_octet}\.{dec_octet}\.{dec_octet}\.{dec_octet}" - -# h16 = 1*4HEXDIG -h16 = rf"(?: {HEXDIG} ){{1,4}}" - -# ls32 = ( h16 ":" h16 ) / IPv4address -ls32 = rf"(?: (?: {h16} : {h16} ) | {IPv4address} )" - -# IPv6address = 6( h16 ":" ) ls32 -# / "::" 5( h16 ":" ) ls32 -# / [ h16 ] "::" 4( h16 ":" ) ls32 -# / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 -# / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 -# / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 -# / [ *4( h16 ":" ) h16 ] "::" ls32 -# / [ *5( h16 ":" ) h16 ] "::" h16 -# / [ *6( h16 ":" ) h16 ] "::" -IPv6address = rf"""(?: (?: {h16} : ){{6}} {ls32} | - :: (?: {h16} : ){{5}} {ls32} | - (?: {h16} )? :: (?: {h16} : ){{4}} {ls32} | - (?: (?: {h16} : ) {h16} )? :: (?: {h16} : ){{3}} {ls32} | - (?: (?: {h16} : ){{1,2}} {h16} )? :: (?: {h16} : ){{2}} {ls32} | - (?: (?: {h16} : ){{1,3}} {h16} )? :: {h16} : {ls32} | - (?: (?: {h16} : ){{1,4}} {h16} )? :: {ls32} | - (?: (?: {h16} : ){{1,5}} {h16} )? :: {h16} | - (?: (?: {h16} : ){{1,6}} {h16} )? :: - ) -""" - -# IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) -IPvFuture = rf"v {HEXDIG}+ \. (?: {unreserved} | {sub_delims} | : )+" - -# IP-literal = "[" ( IPv6address / IPvFuture ) "]" -IP_literal = rf"\[ (?: {IPv6address} | {IPvFuture} ) \]" - -# reg-name = *( unreserved / pct-encoded / sub-delims ) -reg_name = rf"(?: {unreserved} | {pct_encoded} | {sub_delims} )*" - -# required for Path - -# segment = *pchar -segment = rf"{pchar}*" - -# segment-nz = 1*pchar -segment_nz = rf"{pchar}+" - -# segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" ) -segment_nz_nc = rf"(?: {unreserved} | {pct_encoded} | {sub_delims} | @ )+" - -# ----------------------------------------------------------------------------- -# -# Define SCHEME according RFC3986 section 3.1: -# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.1 -# - -# scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) -scheme = rf"(?P {ALPHA} (?: {ALPHA} | {DIGIT} | \+ | \- | \. )* )" - -# ----------------------------------------------------------------------------- -# -# Define AUTHORITY according RFC3986 section 3.2: - -# Define USER INFORMATION according RFC3986 section 3.2.1: -# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.1 - -# userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) -userinfo = rf"""(?P - (?: {unreserved} | {pct_encoded} | {sub_delims} | : )* - ) -""" - -# Define HOST according RFC3986 section 3.2.2: -# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2 - -# host = IP-literal / IPv4address / reg-name -host = rf"(?P {IP_literal} | {IPv4address} | {reg_name} )" - -# Define PORT according RFC3986 section 3.2.3: -# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.3 - -# port = *DIGIT -port = rf"(?P ( {DIGIT} )* )" - -# Define AUTHORITY according RFC3986 section 3.2: -# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2 -# - -# authority = [ userinfo "@" ] host [ ":" port ] -# authority = rf"""(?: (?P {userinfo} ) @)? -authority = rf"""(?P - (?: {userinfo} @)? - {host} - (?: : {port} )? - ) -""" - -# ----------------------------------------------------------------------------- -# -# Define different PATHs according RFC3986 section 3.3: -# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.3 -# - -# path-abempty = *( "/" segment ) -path_abempty = rf"( / {segment} )*" - -# path-absolute = "/" [ segment-nz *( "/" segment ) ] -path_absolute = rf"( / (?: {segment_nz} (?: / {segment} )* )? )" - -# path-noscheme = segment-nz-nc *( "/" segment ) -path_noscheme = rf"( {segment_nz_nc} (?: / {segment} )* )" - -# path-rootless = segment-nz *( "/" segment ) -path_rootless = rf"( {segment_nz} (?: / {segment} )* )" - -# path-empty = 0 -path_empty = r"" - -# path = path-abempty ; begins with "/" or is empty -# / path-absolute ; begins with "/" but not "//" -# / path-noscheme ; begins with a non-colon segment -# / path-rootless ; begins with a segment -# / path-empty ; zero characters -path = rf"""(?: - {path_abempty} | - {path_absolute} | - {path_noscheme} | - {path_rootless} | - {path_empty} -) -""" - -# ----------------------------------------------------------------------------- -# -# Define QUERY according RFC3986 section 3.4: -# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.4 -# - -# query = *( pchar / "/" / "?" ) -query = rf"(?P (?: {pchar} | / | \? )* )" - -# ----------------------------------------------------------------------------- -# -# Define FRAGMENT according RFC3986 section 3.5: -# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.5 -# - -# fragment = *( pchar / "/" / "?" ) -fragment = rf"(?P (?: {pchar} | / | \? )* )" - -# ----------------------------------------------------------------------------- -# -# Define URI and HIERARCHICAL PATH according RFC3986 section 3: -# https://datatracker.ietf.org/doc/html/rfc3986/#section-3 -# - -# hier-part = "//" authority path-abempty -# / path-absolute -# / path-rootless -# / path-empty -hier_part = rf"""(?P - (?: // {authority} {path_abempty} ) | - {path_absolute} | - {path_rootless} | - {path_empty} -) -""" - -# URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] -URI = rf"(?P {scheme} : {hier_part} (?: \? {query} )? (?: \# {fragment} )?)" - -# ----------------------------------------------------------------------------- -# -# Define RELATIVE REFERENCE according RFC3986 section 4.2: -# https://datatracker.ietf.org/doc/html/rfc3986/#section-4.2 -# - -# relative-part = "//" authority path-abempty -# / path-absolute -# / path-noscheme -# / path-empty -# relative-ref = relative-part [ "?" query ] [ "#" fragment ] -relative_ref = rf"""\ -(?P - (?: - (?: // - {authority} - (?P {path_abempty} ) - ) | - (?P {path_absolute} ) | - (?P {path_noscheme} ) | - (?P {path_empty} ) - ) - (?: \? {query} )? - (?: \# {fragment} )? -) -""" - - -# reference := irelative-ref (as defined in IRI) -# !! IMPORTANT NOTE !! -# As of now this module don't support irelative-refs as defined in IRI, but -# relative-refs as defined in URI -# curie := [ [ prefix ] ':' ] reference -# reference := relative-ref (as defined in URI) -CURIE = rf"(?P (?: (?P {prefix} )? : )? {relative_ref})" - - -CURIE_RE = re.compile(f"^{CURIE}$", re.VERBOSE) From 6bc348ef022984c7a68e88612a270714081e1631 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 11 Mar 2024 10:48:41 +0100 Subject: [PATCH 11/19] Update tests.yml --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 82d8d31..4be0327 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -20,7 +20,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install dependencies - run: pip install tox tox-uv + run: pip install tox - name: Check manifest run: tox run -e manifest - name: Check code quality with flake8 From 62c6e6e501ae218cf3b10b18eebbbdd06a95539a Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 11 Mar 2024 10:56:10 +0100 Subject: [PATCH 12/19] Add explicit error --- src/curies/api.py | 14 ++++++++++---- tests/resources/invalid_curies.txt | 4 ---- tests/test_api.py | 3 ++- tests/test_w3c.py | 15 ++++++++++++++- 4 files changed, 26 insertions(+), 10 deletions(-) diff --git a/src/curies/api.py b/src/curies/api.py index e8ac76a..fddb1a4 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -48,9 +48,11 @@ "Reference", "ReferenceTuple", "Record", + # Exceptions "DuplicateValueError", "DuplicatePrefixes", "DuplicateURIPrefixes", + "W3CValidationError", # Utilities "chain", "upgrade_prefix_map", @@ -377,6 +379,10 @@ class URIStandardizationError(StandardizationError): """An error raise when a URI can't be standardized.""" +class W3CValidationError(ValueError): + """An error when W3C validation fails.""" + + def _get_duplicate_uri_prefixes(records: List[Record]) -> List[DuplicateSummary]: return [ DuplicateSummary(record_1, record_2, uri_prefix) @@ -504,7 +510,7 @@ def __init__( :raises DuplicatePrefixes: if any records share any synonyms :raises DuplicateURIPrefixes: if any records share any URI prefixes - :raises ValueError: If w3c validation is on and there are non-conformant records + :raises W3CValidationError: If w3c validation is on and there are non-conformant records """ if strict: duplicate_uri_prefixes = _get_duplicate_uri_prefixes(records) @@ -517,7 +523,7 @@ def __init__( if w3c_validation: broken = [record for record in records if not record.w3c_validate()] if broken: - raise ValueError(f"Records not conforming to W3C: {broken}") + raise W3CValidationError(f"Records not conforming to W3C: {broken}") self.delimiter = delimiter self.records = sorted(records, key=lambda r: r.prefix) @@ -1365,7 +1371,7 @@ def expand( A URI if this converter contains a URI prefix for the prefix in this CURIE :raises ExpansionError: If strict is true and the CURIE can't be expanded - :raises ValueError: + :raises W3CValidationError: If W3C validation is turned on and the CURIE is not valid under the CURIE specification >>> from curies import Converter @@ -1388,7 +1394,7 @@ def expand( instead of ``OBO:GO_0032571``. """ if w3c_validation and not curie_is_w3c(curie): - raise ValueError(f"CURIE is not valid under W3C spec: {curie}") + raise W3CValidationError(f"CURIE is not valid under W3C spec: {curie}") prefix, identifier = self.parse_curie(curie) rv = self.expand_pair(prefix, identifier) if rv: diff --git a/tests/resources/invalid_curies.txt b/tests/resources/invalid_curies.txt index 75e5cd1..696858d 100644 --- a/tests/resources/invalid_curies.txt +++ b/tests/resources/invalid_curies.txt @@ -1,5 +1 @@ -pfx://abc -pfx:// -:// -/ smiles:CC(=O)NC([H])(C)C(=O)O diff --git a/tests/test_api.py b/tests/test_api.py index 7391519..3c1d192 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -24,6 +24,7 @@ Reference, ReferenceTuple, URIStandardizationError, + W3CValidationError, chain, upgrade_prefix_map, ) @@ -817,7 +818,7 @@ def test_expand_w3c_invalid(self): ) curie = "smiles:CC(=O)NC([H])(C)C(=O)O" self.assertIsNotNone(converter.expand(curie)) - with self.assertRaises(ValueError): + with self.assertRaises(W3CValidationError): converter.expand(curie, w3c_validation=True) def test_expand_all(self): diff --git a/tests/test_w3c.py b/tests/test_w3c.py index d0e6172..8e9fed4 100644 --- a/tests/test_w3c.py +++ b/tests/test_w3c.py @@ -14,11 +14,24 @@ class TestW3C(unittest.TestCase): """Tests for W3C utilities.""" - def test_validating_curies(self): + def test_valid_curies(self): """Test validating CURIEs.""" for curie in VALID_CURIES_PATH.read_text().splitlines(): with self.subTest(curie=curie): self.assertTrue(curie_is_w3c(curie)) + + def test_invalid_curies(self): + """Test validating CURIEs. + + .. todo:: + + Later, extend this to the following: + + 1. ``pfx://abc`` + 2. ``pfx://`` + 3. ``://`` + 4. ``/`` + """ for curie in INVALID_CURIES_PATH.read_text().splitlines(): with self.subTest(curie=curie): self.assertFalse(curie_is_w3c(curie)) From 0398a67669032d8e1f7514feeab83a959039862e Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 11 Mar 2024 11:07:18 +0100 Subject: [PATCH 13/19] cleanup --- src/curies/api.py | 4 ++++ tests/resources/invalid_curies.txt | 2 ++ tests/resources/valid_curies.txt | 1 - 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/curies/api.py b/src/curies/api.py index fddb1a4..f6fcbaa 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -2374,6 +2374,10 @@ def curie_is_w3c(s: str) -> bool: """ if "[" in s or "]" in s: return False + if not s.strip(): + return False + if s[0].isdigit(): + return False # TODO get that into the regex return bool(CURIE_RE.match(s)) diff --git a/tests/resources/invalid_curies.txt b/tests/resources/invalid_curies.txt index 696858d..d5d9efd 100644 --- a/tests/resources/invalid_curies.txt +++ b/tests/resources/invalid_curies.txt @@ -1 +1,3 @@ smiles:CC(=O)NC([H])(C)C(=O)O +4cdn:test + diff --git a/tests/resources/valid_curies.txt b/tests/resources/valid_curies.txt index 1066d8d..8ed858c 100644 --- a/tests/resources/valid_curies.txt +++ b/tests/resources/valid_curies.txt @@ -3,7 +3,6 @@ pfx:abc pfx: abc :abc - pfx:/abc pfx:/ :/ From 80c3038c6b25ca0b5204d05c3c786df1d23d36ea Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 12 Mar 2024 07:33:49 +0100 Subject: [PATCH 14/19] Update --- src/curies/api.py | 4 ++-- src/curies/w3c.py | 15 ++++++++++----- tests/resources/invalid_curies.txt | 2 +- tests/resources/valid_curies.txt | 3 +++ 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/curies/api.py b/src/curies/api.py index f6fcbaa..b256925 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -37,7 +37,7 @@ from pytrie import StringTrie from ._pydantic_compat import field_validator, get_field_validator_values -from .w3c import CURIE_RE, PREFIX_RE +from .w3c import CURIE_PREFIX_RE, CURIE_RE, URI_PREFIX_RE if TYPE_CHECKING: # pragma: no cover import pandas @@ -2383,4 +2383,4 @@ def curie_is_w3c(s: str) -> bool: def curie_prefix_is_w3c(s: str) -> bool: """Return if the CURIE prefix is valid under the W3C specification.""" - return bool(PREFIX_RE.match(s)) + return bool(CURIE_PREFIX_RE.match(s)) diff --git a/src/curies/w3c.py b/src/curies/w3c.py index 2317c24..ed3a652 100644 --- a/src/curies/w3c.py +++ b/src/curies/w3c.py @@ -8,7 +8,7 @@ import re -_PREFIX_RE = r"[A-Za-z_][A-Za-z0-9\.\-_]*" +_CURIE_PREFIX_RE = r"[A-Za-z_][A-Za-z0-9\.\-_]*" """The definition of a prefix, from https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName. .. code-block:: @@ -18,10 +18,15 @@ NCNameChar ::= Letter | Digit | '.' | '-' | '_' """ -PREFIX_RE = re.compile(f"^{_PREFIX_RE}$") +CURIE_PREFIX_RE = re.compile(f"^{_CURIE_PREFIX_RE}$") +#: Borrowed from https://github.com/linkml/prefixmaps/blob/82bfdbc/src/prefixmaps/datamodel/context.py#L26C1-L26C60 +#: Still needs adapting to see if there's an actual standard to match this to, +#: or if this is an opinionated implementation +URI_PREFIX_RE = re.compile(r"http[s]?://[\w\.\-\/]+[#/_:]$") -#: Borrowed from https://gist.github.com/niklasl/2506955 -CURIE_PATTERN = r"(([\i-[:]][\c-[:]]*)?:)?(/[^\s/][^\s]*|[^\s/][^\s]*|[^\s]?)" -CURIE_PATTERN = CURIE_PATTERN.replace(r"\i-[:]", r"_A-Za-z").replace(r"\c-[:]", r"-._:A-Za-z0-9") +#: Adapted from https://gist.github.com/niklasl/2506955 +_IDENTIFIER_RE = r"(/[^\s/][^\s]*|[^\s/][^\s]*|[^\s]?)" + +CURIE_PATTERN = rf"({_CURIE_PREFIX_RE}?:)?{_IDENTIFIER_RE}" CURIE_RE = re.compile(CURIE_PATTERN) diff --git a/tests/resources/invalid_curies.txt b/tests/resources/invalid_curies.txt index d5d9efd..b77661c 100644 --- a/tests/resources/invalid_curies.txt +++ b/tests/resources/invalid_curies.txt @@ -1,3 +1,3 @@ + smiles:CC(=O)NC([H])(C)C(=O)O 4cdn:test - diff --git a/tests/resources/valid_curies.txt b/tests/resources/valid_curies.txt index 8ed858c..7125989 100644 --- a/tests/resources/valid_curies.txt +++ b/tests/resources/valid_curies.txt @@ -6,3 +6,6 @@ abc pfx:/abc pfx:/ :/ +bioregistry:bioregistry +GO:0000012 +go:0123456 From c15a08e4af7db08efd6dbbaddd57ca7aaf9cefb2 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 12 Mar 2024 07:42:52 +0100 Subject: [PATCH 15/19] Update type checking --- src/curies/api.py | 29 +++++++++++++++++++++++------ src/curies/w3c.py | 4 ++-- tests/resources/valid_curies.txt | 1 + tox.ini | 4 +++- 4 files changed, 29 insertions(+), 9 deletions(-) diff --git a/src/curies/api.py b/src/curies/api.py index b256925..5e99790 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -37,7 +37,7 @@ from pytrie import StringTrie from ._pydantic_compat import field_validator, get_field_validator_values -from .w3c import CURIE_PREFIX_RE, CURIE_RE, URI_PREFIX_RE +from .w3c import CURIE_PREFIX_RE, CURIE_RE if TYPE_CHECKING: # pragma: no cover import pandas @@ -1224,10 +1224,12 @@ def parse_uri(self, uri: str) -> Union[ReferenceTuple, Tuple[None, None]]: else: return ReferenceTuple(prefix, uri[len(value) :]) - def is_curie(self, s: str) -> bool: + def is_curie(self, s: str, *, w3c_validation: bool = False) -> bool: """Check if the string can be parsed as a CURIE by this converter. :param s: A string that might be a CURIE + :param w3c_validation: If true, requires CURIEs to be valid against the + `W3C CURIE specification `_. :returns: If the string can be parsed as a CURIE by this converter. Note that some valid CURIEs, when passed to this function, will result in False if their prefixes are not registered with this @@ -1248,7 +1250,7 @@ def is_curie(self, s: str) -> bool: False """ try: - return self.expand(s) is not None + return self.expand(s, w3c_validation=w3c_validation) is not None except ValueError: return False @@ -1334,19 +1336,34 @@ def expand_strict(self, curie: str) -> str: # docstr-coverage:excused `overload` @overload def expand( - self, curie: str, *, strict: Literal[True] = True, passthrough: bool = False + self, + curie: str, + *, + strict: Literal[True] = True, + passthrough: bool = ..., + w3c_validation: bool = ..., ) -> str: ... # docstr-coverage:excused `overload` @overload def expand( - self, curie: str, *, strict: Literal[False] = False, passthrough: Literal[True] = True + self, + curie: str, + *, + strict: Literal[False] = False, + passthrough: Literal[True] = True, + w3c_validation: bool = ..., ) -> str: ... # docstr-coverage:excused `overload` @overload def expand( - self, curie: str, *, strict: Literal[False] = False, passthrough: Literal[False] = False + self, + curie: str, + *, + strict: Literal[False] = False, + passthrough: Literal[False] = False, + w3c_validation: bool = ..., ) -> Optional[str]: ... def expand( diff --git a/src/curies/w3c.py b/src/curies/w3c.py index ed3a652..2d92ade 100644 --- a/src/curies/w3c.py +++ b/src/curies/w3c.py @@ -23,10 +23,10 @@ #: Borrowed from https://github.com/linkml/prefixmaps/blob/82bfdbc/src/prefixmaps/datamodel/context.py#L26C1-L26C60 #: Still needs adapting to see if there's an actual standard to match this to, #: or if this is an opinionated implementation -URI_PREFIX_RE = re.compile(r"http[s]?://[\w\.\-\/]+[#/_:]$") +URI_PREFIX_RE = re.compile(r"^http[s]?://[\w\.\-\/]+[#/_:]$") #: Adapted from https://gist.github.com/niklasl/2506955 _IDENTIFIER_RE = r"(/[^\s/][^\s]*|[^\s/][^\s]*|[^\s]?)" -CURIE_PATTERN = rf"({_CURIE_PREFIX_RE}?:)?{_IDENTIFIER_RE}" +CURIE_PATTERN = rf"^({_CURIE_PREFIX_RE}?:)?{_IDENTIFIER_RE}$" CURIE_RE = re.compile(CURIE_PATTERN) diff --git a/tests/resources/valid_curies.txt b/tests/resources/valid_curies.txt index 7125989..d919dd8 100644 --- a/tests/resources/valid_curies.txt +++ b/tests/resources/valid_curies.txt @@ -3,6 +3,7 @@ pfx:abc pfx: abc :abc +_:abc pfx:/abc pfx:/ :/ diff --git a/tox.ini b/tox.ini index 68f4354..620ffcb 100644 --- a/tox.ini +++ b/tox.ini @@ -110,7 +110,9 @@ commands = pyroma --min=10 . description = Run the pyroma tool to check the package friendliness of the project. [testenv:mypy] -deps = mypy +deps = + mypy + types-requests skip_install = true commands = mypy --install-types --non-interactive --ignore-missing-imports --strict src/ description = Run the mypy tool to check static typing on the project. From 291836aa562e23006f586dfb4738b8de3b83001d Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 12 Mar 2024 07:59:46 +0100 Subject: [PATCH 16/19] Add more docs --- docs/source/index.rst | 1 + docs/source/w3c.rst | 39 +++++++++++++++++++++++++++++++++++++++ src/curies/api.py | 1 - 3 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 docs/source/w3c.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index 8bbe6ba..389e33f 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -66,4 +66,5 @@ for updating your code. discovery struct api + w3c services/index diff --git a/docs/source/w3c.rst b/docs/source/w3c.rst new file mode 100644 index 0000000..17561c8 --- /dev/null +++ b/docs/source/w3c.rst @@ -0,0 +1,39 @@ +W3C Compliance +============== +The Worldwide Web Consortium (W3C) provides standards for +`prefixes (i.e., NCName) `_), +`CURIEs `_, and +`IRIs `_, but they are +highly obfuscated and spread across many documents. + +In practice, some usages do not conform to these standards, often due +to encoding things that aren't _really_ supposed to be CURIEs, such as +like SMILES strings for molecules, UCUM codes for units, +or other language-like "identifiers". + +Therefore, it's on the roadmap for the ``curies`` package to support +operations for validating against the W3C standards and mapping +between "loose" (i.e., un-URL-encoded) and strict (i.e., URL-encoded) +CURIEs and IRIs. In practice, this will often solve issues with special +characters like square brackets (``[`` and ``]``). + +.. code-block:: + + looseCURIE <-> strictCURIE + ^. \./. ^ + | X | + v / \. v + looseURI <-> strictURI + +A first step towards accomplishing this was implemented in https://github.com/biopragmatics/curies/pull/104 +by adding a ``w3c_validation`` flag to both the initialization of a :mod:`curies.Converter` as well as in the +:meth:`curies.Converter.expand` function. + +.. seealso:: + + 1. Discussion on the ``curies`` issue tracker about handling CURIEs that include e.g. square brackets + and therefore don't conform to the W3C specification: https://github.com/biopragmatics/curies/issues/103 + 2. Discussion on languages that shouldn't really get encoded in CURIEs, but still do: + https://github.com/biopragmatics/bioregistry/issues/460 + 3. Related to (2) - discussion on how to properly encode UCUM in CURIEs: + https://github.com/biopragmatics/bioregistry/issues/648 diff --git a/src/curies/api.py b/src/curies/api.py index 5e99790..b9671e2 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -2363,7 +2363,6 @@ def curie_is_w3c(s: str) -> bool: :param s: A string to check if it is a valid CURIE under the W3C specification. :return: True if the string is a valid CURIE under the W3C specification. - If no prefix is given, the host language chooses how to assign a default prefix. From 9d8fd8a70fa80866cf3603d04edb1cadb50980fa Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 12 Mar 2024 08:05:57 +0100 Subject: [PATCH 17/19] Update docs --- docs/source/w3c.rst | 2 +- src/curies/__init__.py | 2 ++ src/curies/api.py | 30 ++++++++++++++++-------------- 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/docs/source/w3c.rst b/docs/source/w3c.rst index 17561c8..b19a9df 100644 --- a/docs/source/w3c.rst +++ b/docs/source/w3c.rst @@ -1,7 +1,7 @@ W3C Compliance ============== The Worldwide Web Consortium (W3C) provides standards for -`prefixes (i.e., NCName) `_), +`prefixes `_ (i.e., ``NCName``), `CURIEs `_, and `IRIs `_, but they are highly obfuscated and spread across many documents. diff --git a/src/curies/__init__.py b/src/curies/__init__.py index 718f198..efc4cf1 100644 --- a/src/curies/__init__.py +++ b/src/curies/__init__.py @@ -10,6 +10,7 @@ Record, Reference, ReferenceTuple, + W3CValidationError, chain, load_extended_prefix_map, load_jsonld_context, @@ -39,6 +40,7 @@ "DuplicateValueError", "DuplicateURIPrefixes", "DuplicatePrefixes", + "W3CValidationError", "chain", "remap_curie_prefixes", "remap_uri_prefixes", diff --git a/src/curies/api.py b/src/curies/api.py index b9671e2..2344aaf 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -1178,16 +1178,27 @@ def compress( :raises CompressionError: If strict is set to true and the URI can't be compressed - >>> from curies import Converter >>> converter = Converter.from_prefix_map({ ... "CHEBI": "http://purl.obolibrary.org/obo/CHEBI_", ... "MONDO": "http://purl.obolibrary.org/obo/MONDO_", ... "GO": "http://purl.obolibrary.org/obo/GO_", + ... "OBO": "http://purl.obolibrary.org/obo/", ... }) - >>> converter.compress("http://purl.obolibrary.org/obo/CHEBI_138488") - 'CHEBI:138488' + >>> converter.compress("http://purl.obolibrary.org/obo/GO_0032571") + 'GO:0032571' + >>> converter.compress("http://purl.obolibrary.org/obo/go.owl") + 'OBO:go.owl' >>> converter.compress("http://example.org/missing:0000000") + + .. note:: + + If there are partially overlapping *URI prefixes* in this converter + (e.g., ``http://purl.obolibrary.org/obo/GO_`` for the prefix ``GO`` and + ``http://purl.obolibrary.org/obo/`` for the prefix ``OBO``), the longest + URI prefix will always be matched. For example, parsing + ``http://purl.obolibrary.org/obo/GO_0032571`` will return ``GO:0032571`` + instead of ``OBO:GO_0032571``. """ prefix, identifier = self.parse_uri(uri) if prefix and identifier: @@ -1380,8 +1391,8 @@ def expand( A string representing a compact URI (CURIE) :param strict: If true and the CURIE can't be expanded, returns an error. Defaults to false. :param passthrough: If true, strict is false, and the CURIE can't be expanded, return the input. - Defaults to false. If your strings can either be a CURIE _or_ a URI, consider using - :meth:`Converter.expand_ambiguous` instead. + Defaults to false. If your strings can either be a CURIE or a URI, consider using + :meth:`Converter.expand_or_standardize` instead. :param w3c_validation: If true, requires CURIEs to be valid against the `W3C CURIE specification `_. :returns: @@ -1400,15 +1411,6 @@ def expand( >>> converter.expand("CHEBI:138488") 'http://purl.obolibrary.org/obo/CHEBI_138488' >>> converter.expand("missing:0000000") - - .. note:: - - If there are partially overlapping *URI prefixes* in this converter - (e.g., ``http://purl.obolibrary.org/obo/GO_`` for the prefix ``GO`` and - ``http://purl.obolibrary.org/obo/`` for the prefix ``OBO``), the longest - URI prefix will always be matched. For example, parsing - ``http://purl.obolibrary.org/obo/GO_0032571`` will return ``GO:0032571`` - instead of ``OBO:GO_0032571``. """ if w3c_validation and not curie_is_w3c(curie): raise W3CValidationError(f"CURIE is not valid under W3C spec: {curie}") From 24a15b67430e17057733455291b49d845de228dd Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 12 Mar 2024 09:21:23 +0100 Subject: [PATCH 18/19] Add specific code examples --- docs/source/w3c.rst | 57 +++++++++++++++++++++++++++++++++++++++++++++ src/curies/api.py | 3 ++- 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/docs/source/w3c.rst b/docs/source/w3c.rst index b19a9df..2c985ba 100644 --- a/docs/source/w3c.rst +++ b/docs/source/w3c.rst @@ -29,6 +29,63 @@ A first step towards accomplishing this was implemented in https://github.com/bi by adding a ``w3c_validation`` flag to both the initialization of a :mod:`curies.Converter` as well as in the :meth:`curies.Converter.expand` function. +Here's an example of using W3C validation during expansion: + +.. code-block:: + + import curies + + converter = curies.Converter.from_prefix_map({ + "smiles": "https://bioregistry.io/smiles:", + }) + + >>> converter.expand("smiles:CC(=O)NC([H])(C)C(=O)O") + https://bioregistry.io/smiles:CC(=O)NC([H])(C)C(=O)O + + >>> converter.expand("smiles:CC(=O)NC([H])(C)C(=O)O", w3c_validation=True) + Traceback (most recent call last): + File "", line 1, in + File "/Users/cthoyt/dev/curies/src/curies/api.py", line 1362, in expand + raise W3CValidationError(f"CURIE is not valid under W3C spec: {curie}") + W3CValidationError: CURIE is not valid under W3C spec: smiles:CC(=O)NC([H])(C)C(=O)O + +This can also be used to extend :meth:`curies.Converter.is_curie` + +.. code-block:: + + import curies + + converter = curies.Converter.from_prefix_map({ + "smiles": "https://bioregistry.io/smiles:", + }) + + >>> converter.is_curie("smiles:CC(=O)NC([H])(C)C(=O)O") + True + >>> converter.is_curie("smiles:CC(=O)NC([H])(C)C(=O)O", w3c_validation=True) + False + +Finally, this can be used during instantiation of a converter: + +.. code-block:: + + import curies + + >>> curies.Converter.from_prefix_map( + ... {"4dn.biosource": "https://data.4dnucleome.org/biosources/"}, + ... w3c_validation=True, + ... ) + Traceback (most recent call last): + File "", line 1, in + File "/Users/cthoyt/dev/curies/src/curies/api.py", line 816, in from_prefix_map + return cls( + ^^^^ + File "/Users/cthoyt/dev/curies/src/curies/api.py", line 527, in __init__ + raise W3CValidationError(f"Records not conforming to W3C:\n\n{msg}") + curies.api.W3CValidationError: Records not conforming to W3C: + + - Record(prefix='4dn.biosource', uri_prefix='https://data.4dnucleome.org/biosources/', prefix_synonyms=[], uri_prefix_synonyms=[], pattern=None) + + .. seealso:: 1. Discussion on the ``curies`` issue tracker about handling CURIEs that include e.g. square brackets diff --git a/src/curies/api.py b/src/curies/api.py index e397ee6..65f924e 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -523,7 +523,8 @@ def __init__( if w3c_validation: broken = [record for record in records if not record.w3c_validate()] if broken: - raise W3CValidationError(f"Records not conforming to W3C: {broken}") + msg = "\n".join(f" - {record!r}" for record in records) + raise W3CValidationError(f"Records not conforming to W3C:\n\n{msg}") self.delimiter = delimiter self.records = sorted(records, key=lambda r: r.prefix) From 483736a9829667fa57258ee4206a413f98ecd6f8 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 12 Mar 2024 09:28:39 +0100 Subject: [PATCH 19/19] Hide internal code --- src/curies/api.py | 4 ++-- tests/test_api.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/curies/api.py b/src/curies/api.py index 65f924e..46c83a2 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -308,7 +308,7 @@ def _key(self) -> RecordKey: ",".join(sorted(self.uri_prefix_synonyms)), ) - def w3c_validate(self) -> bool: + def _w3c_validate(self) -> bool: """Check if all prefixes in this record are w3c compliant.""" all_curie_prefixes_valid = all(curie_prefix_is_w3c(prefix) for prefix in self._all_prefixes) # TODO extend to check URI prefixes? @@ -521,7 +521,7 @@ def __init__( raise DuplicatePrefixes(duplicate_prefixes) if w3c_validation: - broken = [record for record in records if not record.w3c_validate()] + broken = [record for record in records if not record._w3c_validate()] if broken: msg = "\n".join(f" - {record!r}" for record in records) raise W3CValidationError(f"Records not conforming to W3C:\n\n{msg}") diff --git a/tests/test_api.py b/tests/test_api.py index 3c1d192..1986454 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -68,8 +68,8 @@ def test_w3c_prefix(self): r1 = Record(prefix=prefix, uri_prefix=uri_prefix) r2 = Record(prefix="prefix", prefix_synonyms=[prefix], uri_prefix=uri_prefix) with self.subTest(prefix=prefix): - self.assertEqual(value, r1.w3c_validate()) - self.assertEqual(value, r2.w3c_validate()) + self.assertEqual(value, r1._w3c_validate()) + self.assertEqual(value, r2._w3c_validate()) class TestAddRecord(unittest.TestCase):