From bb6cf6ecc545cf587357794cd340cc2207aa9688 Mon Sep 17 00:00:00 2001 From: Taneli Hukkinen <3275109+hukkin@users.noreply.github.com> Date: Wed, 1 Dec 2021 19:13:00 +0100 Subject: [PATCH] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20REFACTOR:=20Port=20`mdurl`?= =?UTF-8?q?=20and=20`punycode`=20for=20URL=20normalisation=20(#171)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This port brings markdown-it-py closer inline with markdown-it, and fixes the outstanding CommonMark compliance tests. --- markdown_it/_punycode.py | 66 +++++++++ markdown_it/common/normalize_url.py | 177 ++++++------------------ markdown_it/common/utils.py | 15 +- markdown_it/helpers/parse_link_title.py | 4 +- markdown_it/rules_block/fence.py | 3 +- setup.cfg | 1 + tests/test_port/test_fixtures.py | 10 -- 7 files changed, 120 insertions(+), 156 deletions(-) create mode 100644 markdown_it/_punycode.py diff --git a/markdown_it/_punycode.py b/markdown_it/_punycode.py new file mode 100644 index 00000000..9ad24421 --- /dev/null +++ b/markdown_it/_punycode.py @@ -0,0 +1,66 @@ +# Copyright 2014 Mathias Bynens +# Copyright 2021 Taneli Hukkinen +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import codecs +import re + +REGEX_SEPARATORS = re.compile(r"[\x2E\u3002\uFF0E\uFF61]") +REGEX_NON_ASCII = re.compile(r"[^\0-\x7E]") + + +def encode(uni: str) -> str: + return codecs.encode(uni, encoding="punycode").decode() + + +def decode(ascii: str) -> str: + return codecs.decode(ascii, encoding="punycode") # type: ignore[call-overload] + + +def map_domain(string, fn): + parts = string.split("@") + result = "" + if len(parts) > 1: + # In email addresses, only the domain name should be punycoded. Leave + # the local part (i.e. everything up to `@`) intact. + result = parts[0] + "@" + string = parts[1] + labels = REGEX_SEPARATORS.split(string) + encoded = ".".join(fn(label) for label in labels) + return result + encoded + + +def to_unicode(obj: str) -> str: + def mapping(obj: str) -> str: + if obj.startswith("xn--"): + return decode(obj[4:].lower()) + return obj + + return map_domain(obj, mapping) + + +def to_ascii(obj: str) -> str: + def mapping(obj: str) -> str: + if REGEX_NON_ASCII.search(obj): + return "xn--" + encode(obj) + return obj + + return map_domain(obj, mapping) diff --git a/markdown_it/common/normalize_url.py b/markdown_it/common/normalize_url.py index d150ca36..d1ab85e3 100644 --- a/markdown_it/common/normalize_url.py +++ b/markdown_it/common/normalize_url.py @@ -1,70 +1,13 @@ -import html import re from typing import Callable, Optional from urllib.parse import urlparse, urlunparse, quote, unquote # noqa: F401 -from .utils import ESCAPABLE +import mdurl -# TODO below we port the use of the JS packages: -# var mdurl = require('mdurl') -# var punycode = require('punycode') -# -# e.g. mdurl: parsed = mdurl.parse(url, True) -# -# but need to check these fixes from https://www.npmjs.com/package/mdurl: -# -# Parse url string. Similar to node's url.parse, -# but without any normalizations and query string parse. -# url - input url (string) -# slashesDenoteHost - if url starts with //, expect a hostname after it. Optional, false. -# Difference with node's url: +from .. import _punycode -# No leading slash in paths, e.g. in url.parse('http://foo?bar') pathname is ``, not / -# Backslashes are not replaced with slashes, so http:\\example.org\ is treated like a relative path -# Trailing colon is treated like a part of the path, i.e. in http://example.org:foo pathname is :foo -# Nothing is URL-encoded in the resulting object, -# (in joyent/node some chars in auth and paths are encoded) -# url.parse() does not have parseQueryString argument -# Removed extraneous result properties: host, path, query, etc., -# which can be constructed using other parts of the url. - -# ################# Copied from Commonmark.py ################# - -ENTITY = "&(?:#x[a-f0-9]{1,6}|#[0-9]{1,7}|[a-z][a-z0-9]{1,31});" -reBackslashOrAmp = re.compile(r"[\\&]") -reEntityOrEscapedChar = re.compile( - "\\\\" + "[" + ESCAPABLE + "]|" + ENTITY, re.IGNORECASE -) - - -def unescape_char(s: str) -> str: - if s[0] == "\\": - return s[1] - else: - return html.unescape(s) - - -def unescape_string(s: str) -> str: - """Replace entities and backslash escapes with literal characters.""" - if re.search(reBackslashOrAmp, s): - return re.sub(reEntityOrEscapedChar, lambda m: unescape_char(m.group()), s) - else: - return s - - -def normalize_uri(uri: str) -> str: - return quote(uri, safe="/@:+?=&()%#*,") - - -################## - - -RECODE_HOSTNAME_FOR = ("http", "https", "mailto") - - -def unescape_normalize_uri(x: str) -> str: - return normalize_uri(unescape_string(x)) +RECODE_HOSTNAME_FOR = ("http:", "https:", "mailto:") def normalizeLink(url: str) -> str: @@ -75,49 +18,25 @@ def normalizeLink(url: str) -> str: [label]: destination 'title' ^^^^^^^^^^^ """ - (scheme, netloc, path, params, query, fragment) = urlparse(url) - if scheme in RECODE_HOSTNAME_FOR: - url = urlunparse( - ( - scheme, - unescape_normalize_uri(netloc), - normalize_uri(path), - unescape_normalize_uri(params), - normalize_uri(query), - unescape_normalize_uri(fragment), - ) - ) - else: - url = unescape_normalize_uri(url) - - return url - - # TODO the selective encoding below should probably be done here, - # something like: - # url_check = urllib.parse.urlparse(destination) - # if url_check.scheme in RECODE_HOSTNAME_FOR: ... - - # parsed = urlparse(url) - # if parsed.hostname: - # # Encode hostnames in urls like: - # # `http:#host/`, `https:#host/`, `mailto:user@host`, `#host/` - # # - # # We don't encode unknown schemas, because it's likely that we encode - # # something we shouldn't (e.g. `skype:name` treated as `skype:host`) - # # - # if (not parsed.scheme) or parsed.scheme in RECODE_HOSTNAME_FOR: - # try: - # parsed.hostname = punycode.toASCII(parsed.hostname) - # except Exception: - # pass - # return quote(urlunparse(parsed)) - - -def unescape_unquote(x: str) -> str: - return unquote(unescape_string(x)) - - -def normalizeLinkText(link: str) -> str: + parsed = mdurl.parse(url, slashes_denote_host=True) + + if parsed.hostname: + # Encode hostnames in urls like: + # `http://host/`, `https://host/`, `mailto:user@host`, `//host/` + # + # We don't encode unknown schemas, because it's likely that we encode + # something we shouldn't (e.g. `skype:name` treated as `skype:host`) + # + if not parsed.protocol or parsed.protocol in RECODE_HOSTNAME_FOR: + try: + parsed = parsed._replace(hostname=_punycode.to_ascii(parsed.hostname)) + except Exception: + pass + + return mdurl.encode(mdurl.format(parsed)) + + +def normalizeLinkText(url: str) -> str: """Normalize autolink content :: @@ -125,41 +44,23 @@ def normalizeLinkText(link: str) -> str: ~~~~~~~~~~~ """ - (scheme, netloc, path, params, query, fragment) = urlparse(link) - if scheme in RECODE_HOSTNAME_FOR: - url = urlunparse( - ( - scheme, - unescape_unquote(netloc), - unquote(path), - unescape_unquote(params), - unquote(query), - unescape_unquote(fragment), - ) - ) - else: - url = unescape_unquote(link) - return url - - # TODO the selective encoding below should probably be done here, - # something like: - # url_check = urllib.parse.urlparse(destination) - # if url_check.scheme in RECODE_HOSTNAME_FOR: ... - - # parsed = urlparse(url) - # if parsed.hostname: - # # Encode hostnames in urls like: - # # `http:#host/`, `https:#host/`, `mailto:user@host`, `#host/` - # # - # # We don't encode unknown schemas, because it's likely that we encode - # # something we shouldn't (e.g. `skype:name` treated as `skype:host`) - # # - # if (not parsed.protocol) or parsed.protocol in RECODE_HOSTNAME_FOR: - # try: - # parsed.hostname = punycode.toUnicode(parsed.hostname) - # except Exception: - # pass - # return unquote(urlunparse(parsed)) + parsed = mdurl.parse(url, slashes_denote_host=True) + + if parsed.hostname: + # Encode hostnames in urls like: + # `http://host/`, `https://host/`, `mailto:user@host`, `//host/` + # + # We don't encode unknown schemas, because it's likely that we encode + # something we shouldn't (e.g. `skype:name` treated as `skype:host`) + # + if not parsed.protocol or parsed.protocol in RECODE_HOSTNAME_FOR: + try: + parsed = parsed._replace(hostname=_punycode.to_unicode(parsed.hostname)) + except Exception: + pass + + # add '%' to exclude list because of https://github.com/markdown-it/markdown-it/issues/720 + return mdurl.decode(mdurl.format(parsed), mdurl.DECODE_DEFAULT_CHARS + "%") BAD_PROTO_RE = re.compile(r"^(vbscript|javascript|file|data):") diff --git a/markdown_it/common/utils.py b/markdown_it/common/utils.py index 8b2c4a7f..e2abc368 100644 --- a/markdown_it/common/utils.py +++ b/markdown_it/common/utils.py @@ -6,8 +6,6 @@ from .entities import entities -# from .normalize_url import unescape_string - def charCodeAt(src: str, pos: int) -> Any: """ @@ -105,7 +103,7 @@ def fromCodePoint(c: int) -> str: UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])') # ENTITY_RE_g = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE) UNESCAPE_ALL_RE = re.compile( - r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31})", + r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});", re.IGNORECASE, ) DIGITAL_ENTITY_TEST_RE = re.compile(r"^#((?:x[a-f0-9]{1,8}|[0-9]{1,8}))", re.IGNORECASE) @@ -146,7 +144,16 @@ def unescapeMd(string: str) -> str: def unescapeAll(string: str) -> str: - return html.unescape(string) + def replacer_func(match): + escaped = match.group(1) + if escaped: + return escaped + entity = match.group(2) + return replaceEntityPattern(match.group(), entity) + + if "\\" not in string and "&" not in string: + return string + return UNESCAPE_ALL_RE.sub(replacer_func, string) ESCAPABLE = r"""\\!"#$%&'()*+,./:;<=>?@\[\]^`{}|_~-""" diff --git a/markdown_it/helpers/parse_link_title.py b/markdown_it/helpers/parse_link_title.py index 4aa67e88..0cb1365b 100644 --- a/markdown_it/helpers/parse_link_title.py +++ b/markdown_it/helpers/parse_link_title.py @@ -1,6 +1,6 @@ """Parse link title """ -from ..common.utils import unescapeAll, charCodeAt, stripEscape +from ..common.utils import unescapeAll, charCodeAt class _Result: @@ -40,7 +40,7 @@ def parseLinkTitle(string: str, pos: int, maximum: int) -> _Result: code = charCodeAt(string, pos) if code == marker: title = string[start + 1 : pos] - title = unescapeAll(stripEscape(title)) + title = unescapeAll(title) result.pos = pos + 1 result.lines = lines result.str = title diff --git a/markdown_it/rules_block/fence.py b/markdown_it/rules_block/fence.py index bacf54a2..c4f5275d 100644 --- a/markdown_it/rules_block/fence.py +++ b/markdown_it/rules_block/fence.py @@ -1,7 +1,6 @@ # fences (``` lang, ~~~ lang) import logging -from ..common.utils import stripEscape from .state_block import StateBlock LOGGER = logging.getLogger(__name__) @@ -97,7 +96,7 @@ def fence(state: StateBlock, startLine: int, endLine: int, silent: bool): state.line = nextLine + (1 if haveEndMarker else 0) token = state.push("fence", "code", 0) - token.info = stripEscape(params) + token.info = params token.content = state.getLines(startLine + 1, nextLine, length, True) token.markup = markup token.map = [startLine, state.line] diff --git a/setup.cfg b/setup.cfg index 5655f15b..04230826 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,6 +30,7 @@ project_urls = [options] packages = find: install_requires = + mdurl attrs>=19,<22 typing_extensions>=3.7.4;python_version<'3.8' python_requires = ~=3.6 diff --git a/tests/test_port/test_fixtures.py b/tests/test_port/test_fixtures.py index 5e94a149..a789fb1f 100644 --- a/tests/test_port/test_fixtures.py +++ b/tests/test_port/test_fixtures.py @@ -64,13 +64,6 @@ def test_table(line, title, input, expected): read_fixture_file(FIXTURE_PATH.joinpath("commonmark_extras.md")), ) def test_commonmark_extras(line, title, input, expected): - if title in { - "Escaping entities in links:", - "Checking combination of replaceEntities and unescapeMd:", - }: - # TODO fix failing escaping tests - # probably requires a fix of common.utils.stripEscape - pytest.xfail("escaping entities in link titles / fence.info") md = MarkdownIt("commonmark") md.options["langPrefix"] = "" text = md.render(input) @@ -99,9 +92,6 @@ def test_normalize_url(line, title, input, expected): "line,title,input,expected", read_fixture_file(FIXTURE_PATH.joinpath("fatal.md")) ) def test_fatal(line, title, input, expected): - if line in [1, 17]: - # TODO fix failing url escaping tests - pytest.xfail("url normalisation") md = MarkdownIt("commonmark").enable("replacements") md.options["typographer"] = True text = md.render(input)