From 4a45826897af1b248b4a0650908f6b23178ee593 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 25 Sep 2020 09:37:06 +0300 Subject: [PATCH 1/8] Fix inconsistency between Python and C http request parsers. They both now correctly parse URLs containing pct-encoded sequences and work with yarl 1.6.0. Yarl >= 1.6 is now required. --- CHANGES/4972.bugfix | 1 + CONTRIBUTORS.txt | 1 + aiohttp/_http_parser.pyx | 2 +- requirements/ci-wheel.txt | 2 +- setup.py | 2 +- tests/test_http_parser.py | 65 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 70 insertions(+), 3 deletions(-) create mode 100644 CHANGES/4972.bugfix diff --git a/CHANGES/4972.bugfix b/CHANGES/4972.bugfix new file mode 100644 index 00000000000..caab9b7fbc7 --- /dev/null +++ b/CHANGES/4972.bugfix @@ -0,0 +1 @@ +Fix inconsistency between Python and C http request parsers in parsing pct-encoded URL. Yarl >=1.6 is now required. \ No newline at end of file diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt index b40c107f6cd..c39f8d62937 100644 --- a/CONTRIBUTORS.txt +++ b/CONTRIBUTORS.txt @@ -235,6 +235,7 @@ SeongSoo Cho Sergey Ninua Sergey Skripnick Serhii Kostel +Serhiy Storchaka Simon Kennedy Sin-Woo Bang Stanislas Plum diff --git a/aiohttp/_http_parser.pyx b/aiohttp/_http_parser.pyx index 153d9529b0d..2b195f3dfad 100644 --- a/aiohttp/_http_parser.pyx +++ b/aiohttp/_http_parser.pyx @@ -851,7 +851,7 @@ cdef _parse_url(char* buf_data, size_t length): return URL_build(scheme=schema, user=user, password=password, host=host, port=port, - path=path, query=query, fragment=fragment) + path=path, query_string=query, fragment=fragment, encoded=True) else: raise InvalidURLError("invalid url {!r}".format(buf_data)) finally: diff --git a/requirements/ci-wheel.txt b/requirements/ci-wheel.txt index fe29a5a3090..69e282c4f81 100644 --- a/requirements/ci-wheel.txt +++ b/requirements/ci-wheel.txt @@ -11,7 +11,7 @@ pytest==5.4.2 pytest-cov==2.8.1 pytest-mock==3.1.0 typing_extensions==3.7.4.2 -yarl==1.4.2 +yarl==1.6.0 # Using PEP 508 env markers to control dependency on runtimes: diff --git a/setup.py b/setup.py index 99a969e2131..5b70cd310c4 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,7 @@ 'chardet>=2.0,<4.0', 'multidict>=4.5,<5.0', 'async_timeout>=4.0a2,<5.0', - 'yarl>=1.0,<2.0', + 'yarl>=1.6,<2.0', 'idna-ssl>=1.0; python_version<"3.7"', 'typing_extensions>=3.6.5', ] diff --git a/tests/test_http_parser.py b/tests/test_http_parser.py index af2e8094706..f7041e4d18a 100644 --- a/tests/test_http_parser.py +++ b/tests/test_http_parser.py @@ -2,6 +2,7 @@ import asyncio from unittest import mock +from urllib.parse import quote import pytest from multidict import CIMultiDict @@ -787,6 +788,70 @@ def test_url_parse_non_strict_mode(parser) -> None: assert payload.is_eof() +@pytest.mark.parametrize( + 'uri,path,query,fragment', + [ + ('/path%23frag', '/path#frag', {}, ''), + ('/path%2523frag', '/path%23frag', {}, ''), + ('/path?key=value%23frag', '/path', {'key': 'value#frag'}, ''), + ('/path?key=value%2523frag', '/path', {'key': 'value%23frag'}, ''), + ('/path#frag%20', '/path', {}, 'frag '), + ('/path#frag%2520', '/path', {}, 'frag%20'), + ] +) +def test_parse_uri_percent_encoded(parser, uri, path, query, fragment) -> None: + text = ('GET %s HTTP/1.1\r\n\r\n' % (uri,)).encode() + messages, upgrade, tail = parser.feed_data(text) + msg = messages[0][0] + + assert msg.path == uri + assert msg.url == URL(uri) + assert msg.url.path == path + assert msg.url.query == query + assert msg.url.fragment == fragment + + +def test_parse_uri_utf8(parser) -> None: + text = ('GET /путь?ключ=знач#фраг HTTP/1.1\r\n\r\n').encode() + messages, upgrade, tail = parser.feed_data(text) + msg = messages[0][0] + + assert msg.path == '/путь?ключ=знач#фраг' + assert msg.url.path == '/путь' + assert msg.url.query == {'ключ': 'знач'} + assert msg.url.fragment == 'фраг' + + +def test_parse_uri_utf8_percent_encoded(parser) -> None: + text = ( + 'GET %s HTTP/1.1\r\n\r\n' % + quote('/путь?ключ=знач#фраг', safe='/?=#') + ).encode() + messages, upgrade, tail = parser.feed_data(text) + msg = messages[0][0] + + assert msg.path == quote('/путь?ключ=знач#фраг', safe='/?=#') + assert msg.url == URL('/путь?ключ=знач#фраг') + assert msg.url.path == '/путь' + assert msg.url.query == {'ключ': 'знач'} + assert msg.url.fragment == 'фраг' + + +def test_parse_uri_non_utf8(parser) -> None: + text = ('GET /путь?ключ=знач#фраг HTTP/1.1\r\n\r\n').encode('cp1251') + messages, upgrade, tail = parser.feed_data(text) + msg = messages[0][0] + + def recode(s): + return s.encode('cp1251').decode('utf-8', 'surrogateescape') + + assert msg.path == recode('/путь?ключ=знач#фраг') + assert msg.url == URL(msg.path) + assert msg.url.path == recode('/путь') + assert msg.url.query == {recode('ключ'): recode('знач')} + assert msg.url.fragment == recode('фраг') + + class TestParsePayload: async def test_parse_eof_payload(self, stream) -> None: From 19b7c54fc6045070b0d9dcc74754cef924fdb493 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 26 Sep 2020 11:22:57 +0300 Subject: [PATCH 2/8] Fix web_urldispatcher.py. --- aiohttp/web_urldispatcher.py | 40 +++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/aiohttp/web_urldispatcher.py b/aiohttp/web_urldispatcher.py index 7712ee3c6d5..a69c79b7b57 100644 --- a/aiohttp/web_urldispatcher.py +++ b/aiohttp/web_urldispatcher.py @@ -400,9 +400,9 @@ def __init__(self, path: str, *, name: Optional[str]=None) -> None: if '{' in part or '}' in part: raise ValueError("Invalid path '{}'['{}']".format(path, part)) - path = URL.build(path=part).raw_path - formatter += path - pattern += re.escape(path) + part = _requote_path(part) + formatter += part + pattern += re.escape(part) try: compiled = re.compile(pattern) @@ -430,7 +430,7 @@ def _match(self, path: str) -> Optional[Dict[str, str]]: if match is None: return None else: - return {key: URL.build(path=value, encoded=True).path + return {key: _unquote_path(value) for key, value in match.groupdict().items()} def raw_match(self, path: str) -> bool: @@ -441,9 +441,9 @@ def get_info(self) -> Dict[str, Any]: 'pattern': self._pattern} def url_for(self, **parts: str) -> URL: - url = self._formatter.format_map({k: URL.build(path=v).raw_path + url = self._formatter.format_map({k: _quote_path(v) for k, v in parts.items()}) - return URL.build(path=url) + return URL.build(path=url, encoded=True) def __repr__(self) -> str: name = "'" + self.name + "' " if self.name is not None else "" @@ -457,7 +457,7 @@ def __init__(self, prefix: str, *, name: Optional[str]=None) -> None: assert not prefix or prefix.startswith('/'), prefix assert prefix in ('', '/') or not prefix.endswith('/'), prefix super().__init__(name=name) - self._prefix = URL.build(path=prefix).raw_path + self._prefix = _requote_path(prefix) @property def canonical(self) -> str: @@ -514,17 +514,13 @@ def url_for(self, *, filename: Union[str, Path], # type: ignore append_version = self._append_version if isinstance(filename, Path): filename = str(filename) - while filename.startswith('/'): - filename = filename[1:] - filename = '/' + filename + filename = filename.lstrip('/') # filename is not encoded - url = URL.build(path=self._prefix + filename) + url = URL.build(path=self._prefix, encoded=True) / filename if append_version: try: - if filename.startswith('/'): - filename = filename[1:] filepath = self._directory.joinpath(filename).resolve() if not self._follow_symlinks: filepath.relative_to(self._directory) @@ -571,8 +567,7 @@ async def resolve(self, request: Request) -> _Resolve: if method not in allowed_methods: return None, allowed_methods - match_dict = {'filename': URL.build(path=path[len(self._prefix)+1:], - encoded=True).path} + match_dict = {'filename': _unquote_path(path[len(self._prefix)+1:])} return (UrlMappingMatchInfo(match_dict, self._routes[method]), allowed_methods) @@ -1001,8 +996,7 @@ def add_resource(self, path: str, *, if resource.name == name and resource.raw_match(path): return cast(Resource, resource) if not ('{' in path or '}' in path or ROUTE_RE.search(path)): - url = URL.build(path=path) - resource = PlainResource(url.raw_path, name=name) + resource = PlainResource(_requote_path(path), name=name) self.register_resource(resource) return resource resource = DynamicResource(path, name=name) @@ -1121,3 +1115,15 @@ def add_routes(self, for route_def in routes: registered_routes.extend(route_def.register(self)) return registered_routes + + +def _quote_path(value: str) -> str: + return URL.build(path=value).raw_path + + +def _unquote_path(value: str) -> str: + return URL.build(path=value, encoded=True).path + + +def _requote_path(value: str) -> str: + return _quote_path(value).replace('%25', '%') From 8beb8a2c2b1ec317ddc89981b1164546405c89ef Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 26 Sep 2020 12:25:47 +0300 Subject: [PATCH 3/8] Add new tests. --- tests/test_urldispatch.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/tests/test_urldispatch.py b/tests/test_urldispatch.py index 0f548a4417f..331ed1dea9e 100644 --- a/tests/test_urldispatch.py +++ b/tests/test_urldispatch.py @@ -467,6 +467,20 @@ def test_add_static_append_version_not_follow_symlink(router, assert '/st/append_version_symlink/data.unknown_mime_type' == str(url) +def test_add_static_quoting(router) -> None: + resource = router.add_static('/пре %2Fфикс', + pathlib.Path(aiohttp.__file__).parent, + name='static') + assert router['static'] is resource + url = resource.url_for(filename='/1 2/файл%2F.txt') + assert url.path == '/пре /фикс/1 2/файл%2F.txt' + assert str(url) == ( + '/%D0%BF%D1%80%D0%B5%20%2F%D1%84%D0%B8%D0%BA%D1%81' + '/1%202/%D1%84%D0%B0%D0%B9%D0%BB%252F.txt' + ) + assert len(resource) == 2 + + def test_plain_not_match(router) -> None: handler = make_handler() router.add_route('GET', '/get/path', handler, name='name') @@ -629,10 +643,14 @@ def test_route_dynamic_with_regex(router) -> None: def test_route_dynamic_quoting(router) -> None: handler = make_handler() - route = router.add_route('GET', r'/{arg}', handler) - - url = route.url_for(arg='1 2/текст') - assert '/1%202/%D1%82%D0%B5%D0%BA%D1%81%D1%82' == str(url) + route = router.add_route('GET', r'/пре %2Fфикс/{arg}', handler) + + url = route.url_for(arg='1 2/текст%2F') + assert url.path == '/пре /фикс/1 2/текст%2F' + assert str(url) == ( + '/%D0%BF%D1%80%D0%B5%20%2F%D1%84%D0%B8%D0%BA%D1%81' + '/1%202/%D1%82%D0%B5%D0%BA%D1%81%D1%82%252F' + ) async def test_regular_match_info(router) -> None: From 5c8b948eb0adffb899a6f18ae3d7b56b21401121 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 26 Sep 2020 12:26:55 +0300 Subject: [PATCH 4/8] Update tests/test_http_parser.py Co-authored-by: Sviatoslav Sydorenko --- tests/test_http_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_http_parser.py b/tests/test_http_parser.py index f7041e4d18a..52df0b474fa 100644 --- a/tests/test_http_parser.py +++ b/tests/test_http_parser.py @@ -789,7 +789,7 @@ def test_url_parse_non_strict_mode(parser) -> None: @pytest.mark.parametrize( - 'uri,path,query,fragment', + ('uri', 'path', 'query', 'fragment'), [ ('/path%23frag', '/path#frag', {}, ''), ('/path%2523frag', '/path%23frag', {}, ''), From 5d4828ca2faac9245a51171cd95a0c9e181cd8fa Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 26 Sep 2020 13:02:23 +0300 Subject: [PATCH 5/8] Optimize _requote_path(). --- aiohttp/web_urldispatcher.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/aiohttp/web_urldispatcher.py b/aiohttp/web_urldispatcher.py index a69c79b7b57..4ee1d7d81f7 100644 --- a/aiohttp/web_urldispatcher.py +++ b/aiohttp/web_urldispatcher.py @@ -1126,4 +1126,7 @@ def _unquote_path(value: str) -> str: def _requote_path(value: str) -> str: - return _quote_path(value).replace('%25', '%') + result = _quote_path(value) + if '%' in value: + result = result.replace('%25', '%') + return result From c996e4e86710dcab54956ae330775c76dadffc8c Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 12 Oct 2020 12:12:57 +0300 Subject: [PATCH 6/8] Address review comments. --- aiohttp/web_urldispatcher.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/aiohttp/web_urldispatcher.py b/aiohttp/web_urldispatcher.py index 4ee1d7d81f7..8fd77f1f9fb 100644 --- a/aiohttp/web_urldispatcher.py +++ b/aiohttp/web_urldispatcher.py @@ -1118,7 +1118,7 @@ def add_routes(self, def _quote_path(value: str) -> str: - return URL.build(path=value).raw_path + return URL.build(path=value, encoded=False).raw_path def _unquote_path(value: str) -> str: @@ -1126,6 +1126,8 @@ def _unquote_path(value: str) -> str: def _requote_path(value: str) -> str: + # Quote non-ascii characters and other characters which must be quoted, + # but preserve existing %-sequences. result = _quote_path(value) if '%' in value: result = result.replace('%25', '%') From 516e91b60d8b965b0ef9f20657c1fe6edd6d7060 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 16 Oct 2020 13:01:36 +0300 Subject: [PATCH 7/8] Remove test_parse_uri_non_utf8. --- tests/test_http_parser.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/tests/test_http_parser.py b/tests/test_http_parser.py index 52df0b474fa..8b555d7ef74 100644 --- a/tests/test_http_parser.py +++ b/tests/test_http_parser.py @@ -837,21 +837,6 @@ def test_parse_uri_utf8_percent_encoded(parser) -> None: assert msg.url.fragment == 'фраг' -def test_parse_uri_non_utf8(parser) -> None: - text = ('GET /путь?ключ=знач#фраг HTTP/1.1\r\n\r\n').encode('cp1251') - messages, upgrade, tail = parser.feed_data(text) - msg = messages[0][0] - - def recode(s): - return s.encode('cp1251').decode('utf-8', 'surrogateescape') - - assert msg.path == recode('/путь?ключ=знач#фраг') - assert msg.url == URL(msg.path) - assert msg.url.path == recode('/путь') - assert msg.url.query == {recode('ключ'): recode('знач')} - assert msg.url.fragment == recode('фраг') - - class TestParsePayload: async def test_parse_eof_payload(self, stream) -> None: From b2183ace13109ef32dc6f46f49aa809e39e93d1b Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 19 Oct 2020 14:25:02 +0300 Subject: [PATCH 8/8] Support yarl <1.6. --- CHANGES/4972.bugfix | 2 +- aiohttp/web_urldispatcher.py | 11 ++++++++++- setup.py | 2 +- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/CHANGES/4972.bugfix b/CHANGES/4972.bugfix index caab9b7fbc7..6654f8a645d 100644 --- a/CHANGES/4972.bugfix +++ b/CHANGES/4972.bugfix @@ -1 +1 @@ -Fix inconsistency between Python and C http request parsers in parsing pct-encoded URL. Yarl >=1.6 is now required. \ No newline at end of file +Fix inconsistency between Python and C http request parsers in parsing pct-encoded URL. diff --git a/aiohttp/web_urldispatcher.py b/aiohttp/web_urldispatcher.py index 56ee8c15b13..08b989e8022 100644 --- a/aiohttp/web_urldispatcher.py +++ b/aiohttp/web_urldispatcher.py @@ -31,6 +31,7 @@ from typing_extensions import TypedDict from yarl import URL +from yarl import __version__ as yarl_version # type: ignore from . import hdrs from .abc import AbstractMatchInfo, AbstractRouter, AbstractView @@ -61,6 +62,8 @@ else: BaseDict = dict +YARL_VERSION = tuple(map(int, yarl_version.split('.')[:2])) + HTTP_METHOD_RE = re.compile(r"^[0-9A-Za-z!#\$%&'\*\+\-\.\^_`\|~]+$") ROUTE_RE = re.compile(r'(\{[_a-zA-Z][^{}]*(?:\{[^{}]*\}[^{}]*)*\})') PATH_SEP = re.escape('/') @@ -537,8 +540,12 @@ def url_for(self, *, filename: Union[str, Path], # type: ignore filename = str(filename) filename = filename.lstrip('/') + url = URL.build(path=self._prefix, encoded=True) # filename is not encoded - url = URL.build(path=self._prefix, encoded=True) / filename + if YARL_VERSION < (1, 6): + url = url / filename.replace('%', '%25') + else: + url = url / filename if append_version: try: @@ -1149,6 +1156,8 @@ def add_routes(self, def _quote_path(value: str) -> str: + if YARL_VERSION < (1, 6): + value = value.replace('%', '%25') return URL.build(path=value, encoded=False).raw_path diff --git a/setup.py b/setup.py index 7ab479eff97..a27c34257cc 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,7 @@ 'chardet>=2.0,<4.0', 'multidict>=4.5,<7.0', 'async_timeout>=4.0a2,<5.0', - 'yarl>=1.6,<2.0', + 'yarl>=1.0,<2.0', 'idna-ssl>=1.0; python_version<"3.7"', 'typing_extensions>=3.6.5', ]