Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bpo-42967: only use '&' as a query string separator #24297

Merged
merged 13 commits into from
Feb 14, 2021
23 changes: 14 additions & 9 deletions Lib/cgi.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,8 @@ def closelog():
# 0 ==> unlimited input
maxlen = 0

def parse(fp=None, environ=os.environ, keep_blank_values=0, strict_parsing=0):
def parse(fp=None, environ=os.environ, keep_blank_values=0,
AdamGold marked this conversation as resolved.
Show resolved Hide resolved
strict_parsing=0, separator='&'):
"""Parse a query in the environment or from a file (default stdin)

Arguments, all optional:
Expand All @@ -134,6 +135,9 @@ def parse(fp=None, environ=os.environ, keep_blank_values=0, strict_parsing=0):
strict_parsing: flag indicating what to do with parsing errors.
If false (the default), errors are silently ignored.
If true, errors raise a ValueError exception.

separator: str. The symbol to use for separating the query arguments.
Defaults to &.
"""
if fp is None:
fp = sys.stdin
Expand All @@ -154,7 +158,7 @@ def parse(fp=None, environ=os.environ, keep_blank_values=0, strict_parsing=0):
if environ['REQUEST_METHOD'] == 'POST':
ctype, pdict = parse_header(environ['CONTENT_TYPE'])
if ctype == 'multipart/form-data':
return parse_multipart(fp, pdict)
return parse_multipart(fp, pdict, separator=separator)
elif ctype == 'application/x-www-form-urlencoded':
clength = int(environ['CONTENT_LENGTH'])
if maxlen and clength > maxlen:
Expand All @@ -178,10 +182,10 @@ def parse(fp=None, environ=os.environ, keep_blank_values=0, strict_parsing=0):
qs = ""
environ['QUERY_STRING'] = qs # XXX Shouldn't, really
return urllib.parse.parse_qs(qs, keep_blank_values, strict_parsing,
encoding=encoding)
encoding=encoding, separator=separator)


def parse_multipart(fp, pdict, encoding="utf-8", errors="replace"):
def parse_multipart(fp, pdict, encoding="utf-8", errors="replace", separator='&'):
AdamGold marked this conversation as resolved.
Show resolved Hide resolved
"""Parse multipart input.

Arguments:
Expand All @@ -205,7 +209,7 @@ def parse_multipart(fp, pdict, encoding="utf-8", errors="replace"):
except KeyError:
pass
fs = FieldStorage(fp, headers=headers, encoding=encoding, errors=errors,
environ={'REQUEST_METHOD': 'POST'})
environ={'REQUEST_METHOD': 'POST'}, separator=separator)
return {k: fs.getlist(k) for k in fs}

def _parseparam(s):
Expand Down Expand Up @@ -315,7 +319,7 @@ class FieldStorage:
def __init__(self, fp=None, headers=None, outerboundary=b'',
environ=os.environ, keep_blank_values=0, strict_parsing=0,
limit=None, encoding='utf-8', errors='replace',
max_num_fields=None):
max_num_fields=None, separator='&'):
AdamGold marked this conversation as resolved.
Show resolved Hide resolved
"""Constructor. Read multipart/* until last part.

Arguments, all optional:
Expand Down Expand Up @@ -363,6 +367,7 @@ def __init__(self, fp=None, headers=None, outerboundary=b'',
self.keep_blank_values = keep_blank_values
self.strict_parsing = strict_parsing
self.max_num_fields = max_num_fields
self.separator = separator
if 'REQUEST_METHOD' in environ:
method = environ['REQUEST_METHOD'].upper()
self.qs_on_post = None
Expand Down Expand Up @@ -589,7 +594,7 @@ def read_urlencoded(self):
query = urllib.parse.parse_qsl(
qs, self.keep_blank_values, self.strict_parsing,
encoding=self.encoding, errors=self.errors,
max_num_fields=self.max_num_fields)
max_num_fields=self.max_num_fields, separator=self.separator)
self.list = [MiniFieldStorage(key, value) for key, value in query]
self.skip_lines()

Expand All @@ -605,7 +610,7 @@ def read_multi(self, environ, keep_blank_values, strict_parsing):
query = urllib.parse.parse_qsl(
self.qs_on_post, self.keep_blank_values, self.strict_parsing,
encoding=self.encoding, errors=self.errors,
max_num_fields=self.max_num_fields)
max_num_fields=self.max_num_fields, separator=self.separator)
self.list.extend(MiniFieldStorage(key, value) for key, value in query)

klass = self.FieldStorageClass or self.__class__
Expand Down Expand Up @@ -649,7 +654,7 @@ def read_multi(self, environ, keep_blank_values, strict_parsing):
else self.limit - self.bytes_read
part = klass(self.fp, headers, ib, environ, keep_blank_values,
strict_parsing, limit,
self.encoding, self.errors, max_num_fields)
self.encoding, self.errors, max_num_fields, self.separator)

if max_num_fields is not None:
max_num_fields -= 1
Expand Down
5 changes: 0 additions & 5 deletions Lib/test/test_cgi.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,9 @@ def do_test(buf, method):
("", ValueError("bad query field: ''")),
("&", ValueError("bad query field: ''")),
("&&", ValueError("bad query field: ''")),
(";", ValueError("bad query field: ''")),
(";&;", ValueError("bad query field: ''")),
# Should the next few really be valid?
("=", {}),
("=&=", {}),
("=;=", {}),
# This rest seem to make sense
("=a", {'': ['a']}),
("&=a", ValueError("bad query field: ''")),
Expand All @@ -73,8 +70,6 @@ def do_test(buf, method):
("a=a+b&b=b+c", {'a': ['a b'], 'b': ['b c']}),
("a=a+b&a=b+a", {'a': ['a b', 'b a']}),
("x=1&y=2.0&z=2-3.%2b0", {'x': ['1'], 'y': ['2.0'], 'z': ['2-3.+0']}),
("x=1;y=2.0&z=2-3.%2b0", {'x': ['1'], 'y': ['2.0'], 'z': ['2-3.+0']}),
("x=1;y=2.0;z=2-3.%2b0", {'x': ['1'], 'y': ['2.0'], 'z': ['2-3.+0']}),
("Hbc5161168c542333633315dee1182227:key_store_seqid=400006&cuyer=r&view=bustomer&order_id=0bb2e248638833d48cb7fed300000f1b&expire=964546263&lobale=en-US&kid=130003.300038&ss=env",
{'Hbc5161168c542333633315dee1182227:key_store_seqid': ['400006'],
'cuyer': ['r'],
Expand Down
68 changes: 46 additions & 22 deletions Lib/test/test_urlparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,10 @@
(b"&a=b", [(b'a', b'b')]),
(b"a=a+b&b=b+c", [(b'a', b'a b'), (b'b', b'b c')]),
(b"a=1&a=2", [(b'a', b'1'), (b'a', b'2')]),
(";", []),
(";;", []),
(";a=b", [('a', 'b')]),
("a=a+b;b=b+c", [('a', 'a b'), ('b', 'b c')]),
("a=1;a=2", [('a', '1'), ('a', '2')]),
(b";", []),
(b";;", []),
(b";a=b", [(b'a', b'b')]),
(b"a=a+b;b=b+c", [(b'a', b'a b'), (b'b', b'b c')]),
(b"a=1;a=2", [(b'a', b'1'), (b'a', b'2')]),
(";a=b", [(';a', 'b')]),
("a=a+b;b=b+c", [('a', 'a b;b=b c')]),
(b";a=b", [(b';a', b'b')]),
(b"a=a+b;b=b+c", [(b'a', b'a b;b=b c')]),
]

# Each parse_qs testcase is a two-tuple that contains
Expand All @@ -68,16 +62,10 @@
(b"&a=b", {b'a': [b'b']}),
(b"a=a+b&b=b+c", {b'a': [b'a b'], b'b': [b'b c']}),
(b"a=1&a=2", {b'a': [b'1', b'2']}),
(";", {}),
(";;", {}),
(";a=b", {'a': ['b']}),
("a=a+b;b=b+c", {'a': ['a b'], 'b': ['b c']}),
("a=1;a=2", {'a': ['1', '2']}),
(b";", {}),
(b";;", {}),
(b";a=b", {b'a': [b'b']}),
(b"a=a+b;b=b+c", {b'a': [b'a b'], b'b': [b'b c']}),
(b"a=1;a=2", {b'a': [b'1', b'2']}),
(";a=b", {';a': ['b']}),
("a=a+b;b=b+c", {'a': ['a b;b=b c']}),
(b";a=b", {b';a': [b'b']}),
(b"a=a+b;b=b+c", {b'a':[ b'a b;b=b c']}),
]

class UrlParseTestCase(unittest.TestCase):
Expand Down Expand Up @@ -886,10 +874,46 @@ def test_parse_qsl_encoding(self):
def test_parse_qsl_max_num_fields(self):
with self.assertRaises(ValueError):
urllib.parse.parse_qs('&'.join(['a=a']*11), max_num_fields=10)
with self.assertRaises(ValueError):
urllib.parse.parse_qs(';'.join(['a=a']*11), max_num_fields=10)
AdamGold marked this conversation as resolved.
Show resolved Hide resolved
urllib.parse.parse_qs('&'.join(['a=a']*10), max_num_fields=10)

def test_parse_qs_separator(self):
parse_qs_semicolon_cases = [
(";", {}),
(";;", {}),
(";a=b", {'a': ['b']}),
("a=a+b;b=b+c", {'a': ['a b'], 'b': ['b c']}),
("a=1;a=2", {'a': ['1', '2']}),
(b";", {}),
(b";;", {}),
(b";a=b", {b'a': [b'b']}),
(b"a=a+b;b=b+c", {b'a': [b'a b'], b'b': [b'b c']}),
(b"a=1;a=2", {b'a': [b'1', b'2']}),
]
for orig, expect in parse_qs_semicolon_cases:
with self.subTest(f"Original: {orig!r}, Expected: {expect!r}"):
result = urllib.parse.parse_qs(orig, separator=';')
self.assertEqual(result, expect, "Error parsing %r" % orig)


def test_parse_qsl_separator(self):
parse_qsl_semicolon_cases = [
(";", []),
(";;", []),
(";a=b", [('a', 'b')]),
("a=a+b;b=b+c", [('a', 'a b'), ('b', 'b c')]),
("a=1;a=2", [('a', '1'), ('a', '2')]),
(b";", []),
(b";;", []),
(b";a=b", [(b'a', b'b')]),
(b"a=a+b;b=b+c", [(b'a', b'a b'), (b'b', b'b c')]),
(b"a=1;a=2", [(b'a', b'1'), (b'a', b'2')]),
]
for orig, expect in parse_qsl_semicolon_cases:
with self.subTest(f"Original: {orig!r}, Expected: {expect!r}"):
result = urllib.parse.parse_qsl(orig, separator=';')
self.assertEqual(result, expect, "Error parsing %r" % orig)


AdamGold marked this conversation as resolved.
Show resolved Hide resolved
def test_urlencode_sequences(self):
# Other tests incidentally urlencode things; test non-covered cases:
# Sequence and object values.
Expand Down
20 changes: 15 additions & 5 deletions Lib/urllib/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -662,7 +662,7 @@ def unquote(string, encoding='utf-8', errors='replace'):


def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
encoding='utf-8', errors='replace', max_num_fields=None):
encoding='utf-8', errors='replace', max_num_fields=None, separator='&'):
AdamGold marked this conversation as resolved.
Show resolved Hide resolved
"""Parse a query given as a string argument.

Arguments:
Expand All @@ -686,12 +686,15 @@ def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
max_num_fields: int. If set, then throws a ValueError if there
are more than n fields read by parse_qsl().

separator: str. The symbol to use for separating the query arguments.
Defaults to &.

Returns a dictionary.
"""
parsed_result = {}
pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
encoding=encoding, errors=errors,
max_num_fields=max_num_fields)
max_num_fields=max_num_fields, separator=separator)
for name, value in pairs:
if name in parsed_result:
parsed_result[name].append(value)
Expand All @@ -701,7 +704,7 @@ def parse_qs(qs, keep_blank_values=False, strict_parsing=False,


def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
encoding='utf-8', errors='replace', max_num_fields=None):
encoding='utf-8', errors='replace', max_num_fields=None, separator='&'):
AdamGold marked this conversation as resolved.
Show resolved Hide resolved
"""Parse a query given as a string argument.

Arguments:
Expand All @@ -724,19 +727,26 @@ def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
max_num_fields: int. If set, then throws a ValueError
if there are more than n fields read by parse_qsl().

separator: str. The symbol to use for separating the query arguments.
Defaults to &.

Returns a list, as G-d intended.
"""
qs, _coerce_result = _coerce_args(qs)

if not separator or (not isinstance(separator, str)
and not isinstance(separator, bytes)):
raise ValueError("Separator must be of type string or bytes.")

# If max_num_fields is defined then check that the number of fields
# is less than max_num_fields. This prevents a memory exhaustion DOS
# attack via post bodies with many fields.
if max_num_fields is not None:
num_fields = 1 + qs.count('&') + qs.count(';')
num_fields = 1 + qs.count(separator)
if max_num_fields < num_fields:
raise ValueError('Max number of fields exceeded')

pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
pairs = [s1 for s1 in qs.split(separator)]
r = []
for name_value in pairs:
if not name_value and not strict_parsing:
Expand Down