From f19539a06cac628e93621625b656841f5c6a7a1a Mon Sep 17 00:00:00 2001 From: Jakub Stasiak Date: Sat, 14 May 2016 08:02:37 +0200 Subject: [PATCH] Treat binary and string fields differently I believe this improves [1] which addresses a bytes/str inconsistency issue reported some time ago[2]. The goal of this patch is to have an option to always deseriaize binary fields as bytes and string fields as str (on Python 2, respectively: str and unicode). It achieves it by adding a third option to previously purely boolean decode_response parameter (I admit I'm not a fan of mixing types like this but I'm doing this strictly for backwards compatibility and I'd argue the "auto" behavior should become the default in the future and decode_response should be removed altogether). I'm not intimately familiar with Thrift specification so adding a separate ttype like this may be stupid - let me know if that is so. I've renamed and reorganized parts of the code in order for the names to be reasonably truthful and to avoid too much code redundancy connected to the fact that I've added more tests. Ideally some of the test code would be shared so reduce the redundancy even further but I feel that it's out of scope here. (Similarly the actual non-test code - I'm not entirely comfortable with the redundancy this commit introduces but there's quite a bit of redundancy in the code already so improving this is likely a whole separate task. Also extracting the logic to a separate function would negatively impact the performance /possibly not important/ but it could also make the code less readable, I'm conflicted here). [1] https://github.com/eleme/thriftpy/commit/00c65539035953a79fafff90b1dbeb0f591e0fb2 [2] https://github.com/eleme/thriftpy/issues/190 --- docs/index.rst | 38 ++++++++++++++++++++++ tests/test_protocol_binary.py | 54 +++++++++++++++++++------------ tests/test_protocol_compact.py | 49 ++++++++++++++++++---------- tests/test_protocol_cybinary.py | 53 ++++++++++++++++++------------ thriftpy/protocol/binary.py | 27 ++++++++-------- thriftpy/protocol/compact.py | 30 +++++++++-------- thriftpy/protocol/cybin/cybin.pyx | 18 ++++++++--- thriftpy/thrift.py | 2 +- 8 files changed, 181 insertions(+), 90 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 737eb6c..c228522 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -173,6 +173,44 @@ You can also use `from ... import ...` style after a standard module load. >>> from addressbook_thrift import * +Binaries/bytes vs. strings/text +=============================== + +It's important to distinguish between arbitrary arrays of bytes (str +in Python 2, bytes in Python 3) and text (unicode in Python 2, str in +python 3). ThriftPy deals with this as follows: + +When serializing - both string and binary fields accept bytes and unicode. +If unicode is passed it will be converted to bytes using UTF-8 encoding. + +When deserializing - there's `decode_response` constructor parameter +present in protocols and protocol factories that controls ThriftPy's +behavior in this regard. The default `decode_respone` value is `True`. +The complete list of field type and `decode_response` value combinations +is listed here: + +string fields +````````````` + +* `decode_response` set to True - the value is returned as text if it's + possible to decode it using UTF-8 or as bytes otherwise + +* `decode_response` set to False - the value is always returned as bytes + +* `decode_response` set to 'auto' - the value is decoded using UTF-8 and + returned as text, if decoding fails an exception is raised + +binary fields +````````````` + +* `decode_response` set to True - the value is returned as text if it's + possible to decode it using UTF-8 or as bytes otherwise + +* `decode_response` set to False - the value is always returned as bytes + +* `decode_response` set to 'auto' - the value is always returned as bytes + + Benchmarks ========== diff --git a/tests/test_protocol_binary.py b/tests/test_protocol_binary.py index beb03cc..a03f440 100644 --- a/tests/test_protocol_binary.py +++ b/tests/test_protocol_binary.py @@ -2,6 +2,8 @@ from io import BytesIO +import pytest + from thriftpy._compat import u from thriftpy.thrift import TType, TPayload from thriftpy.utils import hexlify @@ -71,29 +73,41 @@ def test_unpack_double(): assert 1234567890.1234567890 == proto.read_val(b, TType.DOUBLE) -def test_pack_string(): +@pytest.mark.parametrize( + "ttype,data,expected_result", + [ + (TType.STRING, "hello world!", + "00 00 00 0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21"), + (TType.BINARY, b"hello world!", + "00 00 00 0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21"), + (TType.STRING, u("你好世界"), + "00 00 00 0c e4 bd a0 e5 a5 bd e4 b8 96 e7 95 8c"), + ], +) +def test_pack_string_or_binary(ttype, data, expected_result): b = BytesIO() - proto.write_val(b, TType.STRING, "hello world!") - assert "00 00 00 0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21" == \ - hexlify(b.getvalue()) - - b = BytesIO() - proto.write_val(b, TType.STRING, u("你好世界")) - assert "00 00 00 0c e4 bd a0 e5 a5 bd e4 b8 96 e7 95 8c" == \ - hexlify(b.getvalue()) - - -def test_unpack_string(): + proto.write_val(b, ttype, data) + assert expected_result == hexlify(b.getvalue()) + + +@pytest.mark.parametrize( + "ttype,decode_response,expecting_text", + [ + (TType.STRING, True, True), + (TType.STRING, False, False), + (TType.STRING, "auto", True), + (TType.BINARY, True, True), + (TType.BINARY, False, False), + (TType.BINARY, "auto", False), + ], +) +def test_read_string_or_binary(ttype, decode_response, expecting_text): b = BytesIO(b"\x00\x00\x00\x0c" b"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c") - assert u("你好世界") == proto.read_val(b, TType.STRING) - - -def test_unpack_binary(): - bs = BytesIO(b"\x00\x00\x00\x0c" - b"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c") - assert u("你好世界").encode("utf-8") == proto.read_val( - bs, TType.STRING, decode_response=False) + text = u("你好世界") + expected = text if expecting_text else text.encode('utf-8') + assert expected == proto.read_val( + b, ttype, decode_response=decode_response) def test_write_message_begin(): diff --git a/tests/test_protocol_compact.py b/tests/test_protocol_compact.py index 21ae4a0..8676935 100644 --- a/tests/test_protocol_compact.py +++ b/tests/test_protocol_compact.py @@ -2,6 +2,8 @@ from io import BytesIO +import pytest + from thriftpy._compat import u from thriftpy.thrift import TType, TPayload from thriftpy.utils import hexlify @@ -85,16 +87,21 @@ def test_unpack_double(): assert 1234567890.1234567890 == proto.read_val(TType.DOUBLE) -def test_pack_string(): +@pytest.mark.parametrize( + "ttype,data,expected_result", + [ + (TType.STRING, "hello world!", + "0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21"), + (TType.BINARY, b"hello world!", + "0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21"), + (TType.STRING, u("你好世界"), + "0c e4 bd a0 e5 a5 bd e4 b8 96 e7 95 8c"), + ], +) +def test_pack_string_or_binary(ttype, data, expected_result): b, proto = gen_proto() - proto.write_val(TType.STRING, "hello world!") - assert "0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21" == \ - hexlify(b.getvalue()) - - b1, proto1 = gen_proto() - proto1.write_val(TType.STRING, "你好世界") - assert "0c e4 bd a0 e5 a5 bd e4 b8 96 e7 95 8c" == \ - hexlify(b1.getvalue()) + proto.write_val(ttype, data) + assert expected_result == hexlify(b.getvalue()) def test_unpack_string(): @@ -102,17 +109,25 @@ def test_unpack_string(): b"\x20\x77\x6f\x72\x6c\x64\x21") assert u('hello world!') == proto.read_val(TType.STRING) - b, proto = gen_proto(b'\x0c\xe4\xbd\xa0\xe5\xa5' - b'\xbd\xe4\xb8\x96\xe7\x95\x8c') - assert u('你好世界') == proto.read_val(TType.STRING) - -def test_unpack_binary(): +@pytest.mark.parametrize( + "ttype,decode_response,expecting_text", + [ + (TType.STRING, True, True), + (TType.STRING, False, False), + (TType.STRING, 'auto', True), + (TType.BINARY, True, True), + (TType.BINARY, False, False), + (TType.BINARY, 'auto', False), + ], +) +def test_unpack_string_or_binary(ttype, decode_response, expecting_text): b, proto = gen_proto(b'\x0c\xe4\xbd\xa0\xe5\xa5' b'\xbd\xe4\xb8\x96\xe7\x95\x8c') - proto.decode_response = False - - assert u('你好世界').encode("utf-8") == proto.read_val(TType.STRING) + proto.decode_response = decode_response + text = u('你好世界') + expected = text if expecting_text else text.encode('utf-8') + assert expected == proto.read_val(ttype) def test_pack_bool(): diff --git a/tests/test_protocol_cybinary.py b/tests/test_protocol_cybinary.py index 3f5a2ac..bb874a8 100644 --- a/tests/test_protocol_cybinary.py +++ b/tests/test_protocol_cybinary.py @@ -120,31 +120,42 @@ def test_read_double(): assert 1234567890.1234567890 == proto.read_val(b, TType.DOUBLE) -def test_write_string(): +@pytest.mark.parametrize( + "ttype,data,expected_result", + [ + (TType.STRING, "hello world!", + "00 00 00 0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21"), + (TType.BINARY, b"hello world!", + "00 00 00 0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21"), + (TType.STRING, u("你好世界"), + "00 00 00 0c e4 bd a0 e5 a5 bd e4 b8 96 e7 95 8c"), + ], +) +def test_write_string_or_binary(ttype, data, expected_result): b = TCyMemoryBuffer() - proto.write_val(b, TType.STRING, "hello world!") + proto.write_val(b, ttype, data) b.flush() - assert "00 00 00 0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21" == \ - hexlify(b.getvalue()) - - b = TCyMemoryBuffer() - proto.write_val(b, TType.STRING, u("你好世界")) - b.flush() - assert "00 00 00 0c e4 bd a0 e5 a5 bd e4 b8 96 e7 95 8c" == \ - hexlify(b.getvalue()) - - -def test_read_string(): - b = TCyMemoryBuffer(b"\x00\x00\x00\x0c" - b"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c") - assert u("你好世界") == proto.read_val(b, TType.STRING) - - -def test_read_binary(): + assert expected_result == hexlify(b.getvalue()) + + +@pytest.mark.parametrize( + "ttype,decode_response,expecting_text", + [ + (TType.STRING, True, True), + (TType.STRING, False, False), + (TType.STRING, "auto", True), + (TType.BINARY, True, True), + (TType.BINARY, False, False), + (TType.BINARY, "auto", False), + ], +) +def test_read_string_or_binary(ttype, decode_response, expecting_text): b = TCyMemoryBuffer(b"\x00\x00\x00\x0c" b"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c") - assert u("你好世界").encode("utf-8") == proto.read_val( - b, TType.STRING, decode_response=False) + text = u("你好世界") + expected = text if expecting_text else text.encode('utf-8') + assert expected == proto.read_val( + b, ttype, decode_response=decode_response) def test_write_message_begin(): diff --git a/thriftpy/protocol/binary.py b/thriftpy/protocol/binary.py index 7ba21f7..1e40ed7 100644 --- a/thriftpy/protocol/binary.py +++ b/thriftpy/protocol/binary.py @@ -35,8 +35,8 @@ def pack_double(dub): return struct.pack("!d", dub) -def pack_string(string): - return struct.pack("!i%ds" % len(string), len(string), string) +def pack_binary(binary): + return struct.pack("!i%ds" % len(binary), len(binary), binary) def unpack_i8(buf): @@ -62,9 +62,9 @@ def unpack_double(buf): def write_message_begin(outbuf, name, ttype, seqid, strict=True): if strict: outbuf.write(pack_i32(VERSION_1 | ttype)) - outbuf.write(pack_string(name.encode('utf-8'))) + outbuf.write(pack_binary(name.encode('utf-8'))) else: - outbuf.write(pack_string(name.encode('utf-8'))) + outbuf.write(pack_binary(name.encode('utf-8'))) outbuf.write(pack_i8(ttype)) outbuf.write(pack_i32(seqid)) @@ -108,10 +108,10 @@ def write_val(outbuf, ttype, val, spec=None): elif ttype == TType.DOUBLE: outbuf.write(pack_double(val)) - elif ttype == TType.STRING: + elif ttype in [TType.STRING, TType.BINARY]: if not isinstance(val, bytes): val = val.encode('utf-8') - outbuf.write(pack_string(val)) + outbuf.write(pack_binary(val)) elif ttype == TType.SET or ttype == TType.LIST: if isinstance(spec, tuple): @@ -224,17 +224,18 @@ def read_val(inbuf, ttype, spec=None, decode_response=True): elif ttype == TType.DOUBLE: return unpack_double(inbuf.read(8)) - elif ttype == TType.STRING: + elif ttype in [TType.STRING, TType.BINARY]: sz = unpack_i32(inbuf.read(4)) byte_payload = inbuf.read(sz) - - # Since we cannot tell if we're getting STRING or BINARY - # if not asked not to decode, try both - if decode_response: + if ( + decode_response == 'auto' and ttype == TType.STRING or + decode_response != 'auto' and decode_response + ): try: return byte_payload.decode('utf-8') except UnicodeDecodeError: - pass + if decode_response == 'auto': + raise return byte_payload elif ttype == TType.SET or ttype == TType.LIST: @@ -331,7 +332,7 @@ def skip(inbuf, ftype): elif ftype == TType.DOUBLE: inbuf.read(8) - elif ftype == TType.STRING: + elif ftype == [TType.STRING, TType.BINARY]: inbuf.read(unpack_i32(inbuf.read(4))) elif ftype == TType.SET or ftype == TType.LIST: diff --git a/thriftpy/protocol/compact.py b/thriftpy/protocol/compact.py index 9f49e05..4d71b96 100644 --- a/thriftpy/protocol/compact.py +++ b/thriftpy/protocol/compact.py @@ -154,7 +154,7 @@ def read_message_begin(self): 'Bad version: %d (expect %d)' % (version, self.VERSION)) seqid = read_varint(self.trans) - name = self.read_string() + name = self.read_string_or_binary(TType.STRING) return name, type, seqid def read_message_end(self): @@ -226,15 +226,19 @@ def read_double(self): val, = unpack('(&n))[0] - elif ttype == T_STRING: + elif ttype in {T_BINARY, T_STRING}: size = read_i32(buf) - if decode_response: - return c_read_string(buf, size) + if ( + decode_response == "auto" and ttype == T_STRING or + decode_response != "auto" and decode_response + ): + raise_on_error = decode_response == "auto" + return c_read_string(buf, size, raise_on_error) else: return c_read_binary(buf, size) @@ -344,7 +352,7 @@ cdef c_write_val(CyTransportBase buf, TType ttype, val, spec=None): elif ttype == T_DOUBLE: write_double(buf, val) - elif ttype == T_STRING: + elif ttype in {T_STRING, T_BINARY}: if not isinstance(val, bytes): try: val = val.encode("utf-8") diff --git a/thriftpy/thrift.py b/thriftpy/thrift.py index 0cf0877..4acbe90 100644 --- a/thriftpy/thrift.py +++ b/thriftpy/thrift.py @@ -50,13 +50,13 @@ class TType(object): I64 = 10 STRING = 11 UTF7 = 11 - BINARY = 11 # This here just for parsing. For all purposes, it's a string STRUCT = 12 MAP = 13 SET = 14 LIST = 15 UTF8 = 16 UTF16 = 17 + BINARY = 18 _VALUES_TO_NAMES = { STOP: 'STOP',