Treat binary and string fields differently

I believe this improves [1] which addresses a bytes/str inconsistency issue reported some time ago[2]. The goal of this patch is to have an option to always deseriaize binary fields as bytes and string fields as str (on Python 2, respectively: str and unicode). It achieves it by adding a third option to previously purely boolean decode_response parameter (I admit I'm not a fan of mixing types like this but I'm doing this strictly for backwards compatibility and I'd argue the "auto" behavior should become the default in the future and decode_response should be removed altogether). I'm not intimately familiar with Thrift specification so adding a separate ttype like this may be stupid - let me know if that is so. I've renamed and reorganized parts of the code in order for the names to be reasonably truthful and to avoid too much code redundancy connected to the fact that I've added more tests. Ideally some of the test code would be shared so reduce the redundancy even further but I feel that it's out of scope here. (Similarly the actual non-test code - I'm not entirely comfortable with the redundancy this commit introduces but there's quite a bit of redundancy in the code already so improving this is likely a whole separate task. Also extracting the logic to a separate function would negatively impact the performance /possibly not important/ but it could also make the code less readable, I'm conflicted here). [1] Thriftpy@00c6553 [2] Thriftpy#190
jstasiak · May 14, 2016 · f19539a · f19539a
1 parent 5aa0b90
commit f19539a
Show file tree

Hide file tree

Showing 8 changed files with 181 additions and 90 deletions.
diff --git a/docs/index.rst b/docs/index.rst
@@ -173,6 +173,44 @@ You can also use `from ... import ...` style after a standard module load.
     >>> from addressbook_thrift import *
 
 
+Binaries/bytes vs. strings/text
+===============================
+
+It's important to distinguish between arbitrary arrays of bytes (str
+in Python 2, bytes in Python 3) and text (unicode in Python 2, str in
+python 3). ThriftPy deals with this as follows:
+
+When serializing - both string and binary fields accept bytes and unicode.
+If unicode is passed it will be converted to bytes using UTF-8 encoding.
+
+When deserializing - there's `decode_response` constructor parameter
+present in protocols and protocol factories that controls ThriftPy's
+behavior in this regard. The default `decode_respone` value is `True`.
+The complete list of field type and `decode_response` value combinations
+is listed here:
+
+string fields
+`````````````
+
+* `decode_response` set to True - the value is returned as text if it's
+  possible to decode it using UTF-8 or as bytes otherwise
+
+* `decode_response` set to False - the value is always returned as bytes
+
+* `decode_response` set to 'auto' - the value is decoded using UTF-8 and
+  returned as text, if decoding fails an exception is raised
+
+binary fields
+`````````````
+
+* `decode_response` set to True - the value is returned as text if it's
+  possible to decode it using UTF-8 or as bytes otherwise
+
+* `decode_response` set to False - the value is always returned as bytes
+
+* `decode_response` set to 'auto' - the value is always returned as bytes
+
+
 Benchmarks
 ==========
 

diff --git a/tests/test_protocol_binary.py b/tests/test_protocol_binary.py
@@ -2,6 +2,8 @@
 
 from io import BytesIO
 
+import pytest
+
 from thriftpy._compat import u
 from thriftpy.thrift import TType, TPayload
 from thriftpy.utils import hexlify
@@ -71,29 +73,41 @@ def test_unpack_double():
     assert 1234567890.1234567890 == proto.read_val(b, TType.DOUBLE)
 
 
-def test_pack_string():
+@pytest.mark.parametrize(
+    "ttype,data,expected_result",
+    [
+        (TType.STRING, "hello world!",
+         "00 00 00 0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21"),
+        (TType.BINARY, b"hello world!",
+         "00 00 00 0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21"),
+        (TType.STRING, u("你好世界"),
+         "00 00 00 0c e4 bd a0 e5 a5 bd e4 b8 96 e7 95 8c"),
+    ],
+)
+def test_pack_string_or_binary(ttype, data, expected_result):
     b = BytesIO()
-    proto.write_val(b, TType.STRING, "hello world!")
-    assert "00 00 00 0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21" == \
-        hexlify(b.getvalue())
-
-    b = BytesIO()
-    proto.write_val(b, TType.STRING, u("你好世界"))
-    assert "00 00 00 0c e4 bd a0 e5 a5 bd e4 b8 96 e7 95 8c" == \
-        hexlify(b.getvalue())
-
-
-def test_unpack_string():
+    proto.write_val(b, ttype, data)
+    assert expected_result == hexlify(b.getvalue())
+
+
+@pytest.mark.parametrize(
+    "ttype,decode_response,expecting_text",
+    [
+        (TType.STRING, True, True),
+        (TType.STRING, False, False),
+        (TType.STRING, "auto", True),
+        (TType.BINARY, True, True),
+        (TType.BINARY, False, False),
+        (TType.BINARY, "auto", False),
+    ],
+)
+def test_read_string_or_binary(ttype, decode_response, expecting_text):
     b = BytesIO(b"\x00\x00\x00\x0c"
                 b"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c")
-    assert u("你好世界") == proto.read_val(b, TType.STRING)
-
-
-def test_unpack_binary():
-    bs = BytesIO(b"\x00\x00\x00\x0c"
-                 b"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c")
-    assert u("你好世界").encode("utf-8") == proto.read_val(
-        bs, TType.STRING, decode_response=False)
+    text = u("你好世界")
+    expected = text if expecting_text else text.encode('utf-8')
+    assert expected == proto.read_val(
+        b, ttype, decode_response=decode_response)
 
 
 def test_write_message_begin():

diff --git a/tests/test_protocol_compact.py b/tests/test_protocol_compact.py
@@ -2,6 +2,8 @@
 
 from io import BytesIO
 
+import pytest
+
 from thriftpy._compat import u
 from thriftpy.thrift import TType, TPayload
 from thriftpy.utils import hexlify
@@ -85,34 +87,47 @@ def test_unpack_double():
     assert 1234567890.1234567890 == proto.read_val(TType.DOUBLE)
 
 
-def test_pack_string():
+@pytest.mark.parametrize(
+    "ttype,data,expected_result",
+    [
+        (TType.STRING, "hello world!",
+         "0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21"),
+        (TType.BINARY, b"hello world!",
+         "0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21"),
+        (TType.STRING, u("你好世界"),
+         "0c e4 bd a0 e5 a5 bd e4 b8 96 e7 95 8c"),
+    ],
+)
+def test_pack_string_or_binary(ttype, data, expected_result):
     b, proto = gen_proto()
-    proto.write_val(TType.STRING, "hello world!")
-    assert "0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21" == \
-           hexlify(b.getvalue())
-
-    b1, proto1 = gen_proto()
-    proto1.write_val(TType.STRING, "你好世界")
-    assert "0c e4 bd a0 e5 a5 bd e4 b8 96 e7 95 8c" == \
-           hexlify(b1.getvalue())
+    proto.write_val(ttype, data)
+    assert expected_result == hexlify(b.getvalue())
 
 
 def test_unpack_string():
     b, proto = gen_proto(b"\x0c\x68\x65\x6c\x6c\x6f"
                          b"\x20\x77\x6f\x72\x6c\x64\x21")
     assert u('hello world!') == proto.read_val(TType.STRING)
 
-    b, proto = gen_proto(b'\x0c\xe4\xbd\xa0\xe5\xa5'
-                         b'\xbd\xe4\xb8\x96\xe7\x95\x8c')
-    assert u('你好世界') == proto.read_val(TType.STRING)
-
 
-def test_unpack_binary():
+@pytest.mark.parametrize(
+    "ttype,decode_response,expecting_text",
+    [
+        (TType.STRING, True, True),
+        (TType.STRING, False, False),
+        (TType.STRING, 'auto', True),
+        (TType.BINARY, True, True),
+        (TType.BINARY, False, False),
+        (TType.BINARY, 'auto', False),
+    ],
+)
+def test_unpack_string_or_binary(ttype, decode_response, expecting_text):
     b, proto = gen_proto(b'\x0c\xe4\xbd\xa0\xe5\xa5'
                          b'\xbd\xe4\xb8\x96\xe7\x95\x8c')
-    proto.decode_response = False
-
-    assert u('你好世界').encode("utf-8") == proto.read_val(TType.STRING)
+    proto.decode_response = decode_response
+    text = u('你好世界')
+    expected = text if expecting_text else text.encode('utf-8')
+    assert expected == proto.read_val(ttype)
 
 
 def test_pack_bool():

diff --git a/tests/test_protocol_cybinary.py b/tests/test_protocol_cybinary.py
@@ -120,31 +120,42 @@ def test_read_double():
     assert 1234567890.1234567890 == proto.read_val(b, TType.DOUBLE)
 
 
-def test_write_string():
+@pytest.mark.parametrize(
+    "ttype,data,expected_result",
+    [
+        (TType.STRING, "hello world!",
+         "00 00 00 0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21"),
+        (TType.BINARY, b"hello world!",
+         "00 00 00 0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21"),
+        (TType.STRING, u("你好世界"),
+         "00 00 00 0c e4 bd a0 e5 a5 bd e4 b8 96 e7 95 8c"),
+    ],
+)
+def test_write_string_or_binary(ttype, data, expected_result):
     b = TCyMemoryBuffer()
-    proto.write_val(b, TType.STRING, "hello world!")
+    proto.write_val(b, ttype, data)
     b.flush()
-    assert "00 00 00 0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21" == \
-        hexlify(b.getvalue())
-
-    b = TCyMemoryBuffer()
-    proto.write_val(b, TType.STRING, u("你好世界"))
-    b.flush()
-    assert "00 00 00 0c e4 bd a0 e5 a5 bd e4 b8 96 e7 95 8c" == \
-        hexlify(b.getvalue())
-
-
-def test_read_string():
-    b = TCyMemoryBuffer(b"\x00\x00\x00\x0c"
-                        b"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c")
-    assert u("你好世界") == proto.read_val(b, TType.STRING)
-
-
-def test_read_binary():
+    assert expected_result == hexlify(b.getvalue())
+
+
+@pytest.mark.parametrize(
+    "ttype,decode_response,expecting_text",
+    [
+        (TType.STRING, True, True),
+        (TType.STRING, False, False),
+        (TType.STRING, "auto", True),
+        (TType.BINARY, True, True),
+        (TType.BINARY, False, False),
+        (TType.BINARY, "auto", False),
+    ],
+)
+def test_read_string_or_binary(ttype, decode_response, expecting_text):
     b = TCyMemoryBuffer(b"\x00\x00\x00\x0c"
                         b"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c")
-    assert u("你好世界").encode("utf-8") == proto.read_val(
-        b, TType.STRING, decode_response=False)
+    text = u("你好世界")
+    expected = text if expecting_text else text.encode('utf-8')
+    assert expected == proto.read_val(
+        b, ttype, decode_response=decode_response)
 
 
 def test_write_message_begin():

diff --git a/thriftpy/protocol/binary.py b/thriftpy/protocol/binary.py
@@ -35,8 +35,8 @@ def pack_double(dub):
     return struct.pack("!d", dub)
 
 
-def pack_string(string):
-    return struct.pack("!i%ds" % len(string), len(string), string)
+def pack_binary(binary):
+    return struct.pack("!i%ds" % len(binary), len(binary), binary)
 
 
 def unpack_i8(buf):
@@ -62,9 +62,9 @@ def unpack_double(buf):
 def write_message_begin(outbuf, name, ttype, seqid, strict=True):
     if strict:
         outbuf.write(pack_i32(VERSION_1 | ttype))
-        outbuf.write(pack_string(name.encode('utf-8')))
+        outbuf.write(pack_binary(name.encode('utf-8')))
     else:
-        outbuf.write(pack_string(name.encode('utf-8')))
+        outbuf.write(pack_binary(name.encode('utf-8')))
         outbuf.write(pack_i8(ttype))
 
     outbuf.write(pack_i32(seqid))
@@ -108,10 +108,10 @@ def write_val(outbuf, ttype, val, spec=None):
     elif ttype == TType.DOUBLE:
         outbuf.write(pack_double(val))
 
-    elif ttype == TType.STRING:
+    elif ttype in [TType.STRING, TType.BINARY]:
         if not isinstance(val, bytes):
             val = val.encode('utf-8')
-        outbuf.write(pack_string(val))
+        outbuf.write(pack_binary(val))
 
     elif ttype == TType.SET or ttype == TType.LIST:
         if isinstance(spec, tuple):
@@ -224,17 +224,18 @@ def read_val(inbuf, ttype, spec=None, decode_response=True):
     elif ttype == TType.DOUBLE:
         return unpack_double(inbuf.read(8))
 
-    elif ttype == TType.STRING:
+    elif ttype in [TType.STRING, TType.BINARY]:
         sz = unpack_i32(inbuf.read(4))
         byte_payload = inbuf.read(sz)
-
-        # Since we cannot tell if we're getting STRING or BINARY
-        # if not asked not to decode, try both
-        if decode_response:
+        if (
+            decode_response == 'auto' and ttype == TType.STRING or
+            decode_response != 'auto' and decode_response
+        ):
             try:
                 return byte_payload.decode('utf-8')
             except UnicodeDecodeError:
-                pass
+                if decode_response == 'auto':
+                    raise
         return byte_payload
 
     elif ttype == TType.SET or ttype == TType.LIST:
@@ -331,7 +332,7 @@ def skip(inbuf, ftype):
     elif ftype == TType.DOUBLE:
         inbuf.read(8)
 
-    elif ftype == TType.STRING:
+    elif ftype == [TType.STRING, TType.BINARY]:
         inbuf.read(unpack_i32(inbuf.read(4)))
 
     elif ftype == TType.SET or ftype == TType.LIST: