Skip to content

Commit

Permalink
Treat binary and string fields differently
Browse files Browse the repository at this point in the history
I believe this improves [1] which addresses a bytes/str inconsistency issue
reported some time ago[2]. The goal of this patch is to have an option
to always deseriaize binary fields as bytes and string fields as str (on
Python 2, respectively: str and unicode). It achieves it by adding a
third option to previously purely boolean decode_response parameter (I
admit I'm not a fan of mixing types like this but I'm doing this
strictly for backwards compatibility and I'd argue the "auto" behavior
should become the default in the future and decode_response should be
removed altogether).

I'm not intimately familiar with Thrift specification so adding a
separate ttype like this may be stupid - let me know if that is so.

I've renamed and reorganized parts of the code in order for the names to
be reasonably truthful and to avoid too much code redundancy connected
to the fact that I've added more tests. Ideally some of the test code
would be shared so reduce the redundancy even further but I feel that
it's out of scope here. (Similarly the actual non-test code - I'm not
entirely comfortable with the redundancy this commit introduces but
there's quite a bit of redundancy in the code already so improving this
is likely a whole separate task. Also extracting the logic to a separate
function would negatively impact the performance /possibly not
important/ but it could also make the code less readable, I'm conflicted
here).

[1] Thriftpy@00c6553
[2] Thriftpy#190
  • Loading branch information
jstasiak committed May 14, 2016
1 parent 5aa0b90 commit f19539a
Show file tree
Hide file tree
Showing 8 changed files with 181 additions and 90 deletions.
38 changes: 38 additions & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,44 @@ You can also use `from ... import ...` style after a standard module load.
>>> from addressbook_thrift import *
Binaries/bytes vs. strings/text
===============================

It's important to distinguish between arbitrary arrays of bytes (str
in Python 2, bytes in Python 3) and text (unicode in Python 2, str in
python 3). ThriftPy deals with this as follows:

When serializing - both string and binary fields accept bytes and unicode.
If unicode is passed it will be converted to bytes using UTF-8 encoding.

When deserializing - there's `decode_response` constructor parameter
present in protocols and protocol factories that controls ThriftPy's
behavior in this regard. The default `decode_respone` value is `True`.
The complete list of field type and `decode_response` value combinations
is listed here:

string fields
`````````````

* `decode_response` set to True - the value is returned as text if it's
possible to decode it using UTF-8 or as bytes otherwise

* `decode_response` set to False - the value is always returned as bytes

* `decode_response` set to 'auto' - the value is decoded using UTF-8 and
returned as text, if decoding fails an exception is raised

binary fields
`````````````

* `decode_response` set to True - the value is returned as text if it's
possible to decode it using UTF-8 or as bytes otherwise

* `decode_response` set to False - the value is always returned as bytes

* `decode_response` set to 'auto' - the value is always returned as bytes


Benchmarks
==========

Expand Down
54 changes: 34 additions & 20 deletions tests/test_protocol_binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from io import BytesIO

import pytest

from thriftpy._compat import u
from thriftpy.thrift import TType, TPayload
from thriftpy.utils import hexlify
Expand Down Expand Up @@ -71,29 +73,41 @@ def test_unpack_double():
assert 1234567890.1234567890 == proto.read_val(b, TType.DOUBLE)


def test_pack_string():
@pytest.mark.parametrize(
"ttype,data,expected_result",
[
(TType.STRING, "hello world!",
"00 00 00 0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21"),
(TType.BINARY, b"hello world!",
"00 00 00 0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21"),
(TType.STRING, u("你好世界"),
"00 00 00 0c e4 bd a0 e5 a5 bd e4 b8 96 e7 95 8c"),
],
)
def test_pack_string_or_binary(ttype, data, expected_result):
b = BytesIO()
proto.write_val(b, TType.STRING, "hello world!")
assert "00 00 00 0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21" == \
hexlify(b.getvalue())

b = BytesIO()
proto.write_val(b, TType.STRING, u("你好世界"))
assert "00 00 00 0c e4 bd a0 e5 a5 bd e4 b8 96 e7 95 8c" == \
hexlify(b.getvalue())


def test_unpack_string():
proto.write_val(b, ttype, data)
assert expected_result == hexlify(b.getvalue())


@pytest.mark.parametrize(
"ttype,decode_response,expecting_text",
[
(TType.STRING, True, True),
(TType.STRING, False, False),
(TType.STRING, "auto", True),
(TType.BINARY, True, True),
(TType.BINARY, False, False),
(TType.BINARY, "auto", False),
],
)
def test_read_string_or_binary(ttype, decode_response, expecting_text):
b = BytesIO(b"\x00\x00\x00\x0c"
b"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c")
assert u("你好世界") == proto.read_val(b, TType.STRING)


def test_unpack_binary():
bs = BytesIO(b"\x00\x00\x00\x0c"
b"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c")
assert u("你好世界").encode("utf-8") == proto.read_val(
bs, TType.STRING, decode_response=False)
text = u("你好世界")
expected = text if expecting_text else text.encode('utf-8')
assert expected == proto.read_val(
b, ttype, decode_response=decode_response)


def test_write_message_begin():
Expand Down
49 changes: 32 additions & 17 deletions tests/test_protocol_compact.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from io import BytesIO

import pytest

from thriftpy._compat import u
from thriftpy.thrift import TType, TPayload
from thriftpy.utils import hexlify
Expand Down Expand Up @@ -85,34 +87,47 @@ def test_unpack_double():
assert 1234567890.1234567890 == proto.read_val(TType.DOUBLE)


def test_pack_string():
@pytest.mark.parametrize(
"ttype,data,expected_result",
[
(TType.STRING, "hello world!",
"0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21"),
(TType.BINARY, b"hello world!",
"0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21"),
(TType.STRING, u("你好世界"),
"0c e4 bd a0 e5 a5 bd e4 b8 96 e7 95 8c"),
],
)
def test_pack_string_or_binary(ttype, data, expected_result):
b, proto = gen_proto()
proto.write_val(TType.STRING, "hello world!")
assert "0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21" == \
hexlify(b.getvalue())

b1, proto1 = gen_proto()
proto1.write_val(TType.STRING, "你好世界")
assert "0c e4 bd a0 e5 a5 bd e4 b8 96 e7 95 8c" == \
hexlify(b1.getvalue())
proto.write_val(ttype, data)
assert expected_result == hexlify(b.getvalue())


def test_unpack_string():
b, proto = gen_proto(b"\x0c\x68\x65\x6c\x6c\x6f"
b"\x20\x77\x6f\x72\x6c\x64\x21")
assert u('hello world!') == proto.read_val(TType.STRING)

b, proto = gen_proto(b'\x0c\xe4\xbd\xa0\xe5\xa5'
b'\xbd\xe4\xb8\x96\xe7\x95\x8c')
assert u('你好世界') == proto.read_val(TType.STRING)


def test_unpack_binary():
@pytest.mark.parametrize(
"ttype,decode_response,expecting_text",
[
(TType.STRING, True, True),
(TType.STRING, False, False),
(TType.STRING, 'auto', True),
(TType.BINARY, True, True),
(TType.BINARY, False, False),
(TType.BINARY, 'auto', False),
],
)
def test_unpack_string_or_binary(ttype, decode_response, expecting_text):
b, proto = gen_proto(b'\x0c\xe4\xbd\xa0\xe5\xa5'
b'\xbd\xe4\xb8\x96\xe7\x95\x8c')
proto.decode_response = False

assert u('你好世界').encode("utf-8") == proto.read_val(TType.STRING)
proto.decode_response = decode_response
text = u('你好世界')
expected = text if expecting_text else text.encode('utf-8')
assert expected == proto.read_val(ttype)


def test_pack_bool():
Expand Down
53 changes: 32 additions & 21 deletions tests/test_protocol_cybinary.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,31 +120,42 @@ def test_read_double():
assert 1234567890.1234567890 == proto.read_val(b, TType.DOUBLE)


def test_write_string():
@pytest.mark.parametrize(
"ttype,data,expected_result",
[
(TType.STRING, "hello world!",
"00 00 00 0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21"),
(TType.BINARY, b"hello world!",
"00 00 00 0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21"),
(TType.STRING, u("你好世界"),
"00 00 00 0c e4 bd a0 e5 a5 bd e4 b8 96 e7 95 8c"),
],
)
def test_write_string_or_binary(ttype, data, expected_result):
b = TCyMemoryBuffer()
proto.write_val(b, TType.STRING, "hello world!")
proto.write_val(b, ttype, data)
b.flush()
assert "00 00 00 0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21" == \
hexlify(b.getvalue())

b = TCyMemoryBuffer()
proto.write_val(b, TType.STRING, u("你好世界"))
b.flush()
assert "00 00 00 0c e4 bd a0 e5 a5 bd e4 b8 96 e7 95 8c" == \
hexlify(b.getvalue())


def test_read_string():
b = TCyMemoryBuffer(b"\x00\x00\x00\x0c"
b"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c")
assert u("你好世界") == proto.read_val(b, TType.STRING)


def test_read_binary():
assert expected_result == hexlify(b.getvalue())


@pytest.mark.parametrize(
"ttype,decode_response,expecting_text",
[
(TType.STRING, True, True),
(TType.STRING, False, False),
(TType.STRING, "auto", True),
(TType.BINARY, True, True),
(TType.BINARY, False, False),
(TType.BINARY, "auto", False),
],
)
def test_read_string_or_binary(ttype, decode_response, expecting_text):
b = TCyMemoryBuffer(b"\x00\x00\x00\x0c"
b"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c")
assert u("你好世界").encode("utf-8") == proto.read_val(
b, TType.STRING, decode_response=False)
text = u("你好世界")
expected = text if expecting_text else text.encode('utf-8')
assert expected == proto.read_val(
b, ttype, decode_response=decode_response)


def test_write_message_begin():
Expand Down
27 changes: 14 additions & 13 deletions thriftpy/protocol/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ def pack_double(dub):
return struct.pack("!d", dub)


def pack_string(string):
return struct.pack("!i%ds" % len(string), len(string), string)
def pack_binary(binary):
return struct.pack("!i%ds" % len(binary), len(binary), binary)


def unpack_i8(buf):
Expand All @@ -62,9 +62,9 @@ def unpack_double(buf):
def write_message_begin(outbuf, name, ttype, seqid, strict=True):
if strict:
outbuf.write(pack_i32(VERSION_1 | ttype))
outbuf.write(pack_string(name.encode('utf-8')))
outbuf.write(pack_binary(name.encode('utf-8')))
else:
outbuf.write(pack_string(name.encode('utf-8')))
outbuf.write(pack_binary(name.encode('utf-8')))
outbuf.write(pack_i8(ttype))

outbuf.write(pack_i32(seqid))
Expand Down Expand Up @@ -108,10 +108,10 @@ def write_val(outbuf, ttype, val, spec=None):
elif ttype == TType.DOUBLE:
outbuf.write(pack_double(val))

elif ttype == TType.STRING:
elif ttype in [TType.STRING, TType.BINARY]:
if not isinstance(val, bytes):
val = val.encode('utf-8')
outbuf.write(pack_string(val))
outbuf.write(pack_binary(val))

elif ttype == TType.SET or ttype == TType.LIST:
if isinstance(spec, tuple):
Expand Down Expand Up @@ -224,17 +224,18 @@ def read_val(inbuf, ttype, spec=None, decode_response=True):
elif ttype == TType.DOUBLE:
return unpack_double(inbuf.read(8))

elif ttype == TType.STRING:
elif ttype in [TType.STRING, TType.BINARY]:
sz = unpack_i32(inbuf.read(4))
byte_payload = inbuf.read(sz)

# Since we cannot tell if we're getting STRING or BINARY
# if not asked not to decode, try both
if decode_response:
if (
decode_response == 'auto' and ttype == TType.STRING or
decode_response != 'auto' and decode_response
):
try:
return byte_payload.decode('utf-8')
except UnicodeDecodeError:
pass
if decode_response == 'auto':
raise
return byte_payload

elif ttype == TType.SET or ttype == TType.LIST:
Expand Down Expand Up @@ -331,7 +332,7 @@ def skip(inbuf, ftype):
elif ftype == TType.DOUBLE:
inbuf.read(8)

elif ftype == TType.STRING:
elif ftype == [TType.STRING, TType.BINARY]:
inbuf.read(unpack_i32(inbuf.read(4)))

elif ftype == TType.SET or ftype == TType.LIST:
Expand Down
Loading

0 comments on commit f19539a

Please sign in to comment.