Skip to content

Commit

Permalink
Fix symbolize_keys: true to properly create UTF-8 symbols
Browse files Browse the repository at this point in the history
When `symbolize_keys: true` was set, we'd already read the string
as binary, even if the native type is UTF-8. This was causing non-ASCII
keys to be improperly deserialized.

`rb_str_intern` will still notice strings that are ASCII only, and return
ASCII symbol for them. So there's no backward compatibility concerns.
  • Loading branch information
byroot committed Feb 8, 2022
1 parent 122a557 commit 01b57bd
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 6 deletions.
4 changes: 2 additions & 2 deletions ext/msgpack/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -494,9 +494,9 @@ static inline VALUE msgpack_buffer_read_top_as_string(msgpack_buffer_t* b, size_
#endif // HAVE_RB_ENC_INTERNED_STR
}

static inline VALUE msgpack_buffer_read_top_as_symbol(msgpack_buffer_t* b, size_t length)
static inline VALUE msgpack_buffer_read_top_as_symbol(msgpack_buffer_t* b, size_t length, bool utf8)
{
return rb_str_intern(msgpack_buffer_read_top_as_string(b, length, true, false));
return rb_str_intern(msgpack_buffer_read_top_as_string(b, length, true, utf8));
}

#endif
2 changes: 1 addition & 1 deletion ext/msgpack/unpacker.c
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ static inline int read_raw_body_begin(msgpack_unpacker_t* uk, int raw_type)
if(length <= msgpack_buffer_top_readable_size(UNPACKER_BUFFER_(uk))) {
int ret;
if ((uk->optimized_symbol_ext_type && uk->symbol_ext_type == raw_type) || (uk->symbolize_keys && is_reading_map_key(uk))) {
VALUE symbol = msgpack_buffer_read_top_as_symbol(UNPACKER_BUFFER_(uk), length);
VALUE symbol = msgpack_buffer_read_top_as_symbol(UNPACKER_BUFFER_(uk), length, raw_type == RAW_TYPE_STRING);
ret = object_complete_symbol(uk, symbol);
} else {
/* don't use zerocopy for hash keys but get a frozen string directly
Expand Down
25 changes: 22 additions & 3 deletions spec/unpacker_spec.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# encoding: ascii-8bit

require 'stringio'
require 'tempfile'
require 'zlib'
Expand Down Expand Up @@ -302,6 +300,21 @@ def flush
MessagePack.unpack(MessagePack.pack(symbolized_hash), :symbolize_keys => true).should == symbolized_hash
end

it 'MessagePack.unpack symbolize_keys preserve encoding' do
hash = { :ascii => 1, :utf8_é => 2}
loaded_hash = MessagePack.load(MessagePack.pack(hash), :symbolize_keys => true)

hash.keys[0].encoding.should == Encoding::US_ASCII # Ruby coerce symbols to US-ASCII when possible.
loaded_hash.keys[0].should == hash.keys[0]
loaded_hash.keys[0].encoding.should == hash.keys[0].encoding

hash.keys[1].encoding.should == Encoding::UTF_8
loaded_hash.keys[1].should == hash.keys[1]
loaded_hash.keys[1].encoding.should == hash.keys[1].encoding

MessagePack.unpack(MessagePack.pack(hash), :symbolize_keys => true).should == hash
end

it 'Unpacker#unpack symbolize_keys' do
unpacker = MessagePack::Unpacker.new(:symbolize_keys => true)
symbolized_hash = {:a => 'b', :c => 'd'}
Expand Down Expand Up @@ -765,7 +778,13 @@ def flatten(struct, results = [])

context 'binary encoding', :encodings do
let :buffer do
MessagePack.pack({'hello' => 'world', 'nested' => ['object', {'structure' => true}]})
MessagePack.pack({
'hello'.b => 'world'.b,
'nested'.b => [
'object'.b,
{'structure'.b => true},
]
})
end

let :unpacker do
Expand Down

0 comments on commit 01b57bd

Please sign in to comment.