Skip to content

Commit

Permalink
Use protobuf encoding to serialize variable-length integers
Browse files Browse the repository at this point in the history
* The serialized size/source size for all .rb files of the top 100 gems
  is 1.65x before and 0.89x after, so now serialized is smaller than the
  source for those files.
* This compresses the best and is still fast to encode and decode,
  see #741
  and #836
* Use io.getbyte instead of io.read(1).unpack1("C") as it is much faster.
  • Loading branch information
eregon committed May 23, 2023
1 parent fabcc02 commit 1be84ff
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 27 deletions.
40 changes: 27 additions & 13 deletions bin/templates/java/org/yarp/Loader.java.erb
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public class Loader {
}

private byte[] loadString() {
int length = buffer.getInt();
int length = loadVarInt();
byte[] string = new byte[length];
buffer.get(string);
return string;
Expand All @@ -65,7 +65,7 @@ public class Loader {
}

private Nodes.Token[] loadTokens() {
int length = buffer.getInt();
int length = loadVarInt();
Nodes.Token[] tokens = new Nodes.Token[length];
for (int i = 0; i < length; i++) {
tokens[i] = loadToken();
Expand All @@ -74,7 +74,7 @@ public class Loader {
}

private Nodes.Node[] loadNodes() {
int length = buffer.getInt();
int length = loadVarInt();
Nodes.Node[] nodes = new Nodes.Node[length];
for (int i = 0; i < length; i++) {
nodes[i] = loadNode();
Expand All @@ -84,17 +84,17 @@ public class Loader {

private Nodes.Token loadToken() {
int type = buffer.get() & 0xFF;
int startOffset = buffer.getInt();
int length = buffer.getInt();
int startOffset = loadVarInt();
int length = loadVarInt();
int endOffset = startOffset + length;

final Nodes.TokenType tokenType = Nodes.TOKEN_TYPES[type];
return new Nodes.Token(tokenType, startOffset, endOffset);
}

private Nodes.Location loadLocation() {
int startOffset = buffer.getInt();
int length = buffer.getInt();
int startOffset = loadVarInt();
int length = loadVarInt();
int endOffset = startOffset + length;
return new Nodes.Location(startOffset, endOffset);
}
Expand All @@ -107,21 +107,35 @@ public class Loader {
}
}

private int loadInteger() {
return buffer.getInt();
// From https://github.com/protocolbuffers/protobuf/blob/v23.1/java/core/src/main/java/com/google/protobuf/BinaryReader.java#L1507
private int loadVarInt() {
int x;
if ((x = buffer.get()) >= 0) {
return x;
} else if ((x ^= (buffer.get() << 7)) < 0) {
x ^= (~0 << 7);
} else if ((x ^= (buffer.get() << 14)) >= 0) {
x ^= (~0 << 7) ^ (~0 << 14);
} else if ((x ^= (buffer.get() << 21)) < 0) {
x ^= (~0 << 7) ^ (~0 << 14) ^ (~0 << 21);
} else {
x ^= buffer.get() << 28;
x ^= (~0 << 7) ^ (~0 << 14) ^ (~0 << 21) ^ (~0 << 28);
}
return x;
}

private Nodes.Node loadNode() {
int type = buffer.get() & 0xFF;
int startOffset = buffer.getInt();
int length = buffer.getInt();
int startOffset = loadVarInt();
int length = loadVarInt();
int endOffset = startOffset + length;

switch (type) {
<%- nodes.each_with_index do |node, index| -%>
case <%= index + 1 %>:
<%-
params = node.needs_length? ? ["loadInteger()"] : []
params = node.needs_length? ? ["buffer.getInt()"] : []
params.concat node.params.map { |param|
case param
when NodeParam then "#{param.java_cast}loadNode()"
Expand All @@ -133,7 +147,7 @@ public class Loader {
when OptionalTokenParam then "loadOptionalToken()"
when LocationParam then "loadLocation()"
when OptionalLocationParam then "loadOptionalLocation()"
when UInt32Param then "loadInteger()"
when UInt32Param then "loadVarInt()"
else raise
end
}
Expand Down
42 changes: 31 additions & 11 deletions bin/templates/lib/yarp/serialize.rb.erb
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,28 @@ module YARP

private

# variable-length integer using https://en.wikipedia.org/wiki/LEB128
# This is also what protobuf uses: https://protobuf.dev/programming-guides/encoding/#varints
def load_varint
n = io.getbyte
if n < 128
n
else
n -= 128
shift = 0
while (b = io.getbyte) >= 128
n += (b - 128) << (shift += 7)
end
n + (b << (shift + 7))
end
end

def load_serialized_length
io.read(4).unpack1("L")
end

def load_token
number = io.read(1).unpack1("C")
number = io.getbyte
location = load_location

type =
Expand All @@ -43,56 +63,56 @@ module YARP
end
def load_optional_node
if io.read(1).unpack1("C") != 0
if io.getbyte != 0
io.pos -= 1
load_node
end
end
def load_optional_token
if io.read(1).unpack1("C") != 0
if io.getbyte != 0
io.pos -= 1
load_token
end
end
def load_string
length = io.read(4).unpack1("L")
length = load_varint
io.read(length).force_encoding(encoding)
end
def load_location
start_offset, length = io.read(8).unpack("LL")
start_offset, length = load_varint, load_varint
end_offset = start_offset + length
Location.new(start_offset, end_offset)
end
def load_optional_location
load_location if io.read(1).unpack1("C") != 0
load_location if io.getbyte != 0
end
def load_node
type = io.read(1).unpack1("C")
type = io.getbyte
location = load_location
case type
<%- nodes.each_with_index do |node, index| -%>
when <%= index + 1 %> then
<%- if node.needs_length? -%>
io.read(4)
load_serialized_length
<%- end -%>
<%= node.name %>.new(<%= (node.params.map { |param|
case param
when NodeParam then "load_node"
when OptionalNodeParam then "load_optional_node"
when StringParam then "load_string"
when NodeListParam then "io.read(4).unpack1(\"L\").times.map { load_node }"
when NodeListParam then "load_varint.times.map { load_node }"
when TokenParam then "load_token"
when TokenListParam then "io.read(4).unpack1(\"L\").times.map { load_token }"
when TokenListParam then "load_varint.times.map { load_token }"
when OptionalTokenParam then "load_optional_token"
when LocationParam then "load_location"
when OptionalLocationParam then "load_optional_location"
when UInt32Param then 'io.read(4).unpack1("L")'
when UInt32Param then 'load_varint'
else raise
end
} + ["location"]).join(", ") -%>)
Expand Down
2 changes: 1 addition & 1 deletion bin/templates/src/serialize.c.erb
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ yp_serialize_node(yp_parser_t *parser, yp_node_t *node, yp_buffer_t *buffer) {
// encoding of location u32s make us need to save this offset.
size_t length_offset = buffer->length;

yp_buffer_append_u32(buffer, 0); /* Updated below */
yp_buffer_append_str(buffer, "\0\0\0\0", 4); /* consume 4 bytes, updated below */
<%- end -%>
<%- node.params.each do |param| -%>
<%- case param -%>
Expand Down
1 change: 1 addition & 0 deletions include/yarp/util/yp_buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "yarp/defines.h"

#include <assert.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
Expand Down
12 changes: 10 additions & 2 deletions src/util/yp_buffer.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,16 @@ yp_buffer_append_u8(yp_buffer_t *buffer, uint8_t value) {
// Append a 32-bit unsigned integer to the buffer.
void
yp_buffer_append_u32(yp_buffer_t *buffer, uint32_t value) {
const void *source = &value;
yp_buffer_append(buffer, source, sizeof(uint32_t));
if (value < 128) {
yp_buffer_append_u8(buffer, (uint8_t) value);
} else {
uint32_t n = value;
while (n >= 128) {
yp_buffer_append_u8(buffer, (uint8_t) (n | 128));
n >>= 7;
}
yp_buffer_append_u8(buffer, (uint8_t) n);
}
}

// Free the memory associated with the buffer.
Expand Down

0 comments on commit 1be84ff

Please sign in to comment.