Skip to content

Commit

Permalink
More efficient encoding of range fields. (#26470)
Browse files Browse the repository at this point in the history
This PR removes the vInt that precedes every value in order to know how long
they are. Instead the query takes an enum that tells how to compute the length
of values: for fixed-length data (ip addresses, double, float) the length is a
constant while longs and integers use a variable-length representation that
allows the length to be computed from the encoded values.

Also the encoding of ints/longs was made a bit more efficient in order not to
waste 3 bits in the header. As a consequence, values between -8 and 7 can now
be encoded on 1 byte and values between -2048 and 2047 can now be encoded on 2
bytes or less.

Closes #26443
  • Loading branch information
jpountz committed Sep 13, 2017
1 parent a1ae895 commit c7bcc9d
Show file tree
Hide file tree
Showing 4 changed files with 273 additions and 131 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,18 @@ public final class BinaryDocValuesRangeQuery extends Query {

private final String fieldName;
private final QueryType queryType;
private final LengthType lengthType;
private final BytesRef from;
private final BytesRef to;
private final Object originalFrom;
private final Object originalTo;

public BinaryDocValuesRangeQuery(String fieldName, QueryType queryType, BytesRef from, BytesRef to,
public BinaryDocValuesRangeQuery(String fieldName, QueryType queryType, LengthType lengthType,
BytesRef from, BytesRef to,
Object originalFrom, Object originalTo) {
this.fieldName = fieldName;
this.queryType = queryType;
this.lengthType = lengthType;
this.from = from;
this.to = to;
this.originalFrom = originalFrom;
Expand All @@ -66,29 +69,34 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
final TwoPhaseIterator iterator = new TwoPhaseIterator(values) {

ByteArrayDataInput in = new ByteArrayDataInput();
BytesRef otherFrom = new BytesRef(16);
BytesRef otherTo = new BytesRef(16);
BytesRef otherFrom = new BytesRef();
BytesRef otherTo = new BytesRef();

@Override
public boolean matches() throws IOException {
BytesRef encodedRanges = values.binaryValue();
in.reset(encodedRanges.bytes, encodedRanges.offset, encodedRanges.length);
int numRanges = in.readVInt();
final byte[] bytes = encodedRanges.bytes;
otherFrom.bytes = bytes;
otherTo.bytes = bytes;
int offset = in.getPosition();
for (int i = 0; i < numRanges; i++) {
otherFrom.length = in.readVInt();
otherFrom.bytes = encodedRanges.bytes;
otherFrom.offset = in.getPosition();
in.skipBytes(otherFrom.length);
int length = lengthType.readLength(bytes, offset);
otherFrom.offset = offset;
otherFrom.length = length;
offset += length;

otherTo.length = in.readVInt();
otherTo.bytes = encodedRanges.bytes;
otherTo.offset = in.getPosition();
in.skipBytes(otherTo.length);
length = lengthType.readLength(bytes, offset);
otherTo.offset = offset;
otherTo.length = length;
offset += length;

if (queryType.matches(from, to, otherFrom, otherTo)) {
return true;
}
}
assert offset == encodedRanges.offset + encodedRanges.length;
return false;
}

Expand All @@ -114,13 +122,14 @@ public boolean equals(Object o) {
BinaryDocValuesRangeQuery that = (BinaryDocValuesRangeQuery) o;
return Objects.equals(fieldName, that.fieldName) &&
queryType == that.queryType &&
lengthType == that.lengthType &&
Objects.equals(from, that.from) &&
Objects.equals(to, that.to);
}

@Override
public int hashCode() {
return Objects.hash(getClass(), fieldName, queryType, from, to);
return Objects.hash(getClass(), fieldName, queryType, lengthType, from, to);
}

public enum QueryType {
Expand Down Expand Up @@ -161,4 +170,42 @@ boolean matches(BytesRef from, BytesRef to, BytesRef otherFrom, BytesRef otherTo

}

public enum LengthType {
FIXED_4 {
@Override
int readLength(byte[] bytes, int offset) {
return 4;
}
},
FIXED_8 {
@Override
int readLength(byte[] bytes, int offset) {
return 8;
}
},
FIXED_16 {
@Override
int readLength(byte[] bytes, int offset) {
return 16;
}
},
VARIABLE {
@Override
int readLength(byte[] bytes, int offset) {
// the first bit encodes the sign and the next 4 bits encode the number
// of additional bytes
int token = Byte.toUnsignedInt(bytes[offset]);
int length = (token >>> 3) & 0x0f;
if ((token & 0x80) == 0) {
length = 0x0f - length;
}
return 1 + length;
}
};

/**
* Return the length of the value that starts at {@code offset} in {@code bytes}.
*/
abstract int readLength(byte[] bytes, int offset);
}
}
153 changes: 84 additions & 69 deletions core/src/main/java/org/elasticsearch/index/mapper/BinaryRangeUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,14 @@
*/
package org.elasticsearch.index.mapper;

import org.apache.lucene.document.HalfFloatPoint;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.NumericUtils;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Set;

Expand All @@ -32,67 +35,77 @@ enum BinaryRangeUtil {

static BytesRef encodeLongRanges(Set<RangeFieldMapper.Range> ranges) throws IOException {
List<RangeFieldMapper.Range> sortedRanges = new ArrayList<>(ranges);
sortedRanges.sort((r1, r2) -> {
long r1From = ((Number) r1.from).longValue();
long r2From = ((Number) r2.from).longValue();
int cmp = Long.compare(r1From, r2From);
if (cmp != 0) {
return cmp;
} else {
long r1To = ((Number) r1.from).longValue();
long r2To = ((Number) r2.from).longValue();
return Long.compare(r1To, r2To);
}
});
Comparator<RangeFieldMapper.Range> fromComparator = Comparator.comparingLong(range -> ((Number) range.from).longValue());
Comparator<RangeFieldMapper.Range> toComparator = Comparator.comparingLong(range -> ((Number) range.to).longValue());
sortedRanges.sort(fromComparator.thenComparing(toComparator));

final byte[] encoded = new byte[5 + ((5 + 9) * 2) * sortedRanges.size()];
final byte[] encoded = new byte[5 + (9 * 2) * sortedRanges.size()];
ByteArrayDataOutput out = new ByteArrayDataOutput(encoded);
out.writeVInt(sortedRanges.size());
for (RangeFieldMapper.Range range : sortedRanges) {
byte[] encodedFrom = encode(((Number) range.from).longValue());
out.writeVInt(encodedFrom.length);
byte[] encodedFrom = encodeLong(((Number) range.from).longValue());
out.writeBytes(encodedFrom, encodedFrom.length);
byte[] encodedTo = encode(((Number) range.to).longValue());
out.writeVInt(encodedTo.length);
byte[] encodedTo = encodeLong(((Number) range.to).longValue());
out.writeBytes(encodedTo, encodedTo.length);
}
return new BytesRef(encoded, 0, out.getPosition());
}

static BytesRef encodeDoubleRanges(Set<RangeFieldMapper.Range> ranges) throws IOException {
List<RangeFieldMapper.Range> sortedRanges = new ArrayList<>(ranges);
sortedRanges.sort((r1, r2) -> {
double r1From = ((Number) r1.from).doubleValue();
double r2From = ((Number) r2.from).doubleValue();
int cmp = Double.compare(r1From, r2From);
if (cmp != 0) {
return cmp;
} else {
double r1To = ((Number) r1.from).doubleValue();
double r2To = ((Number) r2.from).doubleValue();
return Double.compare(r1To, r2To);
}
});
Comparator<RangeFieldMapper.Range> fromComparator = Comparator.comparingDouble(range -> ((Number) range.from).doubleValue());
Comparator<RangeFieldMapper.Range> toComparator = Comparator.comparingDouble(range -> ((Number) range.to).doubleValue());
sortedRanges.sort(fromComparator.thenComparing(toComparator));

final byte[] encoded = new byte[5 + ((5 + 9) * 2) * sortedRanges.size()];
final byte[] encoded = new byte[5 + (8 * 2) * sortedRanges.size()];
ByteArrayDataOutput out = new ByteArrayDataOutput(encoded);
out.writeVInt(sortedRanges.size());
for (RangeFieldMapper.Range range : sortedRanges) {
byte[] encodedFrom = BinaryRangeUtil.encode(((Number) range.from).doubleValue());
out.writeVInt(encodedFrom.length);
byte[] encodedFrom = encodeDouble(((Number) range.from).doubleValue());
out.writeBytes(encodedFrom, encodedFrom.length);
byte[] encodedTo = BinaryRangeUtil.encode(((Number) range.to).doubleValue());
out.writeVInt(encodedTo.length);
byte[] encodedTo = encodeDouble(((Number) range.to).doubleValue());
out.writeBytes(encodedTo, encodedTo.length);
}
return new BytesRef(encoded, 0, out.getPosition());
}

static BytesRef encodeFloatRanges(Set<RangeFieldMapper.Range> ranges) throws IOException {
List<RangeFieldMapper.Range> sortedRanges = new ArrayList<>(ranges);
Comparator<RangeFieldMapper.Range> fromComparator = Comparator.comparingDouble(range -> ((Number) range.from).floatValue());
Comparator<RangeFieldMapper.Range> toComparator = Comparator.comparingDouble(range -> ((Number) range.to).floatValue());
sortedRanges.sort(fromComparator.thenComparing(toComparator));

final byte[] encoded = new byte[5 + (4 * 2) * sortedRanges.size()];
ByteArrayDataOutput out = new ByteArrayDataOutput(encoded);
out.writeVInt(sortedRanges.size());
for (RangeFieldMapper.Range range : sortedRanges) {
byte[] encodedFrom = encodeFloat(((Number) range.from).floatValue());
out.writeBytes(encodedFrom, encodedFrom.length);
byte[] encodedTo = encodeFloat(((Number) range.to).floatValue());
out.writeBytes(encodedTo, encodedTo.length);
}
return new BytesRef(encoded, 0, out.getPosition());
}

static byte[] encodeDouble(double number) {
byte[] encoded = new byte[8];
NumericUtils.longToSortableBytes(NumericUtils.doubleToSortableLong(number), encoded, 0);
return encoded;
}

static byte[] encodeFloat(float number) {
byte[] encoded = new byte[4];
NumericUtils.intToSortableBytes(NumericUtils.floatToSortableInt(number), encoded, 0);
return encoded;
}

/**
* Encodes the specified number of type long in a variable-length byte format.
* The byte format preserves ordering, which means the returned byte array can be used for comparing as is.
* The first bit stores the sign and the 4 subsequent bits encode the number of bytes that are used to
* represent the long value, in addition to the first one.
*/
static byte[] encode(long number) {
static byte[] encodeLong(long number) {
int sign = 1; // means positive
if (number < 0) {
number = -1 - number;
Expand All @@ -101,46 +114,48 @@ static byte[] encode(long number) {
return encode(number, sign);
}

/**
* Encodes the specified number of type double in a variable-length byte format.
* The byte format preserves ordering, which means the returned byte array can be used for comparing as is.
*/
static byte[] encode(double number) {
long l;
int sign;
if (number < 0.0) {
l = Double.doubleToRawLongBits(-0d - number);
sign = 0;
} else {
l = Double.doubleToRawLongBits(number);
sign = 1; // means positive
}
return encode(l, sign);
}

private static byte[] encode(long l, int sign) {
assert l >= 0;
int bits = 64 - Long.numberOfLeadingZeros(l);

int numBytes = (bits + 7) / 8; // between 0 and 8
byte[] encoded = new byte[1 + numBytes];
// encode the sign first to make sure positive values compare greater than negative values
// and then the number of bytes, to make sure that large values compare greater than low values
if (sign > 0) {
encoded[0] = (byte) ((sign << 4) | numBytes);
} else {
encoded[0] = (byte) ((sign << 4) | (8 - numBytes));
// the header is formed of:
// - 1 bit for the sign
// - 4 bits for the number of additional bytes
// - up to 3 bits of the value
// additional bytes are data bytes

int numBits = 64 - Long.numberOfLeadingZeros(l);
int numAdditionalBytes = (numBits + 7 - 3) / 8;

byte[] encoded = new byte[1 + numAdditionalBytes];

// write data bytes
int i = encoded.length;
while (numBits > 0) {
int index = --i;
assert index > 0 || numBits <= 3; // byte 0 can't encode more than 3 bits
encoded[index] = (byte) l;
l >>>= 8;
numBits -= 8;
}
for (int b = 0; b < numBytes; ++b) {
if (sign == 1) {
encoded[encoded.length - 1 - b] = (byte) (l >>> (8 * b));
} else if (sign == 0) {
encoded[encoded.length - 1 - b] = (byte) (0xFF - ((l >>> (8 * b)) & 0xFF));
} else {
throw new AssertionError();
assert Byte.toUnsignedInt(encoded[0]) <= 0x07;
assert encoded.length == 1 || encoded[0] != 0 || Byte.toUnsignedInt(encoded[1]) > 0x07;

if (sign == 0) {
// reverse the order
for (int j = 0; j < encoded.length; ++j) {
encoded[j] = (byte) ~Byte.toUnsignedInt(encoded[j]);
}
// the first byte only uses 3 bits, we need the 5 upper bits for the header
encoded[0] &= 0x07;
}

// write the header
encoded[0] |= sign << 7;
if (sign > 0) {
encoded[0] |= numAdditionalBytes << 3;
} else {
encoded[0] |= (15 - numAdditionalBytes) << 3;
}
return encoded;
}

}
Loading

0 comments on commit c7bcc9d

Please sign in to comment.