Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HBASE-26566 Optimize encodeNumeric in OrderedBytes #3940

Merged
merged 6 commits into from
Dec 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -309,10 +309,6 @@ public class OrderedBytes {

public static final Charset UTF8 = Charset.forName("UTF-8");
private static final byte TERM = 0x00;
private static final BigDecimal E8 = BigDecimal.valueOf(1e8);
private static final BigDecimal E32 = BigDecimal.valueOf(1e32);
private static final BigDecimal EN2 = BigDecimal.valueOf(1e-2);
private static final BigDecimal EN10 = BigDecimal.valueOf(1e-10);

/**
* Max precision guaranteed to fit into a {@code long}.
Expand Down Expand Up @@ -632,7 +628,7 @@ private static int encodeNumericSmall(PositionedByteRange dst, BigDecimal val) {
byte[] a = dst.getBytes();
boolean isNeg = val.signum() == -1;
final int offset = dst.getOffset(), start = dst.getPosition();
int e = 0, d, startM;
int e = 0, startM;

if (isNeg) { /* Small negative number: 0x14, -E, ~M */
dst.put(NEG_SMALL);
Expand All @@ -641,21 +637,17 @@ private static int encodeNumericSmall(PositionedByteRange dst, BigDecimal val) {
}

// normalize abs(val) to determine E
while (abs.compareTo(EN10) < 0) { abs = abs.movePointRight(8); e += 4; }
while (abs.compareTo(EN2) < 0) { abs = abs.movePointRight(2); e++; }
int zerosBeforeFirstNonZero = abs.scale() - abs.precision();
int lengthToMoveRight = zerosBeforeFirstNonZero % 2 ==
0 ? zerosBeforeFirstNonZero : zerosBeforeFirstNonZero - 1;
e = lengthToMoveRight / 2;
abs = abs.movePointRight(lengthToMoveRight);

putVaruint64(dst, e, !isNeg); // encode appropriate E value.

// encode M by peeling off centimal digits, encoding x as 2x+1
startM = dst.getPosition();
// TODO: 18 is an arbitrary encoding limit. Reevaluate once we have a better handling of
// numeric scale.
for (int i = 0; i < 18 && abs.compareTo(BigDecimal.ZERO) != 0; i++) {
abs = abs.movePointRight(2);
d = abs.intValue();
dst.put((byte) ((2 * d + 1) & 0xff));
abs = abs.subtract(BigDecimal.valueOf(d));
}
encodeToCentimal(dst, abs);
// terminal digit should be 2x
a[offset + dst.getPosition() - 1] = (byte) (a[offset + dst.getPosition() - 1] & 0xfe);
if (isNeg) {
Expand Down Expand Up @@ -707,7 +699,7 @@ private static int encodeNumericLarge(PositionedByteRange dst, BigDecimal val) {
byte[] a = dst.getBytes();
boolean isNeg = val.signum() == -1;
final int start = dst.getPosition(), offset = dst.getOffset();
int e = 0, d, startM;
int e = 0, startM;

if (isNeg) { /* Large negative number: 0x08, ~E, ~M */
dst.put(NEG_LARGE);
Expand All @@ -716,9 +708,10 @@ private static int encodeNumericLarge(PositionedByteRange dst, BigDecimal val) {
}

// normalize abs(val) to determine E
while (abs.compareTo(E32) >= 0 && e <= 350) { abs = abs.movePointLeft(32); e +=16; }
while (abs.compareTo(E8) >= 0 && e <= 350) { abs = abs.movePointLeft(8); e+= 4; }
while (abs.compareTo(BigDecimal.ONE) >= 0 && e <= 350) { abs = abs.movePointLeft(2); e++; }
int integerDigits = abs.precision() - abs.scale();
int lengthToMoveLeft = integerDigits % 2 == 0 ? integerDigits : integerDigits + 1;
e = lengthToMoveLeft / 2;
abs = abs.movePointLeft(lengthToMoveLeft);

Copy link
Contributor Author

@YutSean YutSean Dec 17, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This old 350 limit is problematic, for value > 100^350, the abs will > 1, which will make the following steps have an incorrect encoding for M . And for value < 100^350, this limit is useless. This limit should be removed in my opinion.

// encode appropriate header byte and/or E value.
if (e > 10) { /* large number, write out {~,}E */
Expand All @@ -733,14 +726,7 @@ private static int encodeNumericLarge(PositionedByteRange dst, BigDecimal val) {

// encode M by peeling off centimal digits, encoding x as 2x+1
startM = dst.getPosition();
// TODO: 18 is an arbitrary encoding limit. Reevaluate once we have a better handling of
// numeric scale.
for (int i = 0; i < 18 && abs.compareTo(BigDecimal.ZERO) != 0; i++) {
abs = abs.movePointRight(2);
d = abs.intValue();
dst.put((byte) (2 * d + 1));
abs = abs.subtract(BigDecimal.valueOf(d));
}
encodeToCentimal(dst, abs);
// terminal digit should be 2x
a[offset + dst.getPosition() - 1] = (byte) (a[offset + dst.getPosition() - 1] & 0xfe);
if (isNeg) {
Expand All @@ -750,6 +736,32 @@ private static int encodeNumericLarge(PositionedByteRange dst, BigDecimal val) {
return dst.getPosition() - start;
}

/**
* Encode a value val in [0.01, 1.0) into Centimals.
* Util function for {@link this.encodeNumericLarge()} and {@link this.encodeNumericSmall()}
* @param dst The destination to which encoded digits are written.
* @param val A BigDecimal after the normalization. The value must be in [0.01, 1.0).
*/
private static void encodeToCentimal(PositionedByteRange dst, BigDecimal val) {
// The input value val must be in [0.01, 1.0)
String stringOfAbs = val.stripTrailingZeros().toPlainString();
String value = stringOfAbs.substring(stringOfAbs.indexOf('.') + 1);
int d;

// If the first float digit is 0, we will encode one digit more than MAX_PRECISION
// We encode at most MAX_PRECISION significant digits into centimals,
// because the input value, has been already normalized.
int maxPrecision = value.charAt(0) == '0' ? MAX_PRECISION + 1 : MAX_PRECISION;
maxPrecision = Math.min(maxPrecision, value.length());
for (int i = 0; i < maxPrecision; i += 2) {
d = (value.charAt(i) - '0') * 10;
if (i + 1 < maxPrecision) {
d += (value.charAt(i + 1) - '0');
}
dst.put((byte) (2 * d + 1));
}
}

/**
* Encode a numerical value using the variable-length encoding.
* @param dst The destination to which encoded digits are written.
Expand Down Expand Up @@ -790,6 +802,8 @@ public static int encodeNumeric(PositionedByteRange dst, double val, Order ord)

/**
* Encode a numerical value using the variable-length encoding.
* If the number of significant digits of the value exceeds the
* {@link OrderedBytes#MAX_PRECISION}, the exceeding part will be lost.
* @param dst The destination to which encoded digits are written.
* @param val The value to encode.
* @param ord The {@link Order} to respect while encoding {@code val}.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,14 @@ public class TestOrderedBytes {
static final BigDecimal[] BD_VALS =
{ null, BigDecimal.valueOf(Long.MAX_VALUE), BigDecimal.valueOf(Long.MIN_VALUE),
BigDecimal.valueOf(Double.MAX_VALUE), BigDecimal.valueOf(Double.MIN_VALUE),
BigDecimal.valueOf(Long.MAX_VALUE).multiply(BigDecimal.valueOf(100)) };
BigDecimal.valueOf(Long.MAX_VALUE).multiply(BigDecimal.valueOf(100)),
BigDecimal.valueOf(Long.MAX_VALUE).pow(64),
BigDecimal.valueOf(Long.MAX_VALUE).pow(64).negate(),
new BigDecimal("0." + String.join("", Collections.nCopies(500, "123"))),
new BigDecimal("-0." + String.join("", Collections.nCopies(500, "123")))
};
static final int[] BD_LENGTHS =
{ 1, 11, 11, 11, 4, 12 };
{ 1, 11, 11, 11, 4, 12, 19, 19, 18, 18 };

/*
* This is the smallest difference between two doubles in D_VALS
Expand Down Expand Up @@ -335,7 +340,11 @@ public void testNumericOther() {
if (null == BD_VALS[i]) {
assertEquals(BD_VALS[i], decoded);
} else {
assertEquals("Deserialization failed.", 0, BD_VALS[i].compareTo(decoded));
// The num will be rounded to a specific precision in the encoding phase.
// So that big value will lose precision here. Need to add a normalization here to
// make the test pass.
assertEquals("Deserialization failed.", 0,
OrderedBytes.normalize(BD_VALS[i]).compareTo(decoded));
}
assertEquals("Did not consume enough bytes.", BD_LENGTHS[i], buf1.getPosition() - 1);
}
Expand Down