From 09abde8afe115f0691062998ad37fef5721ff14b Mon Sep 17 00:00:00 2001 From: HuangXingBo Date: Tue, 22 Oct 2024 00:31:53 +0800 Subject: [PATCH] feat(java): Refactor String serialization and deserialization (#1890) ## What does this PR do? ## Related issues Closes #1868 Closes #1754 ## Does this PR introduce any user-facing change? - [ ] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark --------- Co-authored-by: chaokunyang --- .../org/apache/fury/config/FuryBuilder.java | 2 +- .../org/apache/fury/memory/MemoryBuffer.java | 2 +- .../apache/fury/serializer/Serializers.java | 2 +- .../fury/serializer/StringSerializer.java | 577 ++++++++++++------ .../apache/fury/util/StringEncodingUtils.java | 381 ++++++++++++ .../org/apache/fury/util/StringUtils.java | 12 +- .../apache/fury/builder/JITContextTest.java | 6 + .../fury/serializer/StringSerializerTest.java | 5 +- .../fury/util/StringEncodingUtilsTest.java | 59 ++ 9 files changed, 868 insertions(+), 178 deletions(-) create mode 100644 java/fury-core/src/main/java/org/apache/fury/util/StringEncodingUtils.java create mode 100644 java/fury-core/src/test/java/org/apache/fury/util/StringEncodingUtilsTest.java diff --git a/java/fury-core/src/main/java/org/apache/fury/config/FuryBuilder.java b/java/fury-core/src/main/java/org/apache/fury/config/FuryBuilder.java index c27a612418..7e26d7226e 100644 --- a/java/fury-core/src/main/java/org/apache/fury/config/FuryBuilder.java +++ b/java/fury-core/src/main/java/org/apache/fury/config/FuryBuilder.java @@ -68,7 +68,7 @@ public final class FuryBuilder { ClassLoader classLoader; boolean compressInt = true; public LongEncoding longEncoding = LongEncoding.SLI; - boolean compressString = true; + boolean compressString = false; CompatibleMode compatibleMode = CompatibleMode.SCHEMA_CONSISTENT; boolean checkJdkClassSerializable = true; Class defaultJDKStreamSerializerType = ObjectStreamSerializer.class; diff --git a/java/fury-core/src/main/java/org/apache/fury/memory/MemoryBuffer.java b/java/fury-core/src/main/java/org/apache/fury/memory/MemoryBuffer.java index 79d6f2b7f8..87b56e6eb6 100644 --- a/java/fury-core/src/main/java/org/apache/fury/memory/MemoryBuffer.java +++ b/java/fury-core/src/main/java/org/apache/fury/memory/MemoryBuffer.java @@ -471,7 +471,7 @@ private int _unsafeGetInt32(int index) { } // CHECKSTYLE.OFF:MethodName - private void _unsafePutInt32(int index, int value) { + public void _unsafePutInt32(int index, int value) { // CHECKSTYLE.ON:MethodName if (!LITTLE_ENDIAN) { value = Integer.reverseBytes(value); diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java b/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java index 36870aa6d1..61e7574ef5 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java @@ -265,7 +265,7 @@ public void write(MemoryBuffer buffer, T value) { } else { char[] v = (char[]) GET_VALUE.apply(value); if (StringUtils.isLatin(v)) { - stringSerializer.writeCharsLatin(buffer, v, value.length()); + stringSerializer.writeCharsLatin1(buffer, v, value.length()); } else { stringSerializer.writeCharsUTF16(buffer, v, value.length()); } diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java b/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java index b0b67abcc7..a1161138d3 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java @@ -19,8 +19,8 @@ package org.apache.fury.serializer; -import static org.apache.fury.type.TypeUtils.PRIMITIVE_CHAR_ARRAY_TYPE; import static org.apache.fury.type.TypeUtils.STRING_TYPE; +import static org.apache.fury.util.StringUtils.MULTI_CHARS_NON_LATIN_MASK; import java.lang.invoke.CallSite; import java.lang.invoke.LambdaMetafactory; @@ -43,6 +43,7 @@ import org.apache.fury.type.Type; import org.apache.fury.util.MathUtils; import org.apache.fury.util.Preconditions; +import org.apache.fury.util.StringEncodingUtils; import org.apache.fury.util.StringUtils; import org.apache.fury.util.unsafe._JDKAccess; @@ -149,15 +150,19 @@ public void writeString(MemoryBuffer buffer, String value) { public Expression writeStringExpr(Expression strSerializer, Expression buffer, Expression str) { if (isJava) { if (STRING_VALUE_FIELD_IS_BYTES) { - return new StaticInvoke(StringSerializer.class, "writeBytesString", buffer, str); + if (compressString) { + return new Invoke(strSerializer, "writeCompressedBytesString", buffer, str); + } else { + return new StaticInvoke(StringSerializer.class, "writeBytesString", buffer, str); + } } else { if (!STRING_VALUE_FIELD_IS_CHARS) { throw new UnsupportedOperationException(); } if (compressString) { - return new Invoke(strSerializer, "writeCharsStringCompressed", buffer, str); + return new Invoke(strSerializer, "writeCompressedCharsString", buffer, str); } else { - return new Invoke(strSerializer, "writeCharsStringUncompressed", buffer, str); + return new Invoke(strSerializer, "writeCharsString", buffer, str); } } } else { @@ -165,23 +170,6 @@ public Expression writeStringExpr(Expression strSerializer, Expression buffer, E } } - // Invoked by jit - public void writeCharsStringCompressed(MemoryBuffer buffer, String value) { - final char[] chars = (char[]) Platform.getObject(value, STRING_VALUE_FIELD_OFFSET); - if (StringUtils.isLatin(chars)) { - writeCharsLatin(buffer, chars, chars.length); - } else { - writeCharsUTF16(buffer, chars, chars.length); - } - } - - // Invoked by jit - public void writeCharsStringUncompressed(MemoryBuffer buffer, String value) { - int numBytes = MathUtils.doubleExact(value.length()); - final char[] chars = (char[]) Platform.getObject(value, STRING_VALUE_FIELD_OFFSET); - buffer.writePrimitiveArrayWithSize(chars, Platform.CHAR_ARRAY_OFFSET, numBytes); - } - public String readString(MemoryBuffer buffer) { if (isJava) { return readJavaString(buffer); @@ -201,9 +189,7 @@ public Expression readStringExpr(Expression strSerializer, Expression buffer) { if (compressString) { return new Invoke(strSerializer, "readCompressedCharsString", STRING_TYPE, buffer); } else { - Expression chars = new Invoke(buffer, "readCharsAndSize", PRIMITIVE_CHAR_ARRAY_TYPE); - return new StaticInvoke( - StringSerializer.class, "newCharsStringZeroCopy", STRING_TYPE, chars); + return new Invoke(strSerializer, "readCharsString", STRING_TYPE, buffer); } } } else { @@ -216,17 +202,7 @@ public String readBytesString(MemoryBuffer buffer) { long header = buffer.readVarUint36Small(); byte coder = (byte) (header & 0b11); int numBytes = (int) (header >>> 2); - buffer.checkReadableBytes(numBytes); - byte[] bytes; - byte[] heapMemory = buffer.getHeapMemory(); - if (heapMemory != null) { - final int arrIndex = buffer._unsafeHeapReaderIndex(); - buffer.increaseReaderIndex(numBytes); - bytes = new byte[numBytes]; - System.arraycopy(heapMemory, arrIndex, bytes, 0, numBytes); - } else { - bytes = buffer.readBytes(numBytes); - } + byte[] bytes = readBytesUnCompressedUTF16(buffer, numBytes); if (coder != UTF8) { return newBytesStringZeroCopy(coder, bytes); } else { @@ -235,80 +211,130 @@ public String readBytesString(MemoryBuffer buffer) { } @CodegenInvoke - public String readCompressedCharsString(MemoryBuffer buffer) { + public String readCharsString(MemoryBuffer buffer) { long header = buffer.readVarUint36Small(); byte coder = (byte) (header & 0b11); int numBytes = (int) (header >>> 2); + char[] chars; if (coder == LATIN1) { - return newCharsStringZeroCopy(readLatinChars(buffer, numBytes)); + chars = readCharsLatin1(buffer, numBytes); } else if (coder == UTF16) { - return newCharsStringZeroCopy(readUTF16Chars(buffer, numBytes)); + chars = readCharsUTF16(buffer, numBytes); } else { - return readUtf8(buffer, coder, numBytes); + throw new RuntimeException("Unknown coder type " + coder); } + return newCharsStringZeroCopy(chars); } - private String readUtf8(MemoryBuffer buffer, byte coder, int numBytes) { - Preconditions.checkArgument(coder == UTF8, UTF8); - byte[] bytes = buffer.readBytes(numBytes); - return new String(bytes, 0, numBytes, StandardCharsets.UTF_8); + @CodegenInvoke + public String readCompressedBytesString(MemoryBuffer buffer) { + long header = buffer.readVarUint36Small(); + byte coder = (byte) (header & 0b11); + int numBytes = (int) (header >>> 2); + if (coder == UTF8) { + return newBytesStringZeroCopy(UTF16, readBytesUTF8(buffer, numBytes)); + } else if (coder == LATIN1 || coder == UTF16) { + return newBytesStringZeroCopy(coder, readBytesUnCompressedUTF16(buffer, numBytes)); + } else { + throw new RuntimeException("Unknown coder type " + coder); + } } - private byte[] getByteArray(int numElements) { - byte[] byteArray = this.byteArray; - if (byteArray.length < numElements) { - byteArray = new byte[numElements]; - this.byteArray = byteArray; - } - if (byteArray.length > DEFAULT_BUFFER_SIZE) { - smoothByteArrayLength = - Math.max(((int) (smoothByteArrayLength * 0.9 + numElements * 0.1)), DEFAULT_BUFFER_SIZE); - if (smoothByteArrayLength <= DEFAULT_BUFFER_SIZE) { - this.byteArray = new byte[DEFAULT_BUFFER_SIZE]; - } + @CodegenInvoke + public String readCompressedCharsString(MemoryBuffer buffer) { + long header = buffer.readVarUint36Small(); + byte coder = (byte) (header & 0b11); + int numBytes = (int) (header >>> 2); + char[] chars; + if (coder == LATIN1) { + chars = readCharsLatin1(buffer, numBytes); + } else if (coder == UTF8) { + chars = readCharsUTF8(buffer, numBytes); + } else if (coder == UTF16) { + chars = readCharsUTF16(buffer, numBytes); + } else { + throw new RuntimeException("Unknown coder type " + coder); } - return byteArray; + return newCharsStringZeroCopy(chars); } // Invoked by fury JIT public void writeJavaString(MemoryBuffer buffer, String value) { if (STRING_VALUE_FIELD_IS_BYTES) { - writeBytesString(buffer, value); + if (compressString) { + writeCompressedBytesString(buffer, value); + } else { + writeBytesString(buffer, value); + } } else { assert STRING_VALUE_FIELD_IS_CHARS; - final char[] chars = (char[]) Platform.getObject(value, STRING_VALUE_FIELD_OFFSET); if (compressString) { - if (StringUtils.isLatin(chars)) { - writeCharsLatin(buffer, chars, chars.length); - } else { - writeCharsUTF16(buffer, chars, chars.length); - } + writeCompressedCharsString(buffer, value); } else { - int numBytes = MathUtils.doubleExact(value.length()); - buffer.writePrimitiveArrayWithSize(chars, Platform.CHAR_ARRAY_OFFSET, numBytes); + writeCharsString(buffer, value); } } } + @CodegenInvoke + public void writeUTF8String(MemoryBuffer buffer, String value) { + byte[] bytes = value.getBytes(StandardCharsets.UTF_8); + buffer.writeVarUint32(bytes.length); + buffer.writeBytes(bytes); + } + // Invoked by fury JIT public String readJavaString(MemoryBuffer buffer) { if (STRING_VALUE_FIELD_IS_BYTES) { - return readBytesString(buffer); + if (compressString) { + return readCompressedBytesString(buffer); + } else { + return readBytesString(buffer); + } } else { assert STRING_VALUE_FIELD_IS_CHARS; if (compressString) { return readCompressedCharsString(buffer); } else { - return newCharsStringZeroCopy(buffer.readCharsAndSize()); + return readCharsString(buffer); } } } + @CodegenInvoke + public void writeCompressedBytesString(MemoryBuffer buffer, String value) { + final byte[] bytes = (byte[]) Platform.getObject(value, STRING_VALUE_FIELD_OFFSET); + final byte coder = Platform.getByte(value, Offset.STRING_CODER_FIELD_OFFSET); + if (coder == LATIN1 || bestCoder(bytes) == UTF16) { + writeBytesString(buffer, coder, bytes); + } else { + writeBytesUTF8(buffer, bytes); + } + } + + @CodegenInvoke + public void writeCompressedCharsString(MemoryBuffer buffer, String value) { + final char[] chars = (char[]) Platform.getObject(value, STRING_VALUE_FIELD_OFFSET); + final byte coder = bestCoder(chars); + if (coder == LATIN1) { + writeCharsLatin1(buffer, chars, chars.length); + } else if (coder == UTF8) { + writeCharsUTF8(buffer, chars); + } else { + writeCharsUTF16(buffer, chars, chars.length); + } + } + + @CodegenInvoke public static void writeBytesString(MemoryBuffer buffer, String value) { byte[] bytes = (byte[]) Platform.getObject(value, STRING_VALUE_FIELD_OFFSET); + byte coder = Platform.getByte(value, Offset.STRING_CODER_FIELD_OFFSET); + writeBytesString(buffer, coder, bytes); + } + + public static void writeBytesString(MemoryBuffer buffer, byte coder, byte[] bytes) { int bytesLen = bytes.length; - long header = - ((long) bytesLen << 2) | Platform.getByte(value, Offset.STRING_CODER_FIELD_OFFSET); + long header = ((long) bytesLen << 2) | coder; int writerIndex = buffer.writerIndex(); // The `ensure` ensure next operations are safe without bound checks, // and inner heap buffer doesn't change. @@ -332,112 +358,94 @@ public static void writeBytesString(MemoryBuffer buffer, String value) { buffer._unsafeWriterIndex(writerIndex); } - public void writeCharsLatin(MemoryBuffer buffer, char[] chars, final int strLen) { - int writerIndex = buffer.writerIndex(); - // The `ensure` ensure next operations are safe without bound checks, - // and inner heap buffer doesn't change. - buffer.ensure(writerIndex + 9 + strLen); - long header = ((long) strLen << 2) | LATIN1; - final byte[] targetArray = buffer.getHeapMemory(); - if (targetArray != null) { - int arrIndex = buffer._unsafeHeapWriterIndex(); - int written = LittleEndian.putVarUint36Small(targetArray, arrIndex, header); - arrIndex += written; - writerIndex += written + strLen; - for (int i = 0; i < strLen; i++) { - targetArray[arrIndex + i] = (byte) chars[i]; - } - buffer._unsafeWriterIndex(writerIndex); + @CodegenInvoke + public void writeCharsString(MemoryBuffer buffer, String value) { + final char[] chars = (char[]) Platform.getObject(value, STRING_VALUE_FIELD_OFFSET); + if (StringUtils.isLatin(chars)) { + writeCharsLatin1(buffer, chars, chars.length); } else { - writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header); - final byte[] tmpArray = getByteArray(strLen); - // Write to heap memory then copy is 60% faster than unsafe write to direct memory. - for (int i = 0; i < strLen; i++) { - tmpArray[i] = (byte) chars[i]; - } - buffer.put(writerIndex, tmpArray, 0, strLen); - writerIndex += strLen; - buffer._unsafeWriterIndex(writerIndex); + writeCharsUTF16(buffer, chars, chars.length); } } - public void writeCharsUTF16(MemoryBuffer buffer, char[] chars, int strLen) { - int numBytes = MathUtils.doubleExact(strLen); - long header = ((long) numBytes << 2) | UTF16; - // The `ensure` ensure next operations are safe without bound checks, - // and inner heap buffer doesn't change. - int writerIndex = buffer.writerIndex(); - buffer.ensure(writerIndex + 9 + numBytes); - byte[] targetArray = buffer.getHeapMemory(); + @CodegenInvoke + public String readUTF8String(MemoryBuffer buffer) { + int numBytes = buffer.readVarUint32Small14(); + buffer.checkReadableBytes(numBytes); + final byte[] targetArray = buffer.getHeapMemory(); if (targetArray != null) { - int arrIndex = buffer._unsafeHeapWriterIndex(); - int written = LittleEndian.putVarUint36Small(targetArray, arrIndex, header); - arrIndex += written; - writerIndex += written + numBytes; - if (Platform.IS_LITTLE_ENDIAN) { - // FIXME JDK11 utf16 string uses little-endian order. - Platform.UNSAFE.copyMemory( - chars, - Platform.CHAR_ARRAY_OFFSET, - targetArray, - Platform.BYTE_ARRAY_OFFSET + arrIndex, - numBytes); - } else { - heapWriteCharsUTF16BE(chars, arrIndex, numBytes, targetArray); - } + String str = + new String( + targetArray, buffer._unsafeHeapReaderIndex(), numBytes, StandardCharsets.UTF_8); + buffer.increaseReaderIndex(numBytes); + return str; } else { - writerIndex = offHeapWriteCharsUTF16(buffer, chars, writerIndex, header, numBytes); - } - buffer._unsafeWriterIndex(writerIndex); - } - - private static void heapWriteCharsUTF16BE( - char[] chars, int arrIndex, int numBytes, byte[] targetArray) { - // Write to heap memory then copy is 250% faster than unsafe write to direct memory. - int charIndex = 0; - for (int i = arrIndex, end = i + numBytes; i < end; i += 2) { - char c = chars[charIndex++]; - targetArray[i] = (byte) (c >> StringUTF16.HI_BYTE_SHIFT); - targetArray[i + 1] = (byte) (c >> StringUTF16.LO_BYTE_SHIFT); - } - } - - private int offHeapWriteCharsUTF16( - MemoryBuffer buffer, char[] chars, int writerIndex, long header, int numBytes) { - writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header); - byte[] tmpArray = getByteArray(numBytes); - int charIndex = 0; - for (int i = 0; i < numBytes; i += 2) { - char c = chars[charIndex++]; - tmpArray[i] = (byte) (c >> StringUTF16.HI_BYTE_SHIFT); - tmpArray[i + 1] = (byte) (c >> StringUTF16.LO_BYTE_SHIFT); + final byte[] tmpArray = getByteArray(numBytes); + buffer.readBytes(tmpArray, 0, numBytes); + return new String(tmpArray, 0, numBytes, StandardCharsets.UTF_8); } - buffer.put(writerIndex, tmpArray, 0, numBytes); - writerIndex += numBytes; - return writerIndex; } - private char[] readLatinChars(MemoryBuffer buffer, int numBytes) { - char[] chars = new char[numBytes]; + public char[] readCharsLatin1(MemoryBuffer buffer, int numBytes) { + // int utf8AsciiBytes = buffer.readInt32(); buffer.checkReadableBytes(numBytes); - byte[] targetArray = buffer.getHeapMemory(); - if (targetArray != null) { + byte[] srcArray = buffer.getHeapMemory(); + char[] chars = new char[numBytes]; + if (srcArray != null) { int srcIndex = buffer._unsafeHeapReaderIndex(); for (int i = 0; i < numBytes; i++) { - chars[i] = (char) (targetArray[srcIndex++] & 0xff); + chars[i] = (char) (srcArray[srcIndex++] & 0xff); } buffer._increaseReaderIndexUnsafe(numBytes); } else { - byte[] byteArray = getByteArray(numBytes); - buffer.readBytes(byteArray, 0, numBytes); + byte[] tmpArray = getByteArray(numBytes); + buffer.readBytes(tmpArray, 0, numBytes); for (int i = 0; i < numBytes; i++) { - chars[i] = (char) (byteArray[i] & 0xff); + chars[i] = (char) (tmpArray[i] & 0xff); } } return chars; } - private char[] readUTF16Chars(MemoryBuffer buffer, int numBytes) { + public byte[] readBytesUTF8(MemoryBuffer buffer, int numBytes) { + int udf8Bytes = buffer.readInt32(); + byte[] bytes = new byte[numBytes]; + buffer.checkReadableBytes(udf8Bytes); + byte[] srcArray = buffer.getHeapMemory(); + if (srcArray != null) { + int srcIndex = buffer._unsafeHeapReaderIndex(); + int readLen = StringEncodingUtils.convertUTF8ToUTF16(srcArray, srcIndex, udf8Bytes, bytes); + if (readLen != numBytes) { + throw new RuntimeException("Decode UTF8 to UTF16 failed"); + } + buffer._increaseReaderIndexUnsafe(udf8Bytes); + } else { + byte[] tmpArray = getByteArray(udf8Bytes); + buffer.readBytes(tmpArray, 0, udf8Bytes); + int readLen = StringEncodingUtils.convertUTF8ToUTF16(tmpArray, 0, udf8Bytes, bytes); + if (readLen != numBytes) { + throw new RuntimeException("Decode UTF8 to UTF16 failed"); + } + } + return bytes; + } + + public byte[] readBytesUnCompressedUTF16(MemoryBuffer buffer, int numBytes) { + buffer.checkReadableBytes(numBytes); + byte[] bytes; + byte[] heapMemory = buffer.getHeapMemory(); + if (heapMemory != null) { + final int arrIndex = buffer._unsafeHeapReaderIndex(); + buffer.increaseReaderIndex(numBytes); + bytes = new byte[numBytes]; + System.arraycopy(heapMemory, arrIndex, bytes, 0, numBytes); + } else { + bytes = buffer.readBytes(numBytes); + } + return bytes; + } + + public char[] readCharsUTF16(MemoryBuffer buffer, int numBytes) { char[] chars = new char[numBytes >> 1]; if (Platform.IS_LITTLE_ENDIAN) { // FIXME JDK11 utf16 string uses little-endian order. @@ -471,6 +479,138 @@ private char[] readUTF16Chars(MemoryBuffer buffer, int numBytes) { return chars; } + public char[] readCharsUTF8(MemoryBuffer buffer, int numBytes) { + int udf16Chars = numBytes >> 1; + int udf8Bytes = buffer.readInt32(); + char[] chars = new char[udf16Chars]; + buffer.checkReadableBytes(udf8Bytes); + byte[] srcArray = buffer.getHeapMemory(); + if (srcArray != null) { + int srcIndex = buffer._unsafeHeapReaderIndex(); + int readLen = StringEncodingUtils.convertUTF8ToUTF16(srcArray, srcIndex, udf8Bytes, chars); + if (readLen != udf16Chars) { + throw new RuntimeException("Decode UTF8 to UTF16 failed"); + } + buffer._increaseReaderIndexUnsafe(udf8Bytes); + } else { + byte[] tmpArray = getByteArray(udf8Bytes); + buffer.readBytes(tmpArray, 0, udf8Bytes); + int readLen = StringEncodingUtils.convertUTF8ToUTF16(tmpArray, 0, udf8Bytes, chars); + if (readLen != udf16Chars) { + throw new RuntimeException("Decode UTF8 to UTF16 failed"); + } + } + return chars; + } + + public void writeCharsLatin1(MemoryBuffer buffer, char[] chars, int numBytes) { + int writerIndex = buffer.writerIndex(); + long header = ((long) numBytes << 2) | LATIN1; + buffer.ensure(writerIndex + 5 + numBytes); + byte[] targetArray = buffer.getHeapMemory(); + if (targetArray != null) { + final int targetIndex = buffer._unsafeHeapWriterIndex(); + int arrIndex = targetIndex; + arrIndex += LittleEndian.putVarUint36Small(targetArray, arrIndex, header); + writerIndex += arrIndex - targetIndex; + for (int i = 0; i < numBytes; i++) { + targetArray[arrIndex + i] = (byte) chars[i]; + } + } else { + writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header); + final byte[] tmpArray = getByteArray(numBytes); + for (int i = 0; i < numBytes; i++) { + tmpArray[i] = (byte) chars[i]; + } + buffer.put(writerIndex, tmpArray, 0, numBytes); + } + writerIndex += numBytes; + buffer._unsafeWriterIndex(writerIndex); + } + + public void writeCharsUTF16(MemoryBuffer buffer, char[] chars, int numChars) { + int numBytes = MathUtils.doubleExact(numChars); + int writerIndex = buffer.writerIndex(); + long header = ((long) numBytes << 2) | UTF16; + buffer.ensure(writerIndex + 5 + numBytes); + final byte[] targetArray = buffer.getHeapMemory(); + if (targetArray != null) { + final int targetIndex = buffer._unsafeHeapWriterIndex(); + int arrIndex = targetIndex; + arrIndex += LittleEndian.putVarUint36Small(targetArray, arrIndex, header); + writerIndex += arrIndex - targetIndex + numBytes; + if (Platform.IS_LITTLE_ENDIAN) { + // FIXME JDK11 utf16 string uses little-endian order. + Platform.UNSAFE.copyMemory( + chars, + Platform.CHAR_ARRAY_OFFSET, + targetArray, + Platform.BYTE_ARRAY_OFFSET + arrIndex, + numBytes); + } else { + heapWriteCharsUTF16BE(chars, arrIndex, numBytes, targetArray); + } + } else { + writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header); + writerIndex = offHeapWriteCharsUTF16(buffer, chars, writerIndex, numBytes); + } + buffer._unsafeWriterIndex(writerIndex); + } + + public void writeCharsUTF8(MemoryBuffer buffer, char[] chars) { + int estimateMaxBytes = chars.length * 3; + int numBytes = MathUtils.doubleExact(chars.length); + int writerIndex = buffer.writerIndex(); + long header = ((long) numBytes << 2) | UTF8; + buffer.ensure(writerIndex + 9 + estimateMaxBytes); + byte[] targetArray = buffer.getHeapMemory(); + if (targetArray != null) { + int targetIndex = buffer._unsafeHeapWriterIndex(); + int arrIndex = targetIndex; + arrIndex += LittleEndian.putVarUint36Small(targetArray, arrIndex, header); + writerIndex += arrIndex - targetIndex; + targetIndex = StringEncodingUtils.convertUTF16ToUTF8(chars, targetArray, arrIndex + 4); + int written = targetIndex - arrIndex - 4; + buffer._unsafePutInt32(writerIndex, written); + buffer._unsafeWriterIndex(writerIndex + 4 + written); + } else { + final byte[] tmpArray = getByteArray(estimateMaxBytes); + int written = StringEncodingUtils.convertUTF16ToUTF8(chars, tmpArray, 0); + writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header); + buffer._unsafePutInt32(writerIndex, written); + writerIndex += 4; + buffer.put(writerIndex, tmpArray, 0, written); + buffer._unsafeWriterIndex(writerIndex + written); + } + } + + public void writeBytesUTF8(MemoryBuffer buffer, byte[] bytes) { + int numBytes = bytes.length; + int estimateMaxBytes = bytes.length / 2 * 3; + int writerIndex = buffer.writerIndex(); + long header = ((long) numBytes << 2) | UTF8; + buffer.ensure(writerIndex + 9 + estimateMaxBytes); + byte[] targetArray = buffer.getHeapMemory(); + if (targetArray != null) { + int targetIndex = buffer._unsafeHeapWriterIndex(); + int arrIndex = targetIndex; + arrIndex += LittleEndian.putVarUint36Small(targetArray, arrIndex, header); + writerIndex += arrIndex - targetIndex; + targetIndex = StringEncodingUtils.convertUTF16ToUTF8(bytes, targetArray, arrIndex + 4); + int written = targetIndex - arrIndex - 4; + buffer._unsafePutInt32(writerIndex, written); + buffer._unsafeWriterIndex(writerIndex + 4 + written); + } else { + final byte[] tmpArray = getByteArray(estimateMaxBytes); + int written = StringEncodingUtils.convertUTF16ToUTF8(bytes, tmpArray, 0); + writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header); + buffer._unsafePutInt32(writerIndex, written); + writerIndex += 4; + buffer.put(writerIndex, tmpArray, 0, written); + buffer._unsafeWriterIndex(writerIndex + written); + } + } + private static final MethodHandles.Lookup STRING_LOOK_UP = _JDKAccess._trustedLookup(String.class); private static final BiFunction CHARS_STRING_ZERO_COPY_CTR = @@ -603,26 +743,121 @@ private static MethodHandle getJavaStringZeroCopyCtrHandle() { } } - public void writeUTF8String(MemoryBuffer buffer, String value) { - byte[] bytes = value.getBytes(StandardCharsets.UTF_8); - buffer.writeVarUint32(bytes.length); - buffer.writeBytes(bytes); + private static void heapWriteCharsUTF16BE( + char[] chars, int arrIndex, int numBytes, byte[] targetArray) { + // Write to heap memory then copy is 250% faster than unsafe write to direct memory. + int charIndex = 0; + for (int i = arrIndex, end = i + numBytes; i < end; i += 2) { + char c = chars[charIndex++]; + targetArray[i] = (byte) (c >> StringUTF16.HI_BYTE_SHIFT); + targetArray[i + 1] = (byte) (c >> StringUTF16.LO_BYTE_SHIFT); + } } - public String readUTF8String(MemoryBuffer buffer) { - int numBytes = buffer.readVarUint32Small14(); - buffer.checkReadableBytes(numBytes); - final byte[] targetArray = buffer.getHeapMemory(); - if (targetArray != null) { - String str = - new String( - targetArray, buffer._unsafeHeapReaderIndex(), numBytes, StandardCharsets.UTF_8); - buffer.increaseReaderIndex(numBytes); - return str; + private int offHeapWriteCharsUTF16( + MemoryBuffer buffer, char[] chars, int writerIndex, int numBytes) { + byte[] tmpArray = getByteArray(numBytes); + int charIndex = 0; + for (int i = 0; i < numBytes; i += 2) { + char c = chars[charIndex++]; + tmpArray[i] = (byte) (c >> StringUTF16.HI_BYTE_SHIFT); + tmpArray[i + 1] = (byte) (c >> StringUTF16.LO_BYTE_SHIFT); + } + buffer.put(writerIndex, tmpArray, 0, numBytes); + writerIndex += numBytes; + return writerIndex; + } + + private static byte bestCoder(char[] chars) { + int numChars = chars.length; + // sample 64 chars + int sampleNum = Math.min(64, numChars); + int vectorizedLen = sampleNum >> 2; + int vectorizedChars = vectorizedLen << 2; + int endOffset = Platform.CHAR_ARRAY_OFFSET + (vectorizedChars << 1); + int count = 0; + for (int offset = Platform.CHAR_ARRAY_OFFSET, charOffset = 0; + offset < endOffset; + offset += 8, charOffset += 4) { + long multiChars = Platform.getLong(chars, offset); + if ((multiChars & MULTI_CHARS_NON_LATIN_MASK) == 0) { + count += 4; + } else { + for (int i = 0; i < 4; ++i) { + if (chars[charOffset + i] < 0x80) { + count++; + } + } + } + } + + for (int i = vectorizedChars; i < sampleNum; i++) { + if (chars[i] < 0x80) { + count++; + } + } + + // ascii number > 50%, choose UTF-8 + if (count >= sampleNum * 0.5) { + if (count == numChars || (count == sampleNum && StringUtils.isLatin(chars, sampleNum))) { + return LATIN1; + } + return UTF8; } else { - final byte[] tmpArray = getByteArray(numBytes); - buffer.readBytes(tmpArray, 0, numBytes); - return new String(tmpArray, 0, numBytes, StandardCharsets.UTF_8); + return UTF16; + } + } + + private static byte bestCoder(byte[] bytes) { + int numBytes = bytes.length; + // sample 64 chars + int sampleNum = Math.min(64 << 1, numBytes); + int vectorizedLen = sampleNum >> 3; + int vectorizedBytes = vectorizedLen << 3; + int endOffset = Platform.BYTE_ARRAY_OFFSET + vectorizedBytes; + int count = 0; + for (int offset = Platform.BYTE_ARRAY_OFFSET, bytesOffset = 0; + offset < endOffset; + offset += 8, bytesOffset += 8) { + long multiChars = Platform.getLong(bytes, offset); + if ((multiChars & MULTI_CHARS_NON_LATIN_MASK) == 0) { + count += 4; + } else { + for (int i = Platform.IS_LITTLE_ENDIAN ? 1 : 0; i < 8; i += 2) { + if (bytes[bytesOffset + i] == 0) { + count++; + } + } + } } + for (int i = Platform.IS_LITTLE_ENDIAN ? vectorizedBytes + 1 : vectorizedBytes; + i < sampleNum; + ++i) { + if (bytes[i] == 0) { + count++; + } + } + // ascii number > 50%, choose UTF-8 + if (count >= sampleNum * 0.5) { + return UTF8; + } else { + return UTF16; + } + } + + private byte[] getByteArray(int numElements) { + byte[] byteArray = this.byteArray; + if (byteArray.length < numElements) { + byteArray = new byte[numElements]; + this.byteArray = byteArray; + } + if (byteArray.length > DEFAULT_BUFFER_SIZE) { + smoothByteArrayLength = + Math.max(((int) (smoothByteArrayLength * 0.9 + numElements * 0.1)), DEFAULT_BUFFER_SIZE); + if (smoothByteArrayLength <= DEFAULT_BUFFER_SIZE) { + this.byteArray = new byte[DEFAULT_BUFFER_SIZE]; + } + } + return byteArray; } } diff --git a/java/fury-core/src/main/java/org/apache/fury/util/StringEncodingUtils.java b/java/fury-core/src/main/java/org/apache/fury/util/StringEncodingUtils.java new file mode 100644 index 0000000000..d90b5412fe --- /dev/null +++ b/java/fury-core/src/main/java/org/apache/fury/util/StringEncodingUtils.java @@ -0,0 +1,381 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.fury.util; + +import static org.apache.fury.util.StringUtils.MULTI_CHARS_NON_LATIN_MASK; + +import org.apache.fury.memory.Platform; + +/** String Encoding Utils. */ +public class StringEncodingUtils { + + /** A fast convert algorithm to convert an utf16 char array into an utf8 byte array. */ + public static int convertUTF16ToUTF8(char[] src, byte[] dst, int dp) { + int numChars = src.length; + for (int charOffset = 0; charOffset < numChars; ) { + if (charOffset + 4 <= numChars + && (Platform.getLong(src, Platform.CHAR_ARRAY_OFFSET + charOffset * 2L) + & MULTI_CHARS_NON_LATIN_MASK) + == 0) { + // ascii only + dst[dp] = (byte) src[charOffset]; + dst[dp + 1] = (byte) src[charOffset + 1]; + dst[dp + 2] = (byte) src[charOffset + 2]; + dst[dp + 3] = (byte) src[charOffset + 3]; + dp += 4; + charOffset += 4; + } else { + char c = src[charOffset++]; + if (c < 0x80) { + dst[dp++] = (byte) c; + } else if (c < 0x800) { + dst[dp] = (byte) (0xc0 | (c >> 6)); + dst[dp + 1] = (byte) (0x80 | (c & 0x3f)); + dp += 2; + } else if (c >= '\uD800' && c <= Character.MAX_LOW_SURROGATE) { + utf8ToChar2(src, charOffset, c, dst, dp); + dp += 4; + charOffset++; + } else { + dst[dp] = (byte) (0xe0 | ((c >> 12))); + dst[dp + 1] = (byte) (0x80 | ((c >> 6) & 0x3f)); + dst[dp + 2] = (byte) (0x80 | (c & 0x3f)); + dp += 3; + } + } + } + return dp; + } + + /** A fast convert algorithm to convert an utf16 byte array into an utf8 byte array. */ + public static int convertUTF16ToUTF8(byte[] src, byte[] dst, int dp) { + int numBytes = src.length; + for (int offset = 0; offset < numBytes; ) { + if (offset + 8 <= numBytes + && (Platform.getLong(src, Platform.BYTE_ARRAY_OFFSET + offset) + & MULTI_CHARS_NON_LATIN_MASK) + == 0) { + // ascii only + if (Platform.IS_LITTLE_ENDIAN) { + dst[dp] = src[offset]; + dst[dp + 1] = src[offset + 2]; + dst[dp + 2] = src[offset + 4]; + dst[dp + 3] = src[offset + 6]; + } else { + dst[dp] = src[offset + 1]; + dst[dp + 1] = src[offset + 3]; + dst[dp + 2] = src[offset + 5]; + dst[dp + 3] = src[offset + 7]; + } + dp += 4; + offset += 8; + } else { + char c = Platform.getChar(src, Platform.BYTE_ARRAY_OFFSET + offset); + offset += 2; + + if (c < 0x80) { + dst[dp++] = (byte) c; + } else { + if (c < 0x800) { + // 2 bytes, 11 bits + dst[dp] = (byte) (0xc0 | (c >> 6)); + dst[dp + 1] = (byte) (0x80 | (c & 0x3f)); + dp += 2; + } else if (c >= '\uD800' && c <= Character.MAX_LOW_SURROGATE) { + utf8ToChar2(src, offset, c, numBytes, dst, dp); + dp += 4; + offset += 2; + } else { + // 3 bytes, 16 bits + dst[dp] = (byte) (0xe0 | ((c >> 12))); + dst[dp + 1] = (byte) (0x80 | ((c >> 6) & 0x3f)); + dst[dp + 2] = (byte) (0x80 | (c & 0x3f)); + dp += 3; + } + } + } + } + return dp; + } + + /** + * A fast convert algorithm to convert an utf8 encoded byte array into an utf16 encoded byte + * array. + */ + public static int convertUTF8ToUTF16(byte[] src, int offset, int len, byte[] dst) { + final int end = offset + len; + int dp = 0; + + while (offset < end) { + if (offset + 8 <= end + && (Platform.getLong(src, Platform.BYTE_ARRAY_OFFSET + offset) & 0x8080808080808080L) + == 0) { + // ascii only + if (Platform.IS_LITTLE_ENDIAN) { + dst[dp] = src[offset]; + dst[dp + 2] = src[offset + 1]; + dst[dp + 4] = src[offset + 2]; + dst[dp + 6] = src[offset + 3]; + dst[dp + 8] = src[offset + 4]; + dst[dp + 10] = src[offset + 5]; + dst[dp + 12] = src[offset + 6]; + dst[dp + 14] = src[offset + 7]; + } else { + dst[dp + 1] = src[offset]; + dst[dp + 3] = src[offset + 1]; + dst[dp + 5] = src[offset + 2]; + dst[dp + 7] = src[offset + 3]; + dst[dp + 9] = src[offset + 4]; + dst[dp + 11] = src[offset + 5]; + dst[dp + 13] = src[offset + 6]; + dst[dp + 15] = src[offset + 7]; + } + dp += 16; + offset += 8; + } else { + int b0 = src[offset++]; + if (b0 >= 0) { + // 1 byte, 7 bits: 0xxxxxxx + dst[dp] = (byte) b0; + dst[dp + 1] = 0; + dp += 2; + } else if ((b0 >> 5) == -2 && (b0 & 0x1e) != 0) { + // 2 bytes, 11 bits: 110xxxxx 10xxxxxx + if (offset >= end) { + return -1; + } + int b1 = src[offset++]; + if ((b1 & 0xc0) != 0x80) { // isNotContinuation(b2) + return -1; + } else { + char c = (char) (((b0 << 6) ^ b1) ^ (((byte) 0xC0 << 6) ^ ((byte) 0x80))); + dst[dp] = (byte) c; + dst[dp + 1] = (byte) (c >> 8); + dp += 2; + } + } else if ((b0 >> 4) == -2) { + // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx + if (offset + 1 >= end) { + return -1; + } + int b1 = src[offset]; + int b2 = src[offset + 1]; + offset += 2; + if ((b0 == (byte) 0xe0 && (b1 & 0xe0) == 0x80) // + || (b1 & 0xc0) != 0x80 // + || (b2 & 0xc0) != 0x80) { // isMalformed3(b0, b1, b2) + return -1; + } else { + char c = + (char) + ((b0 << 12) + ^ (b1 << 6) + ^ (b2 ^ (((byte) 0xE0 << 12) ^ ((byte) 0x80 << 6) ^ ((byte) 0x80)))); + boolean isSurrogate = c >= '\uD800' && c < (Character.MAX_LOW_SURROGATE + 1); + if (isSurrogate) { + return -1; + } else { + dst[dp] = (byte) c; + dst[dp + 1] = (byte) (c >> 8); + dp += 2; + } + } + } else if ((b0 >> 3) == -2) { + // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + if (offset + 2 >= end) { + return -1; + } + int b2 = src[offset]; + int b3 = src[offset + 1]; + int b4 = src[offset + 2]; + offset += 3; + int uc = + ((b0 << 18) + ^ (b2 << 12) + ^ (b3 << 6) + ^ (b4 + ^ (((byte) 0xF0 << 18) + ^ ((byte) 0x80 << 12) + ^ ((byte) 0x80 << 6) + ^ ((byte) 0x80)))); + if (((b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || (b4 & 0xc0) != 0x80) // isMalformed4 + || + // shortest form check + !(uc >= 0x010000 && uc < 0X10FFFF + 1) // !Character.isSupplementaryCodePoint(uc) + ) { + return -1; + } else { + char c = (char) ((uc >>> 10) + ('\uD800' - (0x010000 >>> 10))); + dst[dp] = (byte) c; + dst[dp + 1] = (byte) (c >> 8); + dp += 2; + + c = (char) ((uc & 0x3ff) + Character.MIN_LOW_SURROGATE); + dst[dp] = (byte) c; + dst[dp + 1] = (byte) (c >> 8); + dp += 2; + } + } else { + return -1; + } + } + } + return dp; + } + + /** + * A fast convert algorithm to convert an utf8 encoded byte array into utf16 encoded char array. + */ + public static int convertUTF8ToUTF16(byte[] src, int offset, int len, char[] dst) { + int end = offset + len; + int dp = 0; + while (offset < end) { + if (offset + 8 <= end + && (Platform.getLong(src, Platform.BYTE_ARRAY_OFFSET + offset) & 0x8080808080808080L) + == 0) { + // ascii only + dst[dp] = (char) src[offset]; + dst[dp + 1] = (char) src[offset + 1]; + dst[dp + 2] = (char) src[offset + 2]; + dst[dp + 3] = (char) src[offset + 3]; + dst[dp + 4] = (char) src[offset + 4]; + dst[dp + 5] = (char) src[offset + 5]; + dst[dp + 6] = (char) src[offset + 6]; + dst[dp + 7] = (char) src[offset + 7]; + dp += 8; + offset += 8; + } else { + int b1 = src[offset++]; + if (b1 >= 0) { + // 1 byte, 7 bits: 0xxxxxxx + dst[dp++] = (char) b1; + } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { + // 2 bytes, 11 bits: 110xxxxx 10xxxxxx + if (offset >= end) { + return -1; + } + int b2 = src[offset++]; + if ((b2 & 0xc0) != 0x80) { // isNotContinuation(b2) + return -1; + } else { + dst[dp++] = (char) (((b1 << 6) ^ b2) ^ (((byte) 0xC0 << 6) ^ ((byte) 0x80))); + } + } else if ((b1 >> 4) == -2) { + // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx + if (offset + 1 >= end) { + return -1; + } + + int b2 = src[offset]; + int b3 = src[offset + 1]; + offset += 2; + if ((b1 == (byte) 0xe0 && (b2 & 0xe0) == 0x80) // + || (b2 & 0xc0) != 0x80 // + || (b3 & 0xc0) != 0x80) { // isMalformed3(b1, b2, b3) + return -1; + } else { + char c = + (char) + ((b1 << 12) + ^ (b2 << 6) + ^ (b3 ^ (((byte) 0xE0 << 12) ^ ((byte) 0x80 << 6) ^ ((byte) 0x80)))); + boolean isSurrogate = c >= '\uD800' && c < (Character.MAX_LOW_SURROGATE + 1); + if (isSurrogate) { + return -1; + } else { + dst[dp++] = c; + } + } + } else if ((b1 >> 3) == -2) { + // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + if (offset + 2 >= end) { + return -1; + } + int b2 = src[offset]; + int b3 = src[offset + 1]; + int b4 = src[offset + 2]; + offset += 3; + int uc = + ((b1 << 18) + ^ (b2 << 12) + ^ (b3 << 6) + ^ (b4 + ^ (((byte) 0xF0 << 18) + ^ ((byte) 0x80 << 12) + ^ ((byte) 0x80 << 6) + ^ ((byte) 0x80)))); + if (((b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || (b4 & 0xc0) != 0x80) // isMalformed4 + || + // shortest form check + !(uc >= 0x010000 && uc < 0X10FFFF + 1) // !Character.isSupplementaryCodePoint(uc) + ) { + return -1; + } else { + dst[dp] = + (char) + ((uc >>> 10) + ('\uD800' - (0x010000 >>> 10))); // Character.highSurrogate(uc); + dst[dp + 1] = + (char) ((uc & 0x3ff) + Character.MIN_LOW_SURROGATE); // Character.lowSurrogate(uc); + dp += 2; + } + } else { + return -1; + } + } + } + return dp; + } + + /** convert two utf16 char c and src[charOffset] to a four byte utf8 bytes. */ + private static void utf8ToChar2(char[] src, int charOffset, char c, byte[] dst, int dp) { + char d; + if (c > Character.MAX_HIGH_SURROGATE + || charOffset == src.length + || (d = src[charOffset]) < Character.MIN_LOW_SURROGATE + || d > Character.MAX_LOW_SURROGATE) { + throw new RuntimeException("malformed input off : " + charOffset); + } + + int uc = ((c << 10) + d) + (0x010000 - ('\uD800' << 10) - Character.MIN_LOW_SURROGATE); + dst[dp] = (byte) (0xf0 | ((uc >> 18))); + dst[dp + 1] = (byte) (0x80 | ((uc >> 12) & 0x3f)); + dst[dp + 2] = (byte) (0x80 | ((uc >> 6) & 0x3f)); + dst[dp + 3] = (byte) (0x80 | (uc & 0x3f)); + } + + /** convert two utf16 char c and char(src[offset], src[offset+1]) to a four byte utf8 bytes. */ + private static void utf8ToChar2( + byte[] src, int offset, char c, int numBytes, byte[] dst, int dp) { + char d; + if (c > Character.MAX_HIGH_SURROGATE + || numBytes - offset < 1 + || (d = Platform.getChar(src, Platform.BYTE_ARRAY_OFFSET + offset)) + < Character.MIN_LOW_SURROGATE + || d > Character.MAX_LOW_SURROGATE) { + throw new RuntimeException("malformed input off : " + offset); + } + + int uc = ((c << 10) + d) + (0x010000 - ('\uD800' << 10) - Character.MIN_LOW_SURROGATE); + dst[dp] = (byte) (0xf0 | ((uc >> 18))); + dst[dp + 1] = (byte) (0x80 | ((uc >> 12) & 0x3f)); + dst[dp + 2] = (byte) (0x80 | ((uc >> 6) & 0x3f)); + dst[dp + 3] = (byte) (0x80 | (uc & 0x3f)); + } +} diff --git a/java/fury-core/src/main/java/org/apache/fury/util/StringUtils.java b/java/fury-core/src/main/java/org/apache/fury/util/StringUtils.java index cc892bef11..99ea8b967c 100644 --- a/java/fury-core/src/main/java/org/apache/fury/util/StringUtils.java +++ b/java/fury-core/src/main/java/org/apache/fury/util/StringUtils.java @@ -26,7 +26,7 @@ public class StringUtils { // A long mask used to clear all-higher bits of char in a super-word way. - private static final long MULTI_CHARS_NON_LATIN_MASK; + public static final long MULTI_CHARS_NON_LATIN_MASK; private static final char[] BASE16_CHARS2 = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' @@ -267,12 +267,20 @@ public static String lowerCamelToLowerUnderscore(String lowerCamel) { } public static boolean isLatin(char[] chars) { + return isLatin(chars, 0); + } + + public static boolean isLatin(char[] chars, int start) { + if (start > chars.length) { + return false; + } + int byteOffset = start << 1; int numChars = chars.length; int vectorizedLen = numChars >> 2; int vectorizedChars = vectorizedLen << 2; int endOffset = Platform.CHAR_ARRAY_OFFSET + (vectorizedChars << 1); boolean isLatin = true; - for (int offset = Platform.CHAR_ARRAY_OFFSET; offset < endOffset; offset += 8) { + for (int offset = Platform.CHAR_ARRAY_OFFSET + byteOffset; offset < endOffset; offset += 8) { // check 4 chars in a vectorized way, 4 times faster than scalar check loop. // See benchmark in CompressStringSuite.latinSuperWordCheck. long multiChars = Platform.getLong(chars, offset); diff --git a/java/fury-core/src/test/java/org/apache/fury/builder/JITContextTest.java b/java/fury-core/src/test/java/org/apache/fury/builder/JITContextTest.java index 86143fe99e..e375a60954 100644 --- a/java/fury-core/src/test/java/org/apache/fury/builder/JITContextTest.java +++ b/java/fury-core/src/test/java/org/apache/fury/builder/JITContextTest.java @@ -154,6 +154,12 @@ public void testAsyncCompilationSwitch() throws InterruptedException { LOG.warn("Wait async compilation finish for {}", cls); } } + while (fury.getJITContext().hasJITResult(PkgAccessLevel.class)) { + Thread.sleep(10); // allow serializer be switched to generated version + } + while (fury.getJITContext().hasJITResult(PrivateAccessLevel.class)) { + Thread.sleep(10); // allow serializer be switched to generated version + } Serializer serializer = fury.getClassResolver().getSerializer(TestAccessLevel.class); assertTrue(ReflectionUtils.getObjectFieldValue(serializer, "serializer") instanceof Generated); diff --git a/java/fury-core/src/test/java/org/apache/fury/serializer/StringSerializerTest.java b/java/fury-core/src/test/java/org/apache/fury/serializer/StringSerializerTest.java index fc891ac512..15c46c57f7 100644 --- a/java/fury-core/src/test/java/org/apache/fury/serializer/StringSerializerTest.java +++ b/java/fury-core/src/test/java/org/apache/fury/serializer/StringSerializerTest.java @@ -303,7 +303,7 @@ public void testCompressJava8String() { @Test public void testReadUtf8String() { - Fury fury = getJavaFury(); + Fury fury = Fury.builder().withStringCompressed(true).requireClassRegistration(false).build(); for (MemoryBuffer buffer : new MemoryBuffer[] { MemoryUtils.buffer(32), MemoryUtils.wrap(ByteBuffer.allocateDirect(2048)) @@ -313,7 +313,8 @@ public void testReadUtf8String() { assertEquals(serializer.read(buffer), "abc你好"); byte[] bytes = "abc你好".getBytes(StandardCharsets.UTF_8); byte UTF8 = 2; - buffer.writeVarUint64(((long) bytes.length) << 2 | UTF8); + buffer.writeVarUint64(((long) "abc你好".length() << 1) << 2 | UTF8); + buffer.writeInt32(bytes.length); buffer.writeBytes(bytes); assertEquals(serializer.read(buffer), "abc你好"); assertEquals(buffer.readerIndex(), buffer.writerIndex()); diff --git a/java/fury-core/src/test/java/org/apache/fury/util/StringEncodingUtilsTest.java b/java/fury-core/src/test/java/org/apache/fury/util/StringEncodingUtilsTest.java new file mode 100644 index 0000000000..0f5e5ed5cf --- /dev/null +++ b/java/fury-core/src/test/java/org/apache/fury/util/StringEncodingUtilsTest.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.fury.util; + +import static org.testng.Assert.assertEquals; + +import java.nio.charset.StandardCharsets; +import org.apache.fury.FuryTestBase; +import org.testng.annotations.Test; + +public class StringEncodingUtilsTest extends FuryTestBase { + @Test + public void testUTF8ToUTF16() { + String input = "你好, Fury"; + byte[] utf8 = input.getBytes(StandardCharsets.UTF_8); + char[] utf16Chars = new char[utf8.length * 2]; + int readLen = StringEncodingUtils.convertUTF8ToUTF16(utf8, 0, utf8.length, utf16Chars); + String result = new String(utf16Chars, 0, readLen); + assertEquals(result, input); + + byte[] utf16Bytes = new byte[utf8.length * 4]; + readLen = StringEncodingUtils.convertUTF8ToUTF16(utf8, 0, utf8.length, utf16Bytes); + result = new String(utf16Bytes, 0, readLen, StandardCharsets.UTF_16LE); + assertEquals(result, input); + } + + @Test + public void testUTF16ToUTF8() { + String input = "你好, Fury"; + char[] utf16 = new char[input.length()]; + byte[] utf8 = new byte[input.length() * 3]; + input.getChars(0, input.length(), utf16, 0); + int readLen = StringEncodingUtils.convertUTF16ToUTF8(utf16, utf8, 0); + String result = new String(utf8, 0, readLen, StandardCharsets.UTF_8); + assertEquals(result, input); + + byte[] utf16Bytes = input.getBytes(StandardCharsets.UTF_16LE); + readLen = StringEncodingUtils.convertUTF16ToUTF8(utf16Bytes, utf8, 0); + result = new String(utf8, 0, readLen, StandardCharsets.UTF_8); + assertEquals(result, input); + } +}