forked from apache/arrow
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support the decompress feature in shuffle component. (apache#3)
* ARROW-10880: [Java] Support compressing RecordBatch IPC buffers by LZ4 * ARROW-10880: [Java] Support reading/writing big-endian message size * ARROW-10880: [Java] Adjust variable names * ARROW-10880: [Java] Support empty buffers * ARROW-10880: [Java] Support passing raw data * ARROW-10880: [Java] Switch to commons-compress library * bug fix and support the fastpfor codec in the IPC framework * update the access permission from private to protected * disable the decompress function when loading the buffer Co-authored-by: liyafan82 <[email protected]>
- Loading branch information
1 parent
b0f9c3a
commit a634d04
Showing
11 changed files
with
422 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
160 changes: 160 additions & 0 deletions
160
java/vector/src/main/java/org/apache/arrow/vector/compression/Lz4CompressionCodec.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.arrow.vector.compression; | ||
|
||
import static org.apache.arrow.memory.util.MemoryUtil.LITTLE_ENDIAN; | ||
import static org.apache.arrow.vector.compression.CompressionUtil.NO_COMPRESSION_LENGTH; | ||
import static org.apache.arrow.vector.compression.CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH; | ||
|
||
import java.io.ByteArrayInputStream; | ||
import java.io.ByteArrayOutputStream; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.io.OutputStream; | ||
|
||
import org.apache.arrow.flatbuf.CompressionType; | ||
import org.apache.arrow.memory.ArrowBuf; | ||
import org.apache.arrow.memory.BufferAllocator; | ||
import org.apache.arrow.util.Preconditions; | ||
import org.apache.commons.compress.compressors.lz4.FramedLZ4CompressorInputStream; | ||
import org.apache.commons.compress.compressors.lz4.FramedLZ4CompressorOutputStream; | ||
import org.apache.commons.compress.utils.IOUtils; | ||
|
||
import io.netty.util.internal.PlatformDependent; | ||
|
||
/** | ||
* Compression codec for the LZ4 algorithm. | ||
*/ | ||
public class Lz4CompressionCodec implements CompressionCodec { | ||
|
||
@Override | ||
public ArrowBuf compress(BufferAllocator allocator, ArrowBuf uncompressedBuffer) { | ||
Preconditions.checkArgument(uncompressedBuffer.writerIndex() <= Integer.MAX_VALUE, | ||
"The uncompressed buffer size exceeds the integer limit"); | ||
|
||
if (uncompressedBuffer.writerIndex() == 0L) { | ||
// shortcut for empty buffer | ||
ArrowBuf compressedBuffer = allocator.buffer(SIZE_OF_UNCOMPRESSED_LENGTH); | ||
compressedBuffer.setLong(0, 0); | ||
compressedBuffer.writerIndex(SIZE_OF_UNCOMPRESSED_LENGTH); | ||
uncompressedBuffer.close(); | ||
return compressedBuffer; | ||
} | ||
|
||
try { | ||
ArrowBuf compressedBuffer = doCompress(allocator, uncompressedBuffer); | ||
long compressedLength = compressedBuffer.writerIndex() - SIZE_OF_UNCOMPRESSED_LENGTH; | ||
if (compressedLength > uncompressedBuffer.writerIndex()) { | ||
// compressed buffer is larger, send the raw buffer | ||
compressedBuffer.close(); | ||
compressedBuffer = CompressionUtil.compressRawBuffer(allocator, uncompressedBuffer); | ||
} | ||
|
||
uncompressedBuffer.close(); | ||
return compressedBuffer; | ||
} catch (IOException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
|
||
private ArrowBuf doCompress(BufferAllocator allocator, ArrowBuf uncompressedBuffer) throws IOException { | ||
byte[] inBytes = new byte[(int) uncompressedBuffer.writerIndex()]; | ||
PlatformDependent.copyMemory(uncompressedBuffer.memoryAddress(), inBytes, 0, uncompressedBuffer.writerIndex()); | ||
ByteArrayOutputStream baos = new ByteArrayOutputStream(); | ||
try (InputStream in = new ByteArrayInputStream(inBytes); | ||
OutputStream out = new FramedLZ4CompressorOutputStream(baos)) { | ||
IOUtils.copy(in, out); | ||
} | ||
|
||
byte[] outBytes = baos.toByteArray(); | ||
|
||
ArrowBuf compressedBuffer = allocator.buffer(SIZE_OF_UNCOMPRESSED_LENGTH + outBytes.length); | ||
|
||
long uncompressedLength = uncompressedBuffer.writerIndex(); | ||
if (!LITTLE_ENDIAN) { | ||
uncompressedLength = Long.reverseBytes(uncompressedLength); | ||
} | ||
// first 8 bytes reserved for uncompressed length, to be consistent with the | ||
// C++ implementation. | ||
compressedBuffer.setLong(0, uncompressedLength); | ||
|
||
PlatformDependent.copyMemory( | ||
outBytes, 0, compressedBuffer.memoryAddress() + SIZE_OF_UNCOMPRESSED_LENGTH, outBytes.length); | ||
compressedBuffer.writerIndex(SIZE_OF_UNCOMPRESSED_LENGTH + outBytes.length); | ||
return compressedBuffer; | ||
} | ||
|
||
@Override | ||
public ArrowBuf decompress(BufferAllocator allocator, ArrowBuf compressedBuffer) { | ||
Preconditions.checkArgument(compressedBuffer.writerIndex() <= Integer.MAX_VALUE, | ||
"The compressed buffer size exceeds the integer limit"); | ||
|
||
Preconditions.checkArgument(compressedBuffer.writerIndex() >= SIZE_OF_UNCOMPRESSED_LENGTH, | ||
"Not enough data to decompress."); | ||
|
||
long decompressedLength = compressedBuffer.getLong(0); | ||
if (!LITTLE_ENDIAN) { | ||
decompressedLength = Long.reverseBytes(decompressedLength); | ||
} | ||
|
||
if (decompressedLength == 0L) { | ||
// shortcut for empty buffer | ||
compressedBuffer.close(); | ||
return allocator.getEmpty(); | ||
} | ||
|
||
if (decompressedLength == NO_COMPRESSION_LENGTH) { | ||
// no compression | ||
return CompressionUtil.decompressRawBuffer(compressedBuffer); | ||
} | ||
|
||
try { | ||
ArrowBuf decompressedBuffer = doDecompress(allocator, compressedBuffer); | ||
compressedBuffer.close(); | ||
return decompressedBuffer; | ||
} catch (IOException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
|
||
private ArrowBuf doDecompress(BufferAllocator allocator, ArrowBuf compressedBuffer) throws IOException { | ||
long decompressedLength = compressedBuffer.getLong(0); | ||
if (!LITTLE_ENDIAN) { | ||
decompressedLength = Long.reverseBytes(decompressedLength); | ||
} | ||
|
||
byte[] inBytes = new byte[(int) (compressedBuffer.writerIndex() - SIZE_OF_UNCOMPRESSED_LENGTH)]; | ||
PlatformDependent.copyMemory( | ||
compressedBuffer.memoryAddress() + SIZE_OF_UNCOMPRESSED_LENGTH, inBytes, 0, inBytes.length); | ||
ByteArrayOutputStream out = new ByteArrayOutputStream((int) decompressedLength); | ||
try (InputStream in = new FramedLZ4CompressorInputStream(new ByteArrayInputStream(inBytes))) { | ||
IOUtils.copy(in, out); | ||
} | ||
|
||
byte[] outBytes = out.toByteArray(); | ||
ArrowBuf decompressedBuffer = allocator.buffer(outBytes.length); | ||
PlatformDependent.copyMemory(outBytes, 0, decompressedBuffer.memoryAddress(), outBytes.length); | ||
decompressedBuffer.writerIndex(decompressedLength); | ||
return decompressedBuffer; | ||
} | ||
|
||
@Override | ||
public CompressionUtil.CodecType getCodecType() { | ||
return CompressionUtil.CodecType.LZ4_FRAME; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.