Added support for IPC 2.0 (compression).

jorgecarleitao · Jun 24, 2021 · 5b24f33 · 5b24f33
1 parent c93b3eb
commit 5b24f33
Show file tree

Hide file tree

Showing 10 changed files with 370 additions and 72 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -39,6 +39,10 @@ prettytable-rs = { version = "^0.8", optional = true }
 flatbuffers = { version = "=0.8.4", optional = true }
 hex = { version = "^0.4", optional = true }
 
+# for IPC compression
+lz4 = { version = "1.23.1", optional = true }
+zstd = { version = "^0.6", optional = true }
+
 rand = { version = "0.7", optional = true }
 
 itertools = { version = "^0.10", optional = true }
@@ -72,7 +76,7 @@ default = ["io_csv", "io_json", "io_ipc", "io_json_integration", "io_print", "io
 merge_sort = ["itertools"]
 io_csv = ["csv", "lazy_static", "regex"]
 io_json = ["serde", "serde_derive", "serde_json", "indexmap"]
-io_ipc = ["flatbuffers"]
+io_ipc = ["flatbuffers", "lz4", "zstd"]
 io_json_integration = ["io_json", "hex"]
 io_print = ["prettytable-rs"]
 # base64 + io_ipc because arrow schemas are stored as base64-encoded ipc format.

diff --git a/README.md b/README.md
@@ -57,6 +57,7 @@ venv/bin/python parquet_integration/write_parquet.py
 * Generalized parsing of CSV based on logical data types
 * conditional compilation based on cargo `features` to reduce dependencies and size
 * faster IPC reader (different design that avoids an extra copy of all data)
+* IPC supports 2.0 (compression)
 
 ## Features in the original not available in this crate
 

diff --git a/arrow-flight/src/utils.rs b/arrow-flight/src/utils.rs
@@ -169,6 +169,7 @@ pub fn flight_data_to_arrow_batch(
                 &dictionaries_by_field,
                 &mut reader,
                 0,
+                message.compression(),
             )
         })?
 }
diff --git a/integration-testing/src/flight_server_scenarios/integration_test.rs b/integration-testing/src/flight_server_scenarios/integration_test.rs
@@ -296,6 +296,7 @@ async fn record_batch_from_message(
         &dictionaries_by_field,
         &mut reader,
         0,
+        message.compression(),
     );
 
     arrow_batch_result

diff --git a/src/io/ipc/compression.rs b/src/io/ipc/compression.rs
@@ -0,0 +1,13 @@
+use std::io::Read;
+
+use crate::error::Result;
+
+pub fn decompress_lz4(input_buf: &[u8], output_buf: &mut [u8]) -> Result<()> {
+    let mut decoder = lz4::Decoder::new(input_buf)?;
+    decoder.read_exact(output_buf).map_err(|e| e.into())
+}
+
+pub fn decompress_zstd(input_buf: &[u8], output_buf: &mut [u8]) -> Result<()> {
+    let mut decoder = zstd::Decoder::new(input_buf)?;
+    decoder.read_exact(output_buf).map_err(|e| e.into())
+}
diff --git a/src/io/ipc/mod.rs b/src/io/ipc/mod.rs
@@ -26,6 +26,7 @@
 pub mod gen;
 
 pub(crate) mod common;
+mod compression;
 mod convert;
 
 pub use convert::fb_to_schema;

diff --git a/src/io/ipc/read/common.rs b/src/io/ipc/read/common.rs
@@ -40,12 +40,6 @@ pub fn read_record_batch<R: Read + Seek>(
     block_offset: u64,
     compression: Option<BodyCompression>,
 ) -> Result<RecordBatch> {
-    if compression.is_some() {
-        return Err(ArrowError::NotYetImplemented(
-            "IPC format with compression".to_string(),
-        ));
-    }
-
     let buffers = batch
         .buffers()
         .ok_or_else(|| ArrowError::Ipc("Unable to get buffers from IPC RecordBatch".to_string()))?;
@@ -73,9 +67,10 @@ pub fn read_record_batch<R: Read + Seek>(
                 reader,
                 block_offset,
                 is_little_endian,
+                compression,
             )
         })
-        .collect::<std::io::Result<Vec<_>>>()?;
+        .collect::<Result<Vec<_>>>()?;
 
     RecordBatch::try_new(schema.clone(), arrays)
 }