diff --git a/Cargo.toml b/Cargo.toml
index 9aa2b01eae0..895467ed86a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -111,6 +111,7 @@ full = [
"io_ipc",
"io_flight",
"io_ipc_write_async",
+ "io_ipc_read_async",
"io_ipc_compression",
"io_json_integration",
"io_print",
@@ -132,6 +133,7 @@ io_csv_write = ["csv", "streaming-iterator", "lexical-core"]
io_json = ["serde", "serde_json", "streaming-iterator", "fallible-streaming-iterator", "indexmap", "lexical-core"]
io_ipc = ["arrow-format"]
io_ipc_write_async = ["io_ipc", "futures"]
+io_ipc_read_async = ["io_ipc", "futures"]
io_ipc_compression = ["lz4", "zstd"]
io_flight = ["io_ipc", "arrow-format/flight-data"]
# base64 + io_ipc because arrow schemas are stored as base64-encoded ipc format.
diff --git a/src/io/ipc/read/mod.rs b/src/io/ipc/read/mod.rs
index 22b3a2fe448..3a45d4ecac6 100644
--- a/src/io/ipc/read/mod.rs
+++ b/src/io/ipc/read/mod.rs
@@ -16,6 +16,9 @@ mod read_basic;
mod reader;
mod schema;
mod stream;
+#[cfg(feature = "io_ipc_read_async")]
+#[cfg_attr(docsrs, doc(cfg(feature = "io_ipc_read_async")))]
+pub mod stream_async;
pub use common::{read_dictionary, read_record_batch};
pub use reader::{read_file_metadata, FileMetadata, FileReader};
diff --git a/src/io/ipc/read/schema.rs b/src/io/ipc/read/schema.rs
index 7dbeeedfd73..bde3deba1e2 100644
--- a/src/io/ipc/read/schema.rs
+++ b/src/io/ipc/read/schema.rs
@@ -8,7 +8,10 @@ use crate::{
error::{ArrowError, Result},
};
-use super::super::{IpcField, IpcSchema};
+use super::{
+ super::{IpcField, IpcSchema},
+ StreamMetadata,
+};
fn try_unzip_vec>>(iter: I) -> Result<(Vec, Vec)> {
let mut a = vec![];
@@ -370,3 +373,28 @@ pub(super) fn fb_to_schema(schema: arrow_format::ipc::SchemaRef) -> Result<(Sche
},
))
}
+
+pub(super) fn deserialize_stream_metadata(meta: &[u8]) -> Result {
+ let message = arrow_format::ipc::MessageRef::read_as_root(meta).map_err(|err| {
+ ArrowError::OutOfSpec(format!("Unable to get root as message: {:?}", err))
+ })?;
+ let version = message.version()?;
+ // message header is a Schema, so read it
+ let header = message
+ .header()?
+ .ok_or_else(|| ArrowError::oos("Unable to read the first IPC message"))?;
+ let schema = if let arrow_format::ipc::MessageHeaderRef::Schema(schema) = header {
+ schema
+ } else {
+ return Err(ArrowError::oos(
+ "The first IPC message of the stream must be a schema",
+ ));
+ };
+ let (schema, ipc_schema) = fb_to_schema(schema)?;
+
+ Ok(StreamMetadata {
+ schema,
+ version,
+ ipc_schema,
+ })
+}
diff --git a/src/io/ipc/read/stream.rs b/src/io/ipc/read/stream.rs
index 370ea9f429d..81ede969cca 100644
--- a/src/io/ipc/read/stream.rs
+++ b/src/io/ipc/read/stream.rs
@@ -12,7 +12,7 @@ use crate::io::ipc::IpcSchema;
use super::super::CONTINUATION_MARKER;
use super::common::*;
-use super::schema::fb_to_schema;
+use super::schema::deserialize_stream_metadata;
use super::Dictionaries;
/// Metadata of an Arrow IPC stream, written at the start of the stream
@@ -45,29 +45,7 @@ pub fn read_stream_metadata(reader: &mut R) -> Result {
let mut meta_buffer = vec![0; meta_len as usize];
reader.read_exact(&mut meta_buffer)?;
- let message =
- arrow_format::ipc::MessageRef::read_as_root(meta_buffer.as_slice()).map_err(|err| {
- ArrowError::OutOfSpec(format!("Unable to get root as message: {:?}", err))
- })?;
- let version = message.version()?;
- // message header is a Schema, so read it
- let header = message
- .header()?
- .ok_or_else(|| ArrowError::oos("Unable to read the first IPC message"))?;
- let schema = if let arrow_format::ipc::MessageHeaderRef::Schema(schema) = header {
- schema
- } else {
- return Err(ArrowError::oos(
- "The first IPC message of the stream must be a schema",
- ));
- };
- let (schema, ipc_schema) = fb_to_schema(schema)?;
-
- Ok(StreamMetadata {
- schema,
- version,
- ipc_schema,
- })
+ deserialize_stream_metadata(&meta_buffer)
}
/// Encodes the stream's status after each read.
diff --git a/src/io/ipc/read/stream_async.rs b/src/io/ipc/read/stream_async.rs
new file mode 100644
index 00000000000..9e054cd07ce
--- /dev/null
+++ b/src/io/ipc/read/stream_async.rs
@@ -0,0 +1,212 @@
+//! APIs to read Arrow streams asynchronously
+use std::sync::Arc;
+
+use arrow_format::ipc::planus::ReadAsRoot;
+use futures::future::BoxFuture;
+use futures::AsyncRead;
+use futures::AsyncReadExt;
+use futures::Stream;
+
+use crate::array::*;
+use crate::chunk::Chunk;
+use crate::error::{ArrowError, Result};
+
+use super::super::CONTINUATION_MARKER;
+use super::common::{read_dictionary, read_record_batch};
+use super::schema::deserialize_stream_metadata;
+use super::Dictionaries;
+use super::StreamMetadata;
+
+/// A (private) state of stream messages
+struct ReadState {
+ pub reader: R,
+ pub metadata: StreamMetadata,
+ pub dictionaries: Dictionaries,
+ /// The internal buffer to read data inside the messages (records and dictionaries) to
+ pub data_buffer: Vec,
+ /// The internal buffer to read messages to
+ pub message_buffer: Vec,
+}
+
+/// The state of an Arrow stream
+enum StreamState {
+ /// The stream does not contain new chunks (and it has not been closed)
+ Waiting(ReadState),
+ /// The stream contain a new chunk
+ Some((ReadState, Chunk>)),
+}
+
+/// Reads the [`StreamMetadata`] of the Arrow stream asynchronously
+pub async fn read_stream_metadata_async(
+ reader: &mut R,
+) -> Result {
+ // determine metadata length
+ let mut meta_size: [u8; 4] = [0; 4];
+ reader.read_exact(&mut meta_size).await?;
+ let meta_len = {
+ // If a continuation marker is encountered, skip over it and read
+ // the size from the next four bytes.
+ if meta_size == CONTINUATION_MARKER {
+ reader.read_exact(&mut meta_size).await?;
+ }
+ i32::from_le_bytes(meta_size)
+ };
+
+ let mut meta_buffer = vec![0; meta_len as usize];
+ reader.read_exact(&mut meta_buffer).await?;
+
+ deserialize_stream_metadata(&meta_buffer)
+}
+
+/// Reads the next item, yielding `None` if the stream has been closed,
+/// or a [`StreamState`] otherwise.
+async fn maybe_next(
+ mut state: ReadState,
+) -> Result