From f0455d12ddcb174f1f8d2bbfd5874f7b708c9a74 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 4 Oct 2023 12:42:49 +0100 Subject: [PATCH] Support Parsing Avro File Headers (#4888) * Add arrow-avro * Add HeaderDecoder * Add schema parsing * Add BlockDecoder * Further docs * Apply suggestions from code review Co-authored-by: Andrew Lamb * Review feedback --------- Co-authored-by: Andrew Lamb --- .github/workflows/arrow.yml | 5 + .github/workflows/dev_pr/labeler.yml | 1 + .github/workflows/integration.yml | 1 + .github/workflows/miri.yaml | 1 + .github/workflows/parquet.yml | 1 + Cargo.toml | 1 + arrow-avro/Cargo.toml | 46 +++ arrow-avro/src/compression.rs | 32 ++ arrow-avro/src/lib.rs | 28 ++ arrow-avro/src/reader/block.rs | 141 ++++++++ arrow-avro/src/reader/header.rs | 289 ++++++++++++++++ arrow-avro/src/reader/mod.rs | 92 +++++ arrow-avro/src/reader/vlq.rs | 46 +++ arrow-avro/src/schema.rs | 484 +++++++++++++++++++++++++++ dev/release/README.md | 1 + 15 files changed, 1169 insertions(+) create mode 100644 arrow-avro/Cargo.toml create mode 100644 arrow-avro/src/compression.rs create mode 100644 arrow-avro/src/lib.rs create mode 100644 arrow-avro/src/reader/block.rs create mode 100644 arrow-avro/src/reader/header.rs create mode 100644 arrow-avro/src/reader/mod.rs create mode 100644 arrow-avro/src/reader/vlq.rs create mode 100644 arrow-avro/src/schema.rs diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index cde931c3c6b8..da56c23b5cd9 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -39,6 +39,7 @@ on: - arrow-integration-test/** - arrow-ipc/** - arrow-json/** + - arrow-avro/** - arrow-ord/** - arrow-row/** - arrow-schema/** @@ -78,6 +79,8 @@ jobs: run: cargo test -p arrow-csv --all-features - name: Test arrow-json with all features run: cargo test -p arrow-json --all-features + - name: Test arrow-avro with all features + run: cargo test -p arrow-avro --all-features - name: Test arrow-string with all features run: cargo test -p arrow-string --all-features - name: Test arrow-ord with all features @@ -202,6 +205,8 @@ jobs: run: cargo clippy -p arrow-csv --all-targets --all-features -- -D warnings - name: Clippy arrow-json with all features run: cargo clippy -p arrow-json --all-targets --all-features -- -D warnings + - name: Clippy arrow-avro with all features + run: cargo clippy -p arrow-avro --all-targets --all-features -- -D warnings - name: Clippy arrow-string with all features run: cargo clippy -p arrow-string --all-targets --all-features -- -D warnings - name: Clippy arrow-ord with all features diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index e5b86e8bcdf0..ea5873081f18 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -27,6 +27,7 @@ arrow: - arrow-integration-testing/**/* - arrow-ipc/**/* - arrow-json/**/* + - arrow-avro/**/* - arrow-ord/**/* - arrow-row/**/* - arrow-schema/**/* diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index aaf39d22bbce..eca51a80c164 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -38,6 +38,7 @@ on: - arrow-integration-testing/** - arrow-ipc/** - arrow-json/** + - arrow-avro/** - arrow-ord/** - arrow-pyarrow-integration-testing/** - arrow-schema/** diff --git a/.github/workflows/miri.yaml b/.github/workflows/miri.yaml index e3704d036aca..19b432121b6f 100644 --- a/.github/workflows/miri.yaml +++ b/.github/workflows/miri.yaml @@ -36,6 +36,7 @@ on: - arrow-data/** - arrow-ipc/** - arrow-json/** + - arrow-avro/** - arrow-schema/** - arrow-select/** - arrow-string/** diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index 7a649e16b1ec..d664a0dc0730 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -40,6 +40,7 @@ on: - arrow-ipc/** - arrow-csv/** - arrow-json/** + - arrow-avro/** - parquet/** - .github/** diff --git a/Cargo.toml b/Cargo.toml index 936935ec7e3d..d874e335eeae 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,7 @@ members = [ "arrow", "arrow-arith", "arrow-array", + "arrow-avro", "arrow-buffer", "arrow-cast", "arrow-csv", diff --git a/arrow-avro/Cargo.toml b/arrow-avro/Cargo.toml new file mode 100644 index 000000000000..9575874c41d2 --- /dev/null +++ b/arrow-avro/Cargo.toml @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "arrow-avro" +version = { workspace = true } +description = "Support for parsing Avro format into the Arrow format" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } + +[lib] +name = "arrow_avro" +path = "src/lib.rs" +bench = false + +[dependencies] +arrow-array = { workspace = true } +arrow-buffer = { workspace = true } +arrow-cast = { workspace = true } +arrow-data = { workspace = true } +arrow-schema = { workspace = true } +serde_json = { version = "1.0", default-features = false, features = ["std"] } +serde = { version = "1.0.188", features = ["derive"] } + +[dev-dependencies] + diff --git a/arrow-avro/src/compression.rs b/arrow-avro/src/compression.rs new file mode 100644 index 000000000000..a1a44fc22b68 --- /dev/null +++ b/arrow-avro/src/compression.rs @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use serde::{Deserialize, Serialize}; + +/// The metadata key used for storing the JSON encoded [`CompressionCodec`] +pub const CODEC_METADATA_KEY: &str = "avro.codec"; + +#[derive(Debug, Copy, Clone, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum CompressionCodec { + Null, + Deflate, + BZip2, + Snappy, + XZ, + ZStandard, +} diff --git a/arrow-avro/src/lib.rs b/arrow-avro/src/lib.rs new file mode 100644 index 000000000000..e134d9d798f2 --- /dev/null +++ b/arrow-avro/src/lib.rs @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Convert data to / from the [Apache Arrow] memory format and [Apache Avro] +//! +//! [Apache Arrow]: https://arrow.apache.org +//! [Apache Avro]: https://avro.apache.org/ + +#![allow(unused)] // Temporary + +pub mod reader; +mod schema; + +mod compression; diff --git a/arrow-avro/src/reader/block.rs b/arrow-avro/src/reader/block.rs new file mode 100644 index 000000000000..479f0ef90909 --- /dev/null +++ b/arrow-avro/src/reader/block.rs @@ -0,0 +1,141 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Decoder for [`Block`] + +use crate::reader::vlq::VLQDecoder; +use arrow_schema::ArrowError; + +/// A file data block +/// +/// +#[derive(Debug, Default)] +pub struct Block { + /// The number of objects in this block + pub count: usize, + /// The serialized objects within this block + pub data: Vec, + /// The sync marker + pub sync: [u8; 16], +} + +/// A decoder for [`Block`] +#[derive(Debug)] +pub struct BlockDecoder { + state: BlockDecoderState, + in_progress: Block, + vlq_decoder: VLQDecoder, + bytes_remaining: usize, +} + +#[derive(Debug)] +enum BlockDecoderState { + Count, + Size, + Data, + Sync, + Finished, +} + +impl Default for BlockDecoder { + fn default() -> Self { + Self { + state: BlockDecoderState::Count, + in_progress: Default::default(), + vlq_decoder: Default::default(), + bytes_remaining: 0, + } + } +} + +impl BlockDecoder { + /// Parse [`Block`] from `buf`, returning the number of bytes read + /// + /// This method can be called multiple times with consecutive chunks of data, allowing + /// integration with chunked IO systems like [`BufRead::fill_buf`] + /// + /// All errors should be considered fatal, and decoding aborted + /// + /// Once an entire [`Block`] has been decoded this method will not read any further + /// input bytes, until [`Self::flush`] is called. Afterwards [`Self::decode`] + /// can then be used again to read the next block, if any + /// + /// [`BufRead::fill_buf`]: std::io::BufRead::fill_buf + pub fn decode(&mut self, mut buf: &[u8]) -> Result { + let max_read = buf.len(); + while !buf.is_empty() { + match self.state { + BlockDecoderState::Count => { + if let Some(c) = self.vlq_decoder.long(&mut buf) { + self.in_progress.count = c.try_into().map_err(|_| { + ArrowError::ParseError(format!( + "Block count cannot be negative, got {c}" + )) + })?; + + self.state = BlockDecoderState::Size; + } + } + BlockDecoderState::Size => { + if let Some(c) = self.vlq_decoder.long(&mut buf) { + self.bytes_remaining = c.try_into().map_err(|_| { + ArrowError::ParseError(format!( + "Block size cannot be negative, got {c}" + )) + })?; + + self.in_progress.data.reserve(self.bytes_remaining); + self.state = BlockDecoderState::Data; + } + } + BlockDecoderState::Data => { + let to_read = self.bytes_remaining.min(buf.len()); + self.in_progress.data.extend_from_slice(&buf[..to_read]); + buf = &buf[to_read..]; + self.bytes_remaining -= to_read; + if self.bytes_remaining == 0 { + self.bytes_remaining = 16; + self.state = BlockDecoderState::Sync; + } + } + BlockDecoderState::Sync => { + let to_decode = buf.len().min(self.bytes_remaining); + let write = &mut self.in_progress.sync[16 - to_decode..]; + write[..to_decode].copy_from_slice(&buf[..to_decode]); + self.bytes_remaining -= to_decode; + buf = &buf[to_decode..]; + if self.bytes_remaining == 0 { + self.state = BlockDecoderState::Finished; + } + } + BlockDecoderState::Finished => return Ok(max_read - buf.len()), + } + } + Ok(max_read) + } + + /// Flush this decoder returning the parsed [`Block`] if any + pub fn flush(&mut self) -> Option { + match self.state { + BlockDecoderState::Finished => { + self.state = BlockDecoderState::Count; + Some(std::mem::take(&mut self.in_progress)) + } + _ => None, + } + } +} diff --git a/arrow-avro/src/reader/header.rs b/arrow-avro/src/reader/header.rs new file mode 100644 index 000000000000..92db8b1dc76d --- /dev/null +++ b/arrow-avro/src/reader/header.rs @@ -0,0 +1,289 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Decoder for [`Header`] + +use crate::reader::vlq::VLQDecoder; +use crate::schema::Schema; +use arrow_schema::ArrowError; + +#[derive(Debug)] +enum HeaderDecoderState { + /// Decoding the [`MAGIC`] prefix + Magic, + /// Decoding a block count + BlockCount, + /// Decoding a block byte length + BlockLen, + /// Decoding a key length + KeyLen, + /// Decoding a key string + Key, + /// Decoding a value length + ValueLen, + /// Decoding a value payload + Value, + /// Decoding sync marker + Sync, + /// Finished decoding + Finished, +} + +/// A decoded header for an [Object Container File](https://avro.apache.org/docs/1.11.1/specification/#object-container-files) +#[derive(Debug, Clone)] +pub struct Header { + meta_offsets: Vec, + meta_buf: Vec, + sync: [u8; 16], +} + +impl Header { + /// Returns an iterator over the meta keys in this header + pub fn metadata(&self) -> impl Iterator { + let mut last = 0; + self.meta_offsets.windows(2).map(move |w| { + let start = last; + last = w[1]; + (&self.meta_buf[start..w[0]], &self.meta_buf[w[0]..w[1]]) + }) + } + + /// Returns the value for a given metadata key if present + pub fn get(&self, key: impl AsRef<[u8]>) -> Option<&[u8]> { + self.metadata() + .find_map(|(k, v)| (k == key.as_ref()).then_some(v)) + } + + /// Returns the sync token for this file + pub fn sync(&self) -> [u8; 16] { + self.sync + } +} + +/// A decoder for [`Header`] +/// +/// The avro file format does not encode the length of the header, and so it +/// is necessary to provide a push-based decoder that can be used with streams +#[derive(Debug)] +pub struct HeaderDecoder { + state: HeaderDecoderState, + vlq_decoder: VLQDecoder, + + /// The end offsets of strings in `meta_buf` + meta_offsets: Vec, + /// The raw binary data of the metadata map + meta_buf: Vec, + + /// The decoded sync marker + sync_marker: [u8; 16], + + /// The number of remaining tuples in the current block + tuples_remaining: usize, + /// The number of bytes remaining in the current string/bytes payload + bytes_remaining: usize, +} + +impl Default for HeaderDecoder { + fn default() -> Self { + Self { + state: HeaderDecoderState::Magic, + meta_offsets: vec![], + meta_buf: vec![], + sync_marker: [0; 16], + vlq_decoder: Default::default(), + tuples_remaining: 0, + bytes_remaining: MAGIC.len(), + } + } +} + +const MAGIC: &[u8; 4] = b"Obj\x01"; + +impl HeaderDecoder { + /// Parse [`Header`] from `buf`, returning the number of bytes read + /// + /// This method can be called multiple times with consecutive chunks of data, allowing + /// integration with chunked IO systems like [`BufRead::fill_buf`] + /// + /// All errors should be considered fatal, and decoding aborted + /// + /// Once the entire [`Header`] has been decoded this method will not read any further + /// input bytes, and the header can be obtained with [`Self::flush`] + /// + /// [`BufRead::fill_buf`]: std::io::BufRead::fill_buf + pub fn decode(&mut self, mut buf: &[u8]) -> Result { + let max_read = buf.len(); + while !buf.is_empty() { + match self.state { + HeaderDecoderState::Magic => { + let remaining = &MAGIC[MAGIC.len() - self.bytes_remaining..]; + let to_decode = buf.len().min(remaining.len()); + if !buf.starts_with(&remaining[..to_decode]) { + return Err(ArrowError::ParseError( + "Incorrect avro magic".to_string(), + )); + } + self.bytes_remaining -= to_decode; + buf = &buf[to_decode..]; + if self.bytes_remaining == 0 { + self.state = HeaderDecoderState::BlockCount; + } + } + HeaderDecoderState::BlockCount => { + if let Some(block_count) = self.vlq_decoder.long(&mut buf) { + match block_count.try_into() { + Ok(0) => { + self.state = HeaderDecoderState::Sync; + self.bytes_remaining = 16; + } + Ok(remaining) => { + self.tuples_remaining = remaining; + self.state = HeaderDecoderState::KeyLen; + } + Err(_) => { + self.tuples_remaining = block_count.unsigned_abs() as _; + self.state = HeaderDecoderState::BlockLen; + } + } + } + } + HeaderDecoderState::BlockLen => { + if self.vlq_decoder.long(&mut buf).is_some() { + self.state = HeaderDecoderState::KeyLen + } + } + HeaderDecoderState::Key => { + let to_read = self.bytes_remaining.min(buf.len()); + self.meta_buf.extend_from_slice(&buf[..to_read]); + self.bytes_remaining -= to_read; + buf = &buf[to_read..]; + if self.bytes_remaining == 0 { + self.meta_offsets.push(self.meta_buf.len()); + self.state = HeaderDecoderState::ValueLen; + } + } + HeaderDecoderState::Value => { + let to_read = self.bytes_remaining.min(buf.len()); + self.meta_buf.extend_from_slice(&buf[..to_read]); + self.bytes_remaining -= to_read; + buf = &buf[to_read..]; + if self.bytes_remaining == 0 { + self.meta_offsets.push(self.meta_buf.len()); + + self.tuples_remaining -= 1; + match self.tuples_remaining { + 0 => self.state = HeaderDecoderState::BlockCount, + _ => self.state = HeaderDecoderState::KeyLen, + } + } + } + HeaderDecoderState::KeyLen => { + if let Some(len) = self.vlq_decoder.long(&mut buf) { + self.bytes_remaining = len as _; + self.state = HeaderDecoderState::Key; + } + } + HeaderDecoderState::ValueLen => { + if let Some(len) = self.vlq_decoder.long(&mut buf) { + self.bytes_remaining = len as _; + self.state = HeaderDecoderState::Value; + } + } + HeaderDecoderState::Sync => { + let to_decode = buf.len().min(self.bytes_remaining); + let write = &mut self.sync_marker[16 - to_decode..]; + write[..to_decode].copy_from_slice(&buf[..to_decode]); + self.bytes_remaining -= to_decode; + buf = &buf[to_decode..]; + if self.bytes_remaining == 0 { + self.state = HeaderDecoderState::Finished; + } + } + HeaderDecoderState::Finished => return Ok(max_read - buf.len()), + } + } + Ok(max_read) + } + + /// Flush this decoder returning the parsed [`Header`] if any + pub fn flush(&mut self) -> Option
{ + match self.state { + HeaderDecoderState::Finished => { + self.state = HeaderDecoderState::Magic; + Some(Header { + meta_offsets: std::mem::take(&mut self.meta_offsets), + meta_buf: std::mem::take(&mut self.meta_buf), + sync: self.sync_marker, + }) + } + _ => None, + } + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::reader::read_header; + use crate::schema::SCHEMA_METADATA_KEY; + use std::fs::File; + use std::io::{BufRead, BufReader}; + + #[test] + fn test_header_decode() { + let mut decoder = HeaderDecoder::default(); + for m in MAGIC { + decoder.decode(std::slice::from_ref(m)).unwrap(); + } + + let mut decoder = HeaderDecoder::default(); + assert_eq!(decoder.decode(MAGIC).unwrap(), 4); + + let mut decoder = HeaderDecoder::default(); + decoder.decode(b"Ob").unwrap(); + let err = decoder.decode(b"s").unwrap_err().to_string(); + assert_eq!(err, "Parser error: Incorrect avro magic"); + } + + fn decode_file(file: &str) -> Header { + let file = File::open(file).unwrap(); + read_header(BufReader::with_capacity(100, file)).unwrap() + } + + #[test] + fn test_header() { + let header = decode_file("../testing/data/avro/alltypes_plain.avro"); + let schema_json = header.get(SCHEMA_METADATA_KEY).unwrap(); + let expected = br#"{"type":"record","name":"topLevelRecord","fields":[{"name":"id","type":["int","null"]},{"name":"bool_col","type":["boolean","null"]},{"name":"tinyint_col","type":["int","null"]},{"name":"smallint_col","type":["int","null"]},{"name":"int_col","type":["int","null"]},{"name":"bigint_col","type":["long","null"]},{"name":"float_col","type":["float","null"]},{"name":"double_col","type":["double","null"]},{"name":"date_string_col","type":["bytes","null"]},{"name":"string_col","type":["bytes","null"]},{"name":"timestamp_col","type":[{"type":"long","logicalType":"timestamp-micros"},"null"]}]}"#; + assert_eq!(schema_json, expected); + let _schema: Schema<'_> = serde_json::from_slice(schema_json).unwrap(); + assert_eq!( + u128::from_le_bytes(header.sync()), + 226966037233754408753420635932530907102 + ); + + let header = decode_file("../testing/data/avro/fixed_length_decimal.avro"); + let schema_json = header.get(SCHEMA_METADATA_KEY).unwrap(); + let expected = br#"{"type":"record","name":"topLevelRecord","fields":[{"name":"value","type":[{"type":"fixed","name":"fixed","namespace":"topLevelRecord.value","size":11,"logicalType":"decimal","precision":25,"scale":2},"null"]}]}"#; + assert_eq!(schema_json, expected); + let _schema: Schema<'_> = serde_json::from_slice(schema_json).unwrap(); + assert_eq!( + u128::from_le_bytes(header.sync()), + 325166208089902833952788552656412487328 + ); + } +} diff --git a/arrow-avro/src/reader/mod.rs b/arrow-avro/src/reader/mod.rs new file mode 100644 index 000000000000..a42011e3b2ad --- /dev/null +++ b/arrow-avro/src/reader/mod.rs @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Read Avro data to Arrow + +use crate::reader::block::{Block, BlockDecoder}; +use crate::reader::header::{Header, HeaderDecoder}; +use arrow_schema::ArrowError; +use std::io::BufRead; + +mod header; + +mod block; + +mod vlq; + +/// Read a [`Header`] from the provided [`BufRead`] +fn read_header(mut reader: R) -> Result { + let mut decoder = HeaderDecoder::default(); + loop { + let buf = reader.fill_buf()?; + if buf.is_empty() { + break; + } + let read = buf.len(); + let decoded = decoder.decode(buf)?; + reader.consume(decoded); + if decoded != read { + break; + } + } + + decoder + .flush() + .ok_or_else(|| ArrowError::ParseError("Unexpected EOF".to_string())) +} + +/// Return an iterator of [`Block`] from the provided [`BufRead`] +fn read_blocks( + mut reader: R, +) -> impl Iterator> { + let mut decoder = BlockDecoder::default(); + + let mut try_next = move || { + loop { + let buf = reader.fill_buf()?; + if buf.is_empty() { + break; + } + let read = buf.len(); + let decoded = decoder.decode(buf)?; + reader.consume(decoded); + if decoded != read { + break; + } + } + Ok(decoder.flush()) + }; + std::iter::from_fn(move || try_next().transpose()) +} + +#[cfg(test)] +mod test { + use crate::reader::{read_blocks, read_header}; + use std::fs::File; + use std::io::BufReader; + + #[test] + fn test_mux() { + let file = File::open("../testing/data/avro/alltypes_plain.avro").unwrap(); + let mut reader = BufReader::new(file); + let header = read_header(&mut reader).unwrap(); + for result in read_blocks(reader) { + let block = result.unwrap(); + assert_eq!(block.sync, header.sync()); + } + } +} diff --git a/arrow-avro/src/reader/vlq.rs b/arrow-avro/src/reader/vlq.rs new file mode 100644 index 000000000000..80f1c60eec7d --- /dev/null +++ b/arrow-avro/src/reader/vlq.rs @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// Decoder for zig-zag encoded variable length (VLW) integers +/// +/// See also: +/// +/// +#[derive(Debug, Default)] +pub struct VLQDecoder { + /// Scratch space for decoding VLQ integers + in_progress: u64, + shift: u32, +} + +impl VLQDecoder { + /// Decode a signed long from `buf` + pub fn long(&mut self, buf: &mut &[u8]) -> Option { + while let Some(byte) = buf.first().copied() { + *buf = &buf[1..]; + self.in_progress |= ((byte & 0x7F) as u64) << self.shift; + self.shift += 7; + if byte & 0x80 == 0 { + let val = self.in_progress; + self.in_progress = 0; + self.shift = 0; + return Some((val >> 1) as i64 ^ -((val & 1) as i64)); + } + } + None + } +} diff --git a/arrow-avro/src/schema.rs b/arrow-avro/src/schema.rs new file mode 100644 index 000000000000..839ba65bd5fc --- /dev/null +++ b/arrow-avro/src/schema.rs @@ -0,0 +1,484 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; + +/// The metadata key used for storing the JSON encoded [`Schema`] +pub const SCHEMA_METADATA_KEY: &str = "avro.schema"; + +/// Either a [`PrimitiveType`] or a reference to a previously defined named type +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(untagged)] +pub enum TypeName<'a> { + Primitive(PrimitiveType), + Ref(&'a str), +} + +/// A primitive type +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub enum PrimitiveType { + Null, + Boolean, + Int, + Long, + Float, + Double, + Bytes, + String, +} + +/// Additional attributes within a [`Schema`] +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Default, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct Attributes<'a> { + /// A logical type name + /// + /// + #[serde(default)] + pub logical_type: Option<&'a str>, + + /// Additional JSON attributes + #[serde(flatten)] + pub additional: HashMap<&'a str, serde_json::Value>, +} + +/// A type definition that is not a variant of [`ComplexType`] +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct Type<'a> { + #[serde(borrow)] + pub r#type: TypeName<'a>, + #[serde(flatten)] + pub attributes: Attributes<'a>, +} + +/// An Avro schema +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(untagged)] +pub enum Schema<'a> { + #[serde(borrow)] + TypeName(TypeName<'a>), + #[serde(borrow)] + Union(Vec>), + #[serde(borrow)] + Complex(ComplexType<'a>), + #[serde(borrow)] + Type(Type<'a>), +} + +/// A complex type +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(tag = "type", rename_all = "camelCase")] +pub enum ComplexType<'a> { + #[serde(borrow)] + Union(Vec>), + #[serde(borrow)] + Record(Record<'a>), + #[serde(borrow)] + Enum(Enum<'a>), + #[serde(borrow)] + Array(Array<'a>), + #[serde(borrow)] + Map(Map<'a>), + #[serde(borrow)] + Fixed(Fixed<'a>), +} + +/// A record +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct Record<'a> { + #[serde(borrow)] + pub name: &'a str, + #[serde(borrow, default)] + pub namespace: Option<&'a str>, + #[serde(borrow, default)] + pub doc: Option<&'a str>, + #[serde(borrow, default)] + pub aliases: Vec<&'a str>, + #[serde(borrow)] + pub fields: Vec>, + #[serde(flatten)] + pub attributes: Attributes<'a>, +} + +/// A field within a [`Record`] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct Field<'a> { + #[serde(borrow)] + pub name: &'a str, + #[serde(borrow, default)] + pub doc: Option<&'a str>, + #[serde(borrow)] + pub r#type: Schema<'a>, + #[serde(borrow, default)] + pub default: Option<&'a str>, +} + +/// An enumeration +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct Enum<'a> { + #[serde(borrow)] + pub name: &'a str, + #[serde(borrow, default)] + pub namespace: Option<&'a str>, + #[serde(borrow, default)] + pub doc: Option<&'a str>, + #[serde(borrow, default)] + pub aliases: Vec<&'a str>, + #[serde(borrow)] + pub symbols: Vec<&'a str>, + #[serde(borrow, default)] + pub default: Option<&'a str>, + #[serde(flatten)] + pub attributes: Attributes<'a>, +} + +/// An array +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct Array<'a> { + #[serde(borrow)] + pub items: Box>, + #[serde(flatten)] + pub attributes: Attributes<'a>, +} + +/// A map +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct Map<'a> { + #[serde(borrow)] + pub values: Box>, + #[serde(flatten)] + pub attributes: Attributes<'a>, +} + +/// A fixed length binary array +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct Fixed<'a> { + #[serde(borrow)] + pub name: &'a str, + #[serde(borrow, default)] + pub namespace: Option<&'a str>, + #[serde(borrow, default)] + pub aliases: Vec<&'a str>, + pub size: usize, + #[serde(flatten)] + pub attributes: Attributes<'a>, +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + #[test] + fn test_deserialize() { + let t: Schema = serde_json::from_str("\"string\"").unwrap(); + assert_eq!( + t, + Schema::TypeName(TypeName::Primitive(PrimitiveType::String)) + ); + + let t: Schema = serde_json::from_str("[\"int\", \"null\"]").unwrap(); + assert_eq!( + t, + Schema::Union(vec![ + Schema::TypeName(TypeName::Primitive(PrimitiveType::Int)), + Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), + ]) + ); + + let t: Type = serde_json::from_str( + r#"{ + "type":"long", + "logicalType":"timestamp-micros" + }"#, + ) + .unwrap(); + + let timestamp = Type { + r#type: TypeName::Primitive(PrimitiveType::Long), + attributes: Attributes { + logical_type: Some("timestamp-micros"), + additional: Default::default(), + }, + }; + + assert_eq!(t, timestamp); + + let t: ComplexType = serde_json::from_str( + r#"{ + "type":"fixed", + "name":"fixed", + "namespace":"topLevelRecord.value", + "size":11, + "logicalType":"decimal", + "precision":25, + "scale":2 + }"#, + ) + .unwrap(); + + let decimal = ComplexType::Fixed(Fixed { + name: "fixed", + namespace: Some("topLevelRecord.value"), + aliases: vec![], + size: 11, + attributes: Attributes { + logical_type: Some("decimal"), + additional: vec![("precision", json!(25)), ("scale", json!(2))] + .into_iter() + .collect(), + }, + }); + + assert_eq!(t, decimal); + + let schema: Schema = serde_json::from_str( + r#"{ + "type":"record", + "name":"topLevelRecord", + "fields":[ + { + "name":"value", + "type":[ + { + "type":"fixed", + "name":"fixed", + "namespace":"topLevelRecord.value", + "size":11, + "logicalType":"decimal", + "precision":25, + "scale":2 + }, + "null" + ] + } + ] + }"#, + ) + .unwrap(); + + assert_eq!( + schema, + Schema::Complex(ComplexType::Record(Record { + name: "topLevelRecord", + namespace: None, + doc: None, + aliases: vec![], + fields: vec![Field { + name: "value", + doc: None, + r#type: Schema::Union(vec![ + Schema::Complex(decimal), + Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), + ]), + default: None, + },], + attributes: Default::default(), + })) + ); + + let schema: Schema = serde_json::from_str( + r#"{ + "type": "record", + "name": "LongList", + "aliases": ["LinkedLongs"], + "fields" : [ + {"name": "value", "type": "long"}, + {"name": "next", "type": ["null", "LongList"]} + ] + }"#, + ) + .unwrap(); + + assert_eq!( + schema, + Schema::Complex(ComplexType::Record(Record { + name: "LongList", + namespace: None, + doc: None, + aliases: vec!["LinkedLongs"], + fields: vec![ + Field { + name: "value", + doc: None, + r#type: Schema::TypeName(TypeName::Primitive( + PrimitiveType::Long + )), + default: None, + }, + Field { + name: "next", + doc: None, + r#type: Schema::Union(vec![ + Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), + Schema::TypeName(TypeName::Ref("LongList")), + ]), + default: None, + } + ], + attributes: Attributes::default(), + })) + ); + + let schema: Schema = serde_json::from_str( + r#"{ + "type":"record", + "name":"topLevelRecord", + "fields":[ + { + "name":"id", + "type":[ + "int", + "null" + ] + }, + { + "name":"timestamp_col", + "type":[ + { + "type":"long", + "logicalType":"timestamp-micros" + }, + "null" + ] + } + ] + }"#, + ) + .unwrap(); + + assert_eq!( + schema, + Schema::Complex(ComplexType::Record(Record { + name: "topLevelRecord", + namespace: None, + doc: None, + aliases: vec![], + fields: vec![ + Field { + name: "id", + doc: None, + r#type: Schema::Union(vec![ + Schema::TypeName(TypeName::Primitive(PrimitiveType::Int)), + Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), + ]), + default: None, + }, + Field { + name: "timestamp_col", + doc: None, + r#type: Schema::Union(vec![ + Schema::Type(timestamp), + Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), + ]), + default: None, + } + ], + attributes: Default::default(), + })) + ); + + let schema: Schema = serde_json::from_str( + r#"{ + "type": "record", + "name": "HandshakeRequest", "namespace":"org.apache.avro.ipc", + "fields": [ + {"name": "clientHash", + "type": {"type": "fixed", "name": "MD5", "size": 16}}, + {"name": "clientProtocol", "type": ["null", "string"]}, + {"name": "serverHash", "type": "MD5"}, + {"name": "meta", "type": ["null", {"type": "map", "values": "bytes"}]} + ] + }"#, + ) + .unwrap(); + + assert_eq!( + schema, + Schema::Complex(ComplexType::Record(Record { + name: "HandshakeRequest", + namespace: Some("org.apache.avro.ipc"), + doc: None, + aliases: vec![], + fields: vec![ + Field { + name: "clientHash", + doc: None, + r#type: Schema::Complex(ComplexType::Fixed(Fixed { + name: "MD5", + namespace: None, + aliases: vec![], + size: 16, + attributes: Default::default(), + })), + default: None, + }, + Field { + name: "clientProtocol", + doc: None, + r#type: Schema::Union(vec![ + Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), + Schema::TypeName(TypeName::Primitive(PrimitiveType::String)), + ]), + default: None, + }, + Field { + name: "serverHash", + doc: None, + r#type: Schema::TypeName(TypeName::Ref("MD5")), + default: None, + }, + Field { + name: "meta", + doc: None, + r#type: Schema::Union(vec![ + Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), + Schema::Complex(ComplexType::Map(Map { + values: Box::new(Schema::TypeName(TypeName::Primitive( + PrimitiveType::Bytes + ))), + attributes: Default::default(), + })), + ]), + default: None, + } + ], + attributes: Default::default(), + })) + ); + } +} diff --git a/dev/release/README.md b/dev/release/README.md index 30b3a4a8a569..177f33bcbb4d 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -258,6 +258,7 @@ Rust Arrow Crates: (cd arrow-ipc && cargo publish) (cd arrow-csv && cargo publish) (cd arrow-json && cargo publish) +(cd arrow-avro && cargo publish) (cd arrow-ord && cargo publish) (cd arrow-arith && cargo publish) (cd arrow-string && cargo publish)