This repository has been archived by the owner on Feb 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 222
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
3685ae8
commit 81de974
Showing
16 changed files
with
550 additions
and
31 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
use std::{fs::File, sync::Arc}; | ||
|
||
use arrow2::{ | ||
array::{Array, Int32Array}, | ||
datatypes::{Field, Schema}, | ||
error::Result, | ||
io::avro::write, | ||
record_batch::RecordBatch, | ||
}; | ||
|
||
fn main() -> Result<()> { | ||
use std::env; | ||
let args: Vec<String> = env::args().collect(); | ||
|
||
let path = &args[1]; | ||
|
||
let array = Int32Array::from(&[ | ||
Some(0), | ||
Some(1), | ||
Some(2), | ||
Some(3), | ||
Some(4), | ||
Some(5), | ||
Some(6), | ||
]); | ||
let field = Field::new("c1", array.data_type().clone(), true); | ||
let schema = Schema::new(vec![field]); | ||
|
||
let avro_schema = write::to_avro_schema(&schema)?; | ||
|
||
let mut file = File::create(path)?; | ||
|
||
let compression = None; | ||
|
||
write::write_metadata(&mut file, &avro_schema, compression)?; | ||
|
||
let serializer = write::new_serializer(&array, avro_schema.fields()[0]); | ||
let mut block = write::Block::new(array.len(), vec![]); | ||
|
||
write::serialize(&mut vec![serializer], &mut block)?; | ||
|
||
let mut compressed_block = write::CompressedBlock::default(); | ||
|
||
if let Some(compression) = compression { | ||
write::compress(&block, &mut compressed_block, compression)?; | ||
} else { | ||
compressed_block.number_of_rows = block.number_of_rows; | ||
std::mem::swap(&mut compressed_block.data, &mut block.data); | ||
} | ||
|
||
write::write_block(&mut file, &compressed_block)?; | ||
|
||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
use std::io::Write; | ||
|
||
use crate::{error::Result, io::avro::Compression}; | ||
|
||
use super::{util::zigzag_encode, SYNC_NUMBER}; | ||
|
||
/// A compressed Avro block. | ||
#[derive(Debug, Clone, Default, PartialEq)] | ||
pub struct CompressedBlock { | ||
/// The number of rows | ||
pub number_of_rows: usize, | ||
/// The compressed data | ||
pub data: Vec<u8>, | ||
} | ||
|
||
impl CompressedBlock { | ||
/// Creates a new CompressedBlock | ||
pub fn new(number_of_rows: usize, data: Vec<u8>) -> Self { | ||
Self { | ||
number_of_rows, | ||
data, | ||
} | ||
} | ||
} | ||
|
||
/// An uncompressed Avro block. | ||
#[derive(Debug, Clone, Default, PartialEq)] | ||
pub struct Block { | ||
/// The number of rows | ||
pub number_of_rows: usize, | ||
/// The uncompressed data | ||
pub data: Vec<u8>, | ||
} | ||
|
||
impl Block { | ||
/// Creates a new Block | ||
pub fn new(number_of_rows: usize, data: Vec<u8>) -> Self { | ||
Self { | ||
number_of_rows, | ||
data, | ||
} | ||
} | ||
} | ||
|
||
/// Writes a [`CompressedBlock`] to `writer` | ||
pub fn write_block<W: Write>(writer: &mut W, compressed_block: &CompressedBlock) -> Result<()> { | ||
// write size and rows | ||
zigzag_encode(compressed_block.number_of_rows as i64, writer)?; | ||
zigzag_encode(compressed_block.data.len() as i64, writer)?; | ||
|
||
writer.write_all(&compressed_block.data)?; | ||
|
||
writer.write_all(&SYNC_NUMBER)?; | ||
|
||
Ok(()) | ||
} | ||
|
||
/// Compresses an [`Block`] to a [`CompressedBlock`]. | ||
pub fn compress( | ||
block: &Block, | ||
compressed_block: &mut CompressedBlock, | ||
compression: Compression, | ||
) -> Result<()> { | ||
match compression { | ||
Compression::Deflate => todo!(), | ||
Compression::Snappy => todo!(), | ||
} | ||
compressed_block.number_of_rows = block.number_of_rows; | ||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
use std::collections::HashMap; | ||
|
||
use avro_rs::Schema; | ||
use serde_json; | ||
|
||
use crate::error::{ArrowError, Result}; | ||
|
||
use super::Compression; | ||
|
||
/// Serializes an [`Schema`] and optional [`Compression`] into an avro header. | ||
pub(crate) fn serialize_header( | ||
schema: &Schema, | ||
compression: Option<Compression>, | ||
) -> Result<HashMap<String, Vec<u8>>> { | ||
let schema = | ||
serde_json::to_string(schema).map_err(|e| ArrowError::ExternalFormat(e.to_string()))?; | ||
|
||
let mut header = HashMap::<String, Vec<u8>>::default(); | ||
|
||
header.insert("avro.schema".to_string(), schema.into_bytes()); | ||
if let Some(compression) = compression { | ||
let value = match compression { | ||
Compression::Snappy => b"snappy".to_vec(), | ||
Compression::Deflate => b"deflate".to_vec(), | ||
}; | ||
header.insert("avro.codec".to_string(), value); | ||
}; | ||
|
||
Ok(header) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
//! APIs to write to Avro format. | ||
use std::io::Write; | ||
|
||
use crate::error::Result; | ||
|
||
use super::Compression; | ||
|
||
mod header; | ||
use header::serialize_header; | ||
mod schema; | ||
pub use schema::{to_avro_schema, AvroSchema}; | ||
mod serialize; | ||
pub use serialize::{can_serialize, new_serializer, BoxSerializer}; | ||
mod block; | ||
pub use block::*; | ||
mod util; | ||
|
||
const SYNC_NUMBER: [u8; 16] = [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4]; | ||
|
||
/// Writes Avro's metadata to `writer`. | ||
pub fn write_metadata<W: std::io::Write>( | ||
writer: &mut W, | ||
schema: &AvroSchema, | ||
compression: Option<Compression>, | ||
) -> Result<()> { | ||
// * Four bytes, ASCII 'O', 'b', 'j', followed by 1. | ||
let avro_magic = [b'O', b'b', b'j', 1u8]; | ||
writer.write_all(&avro_magic)?; | ||
|
||
// * file metadata, including the schema. | ||
let header = serialize_header(&schema.0, compression)?; | ||
|
||
util::zigzag_encode(header.len() as i64, writer)?; | ||
for (name, item) in header { | ||
util::write_binary(name.as_bytes(), writer)?; | ||
util::write_binary(&item, writer)?; | ||
} | ||
writer.write_all(&[0])?; | ||
|
||
// The 16-byte, randomly-generated sync marker for this file. | ||
writer.write_all(&SYNC_NUMBER)?; | ||
|
||
Ok(()) | ||
} | ||
|
||
/// consumes a set of [`BoxSerializer`] into an [`Block`]. | ||
/// # Panics | ||
/// Panics iff the number of items in any of the serializers is not equal to the number of rows | ||
/// declared in the `block`. | ||
pub fn serialize<'a>(serializers: &mut [BoxSerializer<'a>], block: &mut Block) -> Result<()> { | ||
let Block { | ||
data, | ||
number_of_rows, | ||
} = block; | ||
|
||
data.clear(); // restart it | ||
|
||
// _the_ transpose (columns -> rows) | ||
for _ in 0..*number_of_rows { | ||
for serializer in &mut *serializers { | ||
let item_data = serializer.next().unwrap(); | ||
data.write_all(item_data)?; | ||
} | ||
} | ||
Ok(()) | ||
} |
Oops, something went wrong.