This repository has been archived by the owner on Feb 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 222
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added example showing parallel writes to parquet. (#436)
- Loading branch information
1 parent
93c56d7
commit 688e979
Showing
4 changed files
with
118 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
*.parquet |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
[package] | ||
name = "parquet_write_parallel" | ||
version = "0.1.0" | ||
edition = "2018" | ||
|
||
[dependencies] | ||
arrow2 = { path = "../../", default-features = false, features = ["io_parquet", "io_parquet_compression"] } | ||
rayon = { version = "1", default-features = false } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
/// Example demonstrating how to write to parquet in parallel. | ||
use std::sync::Arc; | ||
|
||
use rayon::prelude::*; | ||
|
||
use arrow2::{ | ||
array::*, datatypes::PhysicalType, error::Result, io::parquet::write::*, | ||
record_batch::RecordBatch, | ||
}; | ||
|
||
fn parallel_write(path: &str, batch: &RecordBatch) -> Result<()> { | ||
let options = WriteOptions { | ||
write_statistics: true, | ||
compression: Compression::Snappy, | ||
version: Version::V2, | ||
}; | ||
let encodings = batch.schema().fields().par_iter().map(|field| { | ||
match field.data_type().to_physical_type() { | ||
// let's be fancy and use delta-encoding for binary fields | ||
PhysicalType::Binary | ||
| PhysicalType::LargeBinary | ||
| PhysicalType::Utf8 | ||
| PhysicalType::LargeUtf8 => Encoding::DeltaLengthByteArray, | ||
// remaining is plain | ||
_ => Encoding::Plain, | ||
} | ||
}); | ||
|
||
let parquet_schema = to_parquet_schema(batch.schema())?; | ||
|
||
// write batch to pages; parallelized by rayon | ||
let columns = batch | ||
.columns() | ||
.par_iter() | ||
.zip(parquet_schema.columns().to_vec().into_par_iter()) | ||
.zip(encodings) | ||
.map(|((array, descriptor), encoding)| { | ||
let array = array.clone(); | ||
|
||
// create encoded and compressed pages this column | ||
Ok(array_to_pages(array, descriptor, options, encoding)?.collect::<Vec<_>>()) | ||
}) | ||
.collect::<Result<Vec<_>>>()?; | ||
|
||
// create the iterator over groups (one in this case) | ||
// (for more batches, create the iterator from them here) | ||
let row_groups = std::iter::once(Result::Ok(DynIter::new( | ||
columns | ||
.into_iter() | ||
.map(|column| Ok(DynIter::new(column.into_iter()))), | ||
))); | ||
|
||
// Create a new empty file | ||
let mut file = std::fs::File::create(path)?; | ||
|
||
// Write the file. | ||
let _file_size = write_file( | ||
&mut file, | ||
row_groups, | ||
batch.schema(), | ||
parquet_schema, | ||
options, | ||
None, | ||
)?; | ||
|
||
Ok(()) | ||
} | ||
|
||
fn create_batch(size: usize) -> Result<RecordBatch> { | ||
let c1: Int32Array = (0..size) | ||
.map(|x| if x % 9 == 0 { None } else { Some(x as i32) }) | ||
.collect(); | ||
let c2: Utf8Array<i32> = (0..size) | ||
.map(|x| { | ||
if x % 8 == 0 { | ||
None | ||
} else { | ||
Some(x.to_string()) | ||
} | ||
}) | ||
.collect(); | ||
|
||
RecordBatch::try_from_iter([ | ||
("c1", Arc::new(c1) as Arc<dyn Array>), | ||
("c2", Arc::new(c2) as Arc<dyn Array>), | ||
]) | ||
} | ||
|
||
fn main() -> Result<()> { | ||
let batch = create_batch(10_000_000)?; | ||
|
||
parallel_write("example.parquet", &batch) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters