Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Sampling tests for parquet round trips (#1519)
Browse files Browse the repository at this point in the history
  • Loading branch information
AnIrishDuck authored Jul 27, 2023
1 parent 031bc7b commit f175c1c
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 0 deletions.
11 changes: 11 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,14 @@ proptest = { version = "1", default_features = false, features = ["std"] }
avro-rs = { version = "0.13", features = ["snappy"] }
# use for flaky testing
rand = "0.8"
# use for generating and testing random data samples
sample-arrow2 = "0.1"
sample-std = "0.1"
sample-test = "0.1"

# ugly hack needed to match this library in sample_arrow2
[patch.crates-io]
arrow2 = { path = "." }

[package.metadata.docs.rs]
features = ["full"]
Expand Down Expand Up @@ -188,6 +196,9 @@ io_parquet_compression = [
"io_parquet_brotli"
]

# sample testing of generated arrow data
io_parquet_sample_test = ["io_parquet"]

# compression backends
io_parquet_zstd = ["parquet2/zstd"]
io_parquet_snappy = ["parquet2/snappy"]
Expand Down
3 changes: 3 additions & 0 deletions tests/it/io/parquet/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ mod read_indexes;
mod write;
mod write_async;

#[cfg(feature = "io_parquet_sample_test")]
mod sample_tests;

type ArrayStats = (Box<dyn Array>, Statistics);

fn new_struct(
Expand Down
119 changes: 119 additions & 0 deletions tests/it/io/parquet/sample_tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
use arrow2::io::parquet::write::*;
use arrow2::{
chunk::Chunk,
datatypes::{Field, Metadata, Schema},
error::Result,
io::parquet::read as p_read,
};
use std::borrow::Borrow;
use std::io::Cursor;

use sample_arrow2::{
array::ArbitraryArray,
chunk::{ArbitraryChunk, ChainedChunk},
datatypes::{sample_flat, ArbitraryDataType},
};
use sample_std::{Chance, Random, Regex, Sample};
use sample_test::sample_test;

fn deep_chunk(depth: usize, len: usize) -> ArbitraryChunk<Regex, Chance> {
let names = Regex::new("[a-z]{4,8}");
let data_type = ArbitraryDataType {
struct_branch: 1..3,
names: names.clone(),
// TODO: this breaks the test
// nullable: Chance(0.5),
nullable: Chance(0.0),
flat: sample_flat,
}
.sample_depth(depth);

let array = ArbitraryArray {
names,
branch: 0..10,
len: len..(len + 1),
null: Chance(0.1),
// TODO: this breaks the test
// is_nullable: true,
is_nullable: false,
};

ArbitraryChunk {
// TODO: shrinking appears to be an issue with chunks this large. issues
// currently reproduce on the smaller sizes anyway.
// chunk_len: 10..1000,
chunk_len: 1..10,
array_count: 1..2,
data_type,
array,
}
}

#[sample_test]
fn round_trip_sample(
#[sample(deep_chunk(5, 100).sample_one())] chained: ChainedChunk,
) -> Result<()> {
sample_test::env_logger_init();
let chunks = vec![chained.value];
let name = Regex::new("[a-z]{4, 8}");
let mut g = Random::new();

// TODO: this probably belongs in a helper in sample-arrow2
let schema = Schema {
fields: chunks
.first()
.unwrap()
.iter()
.map(|arr| {
Field::new(
name.generate(&mut g),
arr.data_type().clone(),
arr.validity().is_some(),
)
})
.collect(),
metadata: Metadata::default(),
};

let options = WriteOptions {
write_statistics: true,
compression: CompressionOptions::Uncompressed,
version: Version::V2,
data_pagesize_limit: None,
};

let encodings: Vec<_> = schema
.borrow()
.fields
.iter()
.map(|field| transverse(field.data_type(), |_| Encoding::Plain))
.collect();

let row_groups = RowGroupIterator::try_new(
chunks.clone().into_iter().map(Ok),
&schema,
options,
encodings,
)?;

let buffer = Cursor::new(vec![]);
let mut writer = FileWriter::try_new(buffer, schema, options)?;

for group in row_groups {
writer.write(group?)?;
}
writer.end(None)?;

let mut buffer = writer.into_inner();

let metadata = p_read::read_metadata(&mut buffer)?;
let schema = p_read::infer_schema(&metadata)?;

let mut reader = p_read::FileReader::new(buffer, metadata.row_groups, schema, None, None, None);

let result: Vec<_> = reader.collect::<Result<_>>()?;

assert_eq!(result, chunks);

Ok(())
}

0 comments on commit f175c1c

Please sign in to comment.