Skip to content

Commit

Permalink
Separate parquet writer benchmarks (#818)
Browse files Browse the repository at this point in the history
* split benchmarks of primitive arrays

* add list benches
  • Loading branch information
nevi-me authored Oct 7, 2021
1 parent a835f2c commit 180776a
Showing 1 changed file with 224 additions and 29 deletions.
253 changes: 224 additions & 29 deletions parquet/benches/arrow_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,25 +36,164 @@ fn create_primitive_bench_batch(
true_density: f32,
) -> Result<RecordBatch> {
let fields = vec![
Field::new("_1", DataType::Int8, true),
Field::new("_2", DataType::Int16, true),
Field::new("_3", DataType::Int32, true),
Field::new("_4", DataType::Int64, true),
Field::new("_5", DataType::UInt8, true),
Field::new("_6", DataType::UInt16, true),
Field::new("_7", DataType::UInt32, true),
Field::new("_8", DataType::UInt64, true),
Field::new("_9", DataType::Float32, true),
Field::new("_10", DataType::Float64, true),
Field::new("_11", DataType::Date32, true),
Field::new("_12", DataType::Date64, true),
Field::new("_13", DataType::Time32(TimeUnit::Second), true),
Field::new("_14", DataType::Time32(TimeUnit::Millisecond), true),
Field::new("_15", DataType::Time64(TimeUnit::Microsecond), true),
Field::new("_16", DataType::Time64(TimeUnit::Nanosecond), true),
Field::new("_17", DataType::Utf8, true),
Field::new("_18", DataType::LargeUtf8, true),
Field::new("_19", DataType::Boolean, true),
Field::new("_1", DataType::Int32, true),
Field::new("_2", DataType::Int64, true),
Field::new("_3", DataType::UInt32, true),
Field::new("_4", DataType::UInt64, true),
Field::new("_5", DataType::Float32, true),
Field::new("_6", DataType::Float64, true),
Field::new("_7", DataType::Date64, true),
];
let schema = Schema::new(fields);
Ok(create_random_batch(
Arc::new(schema),
size,
null_density,
true_density,
)?)
}

fn create_primitive_bench_batch_non_null(
size: usize,
null_density: f32,
true_density: f32,
) -> Result<RecordBatch> {
let fields = vec![
Field::new("_1", DataType::Int32, false),
Field::new("_2", DataType::Int64, false),
Field::new("_3", DataType::UInt32, false),
Field::new("_4", DataType::UInt64, false),
Field::new("_5", DataType::Float32, false),
Field::new("_6", DataType::Float64, false),
Field::new("_7", DataType::Date64, false),
];
let schema = Schema::new(fields);
Ok(create_random_batch(
Arc::new(schema),
size,
null_density,
true_density,
)?)
}

fn create_string_bench_batch(
size: usize,
null_density: f32,
true_density: f32,
) -> Result<RecordBatch> {
let fields = vec![
Field::new("_1", DataType::Utf8, true),
Field::new("_2", DataType::LargeUtf8, true),
];
let schema = Schema::new(fields);
Ok(create_random_batch(
Arc::new(schema),
size,
null_density,
true_density,
)?)
}

fn create_string_bench_batch_non_null(
size: usize,
null_density: f32,
true_density: f32,
) -> Result<RecordBatch> {
let fields = vec![
Field::new("_1", DataType::Utf8, false),
Field::new("_2", DataType::LargeUtf8, false),
];
let schema = Schema::new(fields);
Ok(create_random_batch(
Arc::new(schema),
size,
null_density,
true_density,
)?)
}

fn create_bool_bench_batch(
size: usize,
null_density: f32,
true_density: f32,
) -> Result<RecordBatch> {
let fields = vec![Field::new("_1", DataType::Boolean, true)];
let schema = Schema::new(fields);
Ok(create_random_batch(
Arc::new(schema),
size,
null_density,
true_density,
)?)
}

fn create_bool_bench_batch_non_null(
size: usize,
null_density: f32,
true_density: f32,
) -> Result<RecordBatch> {
let fields = vec![Field::new("_1", DataType::Boolean, false)];
let schema = Schema::new(fields);
Ok(create_random_batch(
Arc::new(schema),
size,
null_density,
true_density,
)?)
}

fn create_list_primitive_bench_batch(
size: usize,
null_density: f32,
true_density: f32,
) -> Result<RecordBatch> {
let fields = vec![
Field::new(
"_1",
DataType::List(Box::new(Field::new("item", DataType::Int32, true))),
true,
),
Field::new(
"_2",
DataType::List(Box::new(Field::new("item", DataType::Boolean, true))),
true,
),
Field::new(
"_3",
DataType::LargeList(Box::new(Field::new("item", DataType::Utf8, true))),
true,
),
];
let schema = Schema::new(fields);
Ok(create_random_batch(
Arc::new(schema),
size,
null_density,
true_density,
)?)
}

fn create_list_primitive_bench_batch_non_null(
size: usize,
null_density: f32,
true_density: f32,
) -> Result<RecordBatch> {
let fields = vec![
Field::new(
"_1",
DataType::List(Box::new(Field::new("item", DataType::Int32, false))),
false,
),
Field::new(
"_2",
DataType::List(Box::new(Field::new("item", DataType::Boolean, false))),
false,
),
Field::new(
"_3",
DataType::LargeList(Box::new(Field::new("item", DataType::Utf8, false))),
false,
),
];
let schema = Schema::new(fields);
Ok(create_random_batch(
Expand Down Expand Up @@ -148,7 +287,7 @@ fn write_batch(batch: &RecordBatch) -> Result<()> {
}

fn bench_primitive_writer(c: &mut Criterion) {
let batch = create_primitive_bench_batch(1024, 0.25, 0.75).unwrap();
let batch = create_primitive_bench_batch(4096, 0.25, 0.75).unwrap();
let mut group = c.benchmark_group("write_batch primitive");
group.throughput(Throughput::Bytes(
batch
Expand All @@ -157,24 +296,76 @@ fn bench_primitive_writer(c: &mut Criterion) {
.map(|f| f.get_array_memory_size() as u64)
.sum(),
));
group.bench_function("1024 values", |b| b.iter(|| write_batch(&batch).unwrap()));
group.bench_function("4096 values primitive", |b| {
b.iter(|| write_batch(&batch).unwrap())
});

let batch = create_primitive_bench_batch(4096, 0.25, 0.75).unwrap();
let batch = create_primitive_bench_batch_non_null(4096, 0.25, 0.75).unwrap();
group.throughput(Throughput::Bytes(
batch
.columns()
.iter()
.map(|f| f.get_array_memory_size() as u64)
.sum(),
));
group.bench_function("4096 values primitive non-null", |b| {
b.iter(|| write_batch(&batch).unwrap())
});

let batch = create_bool_bench_batch(4096, 0.25, 0.75).unwrap();
group.throughput(Throughput::Bytes(
batch
.columns()
.iter()
.map(|f| f.get_array_memory_size() as u64)
.sum(),
));
group.bench_function("4096 values bool", |b| {
b.iter(|| write_batch(&batch).unwrap())
});

let batch = create_bool_bench_batch_non_null(4096, 0.25, 0.75).unwrap();
group.throughput(Throughput::Bytes(
batch
.columns()
.iter()
.map(|f| f.get_array_memory_size() as u64)
.sum(),
));
group.bench_function("4096 values bool non-null", |b| {
b.iter(|| write_batch(&batch).unwrap())
});

let batch = create_string_bench_batch(4096, 0.25, 0.75).unwrap();
group.throughput(Throughput::Bytes(
batch
.columns()
.iter()
.map(|f| f.get_array_memory_size() as u64)
.sum(),
));
group.bench_function("4096 values", |b| b.iter(|| write_batch(&batch).unwrap()));
group.bench_function("4096 values string", |b| {
b.iter(|| write_batch(&batch).unwrap())
});

let batch = create_string_bench_batch_non_null(4096, 0.25, 0.75).unwrap();
group.throughput(Throughput::Bytes(
batch
.columns()
.iter()
.map(|f| f.get_array_memory_size() as u64)
.sum(),
));
group.bench_function("4096 values string non-null", |b| {
b.iter(|| write_batch(&batch).unwrap())
});

group.finish();
}

// This bench triggers a write error, it is ignored for now
fn _bench_nested_writer(c: &mut Criterion) {
let batch = _create_nested_bench_batch(1024, 0.25, 0.75).unwrap();
fn bench_nested_writer(c: &mut Criterion) {
let batch = create_list_primitive_bench_batch(4096, 0.25, 0.75).unwrap();
let mut group = c.benchmark_group("write_batch nested");
group.throughput(Throughput::Bytes(
batch
Expand All @@ -183,20 +374,24 @@ fn _bench_nested_writer(c: &mut Criterion) {
.map(|f| f.get_array_memory_size() as u64)
.sum(),
));
group.bench_function("1024 values", |b| b.iter(|| write_batch(&batch).unwrap()));
group.bench_function("4096 values primitive list", |b| {
b.iter(|| write_batch(&batch).unwrap())
});

let batch = create_primitive_bench_batch(4096, 0.25, 0.75).unwrap();
let batch = create_list_primitive_bench_batch_non_null(4096, 0.25, 0.75).unwrap();
group.throughput(Throughput::Bytes(
batch
.columns()
.iter()
.map(|f| f.get_array_memory_size() as u64)
.sum(),
));
group.bench_function("4096 values", |b| b.iter(|| write_batch(&batch).unwrap()));
group.bench_function("4096 values primitive list non-null", |b| {
b.iter(|| write_batch(&batch).unwrap())
});

group.finish();
}

criterion_group!(benches, bench_primitive_writer);
criterion_group!(benches, bench_primitive_writer, bench_nested_writer);
criterion_main!(benches);

0 comments on commit 180776a

Please sign in to comment.