From b8b76bc225a9b0c51407261cc7b55770db1a958b Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+Kev1n8@users.noreply.github.com> Date: Fri, 23 Aug 2024 02:10:55 +0800 Subject: [PATCH] Add benchmark for SUBSTR to evaluate improvements using StringView (#12111) * add a bench file substr.rs * taplo format --------- Co-authored-by: Andrew Lamb --- datafusion/functions/Cargo.toml | 5 + datafusion/functions/benches/substr.rs | 202 +++++++++++++++++++++++++ 2 files changed, 207 insertions(+) create mode 100644 datafusion/functions/benches/substr.rs diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 9ef020b772f0..337379a74670 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -161,3 +161,8 @@ required-features = ["string_expressions"] harness = false name = "random" required-features = ["math_expressions"] + +[[bench]] +harness = false +name = "substr" +required-features = ["unicode_expressions"] diff --git a/datafusion/functions/benches/substr.rs b/datafusion/functions/benches/substr.rs new file mode 100644 index 000000000000..14a3389da380 --- /dev/null +++ b/datafusion/functions/benches/substr.rs @@ -0,0 +1,202 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use arrow::array::{ArrayRef, Int64Array, OffsetSizeTrait}; +use arrow::util::bench_util::{ + create_string_array_with_len, create_string_view_array_with_len, +}; +use criterion::{black_box, criterion_group, criterion_main, Criterion, SamplingMode}; +use datafusion_expr::ColumnarValue; +use datafusion_functions::unicode; +use std::sync::Arc; + +fn create_args_without_count( + size: usize, + str_len: usize, + start_half_way: bool, + use_string_view: bool, +) -> Vec { + let start_array = Arc::new(Int64Array::from( + (0..size) + .map(|_| { + if start_half_way { + (str_len / 2) as i64 + } else { + 1i64 + } + }) + .collect::>(), + )); + + if use_string_view { + let string_array = + Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false)); + vec![ + ColumnarValue::Array(string_array), + ColumnarValue::Array(start_array), + ] + } else { + let string_array = + Arc::new(create_string_array_with_len::(size, 0.1, str_len)); + + vec![ + ColumnarValue::Array(string_array), + ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef), + ] + } +} + +fn create_args_with_count( + size: usize, + str_len: usize, + count_max: usize, + use_string_view: bool, +) -> Vec { + let start_array = + Arc::new(Int64Array::from((0..size).map(|_| 1).collect::>())); + let count = count_max.min(str_len) as i64; + let count_array = Arc::new(Int64Array::from( + (0..size).map(|_| count).collect::>(), + )); + + if use_string_view { + let string_array = + Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false)); + vec![ + ColumnarValue::Array(string_array), + ColumnarValue::Array(start_array), + ColumnarValue::Array(count_array), + ] + } else { + let string_array = + Arc::new(create_string_array_with_len::(size, 0.1, str_len)); + + vec![ + ColumnarValue::Array(string_array), + ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef), + ColumnarValue::Array(Arc::clone(&count_array) as ArrayRef), + ] + } +} + +fn criterion_benchmark(c: &mut Criterion) { + let substr = unicode::substr(); + for size in [1024, 4096] { + // string_len = 12, substring_len=6 (see `create_args_without_count`) + let len = 12; + let mut group = c.benchmark_group("SHORTER THAN 12"); + group.sampling_mode(SamplingMode::Flat); + group.sample_size(10); + + let args = create_args_without_count::(size, len, true, true); + group.bench_function( + &format!("substr_string_view [size={}, strlen={}]", size, len), + |b| b.iter(|| black_box(substr.invoke(&args))), + ); + + let args = create_args_without_count::(size, len, false, false); + group.bench_function( + &format!("substr_string [size={}, strlen={}]", size, len), + |b| b.iter(|| black_box(substr.invoke(&args))), + ); + + let args = create_args_without_count::(size, len, true, false); + group.bench_function( + &format!("substr_large_string [size={}, strlen={}]", size, len), + |b| b.iter(|| black_box(substr.invoke(&args))), + ); + + group.finish(); + + // string_len = 128, start=1, count=64, substring_len=64 + let len = 128; + let count = 64; + let mut group = c.benchmark_group("LONGER THAN 12"); + group.sampling_mode(SamplingMode::Flat); + group.sample_size(10); + + let args = create_args_with_count::(size, len, count, true); + group.bench_function( + &format!( + "substr_string_view [size={}, count={}, strlen={}]", + size, count, len, + ), + |b| b.iter(|| black_box(substr.invoke(&args))), + ); + + let args = create_args_with_count::(size, len, count, false); + group.bench_function( + &format!( + "substr_string [size={}, count={}, strlen={}]", + size, count, len, + ), + |b| b.iter(|| black_box(substr.invoke(&args))), + ); + + let args = create_args_with_count::(size, len, count, false); + group.bench_function( + &format!( + "substr_large_string [size={}, count={}, strlen={}]", + size, count, len, + ), + |b| b.iter(|| black_box(substr.invoke(&args))), + ); + + group.finish(); + + // string_len = 128, start=1, count=6, substring_len=6 + let len = 128; + let count = 6; + let mut group = c.benchmark_group("SRC_LEN > 12, SUB_LEN < 12"); + group.sampling_mode(SamplingMode::Flat); + group.sample_size(10); + + let args = create_args_with_count::(size, len, count, true); + group.bench_function( + &format!( + "substr_string_view [size={}, count={}, strlen={}]", + size, count, len, + ), + |b| b.iter(|| black_box(substr.invoke(&args))), + ); + + let args = create_args_with_count::(size, len, count, false); + group.bench_function( + &format!( + "substr_string [size={}, count={}, strlen={}]", + size, count, len, + ), + |b| b.iter(|| black_box(substr.invoke(&args))), + ); + + let args = create_args_with_count::(size, len, count, false); + group.bench_function( + &format!( + "substr_large_string [size={}, count={}, strlen={}]", + size, count, len, + ), + |b| b.iter(|| black_box(substr.invoke(&args))), + ); + + group.finish(); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches);