From 695abb60bc9c9ade998c671a1de18d1cb5aff646 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sat, 30 Mar 2024 16:42:53 -0400 Subject: [PATCH] Add benchmark for substr_index --- datafusion/functions/Cargo.toml | 6 +- datafusion/functions/benches/substr_index.rs | 102 +++++++++++++++++++ 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 datafusion/functions/benches/substr_index.rs diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 425ac207c33e..933893492c2a 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -106,4 +106,8 @@ required-features = ["datetime_expressions"] [[bench]] harness = false name = "to_char" -required-features = ["datetime_expressions"] + +[[bench]] +harness = false +name = "substr_index" +required-features = ["unicode_expressions"] diff --git a/datafusion/functions/benches/substr_index.rs b/datafusion/functions/benches/substr_index.rs new file mode 100644 index 000000000000..71e66b941084 --- /dev/null +++ b/datafusion/functions/benches/substr_index.rs @@ -0,0 +1,102 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use std::sync::Arc; + +use arrow::array::{ArrayRef, Int64Array, StringArray}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use rand::distributions::{Alphanumeric, Uniform}; +use rand::prelude::Distribution; +use rand::Rng; + +use datafusion_expr::ColumnarValue; +use datafusion_functions::unicode::substr_index; + +struct Filter { + dist: Dist, + test: Test, +} + +impl Distribution for Filter +where + Dist: Distribution, + Test: Fn(&T) -> bool, +{ + fn sample(&self, rng: &mut R) -> T { + loop { + let x = self.dist.sample(rng); + if (self.test)(&x) { + return x; + } + } + } +} + +fn data() -> (StringArray, StringArray, Int64Array) { + let dist = Filter { + dist: Uniform::new(-4, 5), + test: |x: &i64| x != &0, + }; + let mut rng = rand::thread_rng(); + let mut strings: Vec = vec![]; + let mut delimiters: Vec = vec![]; + let mut counts: Vec = vec![]; + + for _ in 0..1000 { + let length = rng.gen_range(20..50); + let text: String = (&mut rng) + .sample_iter(&Alphanumeric) + .take(length) + .map(char::from) + .collect(); + let char = rng.gen_range(0..text.len()); + let delimiter = &text.chars().nth(char).unwrap(); + let count = rng.sample(&dist); + + strings.push(text); + delimiters.push(delimiter.to_string()); + counts.push(count); + } + + ( + StringArray::from(strings), + StringArray::from(delimiters), + Int64Array::from(counts), + ) +} + +fn criterion_benchmark(c: &mut Criterion) { + c.bench_function("substr_index_array_array_1000", |b| { + let (strings, delimiters, counts) = data(); + let strings = ColumnarValue::Array(Arc::new(strings) as ArrayRef); + let delimiters = ColumnarValue::Array(Arc::new(delimiters) as ArrayRef); + let counts = ColumnarValue::Array(Arc::new(counts) as ArrayRef); + + b.iter(|| { + black_box( + substr_index() + .invoke(&[strings.clone(), delimiters.clone(), counts.clone()]) + .expect("substr_index should work on valid values"), + ) + }) + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches);