Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add additional regexp function regexp_count() #12080

Closed
wants to merge 15 commits into from
Closed
2 changes: 1 addition & 1 deletion datafusion/functions/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ encoding_expressions = ["base64", "hex"]
# enable math functions
math_expressions = []
# enable regular expressions
regex_expressions = ["regex"]
regex_expressions = ["regex", "string_expressions"]
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have to do this for StringArrayType. Maybe we should consider relocating it to a common package?

Copy link
Contributor

@Omega359 Omega359 Sep 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would be very open to moving the StringArrayType, StringArrayBuilder, StringViewArrayBuilder, etc to a file in functions such as string_array.rs as they are used in multiple modules (unicode, regex, likely datetime in the future, etc) and are quite useful in general for any external UDF that might need them.

# enable string functions
string_expressions = ["uuid"]
# enable unicode functions
Expand Down
54 changes: 53 additions & 1 deletion datafusion/functions/benches/regx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@
extern crate criterion;

use arrow::array::builder::StringBuilder;
use arrow::array::{ArrayRef, StringArray};
use arrow::array::{ArrayRef, Int64Array, StringArray};
use arrow::compute::cast;
use arrow::datatypes::DataType;
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use datafusion_functions::regex::regexpcount::regexp_count_func;
use datafusion_functions::regex::regexplike::regexp_like;
use datafusion_functions::regex::regexpmatch::regexp_match;
use datafusion_functions::regex::regexpreplace::regexp_replace;
Expand Down Expand Up @@ -59,6 +62,15 @@ fn regex(rng: &mut ThreadRng) -> StringArray {
StringArray::from(data)
}

fn start(rng: &mut ThreadRng) -> Int64Array {
let mut data: Vec<i64> = vec![];
for _ in 0..1000 {
data.push(rng.gen_range(1..5));
}

Int64Array::from(data)
}

fn flags(rng: &mut ThreadRng) -> StringArray {
let samples = [Some("i".to_string()), Some("im".to_string()), None];
let mut sb = StringBuilder::new();
Expand All @@ -75,6 +87,46 @@ fn flags(rng: &mut ThreadRng) -> StringArray {
}

fn criterion_benchmark(c: &mut Criterion) {
c.bench_function("regexp_count_1000 string", |b| {
let mut rng = rand::thread_rng();
let data = Arc::new(data(&mut rng)) as ArrayRef;
let regex = Arc::new(regex(&mut rng)) as ArrayRef;
let start = Arc::new(start(&mut rng)) as ArrayRef;
let flags = Arc::new(flags(&mut rng)) as ArrayRef;

b.iter(|| {
black_box(
regexp_count_func(&[
Arc::clone(&data),
Arc::clone(&regex),
Arc::clone(&start),
Arc::clone(&flags),
])
.expect("regexp_count should work on utf8"),
)
})
});

c.bench_function("regexp_count_1000 utf8view", |b| {
let mut rng = rand::thread_rng();
let data = cast(&data(&mut rng), &DataType::Utf8View).unwrap();
let regex = cast(&regex(&mut rng), &DataType::Utf8View).unwrap();
let start = Arc::new(start(&mut rng)) as ArrayRef;
let flags = cast(&flags(&mut rng), &DataType::Utf8View).unwrap();

b.iter(|| {
black_box(
regexp_count_func(&[
Arc::clone(&data),
Arc::clone(&regex),
Arc::clone(&start),
Arc::clone(&flags),
])
.expect("regexp_count should work on utf8view"),
)
})
});

c.bench_function("regexp_like_1000", |b| {
let mut rng = rand::thread_rng();
let data = Arc::new(data(&mut rng)) as ArrayRef;
Expand Down
27 changes: 26 additions & 1 deletion datafusion/functions/src/regex/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,12 @@

//! "regx" DataFusion functions

pub mod regexpcount;
pub mod regexplike;
pub mod regexpmatch;
pub mod regexpreplace;
// create UDFs
make_udf_function!(regexpcount::RegexpCountFunc, REGEXP_COUNT, regexp_count);
make_udf_function!(regexpmatch::RegexpMatchFunc, REGEXP_MATCH, regexp_match);
make_udf_function!(regexplike::RegexpLikeFunc, REGEXP_LIKE, regexp_like);
make_udf_function!(
Expand All @@ -32,6 +34,24 @@ make_udf_function!(
pub mod expr_fn {
use datafusion_expr::Expr;

/// Returns the number of consecutive occurrences of a regular expression in a string.
pub fn regexp_count(
values: Expr,
regex: Expr,
start: Option<Expr>,
flags: Option<Expr>,
) -> Expr {
let mut args = vec![values, regex];
if let Some(start) = start {
args.push(start);
};

if let Some(flags) = flags {
args.push(flags);
};
super::regexp_count().call(args)
}

/// Returns a list of regular expression matches in a string.
pub fn regexp_match(values: Expr, regex: Expr, flags: Option<Expr>) -> Expr {
let mut args = vec![values, regex];
Expand Down Expand Up @@ -67,5 +87,10 @@ pub mod expr_fn {

/// Returns all DataFusion functions defined in this package
pub fn functions() -> Vec<std::sync::Arc<datafusion_expr::ScalarUDF>> {
vec![regexp_match(), regexp_like(), regexp_replace()]
vec![
regexp_count(),
regexp_match(),
regexp_like(),
regexp_replace(),
]
}
Loading