Skip to content

Commit

Permalink
Add regexp_like scalar function
Browse files Browse the repository at this point in the history
  • Loading branch information
Omega359 committed Feb 5, 2024
1 parent c43c5f1 commit 1b4988c
Show file tree
Hide file tree
Showing 17 changed files with 1,137 additions and 133 deletions.
7 changes: 4 additions & 3 deletions datafusion-examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,21 +52,22 @@ cargo run --example csv_sql
- [`dataframe_output.rs`](examples/dataframe_output.rs): Examples of methods which write data out from a DataFrame
- [`dataframe_in_memory.rs`](examples/dataframe_in_memory.rs): Run a query using a DataFrame against data in memory
- [`deserialize_to_struct.rs`](examples/deserialize_to_struct.rs): Convert query results into rust structs using serde
- [`expr_api.rs`](examples/expr_api.rs): Create, execute, simplify and anaylze `Expr`s
- [`expr_api.rs`](examples/expr_api.rs): Create, execute, simplify and analyze `Expr`s
- [`flight_sql_server.rs`](examples/flight/flight_sql_server.rs): Run DataFusion as a standalone process and execute SQL queries from JDBC clients
- [`make_date.rs`](examples/make_date.rs): Examples of using the make_date function
- [`memtable.rs`](examples/memtable.rs): Create an query data in memory using SQL and `RecordBatch`es
- [`parquet_sql.rs`](examples/parquet_sql.rs): Build and run a query plan from a SQL statement against a local Parquet file
- [`parquet_sql_multiple_files.rs`](examples/parquet_sql_multiple_files.rs): Build and run a query plan from a SQL statement against multiple local Parquet files
- [`query-aws-s3.rs`](examples/external_dependency/query-aws-s3.rs): Configure `object_store` and run a query against files stored in AWS S3
- [`query-http-csv.rs`](examples/query-http-csv.rs): Configure `object_store` and run a query against files vi HTTP
- [`regexp.rs`](examples/regexp.rs): Examples of using regular expression functions
- [`rewrite_expr.rs`](examples/rewrite_expr.rs): Define and invoke a custom Query Optimizer pass
- [`to_timestamp.rs`](examples/to_timestamp.rs): Examples of using to_timestamp functions
- [`simple_udf.rs`](examples/simple_udf.rs): Define and invoke a User Defined Scalar Function (UDF)
- [`advanced_udf.rs`](examples/advanced_udf.rs): Define and invoke a more complicated User Defined Scalar Function (UDF)
- [`simple_udaf.rs`](examples/simple_udaf.rs): Define and invoke a User Defined Aggregate Function (UDAF)
- [`advanced_udaf.rs`](examples/advanced_udaf.rs): Define and invoke a more complicated User Defined Aggregate Function (UDAF)
- [`simple_udfw.rs`](examples/simple_udwf.rs): Define and invoke a User Defined Window Function (UDWF)
- [`make_date.rs`](examples/make_date.rs): Examples of using the make_date function
- [`to_timestamp.rs`](examples/to_timestamp.rs): Examples of using the to_timestamp functions
- [`advanced_udwf.rs`](examples/advanced_udwf.rs): Define and invoke a more complicated User Defined Window Function (UDWF)

## Distributed
Expand Down
300 changes: 300 additions & 0 deletions datafusion-examples/examples/regexp.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,300 @@
// Licensed to the Apache Software Foundation (ASF) under one
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use std::sync::Arc;

use arrow::array::{BooleanArray, LargeStringArray, StringArray, StringBuilder};
use log::info;

use datafusion::arrow::datatypes::{DataType, Field, Schema};
use datafusion::arrow::record_batch::RecordBatch;
use datafusion::error::Result;
use datafusion::prelude::*;
use datafusion_common::assert_contains;

/// This example demonstrates how to use the regexp_*
/// functions in the DataFrame API as well as via sql.
#[tokio::main]
async fn main() -> Result<()> {
// define a schema. Regex are restricted to Utf8 and largeutf8 data
let schema = Arc::new(Schema::new(vec![
Field::new("values", DataType::Utf8, false),
Field::new("patterns", DataType::LargeUtf8, false),
Field::new("flags", DataType::Utf8, true),
]));

let mut sb = StringBuilder::new();
sb.append_value("i");
sb.append_value("i");
sb.append_value("i");
sb.append_null();
sb.append_null();
sb.append_null();
sb.append_null();
sb.append_null();
sb.append_null();
sb.append_null();
sb.append_null();

// define data for our examples
let batch = RecordBatch::try_new(
schema,
vec![
Arc::new(StringArray::from(vec![
"abc",
"ABC",
"aBc",
"AbC",
"aBC",
"4000",
"4010",
"Düsseldorf",
"Москва",
"Köln",
"إسرائيل",
])),
// the full list of supported features and
// syntax can be found at
// https://docs.rs/regex/latest/regex/#syntax

// NOTE: double slashes are required to escape the slash character
// NOTE: when not using the r"" syntax
Arc::new(LargeStringArray::from(vec![
// simple regex examples
"^(a)",
"^(A).*",
"(b|d)",
"(B|D)",
"^(b|c)",
// word boundaries, grouping, etc
r"\b4([1-9]\d\d|\d[1-9]\d|\d\d[1-9])\b",
r"\b4([1-9]\d\d|\d[1-9]\d|\d\d[1-9])\b",
// unicode is supported
r"[\p{Letter}-]+",
r"[\p{L}-]+",
"[a-zA-Z]ö[a-zA-Z]{2}",
// unicode character classes work
r"^\p{Arabic}+$",
])),
// supported flags can be found at
// https://docs.rs/regex/latest/regex/#grouping-and-flags
Arc::new(sb.finish()),
],
)?;

// declare a new context. In spark API, this corresponds to a new spark SQLsession
let ctx = SessionContext::new();

// declare a table in memory. In spark API, this corresponds to createDataFrame(...).
ctx.register_batch("examples", batch)?;
let df = ctx.table("examples").await?;

//
//
//regexp_like examples
//
//
// regexp_like format is (regexp_replace(text, regex[, flags])
//

// use dataframe and regexp_like function to test col 'values', against patterns in col 'patterns' without flags
let df = df.with_column("a", regexp_like(vec![col("values"), col("patterns")]))?;
// use dataframe and regexp_like function to test col 'values', against patterns in col 'patterns' with flags
let df = df.with_column(
"b",
regexp_like(vec![col("values"), col("patterns"), col("flags")]),
)?;

// you can use literals as well with dataframe calls
let df = df.with_column(
"c",
regexp_like(vec![lit("foobarbequebaz"), lit("(bar)(beque)")]),
)?;

let df = df.select_columns(&["a", "b", "c"])?;

// print the results
df.show().await?;

// use sql and regexp_like function to test col 'values', against patterns in col 'patterns' without flags
let df = ctx
.sql("select regexp_like(values, patterns) from examples")
.await?;

// print the results
df.show().await?;

// use dataframe and regexp_like function to test col 'values', against patterns in col 'patterns' with flags
let df = ctx
.sql("select regexp_like(values, patterns, flags) from examples")
.await?;

// print the results
df.show().await?;

// literals work as well
// to match against the entire input use ^ and $ in the regex
let df = ctx.sql("select regexp_like('John Smith', '^.*Smith$'), regexp_like('Smith Jones', '^Smith.*$')").await?;

// print the results
df.show().await?;

// look-around and back references are not supported for performance
// reasons.
// Note that an error may not always be returned but the result
// if returned will always be false
let df = ctx.read_empty()?.with_column(
"a",
regexp_like(vec![
lit(r"(?<=[A-Z]\w* )Smith"),
lit("John Smith"),
lit("i"),
]),
)?;
let df = df.select_columns(&["a"])?;

// print the results
df.show().await?;

let result = ctx
.sql(r"select regexp_like('(?<=[A-Z]\w )Smith', 'John Smith', 'i') as a")
.await?
.collect()
.await;

let expected = RecordBatch::try_new(
Arc::new(Schema::new(vec![Field::new("a", DataType::Boolean, false)])),
vec![Arc::new(BooleanArray::from(vec![false]))],
)
.unwrap();

assert!(result.is_ok());
let result = result.unwrap();

assert_eq!(result.len(), 1);
info!("{:?}", result[0]);
info!("{expected:?}");

assert_eq!(format!("{:?}", result[0]), format!("{expected:?}"));

// invalid flags will result in an error
let result = ctx
.sql(r"select regexp_like('\b4(?!000)\d\d\d\b', 4010, 'g')")
.await?
.collect()
.await;

let expected = "regexp_like() does not support the \"global\" option";
assert_contains!(result.unwrap_err().to_string(), expected);

// there is a size limit on the regex during regex compilation
let result = ctx
.sql("select regexp_like('aaaaa', 'a{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}')")
.await?
.collect()
.await;

let expected = "Regular expression did not compile: CompiledTooBig(";
assert_contains!(result.unwrap_err().to_string(), expected);

//
//
//regexp_match examples
//
//
// regexp_match format is (regexp_replace(text, regex[, flags])
//

let df = ctx.table("examples").await?;

// use dataframe and regexp_match function to test col 'values', against patterns in col 'patterns' without flags
let df = df.with_column("a", regexp_match(vec![col("values"), col("patterns")]))?;
// use dataframe and regexp_match function to test col 'values', against patterns in col 'patterns' with flags
let df = df.with_column(
"b",
regexp_match(vec![col("values"), col("patterns"), col("flags")]),
)?;

// you can use literals as well with dataframe calls
let df = df.with_column(
"c",
regexp_match(vec![lit("foobarbequebaz"), lit("(bar)(beque)")]),
)?;

let df = df.select_columns(&["a", "b", "c"])?;

// print the results
df.show().await?;

// use sql and regexp_match function to test col 'values', against patterns in col 'patterns' without flags
let df = ctx
.sql("select regexp_match(values, patterns) from examples")
.await?;

// print the results
df.show().await?;

// use dataframe and regexp_match function to test col 'values', against patterns in col 'patterns' with flags
let df = ctx
.sql("select regexp_match(values, patterns, flags) from examples")
.await?;

// print the results
df.show().await?;

// literals work as well
// to match against the entire input use ^ and $ in the regex
let df = ctx.sql("select regexp_match('John Smith', '^.*Smith$'), regexp_match('Smith Jones', '^Smith.*$')").await?;

// print the results
df.show().await?;

//
//
//regexp_replace examples
//
//
// regexp_replace format is (regexp_replace(text, regex, replace, flags)
//

// global flag example
let df = ctx
.sql("SELECT regexp_replace('foobarbaz', 'b(..)', 'X\\1Y', 'g')")
.await?;

// print the results
df.show().await?;

// without global flag
let df = ctx
.sql("SELECT regexp_replace('foobarbaz', 'b(..)', 'X\\1Y', null)")
.await?;

// print the results
df.show().await?;

// null regex means null result
let df = ctx
.sql("SELECT regexp_replace('foobarbaz', NULL, 'X\\1Y', 'g')")
.await?;

// print the results
df.show().await?;

Ok(())
}
21 changes: 21 additions & 0 deletions datafusion/core/tests/dataframe/dataframe_functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,27 @@ async fn test_fn_md5() -> Result<()> {
Ok(())
}

#[tokio::test]
#[cfg(feature = "unicode_expressions")]
async fn test_fn_regexp_like() -> Result<()> {
let expr = regexp_like(vec![col("a"), lit("[a-z]")]);

let expected = [
"+-----------------------------------+",
"| regexp_like(test.a,Utf8(\"[a-z]\")) |",
"+-----------------------------------+",
"| true |",
"| true |",
"| true |",
"| true |",
"+-----------------------------------+",
];

assert_fn_batches!(expr, expected);

Ok(())
}

#[tokio::test]
#[cfg(feature = "unicode_expressions")]
async fn test_fn_regexp_match() -> Result<()> {
Expand Down
Loading

0 comments on commit 1b4988c

Please sign in to comment.