Add regexp_like scalar function

Omega359 · Feb 5, 2024 · 1b4988c · 1b4988c
1 parent c43c5f1
commit 1b4988c
Show file tree

Hide file tree

Showing 17 changed files with 1,137 additions and 133 deletions.
diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md
@@ -52,21 +52,22 @@ cargo run --example csv_sql
 - [`dataframe_output.rs`](examples/dataframe_output.rs): Examples of methods which write data out from a DataFrame
 - [`dataframe_in_memory.rs`](examples/dataframe_in_memory.rs): Run a query using a DataFrame against data in memory
 - [`deserialize_to_struct.rs`](examples/deserialize_to_struct.rs): Convert query results into rust structs using serde
-- [`expr_api.rs`](examples/expr_api.rs): Create, execute, simplify and anaylze `Expr`s
+- [`expr_api.rs`](examples/expr_api.rs): Create, execute, simplify and analyze `Expr`s
 - [`flight_sql_server.rs`](examples/flight/flight_sql_server.rs): Run DataFusion as a standalone process and execute SQL queries from JDBC clients
+- [`make_date.rs`](examples/make_date.rs): Examples of using the make_date function
 - [`memtable.rs`](examples/memtable.rs): Create an query data in memory using SQL and `RecordBatch`es
 - [`parquet_sql.rs`](examples/parquet_sql.rs): Build and run a query plan from a SQL statement against a local Parquet file
 - [`parquet_sql_multiple_files.rs`](examples/parquet_sql_multiple_files.rs): Build and run a query plan from a SQL statement against multiple local Parquet files
 - [`query-aws-s3.rs`](examples/external_dependency/query-aws-s3.rs): Configure `object_store` and run a query against files stored in AWS S3
 - [`query-http-csv.rs`](examples/query-http-csv.rs): Configure `object_store` and run a query against files vi HTTP
+- [`regexp.rs`](examples/regexp.rs): Examples of using regular expression functions
 - [`rewrite_expr.rs`](examples/rewrite_expr.rs): Define and invoke a custom Query Optimizer pass
+- [`to_timestamp.rs`](examples/to_timestamp.rs): Examples of using to_timestamp functions
 - [`simple_udf.rs`](examples/simple_udf.rs): Define and invoke a User Defined Scalar Function (UDF)
 - [`advanced_udf.rs`](examples/advanced_udf.rs): Define and invoke a more complicated User Defined Scalar Function (UDF)
 - [`simple_udaf.rs`](examples/simple_udaf.rs): Define and invoke a User Defined Aggregate Function (UDAF)
 - [`advanced_udaf.rs`](examples/advanced_udaf.rs): Define and invoke a more complicated User Defined Aggregate Function (UDAF)
 - [`simple_udfw.rs`](examples/simple_udwf.rs): Define and invoke a User Defined Window Function (UDWF)
-- [`make_date.rs`](examples/make_date.rs): Examples of using the make_date function
-- [`to_timestamp.rs`](examples/to_timestamp.rs): Examples of using the to_timestamp functions
 - [`advanced_udwf.rs`](examples/advanced_udwf.rs): Define and invoke a more complicated User Defined Window Function (UDWF)
 
 ## Distributed

diff --git a/datafusion-examples/examples/regexp.rs b/datafusion-examples/examples/regexp.rs
@@ -0,0 +1,300 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::{BooleanArray, LargeStringArray, StringArray, StringBuilder};
+use log::info;
+
+use datafusion::arrow::datatypes::{DataType, Field, Schema};
+use datafusion::arrow::record_batch::RecordBatch;
+use datafusion::error::Result;
+use datafusion::prelude::*;
+use datafusion_common::assert_contains;
+
+/// This example demonstrates how to use the regexp_*
+/// functions in the DataFrame API as well as via sql.
+#[tokio::main]
+async fn main() -> Result<()> {
+    // define a schema. Regex are restricted to Utf8 and largeutf8 data
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("values", DataType::Utf8, false),
+        Field::new("patterns", DataType::LargeUtf8, false),
+        Field::new("flags", DataType::Utf8, true),
+    ]));
+
+    let mut sb = StringBuilder::new();
+    sb.append_value("i");
+    sb.append_value("i");
+    sb.append_value("i");
+    sb.append_null();
+    sb.append_null();
+    sb.append_null();
+    sb.append_null();
+    sb.append_null();
+    sb.append_null();
+    sb.append_null();
+    sb.append_null();
+
+    // define data for our examples
+    let batch = RecordBatch::try_new(
+        schema,
+        vec![
+            Arc::new(StringArray::from(vec![
+                "abc",
+                "ABC",
+                "aBc",
+                "AbC",
+                "aBC",
+                "4000",
+                "4010",
+                "Düsseldorf",
+                "Москва",
+                "Köln",
+                "إسرائيل",
+            ])),
+            // the full list of supported features and
+            // syntax can be found at
+            // https://docs.rs/regex/latest/regex/#syntax
+
+            // NOTE: double slashes are required to escape the slash character
+            // NOTE: when not using the r"" syntax
+            Arc::new(LargeStringArray::from(vec![
+                // simple regex examples
+                "^(a)",
+                "^(A).*",
+                "(b|d)",
+                "(B|D)",
+                "^(b|c)",
+                // word boundaries, grouping, etc
+                r"\b4([1-9]\d\d|\d[1-9]\d|\d\d[1-9])\b",
+                r"\b4([1-9]\d\d|\d[1-9]\d|\d\d[1-9])\b",
+                // unicode is supported
+                r"[\p{Letter}-]+",
+                r"[\p{L}-]+",
+                "[a-zA-Z]ö[a-zA-Z]{2}",
+                // unicode character classes work
+                r"^\p{Arabic}+$",
+            ])),
+            // supported flags can be found at
+            // https://docs.rs/regex/latest/regex/#grouping-and-flags
+            Arc::new(sb.finish()),
+        ],
+    )?;
+
+    // declare a new context. In spark API, this corresponds to a new spark SQLsession
+    let ctx = SessionContext::new();
+
+    // declare a table in memory. In spark API, this corresponds to createDataFrame(...).
+    ctx.register_batch("examples", batch)?;
+    let df = ctx.table("examples").await?;
+
+    //
+    //
+    //regexp_like examples
+    //
+    //
+    // regexp_like format is (regexp_replace(text, regex[, flags])
+    //
+
+    // use dataframe and regexp_like function to test col 'values', against patterns in col 'patterns' without flags
+    let df = df.with_column("a", regexp_like(vec![col("values"), col("patterns")]))?;
+    // use dataframe and regexp_like function to test col 'values', against patterns in col 'patterns' with flags
+    let df = df.with_column(
+        "b",
+        regexp_like(vec![col("values"), col("patterns"), col("flags")]),
+    )?;
+
+    // you can  use literals as well with dataframe calls
+    let df = df.with_column(
+        "c",
+        regexp_like(vec![lit("foobarbequebaz"), lit("(bar)(beque)")]),
+    )?;
+
+    let df = df.select_columns(&["a", "b", "c"])?;
+
+    // print the results
+    df.show().await?;
+
+    // use sql and regexp_like function to test col 'values', against patterns in col 'patterns' without flags
+    let df = ctx
+        .sql("select regexp_like(values, patterns) from examples")
+        .await?;
+
+    // print the results
+    df.show().await?;
+
+    // use dataframe and regexp_like function to test col 'values', against patterns in col 'patterns' with flags
+    let df = ctx
+        .sql("select regexp_like(values, patterns, flags) from examples")
+        .await?;
+
+    // print the results
+    df.show().await?;
+
+    // literals work as well
+    // to match against the entire input use ^ and $ in the regex
+    let df = ctx.sql("select regexp_like('John Smith', '^.*Smith$'), regexp_like('Smith Jones', '^Smith.*$')").await?;
+
+    // print the results
+    df.show().await?;
+
+    // look-around and back references are not supported for performance
+    // reasons.
+    // Note that an error may not always be returned but the result
+    // if returned will always be false
+    let df = ctx.read_empty()?.with_column(
+        "a",
+        regexp_like(vec![
+            lit(r"(?<=[A-Z]\w* )Smith"),
+            lit("John Smith"),
+            lit("i"),
+        ]),
+    )?;
+    let df = df.select_columns(&["a"])?;
+
+    // print the results
+    df.show().await?;
+
+    let result = ctx
+        .sql(r"select regexp_like('(?<=[A-Z]\w )Smith', 'John Smith', 'i') as a")
+        .await?
+        .collect()
+        .await;
+
+    let expected = RecordBatch::try_new(
+        Arc::new(Schema::new(vec![Field::new("a", DataType::Boolean, false)])),
+        vec![Arc::new(BooleanArray::from(vec![false]))],
+    )
+    .unwrap();
+
+    assert!(result.is_ok());
+    let result = result.unwrap();
+
+    assert_eq!(result.len(), 1);
+    info!("{:?}", result[0]);
+    info!("{expected:?}");
+
+    assert_eq!(format!("{:?}", result[0]), format!("{expected:?}"));
+
+    // invalid flags will result in an error
+    let result = ctx
+        .sql(r"select regexp_like('\b4(?!000)\d\d\d\b', 4010, 'g')")
+        .await?
+        .collect()
+        .await;
+
+    let expected = "regexp_like() does not support the \"global\" option";
+    assert_contains!(result.unwrap_err().to_string(), expected);
+
+    // there is a size limit on the regex during regex compilation
+    let result = ctx
+        .sql("select regexp_like('aaaaa', 'a{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}')")
+        .await?
+        .collect()
+        .await;
+
+    let expected = "Regular expression did not compile: CompiledTooBig(";
+    assert_contains!(result.unwrap_err().to_string(), expected);
+
+    //
+    //
+    //regexp_match examples
+    //
+    //
+    // regexp_match format is (regexp_replace(text, regex[, flags])
+    //
+
+    let df = ctx.table("examples").await?;
+
+    // use dataframe and regexp_match function to test col 'values', against patterns in col 'patterns' without flags
+    let df = df.with_column("a", regexp_match(vec![col("values"), col("patterns")]))?;
+    // use dataframe and regexp_match function to test col 'values', against patterns in col 'patterns' with flags
+    let df = df.with_column(
+        "b",
+        regexp_match(vec![col("values"), col("patterns"), col("flags")]),
+    )?;
+
+    // you can  use literals as well with dataframe calls
+    let df = df.with_column(
+        "c",
+        regexp_match(vec![lit("foobarbequebaz"), lit("(bar)(beque)")]),
+    )?;
+
+    let df = df.select_columns(&["a", "b", "c"])?;
+
+    // print the results
+    df.show().await?;
+
+    // use sql and regexp_match function to test col 'values', against patterns in col 'patterns' without flags
+    let df = ctx
+        .sql("select regexp_match(values, patterns) from examples")
+        .await?;
+
+    // print the results
+    df.show().await?;
+
+    // use dataframe and regexp_match function to test col 'values', against patterns in col 'patterns' with flags
+    let df = ctx
+        .sql("select regexp_match(values, patterns, flags) from examples")
+        .await?;
+
+    // print the results
+    df.show().await?;
+
+    // literals work as well
+    // to match against the entire input use ^ and $ in the regex
+    let df = ctx.sql("select regexp_match('John Smith', '^.*Smith$'), regexp_match('Smith Jones', '^Smith.*$')").await?;
+
+    // print the results
+    df.show().await?;
+
+    //
+    //
+    //regexp_replace examples
+    //
+    //
+    // regexp_replace format is (regexp_replace(text, regex, replace, flags)
+    //
+
+    // global flag example
+    let df = ctx
+        .sql("SELECT regexp_replace('foobarbaz', 'b(..)', 'X\\1Y', 'g')")
+        .await?;
+
+    // print the results
+    df.show().await?;
+
+    // without global flag
+    let df = ctx
+        .sql("SELECT regexp_replace('foobarbaz', 'b(..)', 'X\\1Y', null)")
+        .await?;
+
+    // print the results
+    df.show().await?;
+
+    // null regex means null result
+    let df = ctx
+        .sql("SELECT regexp_replace('foobarbaz', NULL, 'X\\1Y', 'g')")
+        .await?;
+
+    // print the results
+    df.show().await?;
+
+    Ok(())
+}
diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs b/datafusion/core/tests/dataframe/dataframe_functions.rs
@@ -434,6 +434,27 @@ async fn test_fn_md5() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+#[cfg(feature = "unicode_expressions")]
+async fn test_fn_regexp_like() -> Result<()> {
+    let expr = regexp_like(vec![col("a"), lit("[a-z]")]);
+
+    let expected = [
+        "+-----------------------------------+",
+        "| regexp_like(test.a,Utf8(\"[a-z]\")) |",
+        "+-----------------------------------+",
+        "| true                              |",
+        "| true                              |",
+        "| true                              |",
+        "| true                              |",
+        "+-----------------------------------+",
+    ];
+
+    assert_fn_batches!(expr, expected);
+
+    Ok(())
+}
+
 #[tokio::test]
 #[cfg(feature = "unicode_expressions")]
 async fn test_fn_regexp_match() -> Result<()> {