Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve LIKE regex performance up to 12x #6145

Merged
merged 1 commit into from
Jul 29, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 57 additions & 26 deletions arrow-string/src/predicate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,39 +140,54 @@ fn ends_with_ignore_ascii_case(haystack: &str, needle: &str) -> bool {

/// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does:
///
/// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.`
/// 2. Escape regex meta characters to match them and not be evaluated as regex special chars. For example: `.` => `\\.`
/// 3. Replace escaped like wildcards removing the escape characters to be able to match it as a regex. For example: `\\%` => `%`
/// 1. Replace `LIKE` multi-character wildcards `%` => `.*` (unless they're at the start or end of the pattern,
/// where the regex is just truncated - e.g. `%foo%` => `foo` rather than `^.*foo.*$`)
/// 2. Replace `LIKE` single-character wildcards `_` => `.`
/// 3. Escape regex meta characters to match them and not be evaluated as regex special chars. e.g. `.` => `\\.`
/// 4. Replace escaped `LIKE` wildcards removing the escape characters to be able to match it as a regex. e.g. `\\%` => `%`
fn regex_like(pattern: &str, case_insensitive: bool) -> Result<Regex, ArrowError> {
let mut result = String::with_capacity(pattern.len() * 2);
result.push('^');
let mut chars_iter = pattern.chars().peekable();
match chars_iter.peek() {
// if the pattern starts with `%`, we avoid starting the regex with a slow but meaningless `^.*`
Some('%') => {
chars_iter.next();
}
_ => result.push('^'),
};

while let Some(c) = chars_iter.next() {
if c == '\\' {
let next = chars_iter.peek();
match next {
Some(next) if is_like_pattern(*next) => {
result.push(*next);
// Skipping the next char as it is already appended
chars_iter.next();
match c {
'\\' => {
match chars_iter.peek() {
Some(next) if is_like_pattern(*next) => {
result.push(*next);
// Skipping the next char as it is already appended
chars_iter.next();
}
_ => {
result.push('\\');
result.push('\\');
}
}
_ => {
result.push('\\');
}
'%' => result.push_str(".*"),
'_' => result.push('.'),
c => {
if regex_syntax::is_meta_character(c) {
result.push('\\');
}
result.push(c);
}
} else if regex_syntax::is_meta_character(c) {
result.push('\\');
result.push(c);
} else if c == '%' {
result.push_str(".*");
} else if c == '_' {
result.push('.');
} else {
result.push(c);
}
}
result.push('$');
// instead of ending the regex with `.*$` and making it needlessly slow, we just end the regex
if result.ends_with(".*") {
result.pop();
result.pop();
} else {
result.push('$');
}
RegexBuilder::new(&result)
.case_insensitive(case_insensitive)
.dot_matches_new_line(true)
Expand All @@ -197,9 +212,25 @@ mod tests {
use super::*;

#[test]
fn test_replace_like_wildcards() {
let a_eq = "_%";
let expected = "^..*$";
fn test_replace_start_end_percent() {
let a_eq = "%foobar%";
let expected = "foobar";
let r = regex_like(a_eq, false).unwrap();
assert_eq!(r.to_string(), expected);
}

#[test]
fn test_replace_middle_percent() {
let a_eq = "foo%bar";
let expected = "^foo.*bar$";
let r = regex_like(a_eq, false).unwrap();
assert_eq!(r.to_string(), expected);
}

#[test]
fn test_replace_underscore() {
let a_eq = "foo_bar";
let expected = "^foo.bar$";
let r = regex_like(a_eq, false).unwrap();
assert_eq!(r.to_string(), expected);
}
Expand Down
Loading