Skip to content

Commit

Permalink
[Kernel][Expressions] Add support for LIKE expression (#3103)
Browse files Browse the repository at this point in the history
## Description
Add SQL `LIKE` expression support in Kernel list of supported expressions and a default implementation.

Addresses part of #2539 (where `STARTS_WITH` as `LIKE 'str%'`)

## How was this patch tested?
added unit tests

Signed-off-by: Krishnan Paranji Ravi <[email protected]>
  • Loading branch information
krishnanravi authored May 23, 2024
1 parent 35c7536 commit 0deef04
Show file tree
Hide file tree
Showing 7 changed files with 444 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,12 @@
* <li>Since version: 3.2.0</li>
* </ul>
* </li>
* <li>Name: <code>LIKE</code>
* <ul>
* <li>SQL semantic: <code>expr LIKE expr</code></li>
* <li>Since version: 3.3.0</li>
* </ul>
* </li>
* </ol>
*
* @since 3.0.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,15 @@ public static UnsupportedOperationException unsupportedExpressionException(
reason);
return new UnsupportedOperationException(message);
}

/**
* Exception class for invalid escape sequence used in input for LIKE expressions
* @param pattern the invalid pattern
* @param index character index of occurrence of the offending escape in the pattern
*/
public static IllegalArgumentException invalidEscapeSequence(String pattern, int index) {
return new IllegalArgumentException(
format("LIKE expression has invalid escape sequence '%s' at index %d",
pattern, index));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import java.util.stream.Collectors;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;
import static java.util.stream.Collectors.toList;

import io.delta.kernel.data.ColumnVector;
import io.delta.kernel.data.ColumnarBatch;
Expand All @@ -31,8 +32,6 @@
import static io.delta.kernel.internal.util.ExpressionUtils.getRight;
import static io.delta.kernel.internal.util.ExpressionUtils.getUnaryChild;
import static io.delta.kernel.internal.util.Preconditions.checkArgument;


import io.delta.kernel.defaults.internal.data.vector.DefaultBooleanVector;
import io.delta.kernel.defaults.internal.data.vector.DefaultConstantVector;
import static io.delta.kernel.defaults.internal.DefaultEngineErrors.unsupportedExpressionException;
Expand Down Expand Up @@ -280,6 +279,21 @@ ExpressionTransformResult visitCoalesce(ScalarExpression coalesce) {
);
}

@Override
ExpressionTransformResult visitLike(final Predicate like) {
List<ExpressionTransformResult> children =
like.getChildren().stream()
.map(this::visit)
.collect(toList());
Predicate transformedExpression =
LikeExpressionEvaluator.validateAndTransform(
like,
children.stream().map(e -> e.expression).collect(toList()),
children.stream().map(e -> e.outputType).collect(toList()));

return new ExpressionTransformResult(transformedExpression, BooleanType.BOOLEAN);
}

private Predicate validateIsPredicate(
Expression baseExpression,
ExpressionTransformResult result) {
Expand Down Expand Up @@ -560,6 +574,15 @@ ColumnVector visitCoalesce(ScalarExpression coalesce) {
);
}

@Override
ColumnVector visitLike(final Predicate like) {
List<Expression> children = like.getChildren();
return LikeExpressionEvaluator.eval(
children.stream()
.map(this::visit)
.collect(toList()));
}

/**
* Utility method to evaluate inputs to the binary input expression. Also validates the
* evaluated expression result {@link ColumnVector}s are of the same size.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ abstract class ExpressionVisitor<R> {

abstract R visitCoalesce(ScalarExpression ifNull);

abstract R visitLike(Predicate predicate);

final R visit(Expression expression) {
if (expression instanceof PartitionValueExpression) {
return visitPartitionValue((PartitionValueExpression) expression);
Expand Down Expand Up @@ -105,6 +107,8 @@ private R visitScalarExpression(ScalarExpression expression) {
return visitIsNull(new Predicate(name, children));
case "COALESCE":
return visitCoalesce(expression);
case "LIKE":
return visitLike(new Predicate(name, children));
default:
throw new UnsupportedOperationException(
String.format("Scalar expression `%s` is not supported.", name));
Expand All @@ -114,8 +118,8 @@ private R visitScalarExpression(ScalarExpression expression) {
private static Predicate elemAsPredicate(List<Expression> expressions, int index) {
if (expressions.size() <= index) {
throw new RuntimeException(
String.format("Trying to access invalid entry (%d) in list %s", index,
expressions.stream().map(Object::toString).collect(joining(","))));
String.format("Trying to access invalid entry (%d) in list %s", index,
expressions.stream().map(Object::toString).collect(joining(","))));
}
Expression elemExpression = expressions.get(index);
if (!(elemExpression instanceof Predicate)) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
/*
* Copyright (2023) The Delta Lake Project Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.delta.kernel.defaults.internal.expressions;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import java.util.regex.Pattern;

import io.delta.kernel.data.ColumnVector;
import io.delta.kernel.expressions.Expression;
import io.delta.kernel.expressions.Literal;
import io.delta.kernel.expressions.Predicate;
import io.delta.kernel.types.BooleanType;
import io.delta.kernel.types.DataType;
import io.delta.kernel.types.StringType;
import io.delta.kernel.internal.util.Utils;

import static io.delta.kernel.defaults.internal.DefaultEngineErrors.invalidEscapeSequence;
import static io.delta.kernel.defaults.internal.DefaultEngineErrors.unsupportedExpressionException;

/**
* Utility methods to evaluate {@code like} expression.
*/
public class LikeExpressionEvaluator {
private LikeExpressionEvaluator() {
}

static Predicate validateAndTransform(
Predicate like,
List<Expression> childrenExpressions,
List<DataType> childrenOutputTypes) {
int size = childrenExpressions.size();
if (size < 2 || size > 3) {
throw unsupportedExpressionException(like,
"Invalid number of inputs to LIKE expression. " +
"Example usage: LIKE(column, 'test%'), LIKE(column, 'test\\[%', '\\')");
}

Expression left = childrenExpressions.get(0);
DataType leftOutputType = childrenOutputTypes.get(0);
Expression right = childrenExpressions.get(1);
DataType rightOutputType = childrenOutputTypes.get(1);
Expression escapeCharExpr = size == 3 ? childrenExpressions.get(2) : null;
DataType escapeCharOutputType = size == 3 ? childrenOutputTypes.get(2) : null;

if (!(StringType.STRING.equivalent(leftOutputType)
&& StringType.STRING.equivalent(rightOutputType))) {
throw unsupportedExpressionException(like,
"LIKE is only supported for string type expressions");
}

if (escapeCharExpr != null &&
(!(escapeCharExpr instanceof Literal &&
StringType.STRING.equivalent(escapeCharOutputType)))) {
throw unsupportedExpressionException(like,
"LIKE expects escape token expression to be a literal of String type");
}

Literal literal = (Literal) escapeCharExpr;
if (literal != null &&
literal.getValue().toString().length() != 1) {
throw unsupportedExpressionException(like,
"LIKE expects escape token to be a single character");
}

List<Expression> children = new ArrayList<>(Arrays.asList(left, right));
if(Objects.nonNull(escapeCharExpr)) {
children.add(escapeCharExpr);
}
return new Predicate(like.getName(), children);
}

static ColumnVector eval(List<ColumnVector> children) {
final char DEFAULT_ESCAPE_CHAR = '\\';

return new ColumnVector() {
final ColumnVector escapeCharVector =
children.size() == 3 ?
children.get(2) :
null;
final ColumnVector left = children.get(0);
final ColumnVector right = children.get(1);

Character escapeChar = null;

public void initEscapeCharIfRequired() {
if (escapeChar == null) {
escapeChar =
escapeCharVector != null && !escapeCharVector.getString(0).isEmpty() ?
escapeCharVector.getString(0).charAt(0) :
DEFAULT_ESCAPE_CHAR;
}
}

@Override
public DataType getDataType() {
return BooleanType.BOOLEAN;
}

@Override
public int getSize() {
return left.getSize();
}

@Override
public void close() {
Utils.closeCloseables(left, right);
}

@Override
public boolean getBoolean(int rowId) {
initEscapeCharIfRequired();
return isLike(left.getString(rowId), right.getString(rowId), escapeChar);
}

@Override
public boolean isNullAt(int rowId) {
return left.isNullAt(rowId) || right.isNullAt(rowId);
}

public boolean isLike(String input, String pattern, char escape) {
if (!Objects.isNull(input) && !Objects.isNull(pattern)) {
String regex = escapeLikeRegex(pattern, escape);
return input.matches(regex);
}
return false;
}
};
}

/**
* utility method to convert a predicate pattern to a java regex
* @param pattern the pattern used in the expression
* @param escape escape character to use
* @return java regex
*/
private static String escapeLikeRegex(String pattern, char escape) {
final int len = pattern.length();
final StringBuilder javaPattern = new StringBuilder(len + len);
for (int i = 0; i < len; i++) {
char c = pattern.charAt(i);

if (c == escape) {
if (i == (pattern.length() - 1)) {
throw invalidEscapeSequence(pattern, i);
}
char nextChar = pattern.charAt(i + 1);
if ((nextChar == '_')
|| (nextChar == '%')
|| (nextChar == escape)) {
javaPattern.append(Pattern.quote(Character.toString(nextChar)));
i++;
} else {
throw invalidEscapeSequence(pattern, i);
}
} else if (c == '_') {
javaPattern.append('.');
} else if (c == '%') {
javaPattern.append(".*");
} else {
javaPattern.append(Pattern.quote(Character.toString(c)));
}

}
return "(?s)" + javaPattern;
}
}
Loading

0 comments on commit 0deef04

Please sign in to comment.