Skip to content

Commit

Permalink
Integ match bool prefix #187 (#634)
Browse files Browse the repository at this point in the history
Signed-off-by: MaxKsyunz <[email protected]>

Co-authored-by: MaxKsyunz <[email protected]>
Co-authored-by: Max Ksyunz <[email protected]>
  • Loading branch information
3 people authored Jun 27, 2022
1 parent 7cbb121 commit 86dcd51
Show file tree
Hide file tree
Showing 32 changed files with 627 additions and 55 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package org.opensearch.sql.common.antlr;

import org.antlr.v4.runtime.tree.ParseTree;

public interface Parser {
ParseTree parse(String query);
}
4 changes: 4 additions & 0 deletions core/src/main/java/org/opensearch/sql/expression/DSL.java
Original file line number Diff line number Diff line change
Expand Up @@ -666,6 +666,10 @@ public FunctionExpression simple_query_string(Expression... args) {
return compile(BuiltinFunctionName.SIMPLE_QUERY_STRING, args);
}

public FunctionExpression match_bool_prefix(Expression... args) {
return compile(BuiltinFunctionName.MATCH_BOOL_PREFIX, args);
}

private FunctionExpression compile(BuiltinFunctionName bfn, Expression... args) {
return (FunctionExpression) repository.compile(bfn.getName(), Arrays.asList(args.clone()));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ public enum BuiltinFunctionName {
SIMPLE_QUERY_STRING(FunctionName.of("simple_query_string")),
MATCH_PHRASE(FunctionName.of("match_phrase")),
MATCHPHRASE(FunctionName.of("matchphrase")),
MATCH_BOOL_PREFIX(FunctionName.of("match_bool_prefix")),

/**
* Legacy Relevance Function.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import static org.opensearch.sql.data.type.ExprCoreType.STRING;
import static org.opensearch.sql.data.type.ExprCoreType.STRUCT;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import java.util.ArrayList;
import java.util.Collections;
Expand All @@ -27,6 +28,7 @@
public class OpenSearchFunctions {

public static final int MATCH_MAX_NUM_PARAMETERS = 14;
public static final int MATCH_BOOL_PREFIX_MAX_NUM_PARAMETERS = 9;
public static final int MATCH_PHRASE_MAX_NUM_PARAMETERS = 5;
public static final int MIN_NUM_PARAMETERS = 2;
public static final int MULTI_MATCH_MAX_NUM_PARAMETERS = 17;
Expand All @@ -36,6 +38,7 @@ public class OpenSearchFunctions {
* Add functions specific to OpenSearch to repository.
*/
public void register(BuiltinFunctionRepository repository) {
repository.register(match_bool_prefix());
repository.register(match());
repository.register(multi_match());
repository.register(simple_query_string());
Expand All @@ -45,6 +48,11 @@ public void register(BuiltinFunctionRepository repository) {
repository.register(match_phrase(BuiltinFunctionName.MATCHPHRASE));
}

private static FunctionResolver match_bool_prefix() {
FunctionName name = BuiltinFunctionName.MATCH_BOOL_PREFIX.getName();
return getRelevanceFunctionResolver(name, MATCH_BOOL_PREFIX_MAX_NUM_PARAMETERS, STRING);
}

private static FunctionResolver match() {
FunctionName funcName = BuiltinFunctionName.MATCH.getName();
return getRelevanceFunctionResolver(funcName, MATCH_MAX_NUM_PARAMETERS, STRING);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,17 @@ public void named_non_parse_expression() {
assertAnalyzeEqual(DSL.ref("string_field", STRING), qualifiedName("string_field"));
}

@Test
void match_bool_prefix_expression() {
assertAnalyzeEqual(
dsl.match_bool_prefix(
dsl.namedArgument("field", DSL.literal("fieldA")),
dsl.namedArgument("query", DSL.literal("sample query"))),
AstDSL.function("match_bool_prefix",
AstDSL.unresolvedArg("field", stringLiteral("fieldA")),
AstDSL.unresolvedArg("query", stringLiteral("sample query"))));
}

@Test
void visit_span() {
assertAnalyzeEqual(
Expand Down
41 changes: 41 additions & 0 deletions docs/user/dql/functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2233,6 +2233,47 @@ Another example to show how to set custom values for the optional parameters::
+----------------------+--------------------------+


MATCH_BOOL_PREFIX
-----

Description
>>>>>>>>>>>

``match_bool_prefix(field_expression, query_expression)``

The match_bool_prefix function maps to the match_bool_prefix query in the search engine. match_bool_prefix creates a match query from all but the last term in the query string. The last term is used to create a prefix query.

- fuzziness
- max_expansions
- prefix_length
- fuzzy_transpositions
- fuzzy_rewrite
- minimum_should_match
- boost
- operator
- analyzer

Example with only ``field`` and ``query`` expressions, and all other parameters are set default values::

os> SELECT firstname, address FROM accounts WHERE match_bool_prefix(address, 'Bristol Stre');
fetched rows / total rows = 2/2
+-------------+--------------------+
| firstname | address |
|-------------+--------------------|
| Hattie | 671 Bristol Street |
| Nanette | 789 Madison Street |
+-------------+--------------------+

Another example to show how to set custom values for the optional parameters::

os> SELECT firstname, address FROM accounts WHERE match_bool_prefix(address, 'Bristol Street', minimum_should_match=2);
fetched rows / total rows = 1/1
+-------------+--------------------+
| firstname | address |
|-------------+--------------------|
| Hattie | 671 Bristol Street |
+-------------+--------------------+

MULTI_MATCH
-----------

Expand Down
44 changes: 43 additions & 1 deletion docs/user/ppl/functions/relevance.rst
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,49 @@ Another example to show how to set custom values for the optional parameters::
| 1 | The House at Pooh Corner | Alan Alexander Milne |
+------+--------------------------+----------------------+

MATCH_BOOL_PREFIX
-----

Description
>>>>>>>>>>>

``match_bool_prefix(field_expression, query_expression)``

The match_bool_prefix function maps to the match_bool_prefix query in the search engine. match_bool_prefix creates a match query from all but the last term in the query string. The last term is used to create a prefix query.

- analyzer
- fuzziness
- max_expansions
- prefix_length
- fuzzy_transpositions
- operator
- fuzzy_rewrite
- minimum_should_match
- boost

Example with only ``field`` and ``query`` expressions, and all other parameters are set default values::

os> source=accounts | where match_bool_prefix(address, 'Bristol Stre') | fields firstname, address
fetched rows / total rows = 2/2
+-------------+--------------------+
| firstname | address |
|-------------+--------------------|
| Hattie | 671 Bristol Street |
| Nanette | 789 Madison Street |
+-------------+--------------------+

Another example to show how to set custom values for the optional parameters::

os> source=accounts | where match_bool_prefix(address, 'Bristol Stre', minimum_should_match = 2) | fields firstname, address
fetched rows / total rows = 1/1
+-------------+--------------------+
| firstname | address |
|-------------+--------------------|
| Hattie | 671 Bristol Street |
+-------------+--------------------+

Limitations
>>>>>>>>>>>

The relevance functions are available to execute only in OpenSearch DSL but not in memory as of now, so the relevance search might fail for queries that are too complex to translate into DSL if the relevance function is following after a complex PPL query. To make your queries always work-able, it is recommended to place the relevance commands as close to the search command as possible, to ensure the relevance functions are eligible to push down. For example, a complex query like ``search source = people | rename firstname as name | dedup account_number | fields name, account_number, balance, employer | where match(employer, 'Open Search') | stats count() by city`` could fail because it is difficult to translate to DSL, but it would be better if we rewrite it to an equivalent query as ``search source = people | where match(employer, 'Open Search') | rename firstname as name | dedup account_number | fields name, account_number, balance, employer | stats count() by city`` by moving the where command with relevance function to the second command right after the search command, and the relevance would be optimized and executed smoothly in OpenSearch DSL. See `Optimization <../../optimization/optimization.rst>`_ to get more details about the query engine optimization.
The relevance functions are available to execute only in OpenSearch DSL but not in memory as of now, so the relevance search might fail for queries that are too complex to translate into DSL if the relevance function is following after a complex PPL query. To make your queries always work-able, it is recommended to place the relevance commands as close to the search command as possible, to ensure the relevance functions are eligible to push down. For example, a complex query like ``search source = people | rename firstname as name | dedup account_number | fields name, account_number, balance, employer | where match(employer, 'Open Search') | stats count() by city`` could fail because it is difficult to translate to DSL, but it would be better if we rewrite it to an equivalent query as ``search source = people | where match(employer, 'Open Search') | rename firstname as name | dedup account_number | fields name, account_number, balance, employer | stats count() by city`` by moving the where command with relevance function to the second command right after the search command, and the relevance would be optimized and executed smoothly in OpenSearch DSL. See `Optimization <../../optimization/optimization.rst>`_ to get more details about the query engine optimization.

Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,7 @@ public void castStatementInWhereClauseLessThanConstantTest() {
public void castStatementInWhereClauseDatetimeCastTest() {
JSONObject response = executeJdbcRequest("SELECT date_keyword FROM "
+ TestsConstants.TEST_INDEX_DATE
+ " WHERE (CAST(date_keyword AS DATETIME) = \'2014-08-19T07:09:13.434Z\')");
+ " WHERE (CAST(date_keyword AS DATETIME) = '2014-08-19T07:09:13.434Z')");

String schema_result = "{\"name\":\"date_keyword\",\"type\":\"keyword\"}";
assertEquals(response.getJSONArray("schema").get(0).toString(), schema_result);
Expand Down Expand Up @@ -704,7 +704,7 @@ public void ifFuncShouldPassJDBC() {
JSONObject response = executeJdbcRequest(
"SELECT IF(age > 30, 'True', 'False') AS Ages FROM " + TEST_INDEX_ACCOUNT
+ " WHERE age IS NOT NULL GROUP BY Ages");
assertEquals("IF(age > 30, \'True\', \'False\')", response.query("/schema/0/name"));
assertEquals("IF(age > 30, 'True', 'False')", response.query("/schema/0/name"));
assertEquals("Ages", response.query("/schema/0/alias"));
assertEquals("keyword", response.query("/schema/0/type"));
}
Expand Down Expand Up @@ -742,7 +742,7 @@ public void ifnullShouldPassJDBC() throws IOException {
JSONObject response = executeJdbcRequest(
"SELECT IFNULL(lastname, 'unknown') AS name FROM " + TEST_INDEX_ACCOUNT
+ " GROUP BY name");
assertEquals("IFNULL(lastname, \'unknown\')", response.query("/schema/0/name"));
assertEquals("IFNULL(lastname, 'unknown')", response.query("/schema/0/name"));
assertEquals("name", response.query("/schema/0/alias"));
assertEquals("keyword", response.query("/schema/0/type"));
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/

package org.opensearch.sql.ppl;

import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_PHRASE;
import static org.opensearch.sql.util.MatcherUtils.rows;
import static org.opensearch.sql.util.MatcherUtils.verifyDataRows;

import java.io.IOException;
import org.json.JSONObject;
import org.junit.Test;

public class MatchBoolPrefixIT extends PPLIntegTestCase {

@Override
public void init() throws IOException {
loadIndex(Index.PHRASE);
}

@Test
public void valid_query_match_test() throws IOException {
JSONObject result =
executeQuery(
String.format(
"source=%s | where match_bool_prefix(phrase, 'qui') | fields phrase",
TEST_INDEX_PHRASE));

verifyDataRows(result,
rows("quick fox"),
rows("quick fox here"));
}

@Test
public void optional_parameter_match_test() throws IOException {
JSONObject result =
executeQuery(
String.format(
"source=%s | where match_bool_prefix(phrase, '2 tes', minimum_should_match=1, fuzziness=2) | fields phrase",
TEST_INDEX_PHRASE));

verifyDataRows(result,
rows("my test"),
rows("my test 2"));
}

@Test
public void no_matches_test() throws IOException {
JSONObject result =
executeQuery(
String.format(
"source=%s | where match_bool_prefix(phrase, 'rice') | fields phrase",
TEST_INDEX_PHRASE));

assertEquals(0, result.getInt("total"));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ public void ifnullShouldPassJDBC() throws IOException {
JSONObject response = executeJdbcRequest(
"SELECT IFNULL(lastname, 'unknown') AS name FROM " + TEST_INDEX_ACCOUNT
+ " GROUP BY name");
assertEquals("IFNULL(lastname, \'unknown\')", response.query("/schema/0/name"));
assertEquals("IFNULL(lastname, 'unknown')", response.query("/schema/0/name"));
assertEquals("name", response.query("/schema/0/alias"));
assertEquals("keyword", response.query("/schema/0/type"));
}
Expand Down Expand Up @@ -92,7 +92,7 @@ public void ifnullWithMissingInputTest() {
public void nullifShouldPassJDBC() throws IOException {
JSONObject response = executeJdbcRequest(
"SELECT NULLIF(lastname, 'unknown') AS name FROM " + TEST_INDEX_ACCOUNT);
assertEquals("NULLIF(lastname, \'unknown\')", response.query("/schema/0/name"));
assertEquals("NULLIF(lastname, 'unknown')", response.query("/schema/0/name"));
assertEquals("name", response.query("/schema/0/alias"));
assertEquals("keyword", response.query("/schema/0/type"));
}
Expand Down Expand Up @@ -181,8 +181,8 @@ public void isnullWithMathExpr() throws IOException{
@Test
public void ifShouldPassJDBC() throws IOException {
JSONObject response = executeJdbcRequest(
"SELECT IF(2 > 0, \'hello\', \'world\') AS name FROM " + TEST_INDEX_ACCOUNT);
assertEquals("IF(2 > 0, \'hello\', \'world\')", response.query("/schema/0/name"));
"SELECT IF(2 > 0, 'hello', 'world') AS name FROM " + TEST_INDEX_ACCOUNT);
assertEquals("IF(2 > 0, 'hello', 'world')", response.query("/schema/0/name"));
assertEquals("name", response.query("/schema/0/alias"));
assertEquals("keyword", response.query("/schema/0/type"));
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/

package org.opensearch.sql.sql;

import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_PHRASE;
import static org.opensearch.sql.util.MatcherUtils.rows;
import static org.opensearch.sql.util.MatcherUtils.schema;
import static org.opensearch.sql.util.MatcherUtils.verifyDataRows;
import static org.opensearch.sql.util.MatcherUtils.verifySchema;

import java.io.IOException;
import org.json.JSONObject;
import org.junit.Test;
import org.opensearch.sql.legacy.SQLIntegTestCase;

public class MatchBoolPrefixIT extends SQLIntegTestCase {
public void init() throws IOException {
loadIndex(SQLIntegTestCase.Index.PHRASE);
}

@Test
public void query_matches_test() throws IOException {
String query = "SELECT phrase FROM "
+ TEST_INDEX_PHRASE + " WHERE match_bool_prefix(phrase, 'quick')";
var result = new JSONObject(executeQuery(query, "jdbc"));
verifySchema(result, schema("phrase", "text"));

verifyDataRows(result,
rows("quick fox"),
rows("quick fox here"));
}

@Test
public void additional_parameters_test() throws IOException {
String query = "SELECT phrase FROM "
+ TEST_INDEX_PHRASE + " WHERE match_bool_prefix(phrase, '2 test', minimum_should_match=1, fuzziness=2)";
var result = new JSONObject(executeQuery(query, "jdbc"));
verifySchema(result, schema("phrase", "text"));

verifyDataRows(result,
rows("my test"),
rows("my test 2"));
}

@Test
public void no_matches_test() throws IOException {
String query = "SELECT * FROM "
+ TEST_INDEX_PHRASE + " WHERE match_bool_prefix(phrase, 'rice')";
var result = new JSONObject(executeQuery(query, "jdbc"));
assertEquals(0, result.getInt("total"));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -594,4 +594,4 @@ public void fieldWithSpacesInNameShouldPass() {
Assert.assertSame(TEXT, type.get());
}

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
public class EnvironmentTest {

/** Use context class for push/pop */
private SemanticContext context = new SemanticContext();
private final SemanticContext context = new SemanticContext();

@Test
public void defineFieldSymbolInDifferentEnvironmentsShouldBeAbleToResolve() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.opensearch.sql.opensearch.storage.script.filter.lucene.RangeQuery.Comparison;
import org.opensearch.sql.opensearch.storage.script.filter.lucene.TermQuery;
import org.opensearch.sql.opensearch.storage.script.filter.lucene.WildcardQuery;
import org.opensearch.sql.opensearch.storage.script.filter.lucene.relevance.MatchBoolPrefixQuery;
import org.opensearch.sql.opensearch.storage.script.filter.lucene.relevance.MatchPhraseQuery;
import org.opensearch.sql.opensearch.storage.script.filter.lucene.relevance.MatchQuery;
import org.opensearch.sql.opensearch.storage.script.filter.lucene.relevance.MultiMatchQuery;
Expand Down Expand Up @@ -62,6 +63,7 @@ public class FilterQueryBuilder extends ExpressionNodeVisitor<QueryBuilder, Obje
.put(BuiltinFunctionName.MATCHQUERY.getName(), new MatchQuery())
.put(BuiltinFunctionName.MULTI_MATCH.getName(), new MultiMatchQuery())
.put(BuiltinFunctionName.SIMPLE_QUERY_STRING.getName(), new SimpleQueryStringQuery())
.put(BuiltinFunctionName.MATCH_BOOL_PREFIX.getName(), new MatchBoolPrefixQuery())
.build();

/**
Expand Down
Loading

0 comments on commit 86dcd51

Please sign in to comment.