Bit-Quill · Yury-Fridlyand · Aug 21, 2023 · Jul 19, 2023 · Jul 21, 2023 · Jul 21, 2023
@@ -50,7 +50,7 @@ default boolean shouldCast(ExprType other) {
    * Get the parent type.
    */
   default List<ExprType> getParent() {
-    return Arrays.asList(UNKNOWN);
+    return List.of(UNKNOWN);
   }
 
   /**
@@ -64,4 +64,18 @@ default List<ExprType> getParent() {
   default String legacyTypeName() {
     return typeName();
   }
+
+  /**
+   * Perform field name conversion if needed before inserting it into a search query.
+   */
+  default String convertFieldForSearchQuery(String fieldName) {
+    return fieldName;
+  }
+
+  /**
+   * Perform value conversion if needed before inserting it into a search query.
+   */
+  default Object convertValueForSearchQuery(ExprValue value) {
+    return value.value();
+  }
 public class LikeQuery extends LuceneQuery { 
   @Override 
   public QueryBuilder doBuild(String fieldName, ExprType fieldType, ExprValue literal) { 
     String field = OpenSearchTextType.convertTextToKeyword(fieldName, fieldType); 
 public class LikeQuery extends LuceneQuery { 
   @Override 
   public QueryBuilder doBuild(String fieldName, ExprType fieldType, ExprValue literal) { 
     String field = OpenSearchTextType.convertTextToKeyword(fieldName, fieldType); 
 }
@@ -13,15 +13,8 @@
 
 /**
  * The definition of widening type rule for expression value.
- * ExprType     Widens to data types
- * INTEGER      LONG, FLOAT, DOUBLE
- * LONG         FLOAT, DOUBLE
- * FLOAT        DOUBLE
- * DOUBLE       DOUBLE
- * STRING       STRING
- * BOOLEAN      BOOLEAN
- * ARRAY        ARRAY
- * STRUCT       STRUCT
+ * See type widening definitions in {@link ExprCoreType}.
+ * For example, SHORT widens BYTE and so on.
  */
 @UtilityClass
 public class WideningTypeRule {
@@ -41,9 +34,9 @@ public static int distance(ExprType type1, ExprType type2) {
   }
 
   private static int distance(ExprType type1, ExprType type2, int distance) {
-    if (type1 == type2) {
+    if (type1.equals(type2)) {
       return distance;
-    } else if (type1 == UNKNOWN) {
+    } else if (type1.equals(UNKNOWN)) {
       return IMPOSSIBLE_WIDENING;
     } else {
       return type1.getParent().stream()
@@ -53,9 +46,9 @@ private static int distance(ExprType type1, ExprType type2, int distance) {
   }
 
   /**
-   * The max type among two types. The max is defined as follow
+   * The max type among two types. The max is defined as follows:
    * if type1 could widen to type2, then max is type2, vice versa
-   * if type1 could't widen to type2 and type2 could't widen to type1,
+   * if type1 couldn't widen to type2 and type2 couldn't widen to type1,
    * then throw {@link ExpressionEvaluationException}.
    *
    * @param type1 type1

@@ -44,6 +44,7 @@
 + **Query Analyzing**
   + [Semantic Analysis](query-semantic-analysis.md): performs semantic analysis to ensure semantic correctness
   + [Type Conversion](query-type-conversion.md): implement implicit data type conversion
+  + [`text` type](text-type.md): support for `text` data type
 + **Query Planning**
   + [Logical Optimization](query-optimizer-improvement.md): improvement on logical optimizer and physical implementer
 + **Query Execution**

@@ -9,7 +9,7 @@ Type conversion means conversion from one data type to another which has two asp
 1. Whether the conversion is implicit or explicit (implicit conversion is often called coercion)
 2. Whether the data is converted within the family or reinterpreted as another data type outside
 
-It’s common that strong typed language only supports little implicit conversions and no data reinterpretation. While languages with weak typing allows many implicit conversions and flexible reinterpretation.
+It's common that strong typed language only supports little implicit conversions and no data reinterpretation. While languages with weak typing allows many implicit conversions and flexible reinterpretation.
 
 ### 1.2 Problem Statement
 
@@ -30,8 +30,8 @@ However, more general conversions for non-numeric types are missing, such as con
 
 The common use case and motivation include:
 
-1. *User-friendly*: Although it doesn’t matter for application or BI tool which can always follow the strict grammar rule, it’s more friendly and accessible to human by implicit type conversion, ex. `date > DATE('2020-06-01') => date > '2020-06-01'`
-2. *Schema-on-read*: More importantly, implicit conversion from string is required for schema on read (stored as raw string on write and extract field(s) on read), ex. `regex ‘...’ | abs(a)`
+1. *User-friendly*: Although it doesn't matter for application or BI tool which can always follow the strict grammar rule, it's more friendly and accessible to human by implicit type conversion, ex. `date > DATE('2020-06-01') => date > '2020-06-01'`
+2. *Schema-on-read*: More importantly, implicit conversion from string is required for schema on read (stored as raw string on write and extract field(s) on read), ex. `regex ‘...' | abs(a)`
 
 ### 2.2 Functionalities
 
@@ -50,7 +50,7 @@ Future:
 
 ### 3.1 Type Precedence
 
-Type precedence determines the direction of conversion when fields involved in an expression has different type from resolved signature. Before introducing it into our type system, let’s check how an expression is resolved to a function implementation and why type precedence is required.
+Type precedence determines the direction of conversion when fields involved in an expression has different type from resolved signature. Before introducing it into our type system, let's check how an expression is resolved to a function implementation and why type precedence is required.
 
 ```
 Compiling time:
@@ -60,7 +60,7 @@ Compiling time:
  Function builder: returns equal(DOUBLE, DOUBLE) impl
 ```
 
-Now let’s follow the same idea to add support for conversion from `BOOLEAN` to `STRING`. Because all boolean values can be converted to a string (in other word string is “wider”), String type is made the parent of Boolean. However, this leads to wrong semantic as the following expression `false = ‘FALSE’` for example:
+Now let's follow the same idea to add support for conversion from `BOOLEAN` to `STRING`. Because all boolean values can be converted to a string (in other word string is 'wider'), String type is made the parent of Boolean. However, this leads to wrong semantic as the following expression `false = 'FALSE'` for example:
 
 ```
 Compiling time:
@@ -74,17 +74,18 @@ Runtime:
  Evaluation result: *false*
 ```
 
-Therefore type precedence is supposed to be defined based on semantic expected rather than intuitive “width” of type. Now let’s reverse the direction and make Boolean the parent of String type.
+Therefore type precedence is supposed to be defined based on semantic expected rather than intuitive 'width' of type. Now let's reverse the direction and make Boolean the parent of String type.
 
 ![New type hierarchy](img/type-hierarchy-tree-with-implicit-cast.png)
+Note: type hierarchy structure shown on the picture below was implemented in [#166](https://github.com/opensearch-project/sql/pull/166), but was changed later.
 
 ```
 Compiling time:
  Expression: false = 'FALSE'
  Unresolved signature: equal(BOOL, STRING)
  Resovled signature: equal(BOOL, BOOL)
  Function builder: 1) returns equal(BOOL, cast_to_bool(STRING)) impl
-                   2) returns equal(BOOL, BOOL) impl
+                   1) returns equal(BOOL, BOOL) impl
 Runtime:
  equal impl: false.equals(cast_to_bool('FALSE'))
  cast_to_bool impl: Boolean.valueOf('FALSE')
@@ -147,15 +148,15 @@ public enum ExprCoreType implements ExprType {
   DOUBLE(FLOAT),
 
   STRING(UNDEFINED),
-  BOOLEAN(STRING), // PR: change STRING's parent to BOOLEAN
+  BOOLEAN(STRING), // #166 changes: STRING's parent to BOOLEAN
 
   /**
    * Date.
    */
-  TIMESTAMP(UNDEFINED),
-  DATE(UNDEFINED),
-  TIME(UNDEFINED),
-  DATETIME(UNDEFINED),
+  DATE(STRING),  // #171 changes: datetime types parent to STRING
+  TIME(STRING),
+  DATETIME(STRING, DATE, TIME), // #1196 changes: extend DATETIME and TIMESTAMP parent list
+  TIMESTAMP(STRING, DATETIME),
   INTERVAL(UNDEFINED),
 
   STRUCT(UNDEFINED),
@@ -170,3 +171,12 @@ As with examples in section 3.1, the implementation is:
 1. Define all possible conversions in CAST function family.
 2. Define implicit conversions by type hierarchy tree (auto implicit cast from child to parent)
 3. During compile time, wrap original function builder by a new one which cast arguments to target type.
+
+## Final type hierarchy scheme
+
+![Most relevant type hierarchy](img/type-hierarchy-tree-final.png)
+
+## References
+* [#166](https://github.com/opensearch-project/sql/pull/166): Add automatic `STRING` -> `BOOLEAN` cast
+* [#171](https://github.com/opensearch-project/sql/pull/171): Add automatic `STRING` -> `DATE`/`TIME`/`DATETIME`/`TIMESTAMP` cast
+* [#1196](https://github.com/opensearch-project/sql/pull/1196): Add automatic casts between datetime types `DATE`/`TIME`/`DATETIME` -> `DATE`/`TIME`/`DATETIME`/`TIMESTAMP`
@@ -0,0 +1,45 @@
+## History overview
+
+* [odfe#620](https://github.com/opendistro-for-elasticsearch/sql/pull/620) - added two types for `text` (just text and text with fields), querying and string functions are supported, but searching is not.
+* [odfe#682](https://github.com/opendistro-for-elasticsearch/sql/pull/682) and [odfe#730](https://github.com/opendistro-for-elasticsearch/sql/pull/730) - added support for querying (in 682) and aggregation (730); if `text` has sub-fields, `.keyword` prefix is added to the field name regardless of the sub-field names.
+* [#1314](https://github.com/opensearch-project/sql/pull/1314) and [#1664](https://github.com/opensearch-project/sql/pull/1664) - changed format of storing `text` type; it is one type now, subfield information is stored, but not yet used.
+* Current changes (no PR number yet) - correctly resolve sub-field name if a `text` field has only one subfield. Fixes [#1112](https://github.com/opensearch-project/sql/issues/1112) and [#1038](https://github.com/opensearch-project/sql/issues/1038).
+
+## Further changes
+
+* Support search for text sub-fields ([#1113](https://github.com/opensearch-project/sql/issues/1113)).
+* Support multiple sub-fields for text ([#1887](https://github.com/opensearch-project/sql/issues/1887)).
+* Support non-default date formats for search queries ([#1847](https://github.com/opensearch-project/sql/issues/1847)). Fix for this bug depends on the current changes.
+
+## Problem statement
+
+`:opensearch` module parses index mapping and builds instances of `OpenSearchDataType` (a base class), but ships simplified types (`ExprCoreType` - a enum) to `:core` module, because `:core` uses `ExprCoreType`s to resolve functions.
+
+```mermaid
+sequenceDiagram
+  participant core as :core
+  participant opensearch as :opensearch
+
+  core ->>+ opensearch : resolve types
+  opensearch ->>- core : simplified types
+  note over core: preparing query
+  core ->> opensearch : translate request into datasource DSL
+```
+
+Later, `:core` returns to `:opensearch` the DSL request with types stored. Since types were simplified, all mapping information is lost. Adding new `TEXT` entry to `ExprCoreType` enum is not enough, because `ExprCoreType` is datasource agnostic and can't store any specific mapping info.
+
+## Solution
+
+The solution is to provide to `:core` non simplified types, but full types. Those objects should be fully compatible with `ExprCoreType` and implement all required APIs to allow `:core` to manipulate with built-in functions. Once those type objects are returned back to `:opensearch`, it can get all required information to build the correct search request.
+
+1. Pass full (non simplified) types to and through `:core`.
+2. Update `OpenSearchDataType` (and inheritors if needed) to be comparable with `ExprCoreType`.
+3. Update `:core` to do proper comparison (use `.equals` instead of `==`).
+5. Update `:opensearch` to use the mapping information received from `:core` and properly build the search query.
+
+## Type Schema
+
+| JDBC type | `ExprCoreType` | `OpenSearchDataType` | OpenSearch type |
+| --- | --- | --- | --- |
+| `VARCHAR`/`CHAR` | `STRING` | -- | `keyword` |
+| `LONGVARCHAR`/`TEXT` | `STRING` | `OpenSearchTextType` | `text` |
@@ -681,7 +681,16 @@ public enum Index {
     NESTED_WITH_NULLS(TestsConstants.TEST_INDEX_NESTED_WITH_NULLS,
         "multi_nested",
         getNestedTypeIndexMapping(),
-        "src/test/resources/nested_with_nulls.json");
+        "src/test/resources/nested_with_nulls.json"),
+    TEXT(TestsConstants.TEST_INDEX_TEXT,
+        "text",
+        getMappingFile("text_index_mappings.json"),
+        "src/test/resources/text.json"),
+    TEXT_FOR_CAST(TestsConstants.TEST_INDEX_TEXT_FOR_CAST,
+        "text",
+        getMappingFile("text_for_cast_index_mappings.json"),
+        "src/test/resources/text_for_cast.json"),
+    ;
 
     private final String name;
     private final String type;

@@ -60,6 +60,8 @@ public class TestsConstants {
   public final static String TEST_INDEX_WILDCARD = TEST_INDEX + "_wildcard";
   public final static String TEST_INDEX_MULTI_NESTED_TYPE = TEST_INDEX + "_multi_nested";
   public final static String TEST_INDEX_NESTED_WITH_NULLS = TEST_INDEX + "_nested_with_nulls";
+  public final static String TEST_INDEX_TEXT = TEST_INDEX + "_text";
+  public final static String TEST_INDEX_TEXT_FOR_CAST = TEST_INDEX + "_text_for_cast";
   public final static String DATASOURCES = ".ql-datasources";
 
   public final static String DATE_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'";

@@ -36,8 +36,8 @@ public void test_numeric_data_types() throws IOException {
         schema("byte_number", "byte"),
         schema("double_number", "double"),
         schema("float_number", "float"),
-        schema("half_float_number", "float"),
-        schema("scaled_float_number", "double"));
+        schema("half_float_number", "half_float"),
+        schema("scaled_float_number", "scaled_float"));
   }
 
   @Test
@@ -51,8 +51,8 @@ public void test_nonnumeric_data_types() throws IOException {
         schema("binary_value", "binary"),
         schema("date_value", "timestamp"),
         schema("ip_value", "ip"),
-        schema("object_value", "struct"),
-        schema("nested_value", "array"),
+        schema("object_value", "object"),
+        schema("nested_value", "nested"),
         schema("geo_point_value", "geo_point"));
   }
 

@@ -56,19 +56,18 @@ public void typeof_opensearch_types() throws IOException {
         + " | fields `double`, `long`, `integer`, `byte`, `short`, `float`, `half_float`, `scaled_float`",
             TEST_INDEX_DATATYPE_NUMERIC));
     verifyDataRows(response,
-        rows("DOUBLE", "LONG", "INTEGER", "BYTE", "SHORT", "FLOAT", "FLOAT", "DOUBLE"));
+        rows("DOUBLE", "LONG", "INTEGER", "BYTE", "SHORT", "FLOAT", "HALF_FLOAT", "SCALED_FLOAT"));
 
     response = executeQuery(String.format("source=%s | eval "
         + "`text` = typeof(text_value), `date` = typeof(date_value),"
         + "`boolean` = typeof(boolean_value), `object` = typeof(object_value),"
         + "`keyword` = typeof(keyword_value), `ip` = typeof(ip_value),"
-        + "`binary` = typeof(binary_value), `geo_point` = typeof(geo_point_value)"
-        // TODO activate this test once `ARRAY` type supported, see ExpressionAnalyzer::isTypeNotSupported
-        //+ ", `nested` = typeof(nested_value)"
-        + " | fields `text`, `date`, `boolean`, `object`, `keyword`, `ip`, `binary`, `geo_point`",
+        + "`binary` = typeof(binary_value), `geo_point` = typeof(geo_point_value),"
+        + "`nested` = typeof(nested_value)"
+        + " | fields `text`, `date`, `boolean`, `object`, `keyword`, `ip`, `binary`, `geo_point`, `nested`",
             TEST_INDEX_DATATYPE_NONNUMERIC));
     verifyDataRows(response,
         rows("TEXT", "TIMESTAMP", "BOOLEAN", "OBJECT", "KEYWORD",
-                "IP", "BINARY", "GEO_POINT"));
+                "IP", "BINARY", "GEO_POINT", "NESTED"));
   }
 }
@@ -45,16 +45,14 @@ public void typeof_opensearch_types() {
         + "typeof(float_number), typeof(half_float_number), typeof(scaled_float_number)"
         + " from %s;", TEST_INDEX_DATATYPE_NUMERIC));
     verifyDataRows(response,
-        rows("DOUBLE", "LONG", "INTEGER", "BYTE", "SHORT", "FLOAT", "FLOAT", "DOUBLE"));
+        rows("DOUBLE", "LONG", "INTEGER", "BYTE", "SHORT", "FLOAT", "HALF_FLOAT", "SCALED_FLOAT"));
 
     response = executeJdbcRequest(String.format("SELECT typeof(text_value),"
         + "typeof(date_value), typeof(boolean_value), typeof(object_value), typeof(keyword_value),"
-        + "typeof(ip_value), typeof(binary_value), typeof(geo_point_value)"
-        // TODO activate this test once `ARRAY` type supported, see ExpressionAnalyzer::isTypeNotSupported
-        //+ ", typeof(nested_value)"
+        + "typeof(ip_value), typeof(binary_value), typeof(geo_point_value), typeof(nested_value)"
         + " from %s;", TEST_INDEX_DATATYPE_NONNUMERIC));
     verifyDataRows(response,
         rows("TEXT", "TIMESTAMP", "BOOLEAN", "OBJECT", "KEYWORD",
-                "IP", "BINARY", "GEO_POINT"));
+                "IP", "BINARY", "GEO_POINT", "NESTED"));
   }
 }