Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support widening of numeric types in union-types #112610

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/changelog/112610.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 112610
summary: Support widening of numeric types in union-types
area: ES|QL
type: bug
issues:
- 111277
22 changes: 12 additions & 10 deletions x-pack/plugin/esql/qa/testFixtures/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,18 @@ apply plugin: 'elasticsearch.java'
apply plugin: org.elasticsearch.gradle.dependencies.CompileOnlyResolvePlugin

dependencies {
implementation project(':x-pack:plugin:esql:compute')
implementation project(':x-pack:plugin:esql')
compileOnly project(path: xpackModule('core'))
implementation project(":libs:elasticsearch-x-content")
implementation project(':client:rest')
implementation project(':libs:elasticsearch-logging')
implementation project(':test:framework')
api(testArtifact(project(xpackModule('esql-core'))))
implementation project(':server')
implementation "net.sf.supercsv:super-csv:${versions.supercsv}"
implementation project(':x-pack:plugin:esql:compute')
implementation project(':x-pack:plugin:esql')
compileOnly project(path: xpackModule('core'))
implementation project(":libs:elasticsearch-x-content")
implementation project(':client:rest')
implementation project(':libs:elasticsearch-logging')
implementation project(':test:framework')
api(testArtifact(project(xpackModule('esql-core'))))
implementation project(':server')
implementation "net.sf.supercsv:super-csv:${versions.supercsv}"
implementation "com.fasterxml.jackson.core:jackson-core:${versions.jackson}"
implementation "com.fasterxml.jackson.core:jackson-databind:${versions.jackson}"
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ public static Tuple<Version, Version> skipVersionRange(String testName, String i
return null;
}

public static Tuple<Page, List<String>> loadPageFromCsv(URL source) throws Exception {
public static Tuple<Page, List<String>> loadPageFromCsv(URL source, Map<String, String> typeMapping) throws Exception {

record CsvColumn(String name, Type type, BuilderWrapper builderWrapper) implements Releasable {
void append(String stringValue) {
Expand Down Expand Up @@ -164,21 +164,16 @@ public void close() {
if (columns == null) {
columns = new CsvColumn[entries.length];
for (int i = 0; i < entries.length; i++) {
int split = entries[i].indexOf(':');
String name, typeName;
String[] header = entries[i].split(":");
String name = header[0].trim();
String typeName = (typeMapping != null && typeMapping.containsKey(name)) ? typeMapping.get(name)
: header.length > 1 ? header[1].trim()
: null;

if (split < 0) {
if (typeName == null || typeName.isEmpty()) {
throw new IllegalArgumentException(
"A type is always expected in the schema definition; found " + entries[i]
);
} else {
name = entries[i].substring(0, split).trim();
typeName = entries[i].substring(split + 1).trim();
if (typeName.length() == 0) {
throw new IllegalArgumentException(
"A type is always expected in the schema definition; found " + entries[i]
);
}
}
Type type = Type.asType(typeName);
if (type == null) {
Expand Down

Large diffs are not rendered by default.

This file was deleted.

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -1351,3 +1351,54 @@ FROM sample_data, sample_data_ts_long
null | 172.21.0.5 | 1232382 | Disconnected | Disconnected
null | 172.21.0.5 | 1232382 | Disconnected | Disconnected
;

shortIntegerWidening
required_capability: union_types
required_capability: metadata_fields
required_capability: casting_operator
required_capability: union_types_numeric_widening

FROM apps, apps_short METADATA _index
| EVAL id = id::integer
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is cool!

However, to reduce risk some more (some data types can be quirky and stuff could happen at block loading time, I guess, to types like float and byte ...); how about we add to this/replace this by a parameterized integration test that exercises all widened data types?

RestEsqlTestCase is quite nice and, like csv tests, is executed in multiple environments (single/multi node etc.).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests in RestEsqlTestCase also have the benefit that we can more easily test cases with more than 2 different types becoming a union type. E.g. float, byte and, I dunno, keyword being stuffed into to_string.

| KEEP _index, id, version, name
| WHERE name == "aaaaa" OR name == "hhhhh"
| SORT _index ASC, id ASC
;

_index:keyword | id:integer | version:version | name:keyword
apps | 1 | 1 | aaaaa
apps | 8 | 1.2.3.4 | hhhhh
apps | 12 | 1.2.3.4 | aaaaa
apps_short | 1 | 1 | aaaaa
apps_short | 8 | 1.2.3.4 | hhhhh
apps_short | 12 | 1.2.3.4 | aaaaa
;

shortIntegerWideningStats
required_capability: union_types
required_capability: casting_operator
required_capability: union_types_numeric_widening

FROM apps, apps_short
| EVAL id = id::integer
| STATS count=count() BY name, id
| KEEP id, name, count
| SORT id ASC, name ASC
;

id:integer | name:keyword | count:long
1 | aaaaa | 2
2 | bbbbb | 2
3 | ccccc | 2
4 | ddddd | 2
5 | eeeee | 2
6 | fffff | 2
7 | ggggg | 2
8 | hhhhh | 2
9 | iiiii | 2
10 | jjjjj | 2
11 | kkkkk | 2
12 | aaaaa | 2
13 | lllll | 2
14 | mmmmm | 2
;
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,11 @@ public enum Cap {
*/
UNION_TYPES_MISSING_FIELD,

/**
* Fix for widening of short numeric types in union-types. Done in #112610
*/
UNION_TYPES_NUMERIC_WIDENING,

/**
* Fix a parsing issue where numbers below Long.MIN_VALUE threw an exception instead of parsing as doubles.
* see <a href="https://github.com/elastic/elasticsearch/issues/104323"> Parsing large numbers is inconsistent #104323 </a>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@
import static org.elasticsearch.xpack.esql.core.type.DataType.KEYWORD;
import static org.elasticsearch.xpack.esql.core.type.DataType.LONG;
import static org.elasticsearch.xpack.esql.core.type.DataType.TEXT;
import static org.elasticsearch.xpack.esql.core.type.DataType.UNSIGNED_LONG;
import static org.elasticsearch.xpack.esql.core.type.DataType.VERSION;
import static org.elasticsearch.xpack.esql.core.type.DataType.isTemporalAmount;
import static org.elasticsearch.xpack.esql.stats.FeatureMetric.LIMIT;
Expand Down Expand Up @@ -1223,8 +1222,7 @@ private Expression resolveConvertFunction(AbstractConvertFunction convert, List<
HashMap<TypeResolutionKey, Expression> typeResolutions = new HashMap<>();
Set<DataType> supportedTypes = convert.supportedTypes();
imf.types().forEach(type -> {
// TODO: Shouldn't we perform widening of small numerical types here?
if (supportedTypes.contains(type)) {
if (supportedTypes.contains(type.widenSmallNumeric())) {
Comment on lines -1226 to +1225
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Contrary to the funny TODO that I left here, haha, maybe we should actually widen at the point where the InvalidMappedField gets created? That'd be consistent with how we widen the types of regular EsFields in Analyzer.mappingAsAttributes. If we do so, we could enforce this as an invariant of FieldAttribute (so that when it contains an IMF, it has to be widened).

TypeResolutionKey key = new TypeResolutionKey(fa.name(), type);
var concreteConvert = typeSpecificConvert(convert, fa.source(), type, imf);
typeResolutions.put(key, concreteConvert);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ protected AbstractConvertFunction(StreamInput in) throws IOException {
* Build the evaluator given the evaluator a multivalued field.
*/
protected final ExpressionEvaluator.Factory evaluator(ExpressionEvaluator.Factory fieldEval) {
DataType sourceType = field().dataType();
DataType sourceType = field().dataType().widenSmallNumeric();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uhh, how did that field not end up being widened in the first place? In Analyzer.mappingAsAttributes, line 240, each field attribute's field should already have been widened, including the contained EsField's type.

What fails if we do not widen here? I'd like to figure out if this is really necessary.

Also, to avoid confusion in the future, do you think we should add an assert to FieldAttribute's constructor to ensure it only ever gets built with already widened data types?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was surprised this failed too. It fails later in planning, I don't remember the exact stack trace, but the error was that this function was being called with an unsupported type (the narrow type). I understood the evaluator function to be called much later, so I was surprised by this too. I could re-investigate because perhaps there is a code path to this that is happening unnecessarily early?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this deserves a follow up, because there begin to be many places were widening is necessary - and we only find out by failed queries/tests :/ I opened #112691.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I investigated, and this exception is thrown down during query execution (ie. after local physical planning) during the setup of the type converting block loader at https://github.com/elastic/elasticsearch/blob/main/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/EsPhysicalOperationProviders.java#L384.

This is well past any analyzing, planning phases, so it is clear that the types are not widened in general, but only in specific cases. A followup investigation makes a lot of sense.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For completeness, this is the stack trace:

org.elasticsearch.xpack.esql.EsqlIllegalArgumentException: illegal data type [short]
	at org.elasticsearch.xpack.esql.EsqlIllegalArgumentException.illegalDataType(EsqlIllegalArgumentException.java:43)
	at org.elasticsearch.xpack.esql.EsqlIllegalArgumentException.illegalDataType(EsqlIllegalArgumentException.java:39)
	at org.elasticsearch.xpack.esql.expression.function.scalar.convert.AbstractConvertFunction.evaluator(AbstractConvertFunction.java:69)
	at org.elasticsearch.xpack.esql.expression.function.scalar.convert.AbstractConvertFunction.toEvaluator(AbstractConvertFunction.java:128)
	at org.elasticsearch.xpack.esql.planner.EsPhysicalOperationProviders.<init>(EsPhysicalOperationProviders.java:384)
	at org.elasticsearch.xpack.esql.planner.EsPhysicalOperationProviders.getBlockLoaderFor(EsPhysicalOperationProviders.java:143)
	at org.elasticsearch.xpack.esql.planner.EsPhysicalOperationProviders.lambda(EsPhysicalOperationProviders.java:123)
	at org.elasticsearch.compute.lucene.ValuesSourceReaderOperator.newShard(ValuesSourceReaderOperator.java:470)
	at org.elasticsearch.compute.lucene.ValuesSourceReaderOperator.positionFieldWork(ValuesSourceReaderOperator.java:195)
	at org.elasticsearch.compute.lucene.ValuesSourceReaderOperator.loadFromSingleLeaf(ValuesSourceReaderOperator.java:220)
	at org.elasticsearch.compute.lucene.ValuesSourceReaderOperator.process(ValuesSourceReaderOperator.java:143)
	at org.elasticsearch.compute.operator.AbstractPageMappingOperator.getOutput(AbstractPageMappingOperator.java:76)
	at org.elasticsearch.compute.operator.Driver.runSingleLoopIteration(Driver.java:258)
	at org.elasticsearch.compute.operator.Driver.run(Driver.java:189)
	at org.elasticsearch.compute.operator.Driver.doRun(Driver.java:378)

var factory = factories().get(sourceType);
if (factory == null) {
throw EsqlIllegalArgumentException.illegalDataType(sourceType);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@
import org.elasticsearch.xpack.esql.analysis.EnrichResolution;
import org.elasticsearch.xpack.esql.analysis.PreAnalyzer;
import org.elasticsearch.xpack.esql.core.expression.Attribute;
import org.elasticsearch.xpack.esql.core.type.DataType;
import org.elasticsearch.xpack.esql.core.type.EsField;
import org.elasticsearch.xpack.esql.enrich.EnrichLookupService;
import org.elasticsearch.xpack.esql.enrich.ResolvedEnrichPolicy;
import org.elasticsearch.xpack.esql.expression.function.EsqlFunctionRegistry;
Expand Down Expand Up @@ -308,8 +310,18 @@ protected void assertResults(ExpectedResults expected, ActualResults actual, boo
// CsvTestUtils.logData(actual.values(), LOGGER);
}

private static IndexResolution loadIndexResolution(String mappingName, String indexName) {
private static IndexResolution loadIndexResolution(String mappingName, String indexName, Map<String, String> typeMapping) {
var mapping = new TreeMap<>(loadMapping(mappingName));
if ((typeMapping == null || typeMapping.isEmpty()) == false) {
for (var entry : typeMapping.entrySet()) {
if (mapping.containsKey(entry.getKey())) {
DataType dataType = DataType.fromTypeName(entry.getValue());
EsField field = mapping.get(entry.getKey());
EsField editedField = new EsField(field.getName(), dataType, field.getProperties(), field.isAggregatable());
mapping.put(entry.getKey(), editedField);
}
}
}
return IndexResolution.valid(new EsIndex(indexName, mapping, Map.of(indexName, IndexMode.STANDARD)));
}

Expand All @@ -320,7 +332,7 @@ private static EnrichResolution loadEnrichPolicies() {
CsvTestsDataLoader.TestsDataset sourceIndex = CSV_DATASET_MAP.get(policy.getIndices().get(0));
// this could practically work, but it's wrong:
// EnrichPolicyResolution should contain the policy (system) index, not the source index
EsIndex esIndex = loadIndexResolution(sourceIndex.mappingFileName(), sourceIndex.indexName()).get();
EsIndex esIndex = loadIndexResolution(sourceIndex.mappingFileName(), sourceIndex.indexName(), null).get();
var concreteIndices = Map.of(RemoteClusterService.LOCAL_CLUSTER_GROUP_KEY, Iterables.get(esIndex.concreteIndices(), 0));
enrichResolution.addResolvedPolicy(
policyConfig.policyName(),
Expand Down Expand Up @@ -349,7 +361,7 @@ private static EnrichPolicy loadEnrichPolicyMapping(String policyFileName) {
}

private LogicalPlan analyzedPlan(LogicalPlan parsed, CsvTestsDataLoader.TestsDataset dataset) {
var indexResolution = loadIndexResolution(dataset.mappingFileName(), dataset.indexName());
var indexResolution = loadIndexResolution(dataset.mappingFileName(), dataset.indexName(), dataset.typeMapping());
var enrichPolicies = loadEnrichPolicies();
var analyzer = new Analyzer(new AnalyzerContext(configuration, functionRegistry, indexResolution, enrichPolicies), TEST_VERIFIER);
LogicalPlan plan = analyzer.analyze(parsed);
Expand Down Expand Up @@ -392,7 +404,7 @@ private static CsvTestsDataLoader.TestsDataset testsDataset(LogicalPlan parsed)
}

private static TestPhysicalOperationProviders testOperationProviders(CsvTestsDataLoader.TestsDataset dataset) throws Exception {
var testData = loadPageFromCsv(CsvTests.class.getResource("/" + dataset.dataFileName()));
var testData = loadPageFromCsv(CsvTests.class.getResource("/" + dataset.dataFileName()), dataset.typeMapping());
return new TestPhysicalOperationProviders(testData.v1(), testData.v2());
}

Expand Down