-
Notifications
You must be signed in to change notification settings - Fork 25k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ES|QL: better management of exact subfields for TEXT fields #103510
Changes from all commits
1d062a2
81b9af6
81fd0c8
d51155e
389251d
8031d34
e8803b9
aa7e362
597ab2a
dcca601
bc0c910
6eb7c09
ee197aa
2cc0cb1
ea1bd1a
df9375c
940f406
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
pr: 103510 | ||
summary: "ES|QL: better management of exact subfields for TEXT fields" | ||
area: ES|QL | ||
type: bug | ||
issues: | ||
- 99899 |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,7 @@ | |
import org.elasticsearch.xpack.esql.planner.EsqlTranslatorHandler; | ||
import org.elasticsearch.xpack.esql.planner.PhysicalVerificationException; | ||
import org.elasticsearch.xpack.esql.planner.PhysicalVerifier; | ||
import org.elasticsearch.xpack.esql.stats.SearchStats; | ||
import org.elasticsearch.xpack.ql.common.Failure; | ||
import org.elasticsearch.xpack.ql.expression.Alias; | ||
import org.elasticsearch.xpack.ql.expression.Attribute; | ||
|
@@ -54,6 +55,7 @@ | |
import org.elasticsearch.xpack.ql.querydsl.query.Query; | ||
import org.elasticsearch.xpack.ql.rule.ParameterizedRuleExecutor; | ||
import org.elasticsearch.xpack.ql.rule.Rule; | ||
import org.elasticsearch.xpack.ql.type.DataTypes; | ||
import org.elasticsearch.xpack.ql.util.Queries; | ||
import org.elasticsearch.xpack.ql.util.Queries.Clause; | ||
import org.elasticsearch.xpack.ql.util.StringUtils; | ||
|
@@ -64,6 +66,7 @@ | |
import java.util.LinkedList; | ||
import java.util.List; | ||
import java.util.Set; | ||
import java.util.function.Predicate; | ||
import java.util.stream.Collectors; | ||
|
||
import static java.util.Arrays.asList; | ||
|
@@ -193,15 +196,18 @@ private static Set<Attribute> missingAttributes(PhysicalPlan p) { | |
} | ||
} | ||
|
||
public static class PushFiltersToSource extends OptimizerRule<FilterExec> { | ||
public static class PushFiltersToSource extends PhysicalOptimizerRules.ParameterizedOptimizerRule< | ||
FilterExec, | ||
LocalPhysicalOptimizerContext> { | ||
|
||
@Override | ||
protected PhysicalPlan rule(FilterExec filterExec) { | ||
protected PhysicalPlan rule(FilterExec filterExec, LocalPhysicalOptimizerContext ctx) { | ||
PhysicalPlan plan = filterExec; | ||
if (filterExec.child() instanceof EsQueryExec queryExec) { | ||
List<Expression> pushable = new ArrayList<>(); | ||
List<Expression> nonPushable = new ArrayList<>(); | ||
for (Expression exp : splitAnd(filterExec.condition())) { | ||
(canPushToSource(exp) ? pushable : nonPushable).add(exp); | ||
(canPushToSource(exp, x -> hasIdenticalDelegate(x, ctx.searchStats())) ? pushable : nonPushable).add(exp); | ||
} | ||
if (pushable.size() > 0) { // update the executable with pushable conditions | ||
Query queryDSL = TRANSLATOR_HANDLER.asQuery(Predicates.combineAnd(pushable)); | ||
|
@@ -227,26 +233,30 @@ protected PhysicalPlan rule(FilterExec filterExec) { | |
return plan; | ||
} | ||
|
||
public static boolean canPushToSource(Expression exp) { | ||
public static boolean canPushToSource(Expression exp, Predicate<FieldAttribute> hasIdenticalDelegate) { | ||
if (exp instanceof BinaryComparison bc) { | ||
return isAttributePushable(bc.left(), bc) && bc.right().foldable(); | ||
return isAttributePushable(bc.left(), bc, hasIdenticalDelegate) && bc.right().foldable(); | ||
} else if (exp instanceof BinaryLogic bl) { | ||
return canPushToSource(bl.left()) && canPushToSource(bl.right()); | ||
return canPushToSource(bl.left(), hasIdenticalDelegate) && canPushToSource(bl.right(), hasIdenticalDelegate); | ||
} else if (exp instanceof In in) { | ||
return isAttributePushable(in.value(), null) && Expressions.foldable(in.list()); | ||
return isAttributePushable(in.value(), null, hasIdenticalDelegate) && Expressions.foldable(in.list()); | ||
} else if (exp instanceof Not not) { | ||
return canPushToSource(not.field()); | ||
return canPushToSource(not.field(), hasIdenticalDelegate); | ||
} else if (exp instanceof UnaryScalarFunction usf) { | ||
if (usf instanceof RegexMatch<?> || usf instanceof IsNull || usf instanceof IsNotNull) { | ||
return isAttributePushable(usf.field(), usf); | ||
return isAttributePushable(usf.field(), usf, hasIdenticalDelegate); | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
private static boolean isAttributePushable(Expression expression, Expression operation) { | ||
if (expression instanceof FieldAttribute f && f.getExactInfo().hasExact()) { | ||
return isAggregatable(f); | ||
private static boolean isAttributePushable( | ||
Expression expression, | ||
Expression operation, | ||
Predicate<FieldAttribute> hasIdenticalDelegate | ||
) { | ||
if (isPushableFieldAttribute(expression, hasIdenticalDelegate)) { | ||
return true; | ||
} | ||
if (expression instanceof MetadataAttribute ma && ma.searchable()) { | ||
return operation == null | ||
|
@@ -282,15 +292,17 @@ protected PhysicalPlan rule(LimitExec limitExec) { | |
} | ||
} | ||
|
||
private static class PushTopNToSource extends OptimizerRule<TopNExec> { | ||
private static class PushTopNToSource extends PhysicalOptimizerRules.ParameterizedOptimizerRule< | ||
TopNExec, | ||
LocalPhysicalOptimizerContext> { | ||
@Override | ||
protected PhysicalPlan rule(TopNExec topNExec) { | ||
protected PhysicalPlan rule(TopNExec topNExec, LocalPhysicalOptimizerContext ctx) { | ||
PhysicalPlan plan = topNExec; | ||
PhysicalPlan child = topNExec.child(); | ||
|
||
boolean canPushDownTopN = child instanceof EsQueryExec | ||
|| (child instanceof ExchangeExec exchangeExec && exchangeExec.child() instanceof EsQueryExec); | ||
if (canPushDownTopN && canPushDownOrders(topNExec.order())) { | ||
if (canPushDownTopN && canPushDownOrders(topNExec.order(), x -> hasIdenticalDelegate(x, ctx.searchStats()))) { | ||
var sorts = buildFieldSorts(topNExec.order()); | ||
var limit = topNExec.limit(); | ||
|
||
|
@@ -303,10 +315,9 @@ protected PhysicalPlan rule(TopNExec topNExec) { | |
return plan; | ||
} | ||
|
||
private boolean canPushDownOrders(List<Order> orders) { | ||
private boolean canPushDownOrders(List<Order> orders, Predicate<FieldAttribute> hasIdenticalDelegate) { | ||
// allow only exact FieldAttributes (no expressions) for sorting | ||
return orders.stream() | ||
.allMatch(o -> o.child() instanceof FieldAttribute fa && fa.getExactInfo().hasExact() && isAggregatable(fa)); | ||
return orders.stream().allMatch(o -> isPushableFieldAttribute(o.child(), hasIdenticalDelegate)); | ||
} | ||
|
||
private List<EsQueryExec.FieldSort> buildFieldSorts(List<Order> orders) { | ||
|
@@ -405,4 +416,15 @@ private Tuple<List<Attribute>, List<Stat>> pushableStats(AggregateExec aggregate | |
} | ||
} | ||
|
||
public static boolean hasIdenticalDelegate(FieldAttribute attr, SearchStats stats) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I feel like this should return the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's possible, but it would require a bit of refactoring in the pushdown rules to really take advantage of the returned There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cool. We just have to make sure the sub-field it finds is the right one. In case there is more than one candidate. Evil, I know. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We are safe in this sense, if there are two subfields, none is used, even if one of them is good (it's QL logic, not optimal, but in this case it makes our life easier) |
||
return stats.hasIdenticalDelegate(attr.name()); | ||
} | ||
|
||
public static boolean isPushableFieldAttribute(Expression exp, Predicate<FieldAttribute> hasIdenticalDelegate) { | ||
if (exp instanceof FieldAttribute fa && fa.getExactInfo().hasExact() && isAggregatable(fa)) { | ||
return fa.dataType() != DataTypes.TEXT || hasIdenticalDelegate.test(fa); | ||
} | ||
return false; | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,7 +37,6 @@ | |
import org.elasticsearch.xpack.esql.planner.LocalExecutionPlanner.PhysicalOperation; | ||
import org.elasticsearch.xpack.esql.type.EsqlDataTypes; | ||
import org.elasticsearch.xpack.ql.expression.Attribute; | ||
import org.elasticsearch.xpack.ql.expression.FieldAttribute; | ||
import org.elasticsearch.xpack.ql.type.DataType; | ||
|
||
import java.util.ArrayList; | ||
|
@@ -73,9 +72,6 @@ public final PhysicalOperation fieldExtractPhysicalOperation(FieldExtractExec fi | |
List<ValuesSourceReaderOperator.FieldInfo> fields = new ArrayList<>(); | ||
int docChannel = source.layout.get(sourceAttr.id()).channel(); | ||
for (Attribute attr : fieldExtractExec.attributesToExtract()) { | ||
if (attr instanceof FieldAttribute fa && fa.getExactInfo().hasExact()) { | ||
attr = fa.exactAttribute(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not needed, |
||
} | ||
layout.append(attr); | ||
DataType dataType = attr.dataType(); | ||
ElementType elementType = PlannerUtils.toElementType(dataType); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A synthetic _source delegate will never have a normalizer. It might have
ignore_above
though. If it does I think it's still safe to use it for fetching but it won't help at the moment.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This method should probably have a name like
canUseSyntheticSourceDelegateForQuerying
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh! There's no use in using the delegate for querying if the field isn't indexed. maybe check that?
And! I'm not 100% sure block loading works for synthetic source keyword fields with doc values disabled. We should fall to the
originalName()
stored field. If we don't that's a bug.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure I understand the exact use case here. I tried a few cases with KEYWORD fields alone and with TEXT fields with KEYWORD subfields, but every time I try to disable doc_values, I get an error at index creation time like
field .. doesn't support synthetic source unless it is stored or has a sub-field of type [keyword] with doc values or stored and without a normalizer
.Do you have an example?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry -it looks like you need both
doc_values: false, stored: true, ignore_above: 12
or something. I think it's probably just an issue to file and run down later.