Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ESQL: Remove aliasing inside Eval #100238

Merged
merged 6 commits into from
Oct 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/changelog/100238.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 100238
summary: "ESQL: Remove aliasing inside Eval"
area: ES|QL
type: bug
issues:
- 100174
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,7 @@ public enum Type {
LOOKUP.put("BYTE", INTEGER);

// add also the types with short names
LOOKUP.put("BOOL", BOOLEAN);
LOOKUP.put("I", INTEGER);
LOOKUP.put("L", LONG);
LOOKUP.put("UL", UNSIGNED_LONG);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,16 @@ x:integer | y:integer | x2:integer | y2:integer
4 | 4 | 5 | 6
;

duplicateProjectEval
costin marked this conversation as resolved.
Show resolved Hide resolved
from employees | eval y = languages, x = languages | keep x, y | eval x2 = x + 1 | eval y2 = y + 2 | limit 3;

x:integer | y:integer | x2:integer | y2:integer
2 | 2 | 3 | 4
5 | 5 | 6 | 7
4 | 4 | 5 | 6
;


renameWithFilterPushedToES
from employees | sort emp_no | rename emp_no as x | keep languages, first_name, last_name, x | where x > 10030 and x < 10040 | limit 5;

Expand Down Expand Up @@ -147,3 +157,20 @@ y:integer | x:date
10076 | 1985-07-09T00:00:00.000Z
10061 | 1985-09-17T00:00:00.000Z
;

renameIntertwinedWithSort
FROM employees | eval x = salary | rename x as y | rename y as x | sort x | rename x as y | limit 10;

avg_worked_seconds:l | birth_date:date | emp_no:i | first_name:s | gender:s | height:d | height.float:d | height.half_float:d | height.scaled_float:d| hire_date:date | is_rehired:bool | job_positions:s | languages:i | languages.byte:i | languages.long:l | languages.short:i | last_name:s | salary:i | salary_change:d | salary_change.int:i | salary_change.keyword:s | salary_change.long:l | still_hired:bool | y:i

390266432 | 1959-08-19T00:00:00.000Z | 10015 | Guoxiang | null | 1.66 | 1.659999966621399 | 1.66015625 | 1.6600000000000001 | 1987-07-02T00:00:00.000Z | [false, false, false, true]| [Head Human Resources, Junior Developer, Principal Support Engineer, Support Engineer] | 5 | 5 | 5 | 5 | Nooteboom | 25324 | [12.4, 14.25] | [12, 14] | [12.40, 14.25] | [12, 14] | true | 25324
203838153 | 1953-02-08T00:00:00.000Z | 10035 | null | M | 1.81 | 1.809999942779541 | 1.8095703125 | 1.81 | 1988-09-05T00:00:00.000Z | false | [Data Scientist, Senior Python Developer] | 5 | 5 | 5 | 5 | Chappelet | 25945 | [-6.58, -2.54] | [-6, -2] | [-2.54, -6.58] | [-6, -2] | false | 25945
313407352 | 1964-10-18T00:00:00.000Z | 10092 | Valdiodio | F | 1.75 | 1.75 | 1.75 | 1.75 | 1989-09-22T00:00:00.000Z | [false, false, true, true] | [Accountant, Junior Developer] | 1 | 1 | 1 | 1 | Niizuma | 25976 | [-6.77, 0.39, 8.3, 8.78] | [-6, 0, 8, 8] | [-6.77, 0.39, 8.30,8.78] | [-6, 0, 8, 8] | false | 25976
248451647 | null | 10048 | Florian | M | 2.0 | 2.0 | 2.0 | 2.0 | 1985-02-24T00:00:00.000Z | [true, true] | Internship | 3 | 3 | 3 | 3 | Syrotiuk | 26436 | null | null | null | null | false | 26436
324356269 | 1954-05-30T00:00:00.000Z | 10057 | Ebbe | F | 1.59 | 1.590000033378601 | 1.58984375 | 1.59 | 1992-01-15T00:00:00.000Z | null | [Head Human Resources, Python Developer] | 4 | 4 | 4 | 4 | Callaway | 27215 | [-6.73, -5.27, -2.43, 1.03] | [-6, -5, -2, 1] | [-2.43, -5.27, -6.73, 1.03]| [-6, -5, -2, 1] | true | 27215
359067056 | 1960-05-25T00:00:00.000Z | 10084 | Tuval | M | 1.51 | 1.5099999904632568 | 1.509765625 | 1.51 | 1995-12-15T00:00:00.000Z | false | Principal Support Engineer | 1 | 1 | 1 | 1 | Kalloufi | 28035 | null | null | null | null | true | 28035
359208133 | 1953-04-03T00:00:00.000Z | 10026 | Yongqiao | M | 2.1 | 2.0999999046325684 | 2.099609375 | 2.1 | 1995-03-20T00:00:00.000Z | [false, true] | Reporting Analyst | null | null | null | null | Berztiss | 28336 | [-7.37, 10.62, 11.20] | [-7, 10, 11] | [-7.37, 10.62, 11.20] | [-7, 10, 11] | true | 28336
233999584 | 1962-11-26T00:00:00.000Z | 10068 | Charlene | M | 1.58 | 1.5800000429153442 | 1.580078125 | 1.58 | 1987-08-07T00:00:00.000Z | true | Architect | 3 | 3 | 3 | 3 | Brattka | 28941 | [-5.61, -5.29, 3.43] | [-5, -5, 3] | [-5.29, -5.61, 3.43] | [-5, -5, 3] | true | 28941
341158890 | 1961-10-15T00:00:00.000Z | 10060 | Breannda | M | 1.42 | 1.4199999570846558 | 1.419921875 | 1.42 | 1987-11-02T00:00:00.000Z | [false, false, false, true]| [Business Analyst, Data Scientist, Senior Team Lead] | 2 | 2 | 2 | 2 | Billingsley | 29175 | [-1.76, -0.85] | [-1, 0] | [-0.85, -1.76] | [-1, 0] | true | 29175
246355863 | null | 10042 | Magy | F | 1.44 | 1.440000057220459 | 1.4404296875 | 1.44 | 1993-03-21T00:00:00.000Z | null | [Architect, Business Analyst, Internship, Junior Developer] | 3 | 3 | 3 | 3 | Stamatiou | 30404 | [-9.28, 9.42] | [-9, 9] | [-9.28, 9.42] | [-9, 9] | true | 30404
;
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,13 @@ protected List<Batch<LogicalPlan>> batches() {
}

protected static List<Batch<LogicalPlan>> rules() {
var substitutions = new Batch<>("Substitutions", Limiter.ONCE, new SubstituteSurrogates(), new ReplaceRegexMatch()
// new ReplaceTextFieldAttributesWithTheKeywordSubfield()
var substitutions = new Batch<>(
"Substitutions",
Limiter.ONCE,
new SubstituteSurrogates(),
new ReplaceRegexMatch(),
new ReplaceAliasingEvalWithProject()
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The nice thing about the current state of the rule is that it only gets applied once.

// new ReplaceTextFieldAttributesWithTheKeywordSubfield()
);

var operators = new Batch<>(
Expand Down Expand Up @@ -850,4 +855,85 @@ protected Expression regexToEquals(RegexMatch<?> regexMatch, Literal literal) {
return new Equals(regexMatch.source(), regexMatch.field(), literal);
}
}

/**
* Replace aliasing evals (eval x=a) with a projection which can be further combined / simplified.
* The rule gets applied only if there's another project (Project/Stats) above it.
*
* Needs to take into account shadowing of potentially intermediate fields:
* eval x = a + 1, y = x, z = y + 1, y = z, w = y + 1
* The output should be
* eval x = a + 1, z = a + 1 + 1, w = a + 1 + 1
* project x, z, z as y, w
*/
static class ReplaceAliasingEvalWithProject extends Rule<LogicalPlan, LogicalPlan> {

@Override
public LogicalPlan apply(LogicalPlan logicalPlan) {
Holder<Boolean> enabled = new Holder<>(false);

return logicalPlan.transformDown(p -> {
// found projection, turn enable flag on
if (p instanceof Aggregate || p instanceof Project) {
enabled.set(true);
costin marked this conversation as resolved.
Show resolved Hide resolved
} else if (enabled.get() && p instanceof Eval eval) {
p = rule(eval);
}

return p;
});
}

private LogicalPlan rule(Eval eval) {
LogicalPlan plan = eval;

// holds simple aliases such as b = a, c = b, d = c
AttributeMap<Expression> basicAliases = new AttributeMap<>();
// same as above but keeps the original expression
AttributeMap<NamedExpression> basicAliasSources = new AttributeMap<>();

List<Alias> keptFields = new ArrayList<>();

var fields = eval.fields();
for (int i = 0, size = fields.size(); i < size; i++) {
Alias field = fields.get(i);
Expression child = field.child();
var attribute = field.toAttribute();
// put the aliases in a separate map to separate the underlying resolve from other aliases
if (child instanceof Attribute) {
basicAliases.put(attribute, child);
basicAliasSources.put(attribute, field);
} else {
// be lazy and start replacing name aliases only if needed
if (basicAliases.size() > 0) {
// update the child through the field
field = (Alias) field.transformUp(e -> basicAliases.resolve(e, e));
}
keptFields.add(field);
}
}

// at least one alias encountered, move it into a project
if (basicAliases.size() > 0) {
// preserve the eval output (takes care of shadowing and order) but replace the basic aliases
List<NamedExpression> projections = new ArrayList<>(eval.output());
// replace the removed aliases with their initial definition - however use the output to preserve the shadowing
for (int i = projections.size() - 1; i >= 0; i--) {
NamedExpression project = projections.get(i);
projections.set(i, basicAliasSources.getOrDefault(project, project));
}

LogicalPlan child = eval.child();
if (keptFields.size() > 0) {
// replace the eval with just the kept fields
child = new Eval(eval.source(), eval.child(), keptFields);
}
// put the projection in place
plan = new Project(eval.source(), child, projections);
}

return plan;
}

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,10 @@ public Layout build() {
for (NameId id : set.nameIds) {
ChannelAndType next = new ChannelAndType(channel, set.type);
ChannelAndType prev = layout.put(id, next);
if (prev != null) {
throw new IllegalArgumentException("Name [" + id + "] is on two channels [" + prev + "] and [" + next + "]");
}
// Do allow multiple name to point to the same channel - see https://github.com/elastic/elasticsearch/pull/100238
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Do allow multiple name to point to the same channel - see https://github.com/elastic/elasticsearch/pull/100238
// Do allow multiple name to point to the same channel - see https://github.com/elastic/elasticsearch/pull/100299

// if (prev != null) {
costin marked this conversation as resolved.
Show resolved Hide resolved
// throw new IllegalArgumentException("Name [" + id + "] is on two channels [" + prev + "] and [" + next + "]");
// }
}
}
return new DefaultLayout(Collections.unmodifiableMap(layout), numberOfChannels);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
import org.elasticsearch.xpack.esql.plan.physical.EsStatsQueryExec;
import org.elasticsearch.xpack.esql.plan.physical.EsStatsQueryExec.Stat;
import org.elasticsearch.xpack.esql.plan.physical.EstimatesRowSize;
import org.elasticsearch.xpack.esql.plan.physical.EvalExec;
import org.elasticsearch.xpack.esql.plan.physical.ExchangeExec;
import org.elasticsearch.xpack.esql.plan.physical.LimitExec;
import org.elasticsearch.xpack.esql.plan.physical.PhysicalPlan;
Expand Down Expand Up @@ -208,27 +207,31 @@ public void testCountFieldWithFilter() {
}

/**
* Expects - for now
* Expects
* LimitExec[500[INTEGER]]
* \_AggregateExec[[],[COUNT(hidden_s{r}#8) AS c],FINAL,null]
* \_AggregateExec[[],[COUNT(salary{f}#20) AS c],FINAL,null]
* \_ExchangeExec[[count{r}#25, seen{r}#26],true]
* \_AggregateExec[[],[COUNT(hidden_s{r}#8) AS c],PARTIAL,8]
* \_EvalExec[[salary{f}#20 AS s, s{r}#3 AS hidden_s]]
* \_FieldExtractExec[salary{f}#20]
* \_EsQueryExec[test], query[{"esql_single_value":{"field":"emp_no","next":{"range":{"emp_no":{"lt":10050,"boost":1.0}}}}}]
* [_doc{f}#42], limit[], sort[] estimatedRowSize[16]
* \_EsStatsQueryExec[test], stats[Stat[name=salary, type=COUNT, query={
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a great example on how this PR helps - since the eval goes away, there's no field extraction needed.

* "exists" : {
* "field" : "salary",
* "boost" : 1.0
* }
*/
// TODO: the eval is not yet optimized away
public void testCountFieldWithEval() {
var plan = plan("""
from test | eval s = salary | rename s as sr | eval hidden_s = sr | rename emp_no as e | where e < 10050
| stats c = count(hidden_s)
""", IS_SV_STATS);

var limit = as(plan, LimitExec.class);
var agg = as(limit.child(), AggregateExec.class);
var exg = as(agg.child(), ExchangeExec.class);
agg = as(exg.child(), AggregateExec.class);
var eval = as(agg.child(), EvalExec.class);
var esStatsQuery = as(exg.child(), EsStatsQueryExec.class);

assertThat(esStatsQuery.limit(), is(nullValue()));
assertThat(Expressions.names(esStatsQuery.output()), contains("count", "seen"));
var stat = as(esStatsQuery.stats().get(0), Stat.class);
assertThat(stat.query(), is(QueryBuilders.existsQuery("salary")));
}

// optimized doesn't know yet how to push down count over field
Expand Down
Loading