Skip to content

Commit

Permalink
ESQL: Remove aliasing inside Eval (elastic#100238)
Browse files Browse the repository at this point in the history
Evals that introduce aliased can be simplified by extracting them into a
 project (and thus signaling there's no underlying processing).

The following eval:
  eval x = a + 1, y = x, z = y + 1, y = z, w = y + 1
can be converted into:
  eval x = a + 1, z = a + 1 + 1, w = a + 1 + 1 | project x, z, z as y, w

Fix elastic#100174
Fix elastic#100050
  • Loading branch information
costin committed Oct 5, 2023
1 parent 9044e5a commit 2ff5a45
Show file tree
Hide file tree
Showing 9 changed files with 476 additions and 65 deletions.
6 changes: 6 additions & 0 deletions docs/changelog/100238.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 100238
summary: "ESQL: Remove aliasing inside Eval"
area: ES|QL
type: bug
issues:
- 100174
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,7 @@ public enum Type {
LOOKUP.put("BYTE", INTEGER);

// add also the types with short names
LOOKUP.put("BOOL", BOOLEAN);
LOOKUP.put("I", INTEGER);
LOOKUP.put("L", LONG);
LOOKUP.put("UL", UNSIGNED_LONG);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,16 @@ x:integer | y:integer | x2:integer | y2:integer
4 | 4 | 5 | 6
;

duplicateProjectEval
from employees | eval y = languages, x = languages | keep x, y | eval x2 = x + 1 | eval y2 = y + 2 | limit 3;

x:integer | y:integer | x2:integer | y2:integer
2 | 2 | 3 | 4
5 | 5 | 6 | 7
4 | 4 | 5 | 6
;


renameWithFilterPushedToES
from employees | sort emp_no | rename emp_no as x | keep languages, first_name, last_name, x | where x > 10030 and x < 10040 | limit 5;

Expand Down Expand Up @@ -147,3 +157,20 @@ y:integer | x:date
10076 | 1985-07-09T00:00:00.000Z
10061 | 1985-09-17T00:00:00.000Z
;

renameIntertwinedWithSort
FROM employees | eval x = salary | rename x as y | rename y as x | sort x | rename x as y | limit 10;

avg_worked_seconds:l | birth_date:date | emp_no:i | first_name:s | gender:s | height:d | height.float:d | height.half_float:d | height.scaled_float:d| hire_date:date | is_rehired:bool | job_positions:s | languages:i | languages.byte:i | languages.long:l | languages.short:i | last_name:s | salary:i | salary_change:d | salary_change.int:i | salary_change.keyword:s | salary_change.long:l | still_hired:bool | y:i

390266432 | 1959-08-19T00:00:00.000Z | 10015 | Guoxiang | null | 1.66 | 1.659999966621399 | 1.66015625 | 1.6600000000000001 | 1987-07-02T00:00:00.000Z | [false, false, false, true]| [Head Human Resources, Junior Developer, Principal Support Engineer, Support Engineer] | 5 | 5 | 5 | 5 | Nooteboom | 25324 | [12.4, 14.25] | [12, 14] | [12.40, 14.25] | [12, 14] | true | 25324
203838153 | 1953-02-08T00:00:00.000Z | 10035 | null | M | 1.81 | 1.809999942779541 | 1.8095703125 | 1.81 | 1988-09-05T00:00:00.000Z | false | [Data Scientist, Senior Python Developer] | 5 | 5 | 5 | 5 | Chappelet | 25945 | [-6.58, -2.54] | [-6, -2] | [-2.54, -6.58] | [-6, -2] | false | 25945
313407352 | 1964-10-18T00:00:00.000Z | 10092 | Valdiodio | F | 1.75 | 1.75 | 1.75 | 1.75 | 1989-09-22T00:00:00.000Z | [false, false, true, true] | [Accountant, Junior Developer] | 1 | 1 | 1 | 1 | Niizuma | 25976 | [-6.77, 0.39, 8.3, 8.78] | [-6, 0, 8, 8] | [-6.77, 0.39, 8.30,8.78] | [-6, 0, 8, 8] | false | 25976
248451647 | null | 10048 | Florian | M | 2.0 | 2.0 | 2.0 | 2.0 | 1985-02-24T00:00:00.000Z | [true, true] | Internship | 3 | 3 | 3 | 3 | Syrotiuk | 26436 | null | null | null | null | false | 26436
324356269 | 1954-05-30T00:00:00.000Z | 10057 | Ebbe | F | 1.59 | 1.590000033378601 | 1.58984375 | 1.59 | 1992-01-15T00:00:00.000Z | null | [Head Human Resources, Python Developer] | 4 | 4 | 4 | 4 | Callaway | 27215 | [-6.73, -5.27, -2.43, 1.03] | [-6, -5, -2, 1] | [-2.43, -5.27, -6.73, 1.03]| [-6, -5, -2, 1] | true | 27215
359067056 | 1960-05-25T00:00:00.000Z | 10084 | Tuval | M | 1.51 | 1.5099999904632568 | 1.509765625 | 1.51 | 1995-12-15T00:00:00.000Z | false | Principal Support Engineer | 1 | 1 | 1 | 1 | Kalloufi | 28035 | null | null | null | null | true | 28035
359208133 | 1953-04-03T00:00:00.000Z | 10026 | Yongqiao | M | 2.1 | 2.0999999046325684 | 2.099609375 | 2.1 | 1995-03-20T00:00:00.000Z | [false, true] | Reporting Analyst | null | null | null | null | Berztiss | 28336 | [-7.37, 10.62, 11.20] | [-7, 10, 11] | [-7.37, 10.62, 11.20] | [-7, 10, 11] | true | 28336
233999584 | 1962-11-26T00:00:00.000Z | 10068 | Charlene | M | 1.58 | 1.5800000429153442 | 1.580078125 | 1.58 | 1987-08-07T00:00:00.000Z | true | Architect | 3 | 3 | 3 | 3 | Brattka | 28941 | [-5.61, -5.29, 3.43] | [-5, -5, 3] | [-5.29, -5.61, 3.43] | [-5, -5, 3] | true | 28941
341158890 | 1961-10-15T00:00:00.000Z | 10060 | Breannda | M | 1.42 | 1.4199999570846558 | 1.419921875 | 1.42 | 1987-11-02T00:00:00.000Z | [false, false, false, true]| [Business Analyst, Data Scientist, Senior Team Lead] | 2 | 2 | 2 | 2 | Billingsley | 29175 | [-1.76, -0.85] | [-1, 0] | [-0.85, -1.76] | [-1, 0] | true | 29175
246355863 | null | 10042 | Magy | F | 1.44 | 1.440000057220459 | 1.4404296875 | 1.44 | 1993-03-21T00:00:00.000Z | null | [Architect, Business Analyst, Internship, Junior Developer] | 3 | 3 | 3 | 3 | Stamatiou | 30404 | [-9.28, 9.42] | [-9, 9] | [-9.28, 9.42] | [-9, 9] | true | 30404
;
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,13 @@ protected List<Batch<LogicalPlan>> batches() {
}

protected static List<Batch<LogicalPlan>> rules() {
var substitutions = new Batch<>("Substitutions", Limiter.ONCE, new SubstituteSurrogates(), new ReplaceRegexMatch()
// new ReplaceTextFieldAttributesWithTheKeywordSubfield()
var substitutions = new Batch<>(
"Substitutions",
Limiter.ONCE,
new SubstituteSurrogates(),
new ReplaceRegexMatch(),
new ReplaceAliasingEvalWithProject()
// new ReplaceTextFieldAttributesWithTheKeywordSubfield()
);

var operators = new Batch<>(
Expand Down Expand Up @@ -851,4 +856,85 @@ protected Expression regexToEquals(RegexMatch<?> regexMatch, Literal literal) {
return new Equals(regexMatch.source(), regexMatch.field(), literal);
}
}

/**
* Replace aliasing evals (eval x=a) with a projection which can be further combined / simplified.
* The rule gets applied only if there's another project (Project/Stats) above it.
*
* Needs to take into account shadowing of potentially intermediate fields:
* eval x = a + 1, y = x, z = y + 1, y = z, w = y + 1
* The output should be
* eval x = a + 1, z = a + 1 + 1, w = a + 1 + 1
* project x, z, z as y, w
*/
static class ReplaceAliasingEvalWithProject extends Rule<LogicalPlan, LogicalPlan> {

@Override
public LogicalPlan apply(LogicalPlan logicalPlan) {
Holder<Boolean> enabled = new Holder<>(false);

return logicalPlan.transformDown(p -> {
// found projection, turn enable flag on
if (p instanceof Aggregate || p instanceof Project) {
enabled.set(true);
} else if (enabled.get() && p instanceof Eval eval) {
p = rule(eval);
}

return p;
});
}

private LogicalPlan rule(Eval eval) {
LogicalPlan plan = eval;

// holds simple aliases such as b = a, c = b, d = c
AttributeMap<Expression> basicAliases = new AttributeMap<>();
// same as above but keeps the original expression
AttributeMap<NamedExpression> basicAliasSources = new AttributeMap<>();

List<Alias> keptFields = new ArrayList<>();

var fields = eval.fields();
for (int i = 0, size = fields.size(); i < size; i++) {
Alias field = fields.get(i);
Expression child = field.child();
var attribute = field.toAttribute();
// put the aliases in a separate map to separate the underlying resolve from other aliases
if (child instanceof Attribute) {
basicAliases.put(attribute, child);
basicAliasSources.put(attribute, field);
} else {
// be lazy and start replacing name aliases only if needed
if (basicAliases.size() > 0) {
// update the child through the field
field = (Alias) field.transformUp(e -> basicAliases.resolve(e, e));
}
keptFields.add(field);
}
}

// at least one alias encountered, move it into a project
if (basicAliases.size() > 0) {
// preserve the eval output (takes care of shadowing and order) but replace the basic aliases
List<NamedExpression> projections = new ArrayList<>(eval.output());
// replace the removed aliases with their initial definition - however use the output to preserve the shadowing
for (int i = projections.size() - 1; i >= 0; i--) {
NamedExpression project = projections.get(i);
projections.set(i, basicAliasSources.getOrDefault(project, project));
}

LogicalPlan child = eval.child();
if (keptFields.size() > 0) {
// replace the eval with just the kept fields
child = new Eval(eval.source(), eval.child(), keptFields);
}
// put the projection in place
plan = new Project(eval.source(), child, projections);
}

return plan;
}

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,10 @@ public Layout build() {
for (NameId id : set.nameIds) {
ChannelAndType next = new ChannelAndType(channel, set.type);
ChannelAndType prev = layout.put(id, next);
if (prev != null) {
throw new IllegalArgumentException("Name [" + id + "] is on two channels [" + prev + "] and [" + next + "]");
}
// Do allow multiple name to point to the same channel - see https://github.com/elastic/elasticsearch/pull/100238
// if (prev != null) {
// throw new IllegalArgumentException("Name [" + id + "] is on two channels [" + prev + "] and [" + next + "]");
// }
}
}
return new DefaultLayout(Collections.unmodifiableMap(layout), numberOfChannels);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
import org.elasticsearch.xpack.esql.plan.physical.EsStatsQueryExec;
import org.elasticsearch.xpack.esql.plan.physical.EsStatsQueryExec.Stat;
import org.elasticsearch.xpack.esql.plan.physical.EstimatesRowSize;
import org.elasticsearch.xpack.esql.plan.physical.EvalExec;
import org.elasticsearch.xpack.esql.plan.physical.ExchangeExec;
import org.elasticsearch.xpack.esql.plan.physical.LimitExec;
import org.elasticsearch.xpack.esql.plan.physical.PhysicalPlan;
Expand Down Expand Up @@ -208,27 +207,31 @@ public void testCountFieldWithFilter() {
}

/**
* Expects - for now
* Expects
* LimitExec[500[INTEGER]]
* \_AggregateExec[[],[COUNT(hidden_s{r}#8) AS c],FINAL,null]
* \_AggregateExec[[],[COUNT(salary{f}#20) AS c],FINAL,null]
* \_ExchangeExec[[count{r}#25, seen{r}#26],true]
* \_AggregateExec[[],[COUNT(hidden_s{r}#8) AS c],PARTIAL,8]
* \_EvalExec[[salary{f}#20 AS s, s{r}#3 AS hidden_s]]
* \_FieldExtractExec[salary{f}#20]
* \_EsQueryExec[test], query[{"esql_single_value":{"field":"emp_no","next":{"range":{"emp_no":{"lt":10050,"boost":1.0}}}}}]
* [_doc{f}#42], limit[], sort[] estimatedRowSize[16]
* \_EsStatsQueryExec[test], stats[Stat[name=salary, type=COUNT, query={
* "exists" : {
* "field" : "salary",
* "boost" : 1.0
* }
*/
// TODO: the eval is not yet optimized away
public void testCountFieldWithEval() {
var plan = plan("""
from test | eval s = salary | rename s as sr | eval hidden_s = sr | rename emp_no as e | where e < 10050
| stats c = count(hidden_s)
""", IS_SV_STATS);

var limit = as(plan, LimitExec.class);
var agg = as(limit.child(), AggregateExec.class);
var exg = as(agg.child(), ExchangeExec.class);
agg = as(exg.child(), AggregateExec.class);
var eval = as(agg.child(), EvalExec.class);
var esStatsQuery = as(exg.child(), EsStatsQueryExec.class);

assertThat(esStatsQuery.limit(), is(nullValue()));
assertThat(Expressions.names(esStatsQuery.output()), contains("count", "seen"));
var stat = as(esStatsQuery.stats().get(0), Stat.class);
assertThat(stat.query(), is(QueryBuilders.existsQuery("salary")));
}

// optimized doesn't know yet how to push down count over field
Expand Down
Loading

0 comments on commit 2ff5a45

Please sign in to comment.