From c02c55ee8b28b3ee9883b3855552f60366160f91 Mon Sep 17 00:00:00 2001 From: Bogdan Pintea Date: Fri, 22 Nov 2024 14:33:03 +0100 Subject: [PATCH] Add docs for aggs filtering (#116681) (#117335) Add documentation for aggs filtering (the WHERE in STATS command). Fixes: #115083 --- .../esql/processing-commands/stats.asciidoc | 49 +++++++++++++++---- .../src/main/resources/stats.csv-spec | 36 ++++++++++++++ 2 files changed, 75 insertions(+), 10 deletions(-) diff --git a/docs/reference/esql/processing-commands/stats.asciidoc b/docs/reference/esql/processing-commands/stats.asciidoc index 0c479c1f62b76..3ed296fb6db24 100644 --- a/docs/reference/esql/processing-commands/stats.asciidoc +++ b/docs/reference/esql/processing-commands/stats.asciidoc @@ -1,16 +1,18 @@ [discrete] [[esql-stats-by]] -=== `STATS ... BY` +=== `STATS` -The `STATS ... BY` processing command groups rows according to a common value +The `STATS` processing command groups rows according to a common value and calculates one or more aggregated values over the grouped rows. **Syntax** [source,esql] ---- -STATS [column1 =] expression1[, ..., [columnN =] expressionN] -[BY grouping_expression1[, ..., grouping_expressionN]] +STATS [column1 =] expression1 [WHERE boolean_expression1][, + ..., + [columnN =] expressionN [WHERE boolean_expressionN]] + [BY grouping_expression1[, ..., grouping_expressionN]] ---- *Parameters* @@ -28,14 +30,18 @@ An expression that computes an aggregated value. An expression that outputs the values to group by. If its name coincides with one of the computed columns, that column will be ignored. +`boolean_expressionX`:: +The condition that must be met for a row to be included in the evaluation of `expressionX`. + NOTE: Individual `null` values are skipped when computing aggregations. *Description* -The `STATS ... BY` processing command groups rows according to a common value -and calculate one or more aggregated values over the grouped rows. If `BY` is -omitted, the output table contains exactly one row with the aggregations applied -over the entire dataset. +The `STATS` processing command groups rows according to a common value +and calculates one or more aggregated values over the grouped rows. For the +calculation of each aggregated value, the rows in a group can be filtered with +`WHERE`. If `BY` is omitted, the output table contains exactly one row with +the aggregations applied over the entire dataset. The following <> are supported: @@ -90,6 +96,29 @@ include::{esql-specs}/stats.csv-spec[tag=statsCalcMultipleValues] include::{esql-specs}/stats.csv-spec[tag=statsCalcMultipleValues-result] |=== +To filter the rows that go into an aggregation, use the `WHERE` clause: + +[source.merge.styled,esql] +---- +include::{esql-specs}/stats.csv-spec[tag=aggFiltering] +---- +[%header.monospaced.styled,format=dsv,separator=|] +|=== +include::{esql-specs}/stats.csv-spec[tag=aggFiltering-result] +|=== + +The aggregations can be mixed, with and without a filter and grouping is +optional as well: + +[source.merge.styled,esql] +---- +include::{esql-specs}/stats.csv-spec[tag=aggFilteringNoGroup] +---- +[%header.monospaced.styled,format=dsv,separator=|] +|=== +include::{esql-specs}/stats.csv-spec[tag=aggFilteringNoGroup-result] +|=== + [[esql-stats-mv-group]] If the grouping key is multivalued then the input row is in all groups: @@ -109,7 +138,7 @@ It's also possible to group by multiple values: include::{esql-specs}/stats.csv-spec[tag=statsGroupByMultipleValues] ---- -If the all grouping keys are multivalued then the input row is in all groups: +If all the grouping keys are multivalued then the input row is in all groups: [source.merge.styled,esql] ---- @@ -121,7 +150,7 @@ include::{esql-specs}/stats.csv-spec[tag=multi-mv-group-result] |=== Both the aggregating functions and the grouping expressions accept other -functions. This is useful for using `STATS...BY` on multivalue columns. +functions. This is useful for using `STATS` on multivalue columns. For example, to calculate the average salary change, you can use `MV_AVG` to first average the multiple values per employee, and use the result with the `AVG` function: diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/stats.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/stats.csv-spec index b2333c077400d..859f06ed5f22e 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/stats.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/stats.csv-spec @@ -2348,6 +2348,42 @@ v:integer | job_positions:keyword 10094 | Accountant ; +docsStatsWithSimpleFiltering +required_capability: per_agg_filtering +// tag::aggFiltering[] +FROM employees +| STATS avg50s = AVG(salary)::LONG WHERE birth_date < "1960-01-01", + avg60s = AVG(salary)::LONG WHERE birth_date >= "1960-01-01" + BY gender +| SORT gender +// end::aggFiltering[] +| WHERE gender IS NOT NULL +; + +// tag::aggFiltering-result[] +avg50s:long |avg60s:long |gender:keyword +55462 |46637 |F +48279 |44879 |M +// end::aggFiltering-result[] +; + +docsStatsWithFilteringNoGroups +required_capability: per_agg_filtering +// tag::aggFilteringNoGroup[] +FROM employees +| EVAL Ks = salary / 1000 // thousands +| STATS under_40K = COUNT(*) WHERE Ks < 40, + inbetween = COUNT(*) WHERE 40 <= Ks AND Ks < 60, + over_60K = COUNT(*) WHERE 60 <= Ks, + total = COUNT(*) +// end::aggFilteringNoGroup[] +; + +// tag::aggFilteringNoGroup-result[] +under_40K:long |inbetween:long |over_60K:long |total:long +36 |39 |25 |100 +// end::aggFilteringNoGroup-result[] +; statsWithFiltering required_capability: per_agg_filtering