From c02c55ee8b28b3ee9883b3855552f60366160f91 Mon Sep 17 00:00:00 2001
From: Bogdan Pintea <bogdan.pintea@elastic.co>
Date: Fri, 22 Nov 2024 14:33:03 +0100
Subject: [PATCH] Add docs for aggs filtering (#116681) (#117335)

Add documentation for aggs filtering (the WHERE in STATS command).

Fixes: #115083
---
 .../esql/processing-commands/stats.asciidoc   | 49 +++++++++++++++----
 .../src/main/resources/stats.csv-spec         | 36 ++++++++++++++
 2 files changed, 75 insertions(+), 10 deletions(-)

diff --git a/docs/reference/esql/processing-commands/stats.asciidoc b/docs/reference/esql/processing-commands/stats.asciidoc
index 0c479c1f62b76..3ed296fb6db24 100644
--- a/docs/reference/esql/processing-commands/stats.asciidoc
+++ b/docs/reference/esql/processing-commands/stats.asciidoc
@@ -1,16 +1,18 @@
 [discrete]
 [[esql-stats-by]]
-=== `STATS ... BY`
+=== `STATS`
 
-The `STATS ... BY` processing command groups rows according to a common value
+The `STATS` processing command groups rows according to a common value
 and calculates one or more aggregated values over the grouped rows.
 
 **Syntax**
 
 [source,esql]
 ----
-STATS [column1 =] expression1[, ..., [columnN =] expressionN]
-[BY grouping_expression1[, ..., grouping_expressionN]]
+STATS [column1 =] expression1 [WHERE boolean_expression1][,
+      ...,
+      [columnN =] expressionN [WHERE boolean_expressionN]]
+      [BY grouping_expression1[, ..., grouping_expressionN]]
 ----
 
 *Parameters*
@@ -28,14 +30,18 @@ An expression that computes an aggregated value.
 An expression that outputs the values to group by.
 If its name coincides with one of the computed columns, that column will be ignored.
 
+`boolean_expressionX`::
+The condition that must be met for a row to be included in the evaluation of `expressionX`.
+
 NOTE: Individual `null` values are skipped when computing aggregations.
 
 *Description*
 
-The `STATS ... BY` processing command groups rows according to a common value
-and calculate one or more aggregated values over the grouped rows. If `BY` is
-omitted, the output table contains exactly one row with the aggregations applied
-over the entire dataset.
+The `STATS` processing command groups rows according to a common value
+and calculates one or more aggregated values over the grouped rows. For the
+calculation of each aggregated value, the rows in a group can be filtered with
+`WHERE`. If `BY` is omitted, the output table contains exactly one row with
+the aggregations applied over the entire dataset.
 
 The following <<esql-agg-functions,aggregation functions>> are supported:
 
@@ -90,6 +96,29 @@ include::{esql-specs}/stats.csv-spec[tag=statsCalcMultipleValues]
 include::{esql-specs}/stats.csv-spec[tag=statsCalcMultipleValues-result]
 |===
 
+To filter the rows that go into an aggregation, use the `WHERE` clause:
+
+[source.merge.styled,esql]
+----
+include::{esql-specs}/stats.csv-spec[tag=aggFiltering]
+----
+[%header.monospaced.styled,format=dsv,separator=|]
+|===
+include::{esql-specs}/stats.csv-spec[tag=aggFiltering-result]
+|===
+
+The aggregations can be mixed, with and without a filter and grouping is
+optional as well:
+
+[source.merge.styled,esql]
+----
+include::{esql-specs}/stats.csv-spec[tag=aggFilteringNoGroup]
+----
+[%header.monospaced.styled,format=dsv,separator=|]
+|===
+include::{esql-specs}/stats.csv-spec[tag=aggFilteringNoGroup-result]
+|===
+
 [[esql-stats-mv-group]]
 If the grouping key is multivalued then the input row is in all groups:
 
@@ -109,7 +138,7 @@ It's also possible to group by multiple values:
 include::{esql-specs}/stats.csv-spec[tag=statsGroupByMultipleValues]
 ----
 
-If the all grouping keys are multivalued then the input row is in all groups:
+If all the grouping keys are multivalued then the input row is in all groups:
 
 [source.merge.styled,esql]
 ----
@@ -121,7 +150,7 @@ include::{esql-specs}/stats.csv-spec[tag=multi-mv-group-result]
 |===
 
 Both the aggregating functions and the grouping expressions accept other
-functions. This is useful for using `STATS...BY` on multivalue columns.
+functions. This is useful for using `STATS` on multivalue columns.
 For example, to calculate the average salary change, you can use `MV_AVG` to
 first average the multiple values per employee, and use the result with the
 `AVG` function:
diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/stats.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/stats.csv-spec
index b2333c077400d..859f06ed5f22e 100644
--- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/stats.csv-spec
+++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/stats.csv-spec
@@ -2348,6 +2348,42 @@ v:integer | job_positions:keyword
     10094 | Accountant
 ;
 
+docsStatsWithSimpleFiltering
+required_capability: per_agg_filtering
+// tag::aggFiltering[]
+FROM employees
+| STATS avg50s = AVG(salary)::LONG WHERE birth_date < "1960-01-01",
+        avg60s = AVG(salary)::LONG WHERE birth_date >= "1960-01-01"
+        BY gender
+| SORT gender
+// end::aggFiltering[]
+| WHERE gender IS NOT NULL
+;
+
+// tag::aggFiltering-result[]
+avg50s:long    |avg60s:long    |gender:keyword
+55462          |46637          |F
+48279          |44879          |M
+// end::aggFiltering-result[]
+;
+
+docsStatsWithFilteringNoGroups
+required_capability: per_agg_filtering
+// tag::aggFilteringNoGroup[]
+FROM employees
+| EVAL Ks = salary / 1000 // thousands
+| STATS under_40K = COUNT(*) WHERE Ks < 40,
+        inbetween = COUNT(*) WHERE 40 <= Ks AND Ks < 60,
+        over_60K  = COUNT(*) WHERE 60 <= Ks,
+        total     = COUNT(*)
+// end::aggFilteringNoGroup[]
+;
+
+// tag::aggFilteringNoGroup-result[]
+under_40K:long |inbetween:long |over_60K:long  |total:long
+36             |39             |25             |100
+// end::aggFilteringNoGroup-result[]
+;
 
 statsWithFiltering
 required_capability: per_agg_filtering