Add Student's t-test aggregation support

Adds t_test metric aggregation that can perform paired and unpaired two-sample t-tests. In this PR support for filters in unpaired is still missing. It will be added in a follow-up PR. Relates to elastic#53692
imotov · Mar 30, 2020 · 70c36a2 · 70c36a2
1 parent 04c39ae
commit 70c36a2
Show file tree

Hide file tree

Showing 26 changed files with 2,447 additions and 5 deletions.
diff --git a/docs/build.gradle b/docs/build.gradle
@@ -539,6 +539,41 @@ for (int i = 0; i < 100; i++) {
             {"load_time": "$value"}"""
 }
 
+// Used by t_test aggregations
+buildRestTests.setups['node_upgrade'] = '''
+  - do:
+        indices.create:
+          index: node_upgrade
+          body:
+            settings:
+              number_of_shards: 1
+              number_of_replicas: 1
+            mappings:
+              properties:
+                name:
+                  type: keyword
+                startup_time_before:
+                  type: long
+                startup_time_after:
+                  type: long
+  - do:
+        bulk:
+          index: node_upgrade
+          refresh: true
+          body: |
+            {"index":{}}
+            {"name": "A", "startup_time_before": 102, "startup_time_after": 89}
+            {"index":{}}
+            {"name": "B", "startup_time_before": 99, "startup_time_after": 93}
+            {"index":{}}
+            {"name": "C", "startup_time_before": 111, "startup_time_after": 72}
+            {"index":{}}
+            {"name": "D", "startup_time_before": 97, "startup_time_after": 98}
+            {"index":{}}
+            {"name": "E", "startup_time_before": 101, "startup_time_after": 102}
+            {"index":{}}
+            {"name": "F", "startup_time_before": 99, "startup_time_after": 98}'''
+
 // Used by iprange agg
 buildRestTests.setups['iprange'] = '''
   - do:

diff --git a/docs/reference/aggregations/metrics.asciidoc b/docs/reference/aggregations/metrics.asciidoc
@@ -49,7 +49,7 @@ include::metrics/median-absolute-deviation-aggregation.asciidoc[]
 
 include::metrics/boxplot-aggregation.asciidoc[]
 
-
+include::metrics/t-test-aggregation.asciidoc[]
 
 
 

diff --git a/docs/reference/aggregations/metrics/t-test-aggregation.asciidoc b/docs/reference/aggregations/metrics/t-test-aggregation.asciidoc
@@ -0,0 +1,111 @@
+[role="xpack"]
+[testenv="basic"]
+[[search-aggregations-metrics-ttest-aggregation]]
+=== TTest Aggregation
+
+A `t_test` metrics aggregation that performs a statistical hypothesis test in which the test statistic follows a Student's t-distribution
+under the null hypothesis on numeric values extracted from the aggregated documents or generated by provided scripts.
+
+==== Syntax
+
+A `t_test` aggregation looks like this in isolation:
+
+[source,js]
+--------------------------------------------------
+{
+    "t_test": {
+        "a": "value_before",
+        "b": "value_after",
+        "type": "paired"
+    }
+}
+--------------------------------------------------
+// NOTCONSOLE
+
+Assuming that we have a record of node start up times before
+and after upgrade, let's look at a ttest to see if upgrade affected
+the node start up time in a meaningful way.
+
+[source,console]
+--------------------------------------------------
+GET node_upgrade/_search
+{
+    "size": 0,
+    "aggs" : {
+        "startup_time_ttest" : {
+            "t_test" : {
+                "a" : {"field": "startup_time_before" } <1>,
+                "b" : {"field": "startup_time_after"} <2>,
+                "type": "paired"
+            }
+        }
+    }
+}
+--------------------------------------------------
+// TEST[setup:node_upgrade]
+<1> The field `startup_time_before` must be a numeric field
+<b> The field `startup_time_after` must be a numeric field
+<1> The field `startup_time_before` since we have data from the same nodes, we are using paired t-test.
+
+The response will look like this:
+
+[source,console-result]
+--------------------------------------------------
+{
+    ...
+
+   "aggregations": {
+      "startup_time_ttest": {
+         "value":  0.1914368843365979
+      }
+   }
+}
+--------------------------------------------------
+// TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/]
+
+==== T-Test Types
+
+The `t_test` aggregation supports unpaired and paired two-sample t-tests. The type of the test can be specified using the `type` parameter:
+
+`"type": "paired"`:: performs paired t-test
+`"type": "homoscedastic"`:: performs two-sample equal variance test
+`"type": "heteroscedastic"`:: performs two-sample unequal variance test (this is default)
+
+==== Script
+
+The `t_test` metric supports scripting.  For example, if we need to adjust out load times for the before values, we could use
+a script to recalculate them on-the-fly:
+
+[source,console]
+--------------------------------------------------
+GET node_upgrade/_search
+{
+    "size": 0,
+    "aggs" : {
+        "startup_time_ttest" : {
+            "t_test" : {
+                "a": {
+                    "script" : {
+                        "lang": "painless",
+                        "source": "doc['startup_time_before'].value - params.adjustment", <1>
+                        "params" : {
+                            "adjustment" : 10   <2>
+                        }
+                    }
+                },
+                "b": {
+                    "field": "startup_time_after" <3>
+                },
+                "type": "paired"
+            }
+        }
+    }
+}
+--------------------------------------------------
+// TEST[setup:node_upgrade]
+
+<1> The `field` parameter is replaced with a `script` parameter, which uses the
+script to generate values which percentiles are calculated on
+<2> Scripting supports parameterized input just like any other script
+<3> We can mix scripts and fields
+
diff --git a/x-pack/plugin/analytics/build.gradle b/x-pack/plugin/analytics/build.gradle
@@ -18,6 +18,8 @@ dependencies {
 
   compileOnly project(path: xpackModule('core'), configuration: 'default')
   testCompile project(path: xpackModule('core'), configuration: 'testArtifacts')
+
+  compile 'org.apache.commons:commons-math3:3.2'
 }
 
 integTest.enabled = false
diff --git a/x-pack/plugin/analytics/licenses/commons-math3-3.2.jar.sha1 b/x-pack/plugin/analytics/licenses/commons-math3-3.2.jar.sha1
@@ -0,0 +1 @@
+ec2544ab27e110d2d431bdad7d538ed509b21e62
Original file line number	Diff line number	Diff line change
Expand Up		@@ -49,7 +49,7 @@ include::metrics/median-absolute-deviation-aggregation.asciidoc[]

		include::metrics/boxplot-aggregation.asciidoc[]


		include::metrics/t-test-aggregation.asciidoc[]



Expand Down