From 70c36a210a23f726a9094d2510199454a29a1e93 Mon Sep 17 00:00:00 2001 From: Igor Motov Date: Mon, 30 Mar 2020 18:07:53 -0400 Subject: [PATCH] Add Student's t-test aggregation support Adds t_test metric aggregation that can perform paired and unpaired two-sample t-tests. In this PR support for filters in unpaired is still missing. It will be added in a follow-up PR. Relates to #53692 --- docs/build.gradle | 35 ++ docs/reference/aggregations/metrics.asciidoc | 2 +- .../metrics/t-test-aggregation.asciidoc | 111 ++++ x-pack/plugin/analytics/build.gradle | 2 + .../licenses/commons-math3-3.2.jar.sha1 | 1 + .../licenses/commons-math3-LICENSE.txt | 475 +++++++++++++++ .../licenses/commons-math3-NOTICE.txt | 9 + .../xpack/analytics/AnalyticsPlugin.java | 20 +- .../xpack/analytics/AnalyticsUsage.java | 6 +- .../xpack/analytics/ttest/InternalTTest.java | 101 ++++ .../ttest/PairedTTestAggregator.java | 91 +++ .../analytics/ttest/PairedTTestState.java | 88 +++ .../xpack/analytics/ttest/TStatsBuilder.java | 75 +++ .../xpack/analytics/ttest/TTest.java | 15 + .../ttest/TTestAggregationBuilder.java | 140 +++++ .../analytics/ttest/TTestAggregator.java | 67 +++ .../ttest/TTestAggregatorFactory.java | 81 +++ .../xpack/analytics/ttest/TTestState.java | 20 + .../xpack/analytics/ttest/TTestStats.java | 85 +++ .../xpack/analytics/ttest/TTestType.java | 25 + .../ttest/UnpairedTTestAggregator.java | 99 ++++ .../analytics/ttest/UnpairedTTestState.java | 114 ++++ .../analytics/ttest/InternalTTestTests.java | 132 +++++ .../ttest/TTestAggregationBuilderTests.java | 84 +++ .../analytics/ttest/TTestAggregatorTests.java | 558 ++++++++++++++++++ .../action/AnalyticsStatsAction.java | 16 +- 26 files changed, 2447 insertions(+), 5 deletions(-) create mode 100644 docs/reference/aggregations/metrics/t-test-aggregation.asciidoc create mode 100644 x-pack/plugin/analytics/licenses/commons-math3-3.2.jar.sha1 create mode 100644 x-pack/plugin/analytics/licenses/commons-math3-LICENSE.txt create mode 100644 x-pack/plugin/analytics/licenses/commons-math3-NOTICE.txt create mode 100644 x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/InternalTTest.java create mode 100644 x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/PairedTTestAggregator.java create mode 100644 x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/PairedTTestState.java create mode 100644 x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TStatsBuilder.java create mode 100644 x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTest.java create mode 100644 x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTestAggregationBuilder.java create mode 100644 x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTestAggregator.java create mode 100644 x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTestAggregatorFactory.java create mode 100644 x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTestState.java create mode 100644 x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTestStats.java create mode 100644 x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTestType.java create mode 100644 x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/UnpairedTTestAggregator.java create mode 100644 x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/UnpairedTTestState.java create mode 100644 x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/ttest/InternalTTestTests.java create mode 100644 x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/ttest/TTestAggregationBuilderTests.java create mode 100644 x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/ttest/TTestAggregatorTests.java diff --git a/docs/build.gradle b/docs/build.gradle index 0deeb0ea9cdd2..79576a7fb2b1a 100644 --- a/docs/build.gradle +++ b/docs/build.gradle @@ -539,6 +539,41 @@ for (int i = 0; i < 100; i++) { {"load_time": "$value"}""" } +// Used by t_test aggregations +buildRestTests.setups['node_upgrade'] = ''' + - do: + indices.create: + index: node_upgrade + body: + settings: + number_of_shards: 1 + number_of_replicas: 1 + mappings: + properties: + name: + type: keyword + startup_time_before: + type: long + startup_time_after: + type: long + - do: + bulk: + index: node_upgrade + refresh: true + body: | + {"index":{}} + {"name": "A", "startup_time_before": 102, "startup_time_after": 89} + {"index":{}} + {"name": "B", "startup_time_before": 99, "startup_time_after": 93} + {"index":{}} + {"name": "C", "startup_time_before": 111, "startup_time_after": 72} + {"index":{}} + {"name": "D", "startup_time_before": 97, "startup_time_after": 98} + {"index":{}} + {"name": "E", "startup_time_before": 101, "startup_time_after": 102} + {"index":{}} + {"name": "F", "startup_time_before": 99, "startup_time_after": 98}''' + // Used by iprange agg buildRestTests.setups['iprange'] = ''' - do: diff --git a/docs/reference/aggregations/metrics.asciidoc b/docs/reference/aggregations/metrics.asciidoc index 6c518a2f6cd82..4b176e64e96e9 100644 --- a/docs/reference/aggregations/metrics.asciidoc +++ b/docs/reference/aggregations/metrics.asciidoc @@ -49,7 +49,7 @@ include::metrics/median-absolute-deviation-aggregation.asciidoc[] include::metrics/boxplot-aggregation.asciidoc[] - +include::metrics/t-test-aggregation.asciidoc[] diff --git a/docs/reference/aggregations/metrics/t-test-aggregation.asciidoc b/docs/reference/aggregations/metrics/t-test-aggregation.asciidoc new file mode 100644 index 0000000000000..573cb2095a08e --- /dev/null +++ b/docs/reference/aggregations/metrics/t-test-aggregation.asciidoc @@ -0,0 +1,111 @@ +[role="xpack"] +[testenv="basic"] +[[search-aggregations-metrics-ttest-aggregation]] +=== TTest Aggregation + +A `t_test` metrics aggregation that performs a statistical hypothesis test in which the test statistic follows a Student's t-distribution +under the null hypothesis on numeric values extracted from the aggregated documents or generated by provided scripts. + +==== Syntax + +A `t_test` aggregation looks like this in isolation: + +[source,js] +-------------------------------------------------- +{ + "t_test": { + "a": "value_before", + "b": "value_after", + "type": "paired" + } +} +-------------------------------------------------- +// NOTCONSOLE + +Assuming that we have a record of node start up times before +and after upgrade, let's look at a ttest to see if upgrade affected +the node start up time in a meaningful way. + +[source,console] +-------------------------------------------------- +GET node_upgrade/_search +{ + "size": 0, + "aggs" : { + "startup_time_ttest" : { + "t_test" : { + "a" : {"field": "startup_time_before" } <1>, + "b" : {"field": "startup_time_after"} <2>, + "type": "paired" + } + } + } +} +-------------------------------------------------- +// TEST[setup:node_upgrade] +<1> The field `startup_time_before` must be a numeric field + The field `startup_time_after` must be a numeric field +<1> The field `startup_time_before` since we have data from the same nodes, we are using paired t-test. + +The response will look like this: + +[source,console-result] +-------------------------------------------------- +{ + ... + + "aggregations": { + "startup_time_ttest": { + "value": 0.1914368843365979 + } + } +} +-------------------------------------------------- +// TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/] + +==== T-Test Types + +The `t_test` aggregation supports unpaired and paired two-sample t-tests. The type of the test can be specified using the `type` parameter: + +`"type": "paired"`:: performs paired t-test +`"type": "homoscedastic"`:: performs two-sample equal variance test +`"type": "heteroscedastic"`:: performs two-sample unequal variance test (this is default) + +==== Script + +The `t_test` metric supports scripting. For example, if we need to adjust out load times for the before values, we could use +a script to recalculate them on-the-fly: + +[source,console] +-------------------------------------------------- +GET node_upgrade/_search +{ + "size": 0, + "aggs" : { + "startup_time_ttest" : { + "t_test" : { + "a": { + "script" : { + "lang": "painless", + "source": "doc['startup_time_before'].value - params.adjustment", <1> + "params" : { + "adjustment" : 10 <2> + } + } + }, + "b": { + "field": "startup_time_after" <3> + }, + "type": "paired" + } + } + } +} +-------------------------------------------------- +// TEST[setup:node_upgrade] + +<1> The `field` parameter is replaced with a `script` parameter, which uses the +script to generate values which percentiles are calculated on +<2> Scripting supports parameterized input just like any other script +<3> We can mix scripts and fields + diff --git a/x-pack/plugin/analytics/build.gradle b/x-pack/plugin/analytics/build.gradle index 889a3235bdeb7..9edd16498d80d 100644 --- a/x-pack/plugin/analytics/build.gradle +++ b/x-pack/plugin/analytics/build.gradle @@ -18,6 +18,8 @@ dependencies { compileOnly project(path: xpackModule('core'), configuration: 'default') testCompile project(path: xpackModule('core'), configuration: 'testArtifacts') + + compile 'org.apache.commons:commons-math3:3.2' } integTest.enabled = false diff --git a/x-pack/plugin/analytics/licenses/commons-math3-3.2.jar.sha1 b/x-pack/plugin/analytics/licenses/commons-math3-3.2.jar.sha1 new file mode 100644 index 0000000000000..9d0b27affe5d6 --- /dev/null +++ b/x-pack/plugin/analytics/licenses/commons-math3-3.2.jar.sha1 @@ -0,0 +1 @@ +ec2544ab27e110d2d431bdad7d538ed509b21e62 \ No newline at end of file diff --git a/x-pack/plugin/analytics/licenses/commons-math3-LICENSE.txt b/x-pack/plugin/analytics/licenses/commons-math3-LICENSE.txt new file mode 100644 index 0000000000000..28b134f5f8e4d --- /dev/null +++ b/x-pack/plugin/analytics/licenses/commons-math3-LICENSE.txt @@ -0,0 +1,475 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + + +Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was +derived from unicode conversion examples available at +http://www.unicode.org/Public/PROGRAMS/CVTUTF. Here is the copyright +from those sources: + +/* + * Copyright 2001-2004 Unicode, Inc. + * + * Disclaimer + * + * This source code is provided as is by Unicode, Inc. No claims are + * made as to fitness for any particular purpose. No warranties of any + * kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been + * purchased on magnetic or optical media from Unicode, Inc., the + * sole remedy for any claim will be exchange of defective media + * within 90 days of receipt. + * + * Limitations on Rights to Redistribute This Code + * + * Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard, and to make copies of this file in any form + * for internal or external distribution as long as this notice + * remains attached. + */ + + +Some code in core/src/java/org/apache/lucene/util/ArrayUtil.java was +derived from Python 2.4.2 sources available at +http://www.python.org. Full license is here: + + http://www.python.org/download/releases/2.4.2/license/ + +Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was +derived from Python 3.1.2 sources available at +http://www.python.org. Full license is here: + + http://www.python.org/download/releases/3.1.2/license/ + +Some code in core/src/java/org/apache/lucene/util/automaton was +derived from Brics automaton sources available at +www.brics.dk/automaton/. Here is the copyright from those sources: + +/* + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +The levenshtein automata tables in core/src/java/org/apache/lucene/util/automaton +were automatically generated with the moman/finenight FSA package. +Here is the copyright for those sources: + +# Copyright (c) 2010, Jean-Philippe Barrette-LaPierre, +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was +derived from ICU (http://www.icu-project.org) +The full license is available here: + http://source.icu-project.org/repos/icu/icu/trunk/license.html + +/* + * Copyright (C) 1999-2010, International Business Machines + * Corporation and others. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, and/or sell copies of the + * Software, and to permit persons to whom the Software is furnished to do so, + * provided that the above copyright notice(s) and this permission notice appear + * in all copies of the Software and that both the above copyright notice(s) and + * this permission notice appear in supporting documentation. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE + * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR + * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER + * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall not + * be used in advertising or otherwise to promote the sale, use or other + * dealings in this Software without prior written authorization of the + * copyright holder. + */ + +The following license applies to the Snowball stemmers: + +Copyright (c) 2001, Dr Martin Porter +Copyright (c) 2002, Richard Boulton +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holders nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The following license applies to the KStemmer: + +Copyright © 2003, +Center for Intelligent Information Retrieval, +University of Massachusetts, Amherst. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. The names "Center for Intelligent Information Retrieval" and +"University of Massachusetts" must not be used to endorse or promote products +derived from this software without prior written permission. To obtain +permission, contact info@ciir.cs.umass.edu. + +THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +SUCH DAMAGE. + +The following license applies to the Morfologik project: + +Copyright (c) 2006 Dawid Weiss +Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of Morfologik nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +--- + +The dictionary comes from Morfologik project. Morfologik uses data from +Polish ispell/myspell dictionary hosted at http://www.sjp.pl/slownik/en/ and +is licenced on the terms of (inter alia) LGPL and Creative Commons +ShareAlike. The part-of-speech tags were added in Morfologik project and +are not found in the data from sjp.pl. The tagset is similar to IPI PAN +tagset. + +--- + +The following license applies to the Morfeusz project, +used by org.apache.lucene.analysis.morfologik. + +BSD-licensed dictionary of Polish (SGJP) +http://sgjp.pl/morfeusz/ + +Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński, + Marcin Woliński, Robert Wołosz + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + +THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS +OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN +IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/x-pack/plugin/analytics/licenses/commons-math3-NOTICE.txt b/x-pack/plugin/analytics/licenses/commons-math3-NOTICE.txt new file mode 100644 index 0000000000000..587cd7f1da7dd --- /dev/null +++ b/x-pack/plugin/analytics/licenses/commons-math3-NOTICE.txt @@ -0,0 +1,9 @@ +Apache Commons Math +Copyright 2001-2020 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +This product includes software developed for Orekit by +CS Systèmes d'Information (http://www.c-s.fr/) +Copyright 2010-2012 CS Systèmes d'Information diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/AnalyticsPlugin.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/AnalyticsPlugin.java index 295cb91ab0bd9..7d006efefa2d5 100644 --- a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/AnalyticsPlugin.java +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/AnalyticsPlugin.java @@ -41,6 +41,11 @@ import org.elasticsearch.xpack.analytics.topmetrics.InternalTopMetrics; import org.elasticsearch.xpack.analytics.topmetrics.TopMetricsAggregationBuilder; import org.elasticsearch.xpack.analytics.topmetrics.TopMetricsAggregatorFactory; +import org.elasticsearch.xpack.analytics.ttest.InternalTTest; +import org.elasticsearch.xpack.analytics.ttest.PairedTTestState; +import org.elasticsearch.xpack.analytics.ttest.TTestAggregationBuilder; +import org.elasticsearch.xpack.analytics.ttest.TTestState; +import org.elasticsearch.xpack.analytics.ttest.UnpairedTTestState; import org.elasticsearch.xpack.core.XPackField; import org.elasticsearch.xpack.core.XPackPlugin; import org.elasticsearch.xpack.core.action.XPackInfoFeatureAction; @@ -94,7 +99,12 @@ public List getAggregations() { TopMetricsAggregationBuilder.NAME, TopMetricsAggregationBuilder::new, usage.track(AnalyticsUsage.Item.TOP_METRICS, checkLicense(TopMetricsAggregationBuilder.PARSER))) - .addResultReader(InternalTopMetrics::new) + .addResultReader(InternalTopMetrics::new), + new AggregationSpec( + TTestAggregationBuilder.NAME, + TTestAggregationBuilder::new, + usage.track(AnalyticsUsage.Item.T_TEST, checkLicense(TTestAggregationBuilder.PARSER))) + .addResultReader(InternalTTest::new) ); } @@ -130,6 +140,14 @@ public Collection createComponents(Client client, ClusterService cluster return singletonList(new AnalyticsUsage()); } + @Override + public List getNamedWriteables() { + return Arrays.asList( + new NamedWriteableRegistry.Entry(TTestState.class, PairedTTestState.NAME, PairedTTestState::new), + new NamedWriteableRegistry.Entry(TTestState.class, UnpairedTTestState.NAME, UnpairedTTestState::new) + ); + } + private static ContextParser checkLicense(ContextParser realParser) { return (parser, name) -> { if (getLicenseState().isDataScienceAllowed() == false) { diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/AnalyticsUsage.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/AnalyticsUsage.java index 508966301c704..02ca990a08f61 100644 --- a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/AnalyticsUsage.java +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/AnalyticsUsage.java @@ -25,7 +25,8 @@ public enum Item { BOXPLOT, CUMULATIVE_CARDINALITY, STRING_STATS, - TOP_METRICS; + TOP_METRICS, + T_TEST; } private final Map trackers = new EnumMap<>(Item.class); @@ -54,6 +55,7 @@ public AnalyticsStatsAction.NodeResponse stats(DiscoveryNode node) { trackers.get(Item.BOXPLOT).get(), trackers.get(Item.CUMULATIVE_CARDINALITY).get(), trackers.get(Item.STRING_STATS).get(), - trackers.get(Item.TOP_METRICS).get()); + trackers.get(Item.TOP_METRICS).get(), + trackers.get(Item.T_TEST).get()); } } diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/InternalTTest.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/InternalTTest.java new file mode 100644 index 0000000000000..789618ed62561 --- /dev/null +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/InternalTTest.java @@ -0,0 +1,101 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.analytics.ttest; + +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.metrics.InternalNumericMetricsAggregation; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +public class InternalTTest extends InternalNumericMetricsAggregation.SingleValue implements TTest { + + protected final TTestState state; + + InternalTTest(String name, TTestState state, DocValueFormat formatter, + List pipelineAggregators, Map metaData) { + super(name, pipelineAggregators, metaData); + this.state = state; + this.format = formatter; + } + + /** + * Read from a stream. + */ + public InternalTTest(StreamInput in) throws IOException { + super(in); + format = in.readNamedWriteable(DocValueFormat.class); + state = in.readNamedWriteable(TTestState.class); + } + + @Override + protected void doWriteTo(StreamOutput out) throws IOException { + out.writeNamedWriteable(format); + out.writeNamedWriteable(state); + } + + @Override + public String getWriteableName() { + return TTestAggregationBuilder.NAME; + } + + // for testing only + DocValueFormat format() { + return format; + } + + @Override + public InternalTTest reduce(List aggregations, ReduceContext reduceContext) { + TTestState reduced = state.reduce(aggregations.stream().map(a -> ((InternalTTest) a).state)); + return new InternalTTest(name, reduced, format, pipelineAggregators(), getMetaData()); + } + + @Override + public XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException { + double value = state.getValue(); + boolean hasValue = Double.isNaN(value) == false; + builder.field(CommonFields.VALUE.getPreferredName(), hasValue ? value : null); + if (hasValue && format != DocValueFormat.RAW) { + builder.field(CommonFields.VALUE_AS_STRING.getPreferredName(), format.format(value).toString()); + } + return builder; + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), state); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null || getClass() != obj.getClass()) return false; + if (super.equals(obj) == false) return false; + + InternalTTest that = (InternalTTest) obj; + return Objects.equals(state, that.state); + } + + @Override + public double value() { + return state.getValue(); + } + + @Override + public double getValue() { + return state.getValue(); + } + +} + diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/PairedTTestAggregator.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/PairedTTestAggregator.java new file mode 100644 index 0000000000000..b6c2078dd38c1 --- /dev/null +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/PairedTTestAggregator.java @@ -0,0 +1,91 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.analytics.ttest; + +import org.apache.lucene.index.LeafReaderContext; +import org.elasticsearch.common.lease.Releasables; +import org.elasticsearch.common.util.BigArrays; +import org.elasticsearch.index.fielddata.SortedNumericDoubleValues; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.AggregationExecutionException; +import org.elasticsearch.search.aggregations.Aggregator; +import org.elasticsearch.search.aggregations.LeafBucketCollector; +import org.elasticsearch.search.aggregations.LeafBucketCollectorBase; +import org.elasticsearch.search.aggregations.metrics.CompensatedSum; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; +import org.elasticsearch.search.aggregations.support.MultiValuesSource; +import org.elasticsearch.search.internal.SearchContext; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +import static org.elasticsearch.xpack.analytics.ttest.TTestAggregationBuilder.A_FIELD; +import static org.elasticsearch.xpack.analytics.ttest.TTestAggregationBuilder.B_FIELD; + +public class PairedTTestAggregator extends TTestAggregator { + private TStatsBuilder statsBuilder; + + PairedTTestAggregator(String name, MultiValuesSource.NumericMultiValuesSource valuesSources, int tails, DocValueFormat format, + SearchContext context, Aggregator parent, List pipelineAggregators, + Map metaData) throws IOException { + super(name, valuesSources, tails, format, context, parent, pipelineAggregators, metaData); + statsBuilder = new TStatsBuilder(context.bigArrays()); + } + + @Override + protected PairedTTestState getState(long bucket) { + return new PairedTTestState(statsBuilder.get(bucket), tails); + } + + @Override + protected PairedTTestState getEmptyState() { + return new PairedTTestState(new TTestStats(0, 0, 0), tails); + } + + @Override + protected long size() { + return statsBuilder.getSize(); + } + + @Override + public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, + final LeafBucketCollector sub) throws IOException { + if (valuesSources == null) { + return LeafBucketCollector.NO_OP_COLLECTOR; + } + final BigArrays bigArrays = context.bigArrays(); + final SortedNumericDoubleValues docAValues = valuesSources.getField(A_FIELD.getPreferredName(), ctx); + final SortedNumericDoubleValues docBValues = valuesSources.getField(B_FIELD.getPreferredName(), ctx); + final CompensatedSum compDiffSum = new CompensatedSum(0, 0); + final CompensatedSum compDiffSumOfSqr = new CompensatedSum(0, 0); + + return new LeafBucketCollectorBase(sub, docAValues) { + @Override + public void collect(int doc, long bucket) throws IOException { + statsBuilder.grow(bigArrays, bucket + 1); + if (docAValues.advanceExact(doc) && docBValues.advanceExact(doc)) { + if (docAValues.docValueCount() > 1 || docBValues.docValueCount() > 1) { + throw new AggregationExecutionException("Encountered more than one value for a " + + "single document. Use a script to combine multiple values per doc into a single value."); + } + // There should always be one value if advanceExact lands us here, either + // a real value or a `missing` value + assert docAValues.docValueCount() == 1; + assert docBValues.docValueCount() == 1; + double diff = docAValues.nextValue() - docBValues.nextValue(); + statsBuilder.addValue(compDiffSum, compDiffSumOfSqr, bucket, diff); + } + } + }; + } + + @Override + public void doClose() { + Releasables.close(statsBuilder); + } +} diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/PairedTTestState.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/PairedTTestState.java new file mode 100644 index 0000000000000..f61067f30ec84 --- /dev/null +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/PairedTTestState.java @@ -0,0 +1,88 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.analytics.ttest; + +import org.apache.commons.math3.distribution.TDistribution; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; + +import java.io.IOException; +import java.util.Objects; +import java.util.stream.Stream; + +public class PairedTTestState implements TTestState { + + public static final String NAME = "P"; + + private final TTestStats stats; + + private final int tails; + + public PairedTTestState(TTestStats stats, int tails) { + this.stats = stats; + this.tails = tails; + } + + public PairedTTestState(StreamInput in) throws IOException { + stats = new TTestStats(in); + tails = in.readVInt(); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + stats.writeTo(out); + out.writeVInt(tails); + } + + @Override + public double getValue() { + if (stats.count < 2) { + return Double.NaN; + } + long n = stats.count - 1; + double meanDiff = stats.sum / stats.count; + double variance = (stats.sumOfSqrs - ((stats.sum * stats.sum) / stats.count)) / stats.count; + if (variance <= 0.0) { + return meanDiff == 0.0 ? Double.NaN : 0.0; + } + double stdDiv = Math.sqrt(variance); + double stdErr = stdDiv / Math.sqrt(n); + double t = Math.abs(meanDiff / stdErr); + TDistribution dist = new TDistribution(n); + return dist.cumulativeProbability(-t) * tails; + } + + @Override + public TTestState reduce(Stream states) { + TTestStats.Reducer reducer = new TTestStats.Reducer(); + states.forEach(tTestState -> { + PairedTTestState state = (PairedTTestState) tTestState; + reducer.accept(state.stats); + assert state.tails == tails; + }); + return new PairedTTestState(reducer.result(), tails); + } + + @Override + public String getWriteableName() { + return NAME; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + PairedTTestState that = (PairedTTestState) o; + return tails == that.tails && + stats.equals(that.stats); + } + + @Override + public int hashCode() { + return Objects.hash(stats, tails); + } +} diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TStatsBuilder.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TStatsBuilder.java new file mode 100644 index 0000000000000..4c0f02787eda7 --- /dev/null +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TStatsBuilder.java @@ -0,0 +1,75 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.analytics.ttest; + +import org.elasticsearch.common.lease.Releasable; +import org.elasticsearch.common.lease.Releasables; +import org.elasticsearch.common.util.BigArrays; +import org.elasticsearch.common.util.DoubleArray; +import org.elasticsearch.common.util.LongArray; +import org.elasticsearch.search.aggregations.metrics.CompensatedSum; + +public class TStatsBuilder implements Releasable { + + private LongArray counts; + private DoubleArray sums; + private DoubleArray compensations; + private DoubleArray sumOfSqrs; + private DoubleArray sumOfSqrCompensations; + + TStatsBuilder(BigArrays bigArrays) { + counts = bigArrays.newLongArray(1, true); + sums = bigArrays.newDoubleArray(1, true); + compensations = bigArrays.newDoubleArray(1, true); + sumOfSqrs = bigArrays.newDoubleArray(1, true); + sumOfSqrCompensations = bigArrays.newDoubleArray(1, true); + } + + public TTestStats get(long bucket) { + return new TTestStats(counts.get(bucket), sums.get(bucket), sumOfSqrs.get(bucket)); + } + + public long build(long bucket) { + return counts.get(bucket); + } + + public long getSize() { + return counts.size(); + } + + public void grow(BigArrays bigArrays, long buckets) { + counts = bigArrays.grow(counts, buckets); + sums = bigArrays.grow(sums, buckets); + compensations = bigArrays.grow(compensations, buckets); + sumOfSqrs = bigArrays.grow(sumOfSqrs, buckets); + sumOfSqrCompensations = bigArrays.grow(sumOfSqrCompensations, buckets); + } + + public void addValue(CompensatedSum compSum, CompensatedSum compSumOfSqr, long bucket, double val) { + counts.increment(bucket, 1); + double sum = sums.get(bucket); + double compensation = compensations.get(bucket); + compSum.reset(sum, compensation); + + double sumOfSqr = sumOfSqrs.get(bucket); + double sumOfSqrCompensation = sumOfSqrCompensations.get(bucket); + compSumOfSqr.reset(sumOfSqr, sumOfSqrCompensation); + + compSum.add(val); + compSumOfSqr.add(val * val); + + sums.set(bucket, compSum.value()); + compensations.set(bucket, compSum.delta()); + sumOfSqrs.set(bucket, compSumOfSqr.value()); + sumOfSqrCompensations.set(bucket, compSumOfSqr.delta()); + } + + @Override + public void close() { + Releasables.close(counts, sums, compensations, sumOfSqrs, sumOfSqrCompensations); + } +} diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTest.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTest.java new file mode 100644 index 0000000000000..df3b266654f52 --- /dev/null +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTest.java @@ -0,0 +1,15 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.analytics.ttest; + +import org.elasticsearch.search.aggregations.metrics.NumericMetricsAggregation; + +public interface TTest extends NumericMetricsAggregation.SingleValue { + + double getValue(); + +} diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTestAggregationBuilder.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTestAggregationBuilder.java new file mode 100644 index 0000000000000..4c0c621ac040e --- /dev/null +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTestAggregationBuilder.java @@ -0,0 +1,140 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.analytics.ttest; + +import org.elasticsearch.common.ParseField; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.xcontent.ObjectParser; +import org.elasticsearch.common.xcontent.ToXContent; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.index.query.QueryShardContext; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.AggregationBuilder; +import org.elasticsearch.search.aggregations.AggregatorFactories; +import org.elasticsearch.search.aggregations.AggregatorFactory; +import org.elasticsearch.search.aggregations.support.CoreValuesSourceType; +import org.elasticsearch.search.aggregations.support.MultiValuesSourceAggregationBuilder; +import org.elasticsearch.search.aggregations.support.MultiValuesSourceAggregatorFactory; +import org.elasticsearch.search.aggregations.support.MultiValuesSourceFieldConfig; +import org.elasticsearch.search.aggregations.support.MultiValuesSourceParseHelper; +import org.elasticsearch.search.aggregations.support.ValueType; +import org.elasticsearch.search.aggregations.support.ValuesSourceConfig; +import org.elasticsearch.search.aggregations.support.ValuesSourceType; + +import java.io.IOException; +import java.util.Map; +import java.util.Objects; + +public class TTestAggregationBuilder extends MultiValuesSourceAggregationBuilder.LeafOnly { + public static final String NAME = "t_test"; + public static final ParseField A_FIELD = new ParseField("a"); + public static final ParseField B_FIELD = new ParseField("b"); + public static final ParseField TYPE_FIELD = new ParseField("type"); + public static final ParseField TAILS_FIELD = new ParseField("tails"); + + public static final ObjectParser PARSER = + ObjectParser.fromBuilder(NAME, TTestAggregationBuilder::new); + + static { + MultiValuesSourceParseHelper.declareCommon(PARSER, true, ValueType.NUMERIC); + MultiValuesSourceParseHelper.declareField(A_FIELD.getPreferredName(), PARSER, true, false); + MultiValuesSourceParseHelper.declareField(B_FIELD.getPreferredName(), PARSER, true, false); + PARSER.declareString(TTestAggregationBuilder::testType, TYPE_FIELD); + PARSER.declareInt(TTestAggregationBuilder::tails, TAILS_FIELD); + + } + + private TTestType testType = TTestType.HETEROSCEDASTIC; + + private int tails = 2; + + public TTestAggregationBuilder(String name) { + super(name); + } + + public TTestAggregationBuilder(TTestAggregationBuilder clone, + AggregatorFactories.Builder factoriesBuilder, + Map metaData) { + super(clone, factoriesBuilder, metaData); + } + + public TTestAggregationBuilder a(MultiValuesSourceFieldConfig valueConfig) { + field(A_FIELD.getPreferredName(), Objects.requireNonNull(valueConfig, "Configuration for field [" + A_FIELD + "] cannot be null")); + return this; + } + + public TTestAggregationBuilder b(MultiValuesSourceFieldConfig weightConfig) { + field(B_FIELD.getPreferredName(), Objects.requireNonNull(weightConfig, "Configuration for field [" + B_FIELD + "] cannot be null")); + return this; + } + + public TTestAggregationBuilder testType(String testType) { + return testType(TTestType.resolve(Objects.requireNonNull(testType, "Test type cannot be null"))); + } + + public TTestAggregationBuilder testType(TTestType testType) { + this.testType = Objects.requireNonNull(testType, "Test type cannot be null"); + return this; + } + + public TTestAggregationBuilder tails(int tails) { + if (tails < 1 || tails > 2) { + throw new IllegalArgumentException( + "[tails] must be 1 or 2. Found [" + tails + "] in [" + name + "]"); + } + this.tails = tails; + return this; + } + + public TTestAggregationBuilder(StreamInput in) throws IOException { + super(in); + testType = in.readEnum(TTestType.class); + tails = in.readVInt(); + } + + @Override + protected AggregationBuilder shallowCopy(AggregatorFactories.Builder factoriesBuilder, Map metaData) { + return new TTestAggregationBuilder(this, factoriesBuilder, metaData); + } + + @Override + public BucketCardinality bucketCardinality() { + return BucketCardinality.NONE; + } + + @Override + protected void innerWriteTo(StreamOutput out) throws IOException { + out.writeEnum(testType); + out.writeVInt(tails); + } + + @Override + protected ValuesSourceType defaultValueSourceType() { + return CoreValuesSourceType.NUMERIC; + } + + @Override + protected MultiValuesSourceAggregatorFactory innerBuild( + QueryShardContext queryShardContext, + Map configs, + DocValueFormat format, + AggregatorFactory parent, + AggregatorFactories.Builder subFactoriesBuilder) throws IOException { + return new TTestAggregatorFactory(name, configs, testType, tails, format, queryShardContext, parent, subFactoriesBuilder, metaData); + } + + @Override + public XContentBuilder doXContentBody(XContentBuilder builder, ToXContent.Params params) throws IOException { + return builder; + } + + @Override + public String getType() { + return NAME; + } +} diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTestAggregator.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTestAggregator.java new file mode 100644 index 0000000000000..b4c36f8ec920e --- /dev/null +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTestAggregator.java @@ -0,0 +1,67 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.analytics.ttest; + +import org.apache.lucene.search.ScoreMode; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.Aggregator; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.metrics.NumericMetricsAggregator; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; +import org.elasticsearch.search.aggregations.support.MultiValuesSource; +import org.elasticsearch.search.internal.SearchContext; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + + +public abstract class TTestAggregator extends NumericMetricsAggregator.SingleValue { + + protected final MultiValuesSource.NumericMultiValuesSource valuesSources; + protected final int tails; + + private DocValueFormat format; + + TTestAggregator(String name, MultiValuesSource.NumericMultiValuesSource valuesSources, int tails, DocValueFormat format, + SearchContext context, Aggregator parent, + List pipelineAggregators, Map metaData) throws IOException { + super(name, context, parent, pipelineAggregators, metaData); + this.valuesSources = valuesSources; + this.tails = tails; + this.format = format; + } + + @Override + public ScoreMode scoreMode() { + return valuesSources != null && valuesSources.needsScores() ? ScoreMode.COMPLETE : ScoreMode.COMPLETE_NO_SCORES; + } + + protected abstract T getState(long bucket); + + protected abstract T getEmptyState(); + + protected abstract long size(); + + @Override + public InternalAggregation buildAggregation(long bucket) { + if (valuesSources == null || bucket >= size()) { + return buildEmptyAggregation(); + } + return new InternalTTest(name, getState(bucket), format, pipelineAggregators(), metaData()); + } + + @Override + public InternalAggregation buildEmptyAggregation() { + return new InternalTTest(name, getEmptyState(), format, pipelineAggregators(), metaData()); + } + + @Override + public double metric(long owningBucketOrd) { + return getState(owningBucketOrd).getValue(); + } +} diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTestAggregatorFactory.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTestAggregatorFactory.java new file mode 100644 index 0000000000000..07098b86f6009 --- /dev/null +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTestAggregatorFactory.java @@ -0,0 +1,81 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.analytics.ttest; + +import org.elasticsearch.index.query.QueryShardContext; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.Aggregator; +import org.elasticsearch.search.aggregations.AggregatorFactories; +import org.elasticsearch.search.aggregations.AggregatorFactory; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; +import org.elasticsearch.search.aggregations.support.MultiValuesSource; +import org.elasticsearch.search.aggregations.support.MultiValuesSourceAggregatorFactory; +import org.elasticsearch.search.aggregations.support.ValuesSourceConfig; +import org.elasticsearch.search.internal.SearchContext; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +class TTestAggregatorFactory extends MultiValuesSourceAggregatorFactory { + + private final TTestType testType; + private final int tails; + + TTestAggregatorFactory(String name, Map configs, TTestType testType, int tails, + DocValueFormat format, QueryShardContext queryShardContext, AggregatorFactory parent, + AggregatorFactories.Builder subFactoriesBuilder, + Map metaData) throws IOException { + super(name, configs, format, queryShardContext, parent, subFactoriesBuilder, metaData); + this.testType = testType; + this.tails = tails; + } + + @Override + protected Aggregator createUnmapped(SearchContext searchContext, + Aggregator parent, + List pipelineAggregators, + Map metaData) throws IOException { + switch (testType) { + case PAIRED: + return new PairedTTestAggregator(name, null, tails, format, searchContext, parent, pipelineAggregators, metaData); + case HOMOSCEDASTIC: + return new UnpairedTTestAggregator(name, null, tails, true, format, searchContext, parent, pipelineAggregators, metaData); + case HETEROSCEDASTIC: + return new UnpairedTTestAggregator(name, null, tails, false, format, searchContext, parent, pipelineAggregators, metaData); + default: + throw new UnsupportedOperationException("Unsupported t-test type " + testType); + } + } + + @Override + protected Aggregator doCreateInternal(SearchContext searchContext, + Map configs, + DocValueFormat format, + Aggregator parent, + boolean collectsFromSingleBucket, + List pipelineAggregators, + Map metaData) throws IOException { + MultiValuesSource.NumericMultiValuesSource numericMultiVS + = new MultiValuesSource.NumericMultiValuesSource(configs, queryShardContext); + if (numericMultiVS.areValuesSourcesEmpty()) { + return createUnmapped(searchContext, parent, pipelineAggregators, metaData); + } + switch (testType) { + case PAIRED: + return new PairedTTestAggregator(name, numericMultiVS, tails, format, searchContext, parent, pipelineAggregators, metaData); + case HOMOSCEDASTIC: + return new UnpairedTTestAggregator(name, numericMultiVS, tails, true, format, searchContext, parent, pipelineAggregators, + metaData); + case HETEROSCEDASTIC: + return new UnpairedTTestAggregator(name, numericMultiVS, tails, false, format, searchContext, parent, pipelineAggregators, + metaData); + default: + throw new UnsupportedOperationException("Unsupported t-test type " + testType); + } + } +} diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTestState.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTestState.java new file mode 100644 index 0000000000000..60606fad1b5ae --- /dev/null +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTestState.java @@ -0,0 +1,20 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.analytics.ttest; + +import org.elasticsearch.common.io.stream.NamedWriteable; + +import java.util.stream.Stream; + +/** + * Base class for t-test aggregation state + */ +public interface TTestState extends NamedWriteable { + double getValue(); + + TTestState reduce(Stream states); +} diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTestStats.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTestStats.java new file mode 100644 index 0000000000000..55f7c545d460e --- /dev/null +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTestStats.java @@ -0,0 +1,85 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.analytics.ttest; + +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.search.aggregations.metrics.CompensatedSum; + +import java.io.IOException; +import java.util.Objects; +import java.util.function.Consumer; + +/** + * Collects basic stats that are needed to perform t-test + */ +public class TTestStats implements Writeable { + public final long count; + public final double sum; + public final double sumOfSqrs; + + public TTestStats(long count, double sum, double sumOfSqrs) { + this.count = count; + this.sum = sum; + this.sumOfSqrs = sumOfSqrs; + } + + public TTestStats(StreamInput in) throws IOException { + count = in.readVLong(); + sum = in.readDouble(); + sumOfSqrs = in.readDouble(); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeVLong(count); + out.writeDouble(sum); + out.writeDouble(sumOfSqrs); + } + + public double variance() { + double v = (sumOfSqrs - ((sum * sum) / count)) / (count - 1); + return v < 0 ? 0 : v; + } + + public double average() { + return sum / count; + } + + public static class Reducer implements Consumer { + private long count = 0; + CompensatedSum compSum = new CompensatedSum(0, 0); + CompensatedSum compSumOfSqrs = new CompensatedSum(0, 0); + + @Override + public void accept(TTestStats stat) { + count += stat.count; + compSum.add(stat.sum); + compSumOfSqrs.add(stat.sumOfSqrs); + } + + public TTestStats result() { + return new TTestStats(count, compSum.value(), compSumOfSqrs.value()); + } + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + TTestStats that = (TTestStats) o; + return count == that.count && + Double.compare(that.sum, sum) == 0 && + Double.compare(that.sumOfSqrs, sumOfSqrs) == 0; + } + + @Override + public int hashCode() { + return Objects.hash(count, sum, sumOfSqrs); + } +} diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTestType.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTestType.java new file mode 100644 index 0000000000000..95053391ccd1c --- /dev/null +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/TTestType.java @@ -0,0 +1,25 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.analytics.ttest; + +import java.util.Locale; + +/** + * T-test type, paired, unpaired equal variance, unpaired unequal variance + */ +public enum TTestType { + PAIRED, HOMOSCEDASTIC, HETEROSCEDASTIC; + + public static TTestType resolve(String name) { + return TTestType.valueOf(name.toUpperCase(Locale.ROOT)); + } + + public String value() { + return name().toLowerCase(Locale.ROOT); + } + +} diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/UnpairedTTestAggregator.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/UnpairedTTestAggregator.java new file mode 100644 index 0000000000000..67d7a89367ee8 --- /dev/null +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/UnpairedTTestAggregator.java @@ -0,0 +1,99 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.analytics.ttest; + +import org.apache.lucene.index.LeafReaderContext; +import org.elasticsearch.common.lease.Releasables; +import org.elasticsearch.common.util.BigArrays; +import org.elasticsearch.index.fielddata.SortedNumericDoubleValues; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.Aggregator; +import org.elasticsearch.search.aggregations.LeafBucketCollector; +import org.elasticsearch.search.aggregations.LeafBucketCollectorBase; +import org.elasticsearch.search.aggregations.metrics.CompensatedSum; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; +import org.elasticsearch.search.aggregations.support.MultiValuesSource; +import org.elasticsearch.search.internal.SearchContext; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +import static org.elasticsearch.xpack.analytics.ttest.TTestAggregationBuilder.A_FIELD; +import static org.elasticsearch.xpack.analytics.ttest.TTestAggregationBuilder.B_FIELD; + +public class UnpairedTTestAggregator extends TTestAggregator { + private final TStatsBuilder a; + private final TStatsBuilder b; + private final boolean homoscedastic; + + UnpairedTTestAggregator(String name, MultiValuesSource.NumericMultiValuesSource valuesSources, int tails, boolean homoscedastic, + DocValueFormat format, SearchContext context, Aggregator parent, List pipelineAggregators, + Map metaData) throws IOException { + super(name, valuesSources, tails, format, context, parent, pipelineAggregators, metaData); + BigArrays bigArrays = context.bigArrays(); + a = new TStatsBuilder(bigArrays); + b = new TStatsBuilder(bigArrays); + this.homoscedastic = homoscedastic; + } + + @Override + protected UnpairedTTestState getState(long bucket) { + return new UnpairedTTestState(a.get(bucket), b.get(bucket), homoscedastic, tails); + } + + @Override + protected UnpairedTTestState getEmptyState() { + return new UnpairedTTestState(new TTestStats(0, 0, 0), new TTestStats(0, 0, 0), homoscedastic, tails); + } + + @Override + protected long size() { + return a.getSize(); + } + + @Override + public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, + final LeafBucketCollector sub) throws IOException { + if (valuesSources == null) { + return LeafBucketCollector.NO_OP_COLLECTOR; + } + final BigArrays bigArrays = context.bigArrays(); + final SortedNumericDoubleValues docAValues = valuesSources.getField(A_FIELD.getPreferredName(), ctx); + final SortedNumericDoubleValues docBValues = valuesSources.getField(B_FIELD.getPreferredName(), ctx); + final CompensatedSum compSumA = new CompensatedSum(0, 0); + final CompensatedSum compSumOfSqrA = new CompensatedSum(0, 0); + final CompensatedSum compSumB = new CompensatedSum(0, 0); + final CompensatedSum compSumOfSqrB = new CompensatedSum(0, 0); + + return new LeafBucketCollectorBase(sub, docAValues) { + + private void processValues(int doc, long bucket, SortedNumericDoubleValues docValues, CompensatedSum compSum, + CompensatedSum compSumOfSqr, TStatsBuilder builder) throws IOException { + if (docValues.advanceExact(doc)) { + final int numValues = docValues.docValueCount(); + for (int i = 0; i < numValues; i++) { + builder.addValue(compSum, compSumOfSqr, bucket, docValues.nextValue()); + } + } + } + + @Override + public void collect(int doc, long bucket) throws IOException { + a.grow(bigArrays, bucket + 1); + b.grow(bigArrays, bucket + 1); + processValues(doc, bucket, docAValues, compSumA, compSumOfSqrA, a); + processValues(doc, bucket, docBValues, compSumB, compSumOfSqrB, b); + } + }; + } + + @Override + public void doClose() { + Releasables.close(a, b); + } +} diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/UnpairedTTestState.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/UnpairedTTestState.java new file mode 100644 index 0000000000000..eb707d7883a34 --- /dev/null +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/ttest/UnpairedTTestState.java @@ -0,0 +1,114 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.analytics.ttest; + +import org.apache.commons.math3.distribution.TDistribution; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; + +import java.io.IOException; +import java.util.Objects; +import java.util.stream.Stream; + +public class UnpairedTTestState implements TTestState { + + public static final String NAME = "U"; + + private final TTestStats a; + private final TTestStats b; + private boolean homoscedastic; + private int tails; + + public UnpairedTTestState(TTestStats a, TTestStats b, boolean homoscedastic, int tails) { + this.a = a; + this.b = b; + this.homoscedastic = homoscedastic; + this.tails = tails; + } + + public UnpairedTTestState(StreamInput in) throws IOException { + a = new TTestStats(in); + b = new TTestStats(in); + homoscedastic = in.readBoolean(); + tails = in.readVInt(); + } + + @Override + public double getValue() { + if (a.count < 2 || b.count < 2) { + return Double.NaN; + } + + if (homoscedastic) { + long n = a.count + b.count - 2; + double variance = ((a.count - 1) * a.variance() + (b.count - 1) * b.variance()) / n; + double nn = (1.0 / a.count + 1.0 / b.count); + return p(variance * nn, n); + } else { + double s2an = a.variance() / a.count; + double s2bn = b.variance() / b.count; + double variance = s2an + s2bn; + double degreeOfFreedom = variance * variance / (s2an * s2an / (a.count - 1) + s2bn * s2bn / (b.count - 1)); + return p(variance, degreeOfFreedom); + } + } + + private double p(double sd2, double degreesOfFreedom) { + if (degreesOfFreedom < 0) { + return Double.NaN; + } + double sd = Math.sqrt(sd2); + double meanDiff = a.average() - b.average(); + double t = Math.abs(meanDiff / sd); + TDistribution dist = new TDistribution(degreesOfFreedom); + return dist.cumulativeProbability(-t) * tails; + } + + + @Override + public TTestState reduce(Stream states) { + TTestStats.Reducer reducerA = new TTestStats.Reducer(); + TTestStats.Reducer reducerB = new TTestStats.Reducer(); + states.forEach(tTestState -> { + UnpairedTTestState state = (UnpairedTTestState) tTestState; + assert state.homoscedastic == homoscedastic; + assert state.tails == tails; + reducerA.accept(state.a); + reducerB.accept(state.b); + }); + return new UnpairedTTestState(reducerA.result(), reducerB.result(), homoscedastic, tails); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + a.writeTo(out); + b.writeTo(out); + out.writeBoolean(homoscedastic); + out.writeVInt(tails); + } + + @Override + public String getWriteableName() { + return NAME; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + UnpairedTTestState that = (UnpairedTTestState) o; + return homoscedastic == that.homoscedastic && + tails == that.tails && + a.equals(that.a) && + b.equals(that.b); + } + + @Override + public int hashCode() { + return Objects.hash(a, b, homoscedastic, tails); + } +} diff --git a/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/ttest/InternalTTestTests.java b/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/ttest/InternalTTestTests.java new file mode 100644 index 0000000000000..89ed4f0c0f711 --- /dev/null +++ b/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/ttest/InternalTTestTests.java @@ -0,0 +1,132 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.analytics.ttest; + +import org.elasticsearch.common.ParseField; +import org.elasticsearch.common.io.stream.BytesStreamOutput; +import org.elasticsearch.common.io.stream.NamedWriteableAwareStreamInput; +import org.elasticsearch.common.io.stream.NamedWriteableRegistry; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.xcontent.NamedXContentRegistry; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.SearchModule; +import org.elasticsearch.search.aggregations.Aggregation; +import org.elasticsearch.search.aggregations.ParsedAggregation; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; +import org.elasticsearch.test.InternalAggregationTestCase; +import org.elasticsearch.xpack.analytics.AnalyticsPlugin; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static java.util.Collections.emptyList; + +public class InternalTTestTests extends InternalAggregationTestCase { + + private TTestType type = randomFrom(TTestType.values()); + private int tails = randomIntBetween(1, 2); + + @Override + protected InternalTTest createTestInstance(String name, List pipelineAggregators, + Map metaData) { + TTestState state = randomState(); + DocValueFormat formatter = randomNumericDocValueFormat(); + return new InternalTTest(name, state, formatter, pipelineAggregators, metaData); + } + + private TTestState randomState() { + if (type == TTestType.PAIRED) { + return new PairedTTestState(randomStats(), tails); + } else { + return new UnpairedTTestState(randomStats(), randomStats(), type == TTestType.HOMOSCEDASTIC, tails); + } + } + + private TTestStats randomStats() { + return new TTestStats(randomNonNegativeLong(), randomDouble(), randomDouble()); + } + + @Override + protected Writeable.Reader instanceReader() { + return InternalTTest::new; + } + + @Override + protected void assertReduced(InternalTTest reduced, List inputs) { + TTestState expected = reduced.state.reduce(inputs.stream().map(a -> a.state)); + assertNotNull(expected); + assertEquals(expected.getValue(), reduced.getValue(), 0.00001); + } + + @Override + protected void assertFromXContent(InternalTTest min, ParsedAggregation parsedAggregation) { + // There is no ParsedTTest yet so we cannot test it here + } + + @Override + protected InternalTTest mutateInstance(InternalTTest instance) { + String name = instance.getName(); + TTestState state; + try (BytesStreamOutput output = new BytesStreamOutput()) { + output.writeNamedWriteable(instance.state); + try (StreamInput in = new NamedWriteableAwareStreamInput(output.bytes().streamInput(), getNamedWriteableRegistry())) { + state = in.readNamedWriteable(TTestState.class); + } + } catch (IOException ex) { + throw new IllegalStateException(ex); + } + DocValueFormat formatter = instance.format(); + List pipelineAggregators = instance.pipelineAggregators(); + Map metaData = instance.getMetaData(); + switch (between(0, 2)) { + case 0: + name += randomAlphaOfLength(5); + break; + case 1: + state = randomState(); + break; + case 2: + if (metaData == null) { + metaData = new HashMap<>(1); + } else { + metaData = new HashMap<>(instance.getMetaData()); + } + metaData.put(randomAlphaOfLength(15), randomInt()); + break; + default: + throw new AssertionError("Illegal randomisation branch"); + } + return new InternalTTest(name, state, formatter, pipelineAggregators, metaData); + } + + @Override + protected List getNamedXContents() { + List extendedNamedXContents = new ArrayList<>(super.getNamedXContents()); + extendedNamedXContents.add(new NamedXContentRegistry.Entry(Aggregation.class, + new ParseField(TTestAggregationBuilder.NAME), + (p, c) -> { + assumeTrue("There is no ParsedTTest yet", false); + return null; + } + )); + return extendedNamedXContents; + } + + @Override + protected NamedWriteableRegistry getNamedWriteableRegistry() { + List entries = new ArrayList<>(); + entries.addAll(new SearchModule(Settings.EMPTY, emptyList()).getNamedWriteables()); + entries.addAll(new AnalyticsPlugin().getNamedWriteables()); + return new NamedWriteableRegistry(entries); + } + +} diff --git a/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/ttest/TTestAggregationBuilderTests.java b/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/ttest/TTestAggregationBuilderTests.java new file mode 100644 index 0000000000000..69a228c49fe4d --- /dev/null +++ b/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/ttest/TTestAggregationBuilderTests.java @@ -0,0 +1,84 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.analytics.ttest; + +import org.elasticsearch.common.ParseField; +import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.common.xcontent.NamedXContentRegistry; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.script.Script; +import org.elasticsearch.search.aggregations.AggregatorFactories; +import org.elasticsearch.search.aggregations.BaseAggregationBuilder; +import org.elasticsearch.search.aggregations.support.MultiValuesSourceFieldConfig; +import org.elasticsearch.test.AbstractSerializingTestCase; +import org.junit.Before; + +import java.io.IOException; + +import static java.util.Collections.singletonList; +import static org.hamcrest.Matchers.hasSize; + +public class TTestAggregationBuilderTests extends AbstractSerializingTestCase { + String aggregationName; + + @Before + public void setupName() { + aggregationName = randomAlphaOfLength(10); + } + + @Override + protected NamedXContentRegistry xContentRegistry() { + return new NamedXContentRegistry(singletonList(new NamedXContentRegistry.Entry( + BaseAggregationBuilder.class, + new ParseField(TTestAggregationBuilder.NAME), + (p, n) -> TTestAggregationBuilder.PARSER.apply(p, (String) n)))); + } + + @Override + protected TTestAggregationBuilder doParseInstance(XContentParser parser) throws IOException { + assertSame(XContentParser.Token.START_OBJECT, parser.nextToken()); + AggregatorFactories.Builder parsed = AggregatorFactories.parseAggregators(parser); + assertThat(parsed.getAggregatorFactories(), hasSize(1)); + assertThat(parsed.getPipelineAggregatorFactories(), hasSize(0)); + TTestAggregationBuilder agg = (TTestAggregationBuilder) parsed.getAggregatorFactories().iterator().next(); + assertNull(parser.nextToken()); + assertNotNull(agg); + return agg; + } + + @Override + protected TTestAggregationBuilder createTestInstance() { + MultiValuesSourceFieldConfig aConfig; + if (randomBoolean()) { + aConfig = new MultiValuesSourceFieldConfig.Builder().setFieldName("a_field").build(); + } else { + aConfig = new MultiValuesSourceFieldConfig.Builder().setScript(new Script(randomAlphaOfLength(10))).build(); + } + MultiValuesSourceFieldConfig bConfig; + if (randomBoolean()) { + bConfig = new MultiValuesSourceFieldConfig.Builder().setFieldName("b_field").build(); + } else { + bConfig = new MultiValuesSourceFieldConfig.Builder().setScript(new Script(randomAlphaOfLength(10))).build(); + } + TTestAggregationBuilder aggregationBuilder = new TTestAggregationBuilder(aggregationName) + .a(aConfig) + .b(bConfig); + if (randomBoolean()) { + aggregationBuilder.tails(randomIntBetween(1, 2)); + } + if (randomBoolean()) { + aggregationBuilder.testType(randomFrom(TTestType.values())); + } + return aggregationBuilder; + } + + @Override + protected Writeable.Reader instanceReader() { + return TTestAggregationBuilder::new; + } +} + diff --git a/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/ttest/TTestAggregatorTests.java b/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/ttest/TTestAggregatorTests.java new file mode 100644 index 0000000000000..30bd5356bc565 --- /dev/null +++ b/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/ttest/TTestAggregatorTests.java @@ -0,0 +1,558 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.analytics.ttest; + +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.SortedNumericDocValuesField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.store.Directory; +import org.elasticsearch.common.CheckedConsumer; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.fielddata.ScriptDocValues; +import org.elasticsearch.index.mapper.KeywordFieldMapper; +import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.NumberFieldMapper; +import org.elasticsearch.script.MockScriptEngine; +import org.elasticsearch.script.Script; +import org.elasticsearch.script.ScriptEngine; +import org.elasticsearch.script.ScriptModule; +import org.elasticsearch.script.ScriptService; +import org.elasticsearch.script.ScriptType; +import org.elasticsearch.search.aggregations.AggregationBuilder; +import org.elasticsearch.search.aggregations.AggregationExecutionException; +import org.elasticsearch.search.aggregations.AggregatorTestCase; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.bucket.global.GlobalAggregationBuilder; +import org.elasticsearch.search.aggregations.bucket.global.InternalGlobal; +import org.elasticsearch.search.aggregations.bucket.histogram.HistogramAggregationBuilder; +import org.elasticsearch.search.aggregations.bucket.histogram.InternalHistogram; +import org.elasticsearch.search.aggregations.support.AggregationInspectionHelper; +import org.elasticsearch.search.aggregations.support.CoreValuesSourceType; +import org.elasticsearch.search.aggregations.support.MultiValuesSourceFieldConfig; +import org.elasticsearch.search.aggregations.support.ValuesSourceType; +import org.elasticsearch.search.lookup.LeafDocLookup; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.function.Consumer; +import java.util.function.Function; + +import static java.util.Arrays.asList; +import static java.util.Collections.singleton; + +public class TTestAggregatorTests extends AggregatorTestCase { + + /** + * Script to return the {@code _value} provided by aggs framework. + */ + public static final String ADD_HALF_SCRIPT = "add_one"; + + @Override + protected AggregationBuilder createAggBuilderForTypeTest(MappedFieldType fieldType, String fieldName) { + return new TTestAggregationBuilder("foo") + .a(new MultiValuesSourceFieldConfig.Builder().setFieldName(fieldName).build()) + .b(new MultiValuesSourceFieldConfig.Builder().setFieldName(fieldName).build()); + } + + @Override + protected List getSupportedValuesSourceTypes() { + return List.of(CoreValuesSourceType.NUMERIC); + } + + @Override + protected ScriptService getMockScriptService() { + Map, Object>> scripts = new HashMap<>(); + + scripts.put(ADD_HALF_SCRIPT, vars -> { + LeafDocLookup leafDocLookup = (LeafDocLookup) vars.get("doc"); + String fieldname = (String) vars.get("fieldname"); + ScriptDocValues scriptDocValues = leafDocLookup.get(fieldname); + double val = ((Number) scriptDocValues.get(0)).doubleValue(); + if (val == 1) { + val += 0.0000001; + } + return val + 0.5; + }); + + MockScriptEngine scriptEngine = new MockScriptEngine(MockScriptEngine.NAME, + scripts, + Collections.emptyMap()); + Map engines = Collections.singletonMap(scriptEngine.getType(), scriptEngine); + + return new ScriptService(Settings.EMPTY, engines, ScriptModule.CORE_CONTEXTS); + } + + public void testNoMatchingField() throws IOException { + testCase(new MatchAllDocsQuery(), randomFrom(TTestType.values()), iw -> { + iw.addDocument(asList(new NumericDocValuesField("wrong_a", 102), new NumericDocValuesField("wrong_b", 89))); + iw.addDocument(asList(new NumericDocValuesField("wrong_a", 99), new NumericDocValuesField("wrong_b", 93))); + }, tTest -> assertEquals(Double.NaN, tTest.getValue(), 0)); + } + + public void testNotEnoughRecords() throws IOException { + testCase(new MatchAllDocsQuery(), randomFrom(TTestType.values()), iw -> { + iw.addDocument(asList(new NumericDocValuesField("a", 102), new NumericDocValuesField("b", 89))); + }, tTest -> assertEquals(Double.NaN, tTest.getValue(), 0)); + } + + public void testSameValues() throws IOException { + TTestType tTestType = randomFrom(TTestType.values()); + testCase(new MatchAllDocsQuery(), tTestType, iw -> { + iw.addDocument(asList(new NumericDocValuesField("a", 102), new NumericDocValuesField("b", 102))); + iw.addDocument(asList(new NumericDocValuesField("a", 99), new NumericDocValuesField("b", 99))); + iw.addDocument(asList(new NumericDocValuesField("a", 111), new NumericDocValuesField("b", 111))); + iw.addDocument(asList(new NumericDocValuesField("a", 97), new NumericDocValuesField("b", 97))); + iw.addDocument(asList(new NumericDocValuesField("a", 101), new NumericDocValuesField("b", 101))); + }, tTest -> assertEquals(tTestType == TTestType.PAIRED ? Double.NaN : 1, tTest.getValue(), 0)); + } + + public void testMatchesSortedNumericDocValues() throws IOException { + testCase(new MatchAllDocsQuery(), TTestType.PAIRED, iw -> { + iw.addDocument(asList(new SortedNumericDocValuesField("a", 102), new SortedNumericDocValuesField("b", 89))); + iw.addDocument(asList(new SortedNumericDocValuesField("a", 99), new SortedNumericDocValuesField("b", 93))); + iw.addDocument(asList(new SortedNumericDocValuesField("a", 111), new SortedNumericDocValuesField("b", 72))); + iw.addDocument(asList(new SortedNumericDocValuesField("a", 97), new SortedNumericDocValuesField("b", 98))); + iw.addDocument(asList(new SortedNumericDocValuesField("a", 101), new SortedNumericDocValuesField("b", 102))); + iw.addDocument(asList(new SortedNumericDocValuesField("a", 99), new SortedNumericDocValuesField("b", 98))); + }, tTest -> assertEquals(0.09571844217 * 2, tTest.getValue(), 0.000001)); + } + + public void testMultiplePairedValues() { + AggregationExecutionException ex = expectThrows(AggregationExecutionException.class, () -> + testCase(new MatchAllDocsQuery(), TTestType.PAIRED, iw -> { + iw.addDocument(asList(new SortedNumericDocValuesField("a", 102), new SortedNumericDocValuesField("a", 103), + new SortedNumericDocValuesField("b", 89))); + iw.addDocument(asList(new SortedNumericDocValuesField("a", 99), new SortedNumericDocValuesField("b", 93))); + }, tTest -> fail("Should have thrown exception")) + ); + assertEquals( + "Encountered more than one value for a single document. Use a script to combine multiple values per doc into a single value.", + ex.getMessage()); + } + + public void testMultipleUnpairedValues() throws IOException { + TTestType tTestType = randomFrom(TTestType.HETEROSCEDASTIC, TTestType.HOMOSCEDASTIC); + testCase(new MatchAllDocsQuery(), tTestType, iw -> { + iw.addDocument(asList(new SortedNumericDocValuesField("a", 102), new SortedNumericDocValuesField("a", 103), + new SortedNumericDocValuesField("b", 89))); + iw.addDocument(asList(new SortedNumericDocValuesField("a", 99), new SortedNumericDocValuesField("b", 93))); + }, tTest -> assertEquals(tTestType == TTestType.HETEROSCEDASTIC ? 0.0607303911 : 0.01718374671, tTest.getValue(), 0.000001)); + } + + public void testMissingValues() throws IOException { + TTestType tTestType = randomFrom(TTestType.values()); + testCase(new MatchAllDocsQuery(), tTestType, iw -> { + iw.addDocument(asList(new SortedNumericDocValuesField("a", 102), new SortedNumericDocValuesField("b", 89))); + iw.addDocument(asList(new SortedNumericDocValuesField("a1", 99), new SortedNumericDocValuesField("b", 93))); + iw.addDocument(asList(new SortedNumericDocValuesField("a", 111), new SortedNumericDocValuesField("b1", 72))); + iw.addDocument(asList(new SortedNumericDocValuesField("a", 97), new SortedNumericDocValuesField("b", 98))); + iw.addDocument(asList(new SortedNumericDocValuesField("a", 101), new SortedNumericDocValuesField("b", 102))); + iw.addDocument(asList(new SortedNumericDocValuesField("a", 99), new SortedNumericDocValuesField("b", 98))); + }, tTest -> { + switch (tTestType) { + case PAIRED: + assertEquals(0.4385093524, tTest.getValue(), 0.000001); + break; + case HOMOSCEDASTIC: + assertEquals(0.1066843841, tTest.getValue(), 0.000001); + break; + case HETEROSCEDASTIC: + assertEquals(0.1068382282, tTest.getValue(), 0.000001); + break; + default: + fail("unknown t-test type " + tTestType); + } + }); + } + + public void testUnmappedWithMissingField() throws IOException { + TTestType tTestType = randomFrom(TTestType.values()); + boolean missA = randomBoolean(); + boolean missB = missA == false || randomBoolean(); // at least one of the fields should be missing + MappedFieldType fieldType1 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType1.setName(missA ? "not_a" : "a"); + MappedFieldType fieldType2 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType2.setName(missB ? "not_b" : "b"); + TTestAggregationBuilder aggregationBuilder = new TTestAggregationBuilder("t_test") + .a(new MultiValuesSourceFieldConfig.Builder().setFieldName("a").setMissing(100).build()) + .b(new MultiValuesSourceFieldConfig.Builder().setFieldName("b").setMissing(100).build()) + .testType(tTestType); + testCase(aggregationBuilder, new MatchAllDocsQuery(), iw -> { + iw.addDocument(asList(new NumericDocValuesField("a", 102), new NumericDocValuesField("b", 89))); + iw.addDocument(asList(new NumericDocValuesField("a", 99), new NumericDocValuesField("b", 93))); + }, (Consumer) tTest -> { + if (missA && missB) { + assertEquals(Double.NaN, tTest.getValue(), 0); + } else { + if (missA) { + switch (tTestType) { + case PAIRED: + assertEquals(0.1392089745, tTest.getValue(), 0.000001); + break; + case HOMOSCEDASTIC: + assertEquals(0.04600190799, tTest.getValue(), 0.000001); + break; + case HETEROSCEDASTIC: + assertEquals(0.1392089745, tTest.getValue(), 0.000001); + break; + default: + fail("unknown t-test type " + tTestType); + } + } else { + switch (tTestType) { + case PAIRED: + assertEquals(0.7951672353, tTest.getValue(), 0.000001); + break; + case HOMOSCEDASTIC: + assertEquals(0.7705842661, tTest.getValue(), 0.000001); + break; + case HETEROSCEDASTIC: + assertEquals(0.7951672353, tTest.getValue(), 0.000001); + break; + default: + fail("unknown t-test type " + tTestType); + } + } + } + }, fieldType1, fieldType2); + } + + public void testUnsupportedType() { + TTestType tTestType = randomFrom(TTestType.values()); + boolean wrongA = randomBoolean(); + boolean wrongB = wrongA == false || randomBoolean(); // at least one of the fields should have unsupported type + MappedFieldType fieldType1; + if (wrongA) { + fieldType1 = new KeywordFieldMapper.KeywordFieldType(); + fieldType1.setHasDocValues(true); + } else { + fieldType1 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + } + fieldType1.setName("a"); + MappedFieldType fieldType2; + if (wrongB) { + fieldType2 = new KeywordFieldMapper.KeywordFieldType(); + fieldType2.setHasDocValues(true); + } else { + fieldType2 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + } + fieldType2.setName("b"); + TTestAggregationBuilder aggregationBuilder = new TTestAggregationBuilder("t_test") + .a(new MultiValuesSourceFieldConfig.Builder().setFieldName("a").build()) + .b(new MultiValuesSourceFieldConfig.Builder().setFieldName("b").build()) + .testType(tTestType); + + IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () -> + testCase(aggregationBuilder, new MatchAllDocsQuery(), iw -> { + iw.addDocument(asList(new SortedNumericDocValuesField("a", 102), new SortedNumericDocValuesField("a", 103), + new SortedNumericDocValuesField("b", 89))); + iw.addDocument(asList(new SortedNumericDocValuesField("a", 99), new SortedNumericDocValuesField("b", 93))); + }, tTest -> fail("Should have thrown exception"), fieldType1, fieldType2) + ); + assertEquals( + "Expected numeric type on field [" + (wrongA ? "a" : "b") + "], but got [keyword]", + ex.getMessage()); + } + + public void testBadMissingField() { + TTestType tTestType = randomFrom(TTestType.values()); + boolean missA = randomBoolean(); + boolean missB = missA == false || randomBoolean(); // at least one of the fields should be have bad missing + MappedFieldType fieldType1 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType1.setName("a"); + MultiValuesSourceFieldConfig.Builder a = new MultiValuesSourceFieldConfig.Builder().setFieldName("a"); + if (missA) { + a.setMissing("bad_number"); + } + MappedFieldType fieldType2 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType2.setName("b"); + MultiValuesSourceFieldConfig.Builder b = new MultiValuesSourceFieldConfig.Builder().setFieldName("b"); + if (missB) { + b.setMissing("bad_number"); + } + TTestAggregationBuilder aggregationBuilder = new TTestAggregationBuilder("t_test").a(a.build()).b(b.build()).testType(tTestType); + + NumberFormatException ex = expectThrows(NumberFormatException.class, () -> + testCase(aggregationBuilder, new MatchAllDocsQuery(), iw -> { + iw.addDocument(asList(new SortedNumericDocValuesField("a", 102), new SortedNumericDocValuesField("b", 89))); + iw.addDocument(asList(new SortedNumericDocValuesField("a", 99), new SortedNumericDocValuesField("b", 93))); + }, tTest -> fail("Should have thrown exception"), fieldType1, fieldType2) + ); + assertEquals("For input string: \"bad_number\"", ex.getMessage()); + } + + + public void testUnmappedWithBadMissingField() { + TTestType tTestType = randomFrom(TTestType.values()); + boolean missA = randomBoolean(); + boolean missB = missA == false || randomBoolean(); // at least one of the fields should be have bad missing + MappedFieldType fieldType1 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType1.setName("a"); + MultiValuesSourceFieldConfig.Builder a = new MultiValuesSourceFieldConfig.Builder(); + if (missA) { + a.setFieldName("not_a").setMissing("bad_number"); + } else { + a.setFieldName("a"); + } + MappedFieldType fieldType2 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + + MultiValuesSourceFieldConfig.Builder b = new MultiValuesSourceFieldConfig.Builder(); + if (missB) { + b.setFieldName("not_b").setMissing("bad_number"); + } else { + b.setFieldName("b"); + } + TTestAggregationBuilder aggregationBuilder = new TTestAggregationBuilder("t_test").a(a.build()).b(b.build()).testType(tTestType); + + NumberFormatException ex = expectThrows(NumberFormatException.class, () -> + testCase(aggregationBuilder, new MatchAllDocsQuery(), iw -> { + iw.addDocument(asList(new SortedNumericDocValuesField("a", 102), new SortedNumericDocValuesField("b", 89))); + iw.addDocument(asList(new SortedNumericDocValuesField("a", 99), new SortedNumericDocValuesField("b", 93))); + }, tTest -> fail("Should have thrown exception"), fieldType1, fieldType2) + ); + assertEquals("For input string: \"bad_number\"", ex.getMessage()); + } + + public void testEmptyBucket() throws IOException { + TTestType tTestType = randomFrom(TTestType.values()); + MappedFieldType fieldType1 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType1.setName("a"); + MappedFieldType fieldType2 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType2.setName("b"); + MappedFieldType fieldTypePart = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldTypePart.setName("part"); + HistogramAggregationBuilder histogram = new HistogramAggregationBuilder("histo").field("part").interval(10).minDocCount(0) + .subAggregation(new TTestAggregationBuilder("t_test") + .a(new MultiValuesSourceFieldConfig.Builder().setFieldName("a").build()) + .b(new MultiValuesSourceFieldConfig.Builder().setFieldName("b").build()) + .testType(tTestType)); + + testCase(histogram, new MatchAllDocsQuery(), iw -> { + iw.addDocument(asList(new NumericDocValuesField("a", 102), new NumericDocValuesField("b", 89), + new NumericDocValuesField("part", 1))); + iw.addDocument(asList(new NumericDocValuesField("a", 99), new NumericDocValuesField("b", 93), + new NumericDocValuesField("part", 1))); + iw.addDocument(asList(new NumericDocValuesField("a", 111), new NumericDocValuesField("b", 72), + new NumericDocValuesField("part", 1))); + iw.addDocument(asList(new NumericDocValuesField("a", 97), new NumericDocValuesField("b", 98), + new NumericDocValuesField("part", 21))); + iw.addDocument(asList(new NumericDocValuesField("a", 101), new NumericDocValuesField("b", 102), + new NumericDocValuesField("part", 21))); + iw.addDocument(asList(new NumericDocValuesField("a", 99), new NumericDocValuesField("b", 98), + new NumericDocValuesField("part", 21))); + }, (Consumer) histo -> { + assertEquals(3, histo.getBuckets().size()); + assertNotNull(histo.getBuckets().get(0).getAggregations().asMap().get("t_test")); + InternalTTest tTest = (InternalTTest) histo.getBuckets().get(0).getAggregations().asMap().get("t_test"); + assertEquals(tTestType == TTestType.PAIRED ? 0.1939778614 : + tTestType == TTestType.HOMOSCEDASTIC ? 0.05878871029 : 0.07529006595, tTest.getValue(), 0.000001); + + assertNotNull(histo.getBuckets().get(1).getAggregations().asMap().get("t_test")); + tTest = (InternalTTest) histo.getBuckets().get(1).getAggregations().asMap().get("t_test"); + assertEquals(Double.NaN, tTest.getValue(), 0.000001); + + assertNotNull(histo.getBuckets().get(2).getAggregations().asMap().get("t_test")); + tTest = (InternalTTest) histo.getBuckets().get(2).getAggregations().asMap().get("t_test"); + assertEquals(tTestType == TTestType.PAIRED ? 0.6666666667 : + tTestType == TTestType.HOMOSCEDASTIC ? 0.8593081179 : 0.8594865044, tTest.getValue(), 0.000001); + + }, fieldType1, fieldType2, fieldTypePart); + } + + @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/54365") + public void testFormatter() throws IOException { + TTestType tTestType = randomFrom(TTestType.values()); + MappedFieldType fieldType1 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType1.setName("a"); + MappedFieldType fieldType2 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType2.setName("b"); + TTestAggregationBuilder aggregationBuilder = new TTestAggregationBuilder("t_test") + .a(new MultiValuesSourceFieldConfig.Builder().setFieldName("a").build()) + .b(new MultiValuesSourceFieldConfig.Builder().setFieldName("b").build()) + .testType(tTestType).format("0.00%"); + + testCase(aggregationBuilder, new MatchAllDocsQuery(), iw -> { + iw.addDocument(asList(new NumericDocValuesField("a", 102), new NumericDocValuesField("b", 89))); + iw.addDocument(asList(new NumericDocValuesField("a", 99), new NumericDocValuesField("b", 93))); + iw.addDocument(asList(new NumericDocValuesField("a", 111), new NumericDocValuesField("b", 72))); + }, (Consumer) tTest -> { + assertEquals(tTestType == TTestType.PAIRED ? 0.1939778614 : + tTestType == TTestType.HOMOSCEDASTIC ? 0.05878871029 : 0.07529006595, tTest.getValue(), 0.000001); + assertEquals(tTestType == TTestType.PAIRED ? "19.40%" : + tTestType == TTestType.HOMOSCEDASTIC ? "5.88%" : "7.53%", tTest.getValueAsString()); + }, fieldType1, fieldType2); + } + + public void testGetProperty() throws IOException { + MappedFieldType fieldType1 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType1.setName("a"); + MappedFieldType fieldType2 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType2.setName("b"); + GlobalAggregationBuilder globalBuilder = new GlobalAggregationBuilder("global") + .subAggregation(new TTestAggregationBuilder("t_test") + .a(new MultiValuesSourceFieldConfig.Builder().setFieldName("a").build()) + .b(new MultiValuesSourceFieldConfig.Builder().setFieldName("b").build()) + .testType(TTestType.PAIRED)); + + testCase(globalBuilder, new MatchAllDocsQuery(), iw -> { + iw.addDocument(asList(new NumericDocValuesField("a", 102), new NumericDocValuesField("b", 89))); + iw.addDocument(asList(new NumericDocValuesField("a", 99), new NumericDocValuesField("b", 93))); + iw.addDocument(asList(new NumericDocValuesField("a", 111), new NumericDocValuesField("b", 72))); + }, (Consumer) global -> { + assertEquals(3, global.getDocCount()); + assertTrue(AggregationInspectionHelper.hasValue(global)); + assertNotNull(global.getAggregations().asMap().get("t_test")); + InternalTTest tTest = (InternalTTest) global.getAggregations().asMap().get("t_test"); + assertEquals(tTest, global.getProperty("t_test")); + assertEquals(0.1939778614, (Double) global.getProperty("t_test.value"), 0.000001); + }, fieldType1, fieldType2); + } + + public void testScript() throws IOException { + boolean fieldInA = randomBoolean(); + TTestType tTestType = randomFrom(TTestType.values()); + + MappedFieldType fieldType = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType.setName("field"); + + MultiValuesSourceFieldConfig a = new MultiValuesSourceFieldConfig.Builder().setFieldName("field").build(); + MultiValuesSourceFieldConfig b = new MultiValuesSourceFieldConfig.Builder().setScript( + new Script(ScriptType.INLINE, MockScriptEngine.NAME, ADD_HALF_SCRIPT, Collections.singletonMap("fieldname", "field"))).build(); + TTestAggregationBuilder aggregationBuilder = new TTestAggregationBuilder("t_test"). + a(fieldInA ? a : b).b(fieldInA ? b : a).testType(tTestType); + + testCase(aggregationBuilder, new MatchAllDocsQuery(), iw -> { + iw.addDocument(singleton(new NumericDocValuesField("field", 1))); + iw.addDocument(singleton(new NumericDocValuesField("field", 2))); + iw.addDocument(singleton(new NumericDocValuesField("field", 3))); + }, (Consumer) tTest -> { + assertEquals(tTestType == TTestType.PAIRED ? 0 : 0.5733922538, tTest.getValue(), 0.000001); + }, fieldType); + } + + public void testPaired() throws IOException { + MappedFieldType fieldType1 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType1.setName("a"); + MappedFieldType fieldType2 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType2.setName("b"); + TTestAggregationBuilder aggregationBuilder = new TTestAggregationBuilder("t_test") + .a(new MultiValuesSourceFieldConfig.Builder().setFieldName("a").build()) + .b(new MultiValuesSourceFieldConfig.Builder().setFieldName("b").build()) + .testType(TTestType.PAIRED); + int tails = randomIntBetween(1, 2); + if (tails == 1 || randomBoolean()) { + aggregationBuilder.tails(tails); + } + testCase(aggregationBuilder, new MatchAllDocsQuery(), iw -> { + iw.addDocument(asList(new NumericDocValuesField("a", 102), new NumericDocValuesField("b", 89))); + iw.addDocument(asList(new NumericDocValuesField("a", 99), new NumericDocValuesField("b", 93))); + iw.addDocument(asList(new NumericDocValuesField("a", 111), new NumericDocValuesField("b", 72))); + iw.addDocument(asList(new NumericDocValuesField("a", 97), new NumericDocValuesField("b", 98))); + iw.addDocument(asList(new NumericDocValuesField("a", 101), new NumericDocValuesField("b", 102))); + iw.addDocument(asList(new NumericDocValuesField("a", 99), new NumericDocValuesField("b", 98))); + }, (Consumer) ttest -> { + assertEquals(0.09571844217 * tails, ttest.getValue(), 0.00001); + }, fieldType1, fieldType2); + } + + public void testHomoscedastic() throws IOException { + MappedFieldType fieldType1 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType1.setName("a"); + MappedFieldType fieldType2 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType2.setName("b"); + TTestAggregationBuilder aggregationBuilder = new TTestAggregationBuilder("t_test") + .a(new MultiValuesSourceFieldConfig.Builder().setFieldName("a").build()) + .b(new MultiValuesSourceFieldConfig.Builder().setFieldName("b").build()) + .testType(TTestType.HOMOSCEDASTIC); + int tails = randomIntBetween(1, 2); + if (tails == 1 || randomBoolean()) { + aggregationBuilder.tails(tails); + } + testCase(aggregationBuilder, new MatchAllDocsQuery(), iw -> { + iw.addDocument(asList(new NumericDocValuesField("a", 102), new NumericDocValuesField("b", 89))); + iw.addDocument(asList(new NumericDocValuesField("a", 99), new NumericDocValuesField("b", 93))); + iw.addDocument(asList(new NumericDocValuesField("a", 111), new NumericDocValuesField("b", 72))); + iw.addDocument(asList(new NumericDocValuesField("a", 97), new NumericDocValuesField("b", 98))); + iw.addDocument(asList(new NumericDocValuesField("a", 101), new NumericDocValuesField("b", 102))); + iw.addDocument(asList(new NumericDocValuesField("a", 99), new NumericDocValuesField("b", 98))); + }, (Consumer) ttest -> { + assertEquals(0.03928288693 * tails, ttest.getValue(), 0.00001); + }, fieldType1, fieldType2); + } + + + public void testHeteroscedastic() throws IOException { + MappedFieldType fieldType1 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType1.setName("a"); + MappedFieldType fieldType2 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType2.setName("b"); + TTestAggregationBuilder aggregationBuilder = new TTestAggregationBuilder("t_test") + .a(new MultiValuesSourceFieldConfig.Builder().setFieldName("a").build()) + .b(new MultiValuesSourceFieldConfig.Builder().setFieldName("b").build()); + if (randomBoolean()) { + aggregationBuilder.testType(TTestType.HETEROSCEDASTIC); + } + int tails = randomIntBetween(1, 2); + if (tails == 1 || randomBoolean()) { + aggregationBuilder.tails(tails); + } + testCase(aggregationBuilder, new MatchAllDocsQuery(), iw -> { + iw.addDocument(asList(new NumericDocValuesField("a", 102), new NumericDocValuesField("b", 89))); + iw.addDocument(asList(new NumericDocValuesField("a", 99), new NumericDocValuesField("b", 93))); + iw.addDocument(asList(new NumericDocValuesField("a", 111), new NumericDocValuesField("b", 72))); + iw.addDocument(asList(new NumericDocValuesField("a", 97), new NumericDocValuesField("b", 98))); + iw.addDocument(asList(new NumericDocValuesField("a", 101), new NumericDocValuesField("b", 102))); + iw.addDocument(asList(new NumericDocValuesField("a", 99), new NumericDocValuesField("b", 98))); + }, (Consumer) ttest -> { + assertEquals(0.04538666214 * tails, ttest.getValue(), 0.00001); + }, fieldType1, fieldType2); + } + + private void testCase(Query query, TTestType type, + CheckedConsumer buildIndex, + Consumer verify) throws IOException { + MappedFieldType fieldType1 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType1.setName("a"); + MappedFieldType fieldType2 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType2.setName("b"); + + TTestAggregationBuilder aggregationBuilder = new TTestAggregationBuilder("t_test") + .a(new MultiValuesSourceFieldConfig.Builder().setFieldName("a").build()) + .b(new MultiValuesSourceFieldConfig.Builder().setFieldName("b").build()); + if (type != TTestType.HETEROSCEDASTIC || randomBoolean()) { + aggregationBuilder.testType(type); + } + testCase(aggregationBuilder, query, buildIndex, verify, fieldType1, fieldType2); + } + + private void testCase( + T aggregationBuilder, Query query, + CheckedConsumer buildIndex, + Consumer verify, MappedFieldType... fieldType) throws IOException { + try (Directory directory = newDirectory()) { + RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory); + buildIndex.accept(indexWriter); + indexWriter.close(); + + try (IndexReader indexReader = DirectoryReader.open(directory)) { + IndexSearcher indexSearcher = newSearcher(indexReader, true, true); + + V agg = searchAndReduce(indexSearcher, query, aggregationBuilder, fieldType); + verify.accept(agg); + + } + } + } +} diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/analytics/action/AnalyticsStatsAction.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/analytics/action/AnalyticsStatsAction.java index 14c7c02999881..0bc12ea025a57 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/analytics/action/AnalyticsStatsAction.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/analytics/action/AnalyticsStatsAction.java @@ -119,14 +119,16 @@ public static class NodeResponse extends BaseNodeResponse implements ToXContentO private final long cumulativeCardinalityUsage; private final long stringStatsUsage; private final long topMetricsUsage; + private final long ttestUsage; public NodeResponse(DiscoveryNode node, long boxplotUsage, long cumulativeCardinalityUsage, long stringStatsUsage, - long topMetricsUsage) { + long topMetricsUsage, long ttestUsage) { super(node); this.boxplotUsage = boxplotUsage; this.cumulativeCardinalityUsage = cumulativeCardinalityUsage; this.stringStatsUsage = stringStatsUsage; this.topMetricsUsage = topMetricsUsage; + this.ttestUsage = ttestUsage; } public NodeResponse(StreamInput in) throws IOException { @@ -144,6 +146,11 @@ public NodeResponse(StreamInput in) throws IOException { stringStatsUsage = 0; topMetricsUsage = 0; } + if (in.getVersion().onOrAfter(Version.V_8_0_0)) { // Will drop to 7.8.0 after backport + ttestUsage = in.readVLong(); + } else { + ttestUsage = 0; + } } @Override @@ -157,6 +164,9 @@ public void writeTo(StreamOutput out) throws IOException { out.writeVLong(stringStatsUsage); out.writeVLong(topMetricsUsage); } + if (out.getVersion().onOrAfter(Version.V_8_0_0)) { // Will drop to 7.8.0 after backport + out.writeVLong(ttestUsage); + } } @Override @@ -185,5 +195,9 @@ public long getStringStatsUsage() { public long getTopMetricsUsage() { return topMetricsUsage; } + + public long getTTestUsage() { + return topMetricsUsage; + } } }