[8.11] [ML] Fix data drift calculating inaccurate p value when range …

…is not of uniform distribution (#168757) (#169168) # Backport This will backport the following commits from `main` to `8.11`: - [[ML] Fix data drift calculating inaccurate p value when range is not of uniform distribution (#168757)](#168757)  ### Questions ? Please refer to the [Backport tool documentation](https://github.com/sqren/backport)  Co-authored-by: Quynh Nguyen (Quinn) <[email protected]>
elastic · Oct 17, 2023 · 59a7ede · 59a7ede
1 parent ef73f85
commit 59a7ede
Show file tree

Hide file tree

Showing 3 changed files with 72 additions and 11 deletions.
diff --git a/x-pack/plugins/data_visualizer/public/application/data_drift/data_drift_page.tsx b/x-pack/plugins/data_visualizer/public/application/data_drift/data_drift_page.tsx
@@ -359,7 +359,6 @@ export const DataDriftPage: FC<Props> = ({ initialSettings }) => {
                 label={comparisonIndexPatternLabel}
                 randomSampler={randomSamplerProd}
                 reload={forceRefresh}
-                brushSelectionUpdateHandler={brushSelectionUpdate}
                 documentCountStats={documentStatsProd.documentCountStats}
                 documentCountStatsSplit={documentStatsProd.documentCountStatsCompare}
                 isBrushCleared={isBrushCleared}

diff --git a/.../plugins/data_visualizer/public/application/data_drift/document_count_with_dual_brush.tsx b/.../plugins/data_visualizer/public/application/data_drift/document_count_with_dual_brush.tsx
@@ -32,7 +32,7 @@ export interface DocumentCountContentProps
     | 'interval'
     | 'chartPointsSplitLabel'
   > {
-  brushSelectionUpdateHandler: BrushSelectionUpdateHandler;
+  brushSelectionUpdateHandler?: BrushSelectionUpdateHandler;
   documentCountStats?: DocumentCountStats;
   documentCountStatsSplit?: DocumentCountStats;
   documentCountStatsSplitLabel?: string;

diff --git a/x-pack/plugins/data_visualizer/public/application/data_drift/use_data_drift_result.ts b/x-pack/plugins/data_visualizer/public/application/data_drift/use_data_drift_result.ts
@@ -29,6 +29,7 @@ import { isDefined } from '@kbn/ml-is-defined';
 import { computeChi2PValue, type Histogram } from '@kbn/ml-chi2test';
 import { mapAndFlattenFilters } from '@kbn/data-plugin/public';
 
+import type { AggregationsRangeBucketKeys } from '@elastic/elasticsearch/lib/api/types';
 import { createMergedEsQuery } from '../index_data_visualizer/utils/saved_search_utils';
 import { useDataVisualizerKibana } from '../kibana_context';
 
@@ -378,6 +379,7 @@ const fetchComparisonDriftedData = async ({
   fields,
   baselineResponseAggs,
   baseRequest,
+  baselineRequest,
   randomSamplerWrapper,
   signal,
 }: {
@@ -387,10 +389,19 @@ const fetchComparisonDriftedData = async ({
   randomSamplerWrapper: RandomSamplerWrapper;
   signal: AbortSignal;
   baselineResponseAggs: object;
+  baselineRequest: EsRequestParams;
 }) => {
   const driftedRequest = { ...baseRequest };
+
   const driftedRequestAggs: Record<string, estypes.AggregationsAggregationContainer> = {};
 
+  // Since aggregation is not able to split the values into distinct 5% intervals,
+  // this breaks our assumption of uniform distributed fractions in the`ks_test`.
+  // So, to fix this in the general case, we need to run an additional ranges agg to get the doc count for the ranges
+  // that we get from the percentiles aggregation
+  // and use it in the bucket_count_ks_test
+  const rangesRequestAggs: Record<string, estypes.AggregationsAggregationContainer> = {};
+
   for (const { field, type } of fields) {
     if (
       isPopulatedObject(baselineResponseAggs, [`${field}_percentiles`]) &&
@@ -410,19 +421,16 @@ const fetchComparisonDriftedData = async ({
           ranges.push({ from: percentiles[idx - 1], to: val });
         }
       });
-      // add range and bucket_count_ks_test to the request
-      driftedRequestAggs[`${field}_ranges`] = {
+      const rangeAggs = {
         range: {
           field,
           ranges,
         },
       };
-      driftedRequestAggs[`${field}_ks_test`] = {
-        bucket_count_ks_test: {
-          buckets_path: `${field}_ranges>_count`,
-          alternative: ['two_sided'],
-        },
-      };
+      // add range and bucket_count_ks_test to the request
+      rangesRequestAggs[`${field}_ranges`] = rangeAggs;
+      driftedRequestAggs[`${field}_ranges`] = rangeAggs;
+
       // add stats aggregation to the request
       driftedRequestAggs[`${field}_stats`] = {
         stats: {
@@ -441,13 +449,66 @@ const fetchComparisonDriftedData = async ({
     }
   }
 
+  // Compute fractions based on results of ranges
+  const rangesResp = await dataSearch(
+    {
+      ...baselineRequest,
+      body: { ...baselineRequest.body, aggs: randomSamplerWrapper.wrap(rangesRequestAggs) },
+    },
+    signal
+  );
+
+  const fieldsWithNoOverlap = new Set<string>();
+  for (const { field } of fields) {
+    if (rangesResp.aggregations[`${field}_ranges`]) {
+      const buckets = rangesResp.aggregations[`${field}_ranges`]
+        .buckets as AggregationsRangeBucketKeys[];
+
+      if (buckets) {
+        const totalSumOfAllBuckets = buckets.reduce((acc, bucket) => acc + bucket.doc_count, 0);
+
+        const fractions = buckets.map((bucket) => ({
+          ...bucket,
+          fraction: bucket.doc_count / totalSumOfAllBuckets,
+        }));
+
+        if (totalSumOfAllBuckets > 0) {
+          driftedRequestAggs[`${field}_ks_test`] = {
+            bucket_count_ks_test: {
+              buckets_path: `${field}_ranges>_count`,
+              alternative: ['two_sided'],
+              ...(totalSumOfAllBuckets > 0
+                ? { fractions: fractions.map((bucket) => Number(bucket.fraction.toFixed(3))) }
+                : {}),
+            },
+          };
+        } else {
+          // If all doc_counts are 0, that means there's no overlap whatsoever
+          // in which case we don't need to make the ks test agg, because it defaults to astronomically small value
+          fieldsWithNoOverlap.add(field);
+        }
+      }
+    }
+  }
+
   const driftedResp = await dataSearch(
     {
       ...driftedRequest,
       body: { ...driftedRequest.body, aggs: randomSamplerWrapper.wrap(driftedRequestAggs) },
     },
     signal
   );
+
+  fieldsWithNoOverlap.forEach((field) => {
+    if (driftedResp.aggregations) {
+      driftedResp.aggregations[`${field}_ks_test`] = {
+        // Setting -Infinity to represent astronomically small number
+        // which would be represented as < 0.000001 in table
+        two_sided: -Infinity,
+      };
+    }
+  });
+
   return driftedResp;
 };
 
@@ -678,7 +739,7 @@ export const useFetchDataComparisonResult = (
 
         setResult({ data: undefined, status: FETCH_STATUS.LOADING, error: undefined });
 
-        // Place holder for when there might be difference data views in the future
+        // Placeholder for when there might be difference data views in the future
         const referenceIndex = initialSettings
           ? initialSettings.reference
           : currentDataView?.getIndexPattern();
@@ -802,6 +863,7 @@ export const useFetchDataComparisonResult = (
               fetchComparisonDriftedData({
                 dataSearch,
                 baseRequest: driftedRequest,
+                baselineRequest,
                 baselineResponseAggs,
                 fields: chunkedFields,
                 randomSamplerWrapper: prodRandomSamplerWrapper,