From d1af703546fb344179d88505d46653b2fe02d850 Mon Sep 17 00:00:00 2001 From: Dima Arnautov Date: Fri, 17 Jan 2020 17:22:27 +0100 Subject: [PATCH 1/3] [ML] update data visualizer endpoint to check doc counts --- .../models/data_visualizer/data_visualizer.js | 64 ++++++++++++++++--- 1 file changed, 54 insertions(+), 10 deletions(-) diff --git a/x-pack/legacy/plugins/ml/server/models/data_visualizer/data_visualizer.js b/x-pack/legacy/plugins/ml/server/models/data_visualizer/data_visualizer.js index 7c2e3eaf07bcc..9e743aaf20298 100644 --- a/x-pack/legacy/plugins/ml/server/models/data_visualizer/data_visualizer.js +++ b/x-pack/legacy/plugins/ml/server/models/data_visualizer/data_visualizer.js @@ -261,7 +261,11 @@ export class DataVisualizer { aggregatableFields.forEach((field, i) => { const safeFieldName = getSafeAggregationName(field, i); aggs[`${safeFieldName}_count`] = { - value_count: { field }, + filter: { + exists: { + field: safeFieldName, + }, + }, }; aggs[`${safeFieldName}_cardinality`] = { cardinality: { field }, @@ -296,7 +300,7 @@ export class DataVisualizer { samplerShardSize > 0 ? _.get(aggregations, ['sample', 'doc_count'], 0) : totalCount; aggregatableFields.forEach((field, i) => { const safeFieldName = getSafeAggregationName(field, i); - const count = _.get(aggregations, [...aggsPath, `${safeFieldName}_count`, 'value'], 0); + const count = _.get(aggregations, [...aggsPath, `${safeFieldName}_count`, 'doc_count'], 0); if (count > 0) { const cardinality = _.get( aggregations, @@ -433,7 +437,16 @@ export class DataVisualizer { fields.forEach((field, i) => { const safeFieldName = getSafeAggregationName(field.fieldName, i); aggs[`${safeFieldName}_field_stats`] = { - stats: { field: field.fieldName }, + filter: { + exists: { + field: safeFieldName, + }, + }, + aggs: { + actual_stats: { + stats: { field: field.fieldName }, + }, + }, }; aggs[`${safeFieldName}_percentiles`] = { percentiles: { @@ -484,10 +497,19 @@ export class DataVisualizer { const batchStats = []; fields.forEach((field, i) => { const safeFieldName = getSafeAggregationName(field.fieldName, i); - const fieldStatsResp = _.get(aggregations, [...aggsPath, `${safeFieldName}_field_stats`], {}); + const docCount = _.get( + aggregations, + [...aggsPath, `${safeFieldName}_field_stats`, 'doc_count'], + 0 + ); + const fieldStatsResp = _.get( + aggregations, + [...aggsPath, `${safeFieldName}_field_stats`, 'actual_stats'], + {} + ); const stats = { fieldName: field.fieldName, - count: _.get(fieldStatsResp, 'count', 0), + count: docCount, min: _.get(fieldStatsResp, 'min', 0), max: _.get(fieldStatsResp, 'max', 0), avg: _.get(fieldStatsResp, 'avg', 0), @@ -632,7 +654,16 @@ export class DataVisualizer { fields.forEach((field, i) => { const safeFieldName = getSafeAggregationName(field.fieldName, i); aggs[`${safeFieldName}_field_stats`] = { - stats: { field: field.fieldName }, + filter: { + exists: { + field: safeFieldName, + }, + }, + aggs: { + actual_stats: { + stats: { field: field.fieldName }, + }, + }, }; }); @@ -651,10 +682,19 @@ export class DataVisualizer { const batchStats = []; fields.forEach((field, i) => { const safeFieldName = getSafeAggregationName(field.fieldName, i); - const fieldStatsResp = _.get(aggregations, [...aggsPath, `${safeFieldName}_field_stats`], {}); + const docCount = _.get( + aggregations, + [...aggsPath, `${safeFieldName}_field_stats`, 'doc_count'], + 0 + ); + const fieldStatsResp = _.get( + aggregations, + [...aggsPath, `${safeFieldName}_field_stats`, 'actual_stats'], + {} + ); batchStats.push({ fieldName: field.fieldName, - count: _.get(fieldStatsResp, 'count', 0), + count: docCount, earliest: _.get(fieldStatsResp, 'min', 0), latest: _.get(fieldStatsResp, 'max', 0), }); @@ -680,7 +720,11 @@ export class DataVisualizer { fields.forEach((field, i) => { const safeFieldName = getSafeAggregationName(field.fieldName, i); aggs[`${safeFieldName}_value_count`] = { - value_count: { field: field.fieldName }, + filter: { + exists: { + field: safeFieldName, + }, + }, }; aggs[`${safeFieldName}_values`] = { terms: { @@ -707,7 +751,7 @@ export class DataVisualizer { const safeFieldName = getSafeAggregationName(field.fieldName, i); const stats = { fieldName: field.fieldName, - count: _.get(aggregations, [...aggsPath, `${safeFieldName}_value_count`, 'value'], 0), + count: _.get(aggregations, [...aggsPath, `${safeFieldName}_value_count`, 'doc_count'], 0), trueCount: 0, falseCount: 0, }; From b81d6a106d2efd6d2695a170281ba938e4299359 Mon Sep 17 00:00:00 2001 From: Dima Arnautov Date: Tue, 21 Jan 2020 18:48:58 +0100 Subject: [PATCH 2/3] [ML] fix mock for cardinality tests --- .../__tests__/mock_farequote_cardinality.json | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/x-pack/legacy/plugins/ml/server/models/job_validation/__tests__/mock_farequote_cardinality.json b/x-pack/legacy/plugins/ml/server/models/job_validation/__tests__/mock_farequote_cardinality.json index 9dcfc11575abb..8d408ff0310c9 100644 --- a/x-pack/legacy/plugins/ml/server/models/job_validation/__tests__/mock_farequote_cardinality.json +++ b/x-pack/legacy/plugins/ml/server/models/job_validation/__tests__/mock_farequote_cardinality.json @@ -1 +1,7 @@ -{"took":0,"timed_out":false,"_shards":{"total":1,"successful":1,"skipped":0,"failed":0},"hits":{"total":86274,"max_score":0,"hits":[]},"aggregations":{"airline_cardinality":{"value":19},"airline_count":{"value":86274}}} +{ + "took": 0, + "timed_out": false, + "_shards": { "total": 1, "successful": 1, "skipped": 0, "failed": 0 }, + "hits": { "total": 86274, "max_score": 0, "hits": [] }, + "aggregations": { "airline_cardinality": { "value": 19 }, "airline_count": { "doc_count": 86274 } } +} From 24f4c05028b7ff956aa32b87e4d189aeaa4c7bfc Mon Sep 17 00:00:00 2001 From: Dima Arnautov Date: Tue, 21 Jan 2020 23:55:16 +0100 Subject: [PATCH 3/3] [ML] use actual field name for agg filtering instead of safeFieldName --- .../models/data_visualizer/data_visualizer.js | 24 ++++--------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/x-pack/legacy/plugins/ml/server/models/data_visualizer/data_visualizer.js b/x-pack/legacy/plugins/ml/server/models/data_visualizer/data_visualizer.js index 9e743aaf20298..f4ee032ee2dbb 100644 --- a/x-pack/legacy/plugins/ml/server/models/data_visualizer/data_visualizer.js +++ b/x-pack/legacy/plugins/ml/server/models/data_visualizer/data_visualizer.js @@ -261,11 +261,7 @@ export class DataVisualizer { aggregatableFields.forEach((field, i) => { const safeFieldName = getSafeAggregationName(field, i); aggs[`${safeFieldName}_count`] = { - filter: { - exists: { - field: safeFieldName, - }, - }, + filter: { exists: { field } }, }; aggs[`${safeFieldName}_cardinality`] = { cardinality: { field }, @@ -437,11 +433,7 @@ export class DataVisualizer { fields.forEach((field, i) => { const safeFieldName = getSafeAggregationName(field.fieldName, i); aggs[`${safeFieldName}_field_stats`] = { - filter: { - exists: { - field: safeFieldName, - }, - }, + filter: { exists: { field: field.fieldName } }, aggs: { actual_stats: { stats: { field: field.fieldName }, @@ -654,11 +646,7 @@ export class DataVisualizer { fields.forEach((field, i) => { const safeFieldName = getSafeAggregationName(field.fieldName, i); aggs[`${safeFieldName}_field_stats`] = { - filter: { - exists: { - field: safeFieldName, - }, - }, + filter: { exists: { field: field.fieldName } }, aggs: { actual_stats: { stats: { field: field.fieldName }, @@ -720,11 +708,7 @@ export class DataVisualizer { fields.forEach((field, i) => { const safeFieldName = getSafeAggregationName(field.fieldName, i); aggs[`${safeFieldName}_value_count`] = { - filter: { - exists: { - field: safeFieldName, - }, - }, + filter: { exists: { field: field.fieldName } }, }; aggs[`${safeFieldName}_values`] = { terms: {