From d37f83fe5522f397e1360179472e25a10a53769c Mon Sep 17 00:00:00 2001 From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com> Date: Tue, 3 Mar 2020 10:49:14 -0500 Subject: [PATCH 1/2] [ML] specifying nullvalue value and using it instead of empty_string --- .../xpack/ml/dataframe/extractor/DataFrameDataExtractor.java | 4 ++-- .../xpack/ml/dataframe/process/AnalyticsProcessConfig.java | 3 +++ .../customprocessing/DatasetSplittingCustomProcessor.java | 5 ++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/DataFrameDataExtractor.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/DataFrameDataExtractor.java index a2ce8cf60d436..717c955eb9553 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/DataFrameDataExtractor.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/DataFrameDataExtractor.java @@ -52,7 +52,7 @@ public class DataFrameDataExtractor { private static final Logger LOGGER = LogManager.getLogger(DataFrameDataExtractor.class); private static final TimeValue SCROLL_TIMEOUT = new TimeValue(30, TimeUnit.MINUTES); - private static final String EMPTY_STRING = ""; + public static final String NULL_VALUE = "\\0"; private final Client client; private final DataFrameDataExtractorContext context; @@ -189,7 +189,7 @@ private Row createRow(SearchHit hit) { } else { if (values.length == 0 && context.includeRowsWithMissingValues) { // if values is empty then it means it's a missing value - extractedValues[i] = EMPTY_STRING; + extractedValues[i] = NULL_VALUE; } else { // we are here if we have a missing value but the analysis does not support those // or the value type is not supported (e.g. arrays, etc.) diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/process/AnalyticsProcessConfig.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/process/AnalyticsProcessConfig.java index 714f63091801f..82f6b824e21c8 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/process/AnalyticsProcessConfig.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/process/AnalyticsProcessConfig.java @@ -9,6 +9,7 @@ import org.elasticsearch.common.xcontent.ToXContentObject; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.xpack.core.ml.dataframe.analyses.DataFrameAnalysis; +import org.elasticsearch.xpack.ml.dataframe.extractor.DataFrameDataExtractor; import org.elasticsearch.xpack.ml.extractor.ExtractedField; import org.elasticsearch.xpack.ml.extractor.ExtractedFields; @@ -28,6 +29,7 @@ public class AnalyticsProcessConfig implements ToXContentObject { private static final String ANALYSIS = "analysis"; private static final String RESULTS_FIELD = "results_field"; private static final String CATEGORICAL_FIELDS = "categorical_fields"; + private static final String MISSING_FIELD_VALUE = "missing_field_value"; private final String jobId; private final long rows; @@ -75,6 +77,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws builder.field(RESULTS_FIELD, resultsField); builder.field(CATEGORICAL_FIELDS, categoricalFields); builder.field(ANALYSIS, new DataFrameAnalysisWrapper(analysis, extractedFields)); + builder.field(MISSING_FIELD_VALUE, DataFrameDataExtractor.NULL_VALUE); builder.endObject(); return builder; } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/process/customprocessing/DatasetSplittingCustomProcessor.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/process/customprocessing/DatasetSplittingCustomProcessor.java index bf6284aa7a5c8..6e6acfb271e3f 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/process/customprocessing/DatasetSplittingCustomProcessor.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/process/customprocessing/DatasetSplittingCustomProcessor.java @@ -6,6 +6,7 @@ package org.elasticsearch.xpack.ml.dataframe.process.customprocessing; import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper; +import org.elasticsearch.xpack.ml.dataframe.extractor.DataFrameDataExtractor; import java.util.List; import java.util.Random; @@ -18,8 +19,6 @@ */ class DatasetSplittingCustomProcessor implements CustomProcessor { - private static final String EMPTY = ""; - private final int dependentVariableIndex; private final double trainingPercent; private final Random random; @@ -47,7 +46,7 @@ public void process(String[] row) { // Let's make sure we have at least one training row isFirstRow = false; } else if (isRandomlyExcludedFromTraining()) { - row[dependentVariableIndex] = EMPTY; + row[dependentVariableIndex] = DataFrameDataExtractor.NULL_VALUE; } } } From 74e8b1a6401918bcef23350e9f7b129930449584 Mon Sep 17 00:00:00 2001 From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com> Date: Wed, 4 Mar 2020 11:00:00 -0500 Subject: [PATCH 2/2] fixing tests --- .../xpack/ml/dataframe/extractor/DataFrameDataExtractor.java | 2 +- .../xpack/ml/dataframe/process/AnalyticsProcessConfig.java | 3 --- .../ml/dataframe/extractor/DataFrameDataExtractorTests.java | 3 ++- .../customprocessing/DatasetSplittingCustomProcessorTests.java | 3 ++- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/DataFrameDataExtractor.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/DataFrameDataExtractor.java index 717c955eb9553..6d9b6fb04b839 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/DataFrameDataExtractor.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/DataFrameDataExtractor.java @@ -52,7 +52,7 @@ public class DataFrameDataExtractor { private static final Logger LOGGER = LogManager.getLogger(DataFrameDataExtractor.class); private static final TimeValue SCROLL_TIMEOUT = new TimeValue(30, TimeUnit.MINUTES); - public static final String NULL_VALUE = "\\0"; + public static final String NULL_VALUE = "\0"; private final Client client; private final DataFrameDataExtractorContext context; diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/process/AnalyticsProcessConfig.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/process/AnalyticsProcessConfig.java index 82f6b824e21c8..714f63091801f 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/process/AnalyticsProcessConfig.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/process/AnalyticsProcessConfig.java @@ -9,7 +9,6 @@ import org.elasticsearch.common.xcontent.ToXContentObject; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.xpack.core.ml.dataframe.analyses.DataFrameAnalysis; -import org.elasticsearch.xpack.ml.dataframe.extractor.DataFrameDataExtractor; import org.elasticsearch.xpack.ml.extractor.ExtractedField; import org.elasticsearch.xpack.ml.extractor.ExtractedFields; @@ -29,7 +28,6 @@ public class AnalyticsProcessConfig implements ToXContentObject { private static final String ANALYSIS = "analysis"; private static final String RESULTS_FIELD = "results_field"; private static final String CATEGORICAL_FIELDS = "categorical_fields"; - private static final String MISSING_FIELD_VALUE = "missing_field_value"; private final String jobId; private final long rows; @@ -77,7 +75,6 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws builder.field(RESULTS_FIELD, resultsField); builder.field(CATEGORICAL_FIELDS, categoricalFields); builder.field(ANALYSIS, new DataFrameAnalysisWrapper(analysis, extractedFields)); - builder.field(MISSING_FIELD_VALUE, DataFrameDataExtractor.NULL_VALUE); builder.endObject(); return builder; } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/extractor/DataFrameDataExtractorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/extractor/DataFrameDataExtractorTests.java index caec7c7bab82e..d54997f1dbdc3 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/extractor/DataFrameDataExtractorTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/extractor/DataFrameDataExtractorTests.java @@ -377,7 +377,8 @@ public void testMissingValues_GivenShouldInclude() throws IOException { assertThat(rows.get().size(), equalTo(3)); assertThat(rows.get().get(0).getValues(), equalTo(new String[] {"11", "21"})); - assertThat(rows.get().get(1).getValues(), equalTo(new String[] {"", "22"})); + assertThat(rows.get().get(1).getValues()[0], equalTo(DataFrameDataExtractor.NULL_VALUE)); + assertThat(rows.get().get(1).getValues()[1], equalTo("22")); assertThat(rows.get().get(2).getValues(), equalTo(new String[] {"13", "23"})); assertThat(rows.get().get(0).shouldSkip(), is(false)); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/process/customprocessing/DatasetSplittingCustomProcessorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/process/customprocessing/DatasetSplittingCustomProcessorTests.java index d18adc3dcdb48..ac897413a4eac 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/process/customprocessing/DatasetSplittingCustomProcessorTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/process/customprocessing/DatasetSplittingCustomProcessorTests.java @@ -6,6 +6,7 @@ package org.elasticsearch.xpack.ml.dataframe.process.customprocessing; import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.xpack.ml.dataframe.extractor.DataFrameDataExtractor; import org.junit.Before; import java.util.ArrayList; @@ -98,7 +99,7 @@ public void testProcess_GivenRowsWithDependentVariableValue_AndTrainingPercentIs assertThat(processedRow[fieldIndex], equalTo(row[fieldIndex])); } } - if (processedRow[dependentVariableIndex].length() > 0) { + if (DataFrameDataExtractor.NULL_VALUE.equals(processedRow[dependentVariableIndex]) == false) { assertThat(processedRow[dependentVariableIndex], equalTo(row[dependentVariableIndex])); trainingRows++; }