Skip to content

Commit

Permalink
[ML] specifying missing_field_value value and using it instead of emp…
Browse files Browse the repository at this point in the history
…ty_string (elastic#53108)

For analytics, we need a consistent way of indicating when a value is missing. Inheriting from anomaly detection, analysis sent `""` when a field is missing. This works fine with numbers, but the underlying analytics process actually treats `""` as a category in categorical values. 

Consequently, you end up with this situation in the resulting model
```
{
              "frequency_encoding" : {
                "field" : "RainToday",
                "feature_name" : "RainToday_frequency",
                "frequency_map" : {
                  "" : 0.009844409027270245,
                  "No" : 0.6472019970785184,
                  "Yes" : 0.6472019970785184
                }
              }
            }
```
For inference this is a problem, because inference will treat missing values as `null`. And thus not include them on the infer call against the model.

This PR takes advantage of our new `missing_field_value` option and supplies `\0` as the value.
  • Loading branch information
benwtrent committed Mar 5, 2020
1 parent 360ac19 commit 6c943a3
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ public class DataFrameDataExtractor {
private static final Logger LOGGER = LogManager.getLogger(DataFrameDataExtractor.class);
private static final TimeValue SCROLL_TIMEOUT = new TimeValue(30, TimeUnit.MINUTES);

private static final String EMPTY_STRING = "";
public static final String NULL_VALUE = "\0";

private final Client client;
private final DataFrameDataExtractorContext context;
Expand Down Expand Up @@ -189,7 +189,7 @@ private Row createRow(SearchHit hit) {
} else {
if (values.length == 0 && context.includeRowsWithMissingValues) {
// if values is empty then it means it's a missing value
extractedValues[i] = EMPTY_STRING;
extractedValues[i] = NULL_VALUE;
} else {
// we are here if we have a missing value but the analysis does not support those
// or the value type is not supported (e.g. arrays, etc.)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
package org.elasticsearch.xpack.ml.dataframe.process.customprocessing;

import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
import org.elasticsearch.xpack.ml.dataframe.extractor.DataFrameDataExtractor;

import java.util.List;
import java.util.Random;
Expand All @@ -18,8 +19,6 @@
*/
class DatasetSplittingCustomProcessor implements CustomProcessor {

private static final String EMPTY = "";

private final int dependentVariableIndex;
private final double trainingPercent;
private final Random random;
Expand Down Expand Up @@ -47,7 +46,7 @@ public void process(String[] row) {
// Let's make sure we have at least one training row
isFirstRow = false;
} else if (isRandomlyExcludedFromTraining()) {
row[dependentVariableIndex] = EMPTY;
row[dependentVariableIndex] = DataFrameDataExtractor.NULL_VALUE;
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,8 @@ public void testMissingValues_GivenShouldInclude() throws IOException {
assertThat(rows.get().size(), equalTo(3));

assertThat(rows.get().get(0).getValues(), equalTo(new String[] {"11", "21"}));
assertThat(rows.get().get(1).getValues(), equalTo(new String[] {"", "22"}));
assertThat(rows.get().get(1).getValues()[0], equalTo(DataFrameDataExtractor.NULL_VALUE));
assertThat(rows.get().get(1).getValues()[1], equalTo("22"));
assertThat(rows.get().get(2).getValues(), equalTo(new String[] {"13", "23"}));

assertThat(rows.get().get(0).shouldSkip(), is(false));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
package org.elasticsearch.xpack.ml.dataframe.process.customprocessing;

import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.xpack.ml.dataframe.extractor.DataFrameDataExtractor;
import org.junit.Before;

import java.util.ArrayList;
Expand Down Expand Up @@ -98,7 +99,7 @@ public void testProcess_GivenRowsWithDependentVariableValue_AndTrainingPercentIs
assertThat(processedRow[fieldIndex], equalTo(row[fieldIndex]));
}
}
if (processedRow[dependentVariableIndex].length() > 0) {
if (DataFrameDataExtractor.NULL_VALUE.equals(processedRow[dependentVariableIndex]) == false) {
assertThat(processedRow[dependentVariableIndex], equalTo(row[dependentVariableIndex]));
trainingRows++;
}
Expand Down

0 comments on commit 6c943a3

Please sign in to comment.