Skip to content

Commit

Permalink
[ML] Lift limit of max number of classes for classification to 100 (e…
Browse files Browse the repository at this point in the history
…lastic#89755)

Limit was previously set to `30`. After the improvements in elastic/ml-cpp#2395
we now raist the limit to `100`.
  • Loading branch information
dimitris-athanasiou authored Sep 1, 2022
1 parent 936cde0 commit b5504ea
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=class-assignment-objective]
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=dependent-variable]
+
The data type of the field must be numeric (`integer`, `short`, `long`, `byte`),
categorical (`ip` or `keyword`), or boolean. There must be no more than 30
categorical (`ip` or `keyword`), or boolean. There must be no more than 100
different values in this field.

`downsample_factor`::::
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ public class Classification implements DataFrameAnalysis {
/**
* The max number of classes classification supports
*/
public static final int MAX_DEPENDENT_VARIABLE_CARDINALITY = 30;
public static final int MAX_DEPENDENT_VARIABLE_CARDINALITY = 100;

@SuppressWarnings("unchecked")
private static ConstructingObjectParser<Classification, Void> createParser(boolean lenient) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ public void testFieldCardinalityLimitsIsNonEmpty() {
assertThat(constraints.size(), equalTo(1));
assertThat(constraints.get(0).getField(), equalTo(classification.getDependentVariable()));
assertThat(constraints.get(0).getLowerBound(), equalTo(2L));
assertThat(constraints.get(0).getUpperBound(), equalTo(30L));
assertThat(constraints.get(0).getUpperBound(), equalTo(100L));
}

public void testGetResultMappings_DependentVariableMappingIsAbsent() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.delete.DeleteResponse;
import org.elasticsearch.action.get.GetResponse;
import org.elasticsearch.action.index.IndexAction;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.support.WriteRequest;
Expand Down Expand Up @@ -642,17 +641,24 @@ public void testDependentVariableCardinalityTooHighError() throws Exception {

ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, () -> startAnalytics(jobId));
assertThat(e.status().getStatus(), equalTo(400));
assertThat(e.getMessage(), equalTo("Field [keyword-field] must have at most [30] distinct values but there were at least [31]"));
assertThat(e.getMessage(), equalTo("Field [keyword-field] must have at most [100] distinct values but there were at least [101]"));
}

public void testDependentVariableCardinalityTooHighButWithQueryMakesItWithinRange() throws Exception {
initialize("cardinality_too_high_with_query");
indexData(sourceIndex, 6, 5, KEYWORD_FIELD);
// Index one more document with a class different than the two already used.
client().execute(
IndexAction.INSTANCE,
new IndexRequest(sourceIndex).source(KEYWORD_FIELD, "fox").setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE)
).actionGet();

// Index enough documents to have more classes than the allowed limit
BulkRequestBuilder bulkRequestBuilder = client().prepareBulk().setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE);
for (int i = 0; i < Classification.MAX_DEPENDENT_VARIABLE_CARDINALITY - 1; i++) {
IndexRequest indexRequest = new IndexRequest(sourceIndex).source(KEYWORD_FIELD, "fox-" + i);
bulkRequestBuilder.add(indexRequest);
}
BulkResponse bulkResponse = bulkRequestBuilder.get();
if (bulkResponse.hasFailures()) {
fail("Failed to index data: " + bulkResponse.buildFailureMessage());
}

QueryBuilder query = QueryBuilders.boolQuery().filter(QueryBuilders.termsQuery(KEYWORD_FIELD, KEYWORD_FIELD_VALUES));

DataFrameAnalyticsConfig config = buildAnalytics(jobId, sourceIndex, destIndex, null, new Classification(KEYWORD_FIELD), query);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -409,11 +409,11 @@ public void testDetect_GivenClassificationAndDependentVariableHasInvalidCardinal
buildClassificationConfig("some_keyword"),
100,
fieldCapabilities,
Collections.singletonMap("some_keyword", 31L)
Collections.singletonMap("some_keyword", 101L)
);
ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect);

assertThat(e.getMessage(), equalTo("Field [some_keyword] must have at most [30] distinct values but there were at least [31]"));
assertThat(e.getMessage(), equalTo("Field [some_keyword] must have at most [100] distinct values but there were at least [101]"));
}

public void testDetect_GivenIgnoredField() {
Expand Down

0 comments on commit b5504ea

Please sign in to comment.