-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
HLRC support for string_stats (#52163) (#52297)
This adds a builder and parsed results for the `string_stats` aggregation directly to the high level rest client. Without this the HLRC can't access the `string_stats` API without the elastic licensed `analytics` module. While I'm in there this adds a few of our usual unit tests and modernizes the parsing.
- Loading branch information
Showing
17 changed files
with
727 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
172 changes: 172 additions & 0 deletions
172
...t/rest-high-level/src/main/java/org/elasticsearch/client/analytics/ParsedStringStats.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,172 @@ | ||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.elasticsearch.client.analytics; | ||
|
||
import org.elasticsearch.common.ParseField; | ||
import org.elasticsearch.common.xcontent.ConstructingObjectParser; | ||
import org.elasticsearch.common.xcontent.XContentBuilder; | ||
import org.elasticsearch.common.xcontent.XContentParser; | ||
import org.elasticsearch.search.aggregations.ParsedAggregation; | ||
|
||
import java.io.IOException; | ||
import java.util.HashMap; | ||
import java.util.Map; | ||
|
||
import static java.util.Collections.unmodifiableMap; | ||
import static org.elasticsearch.common.xcontent.ConstructingObjectParser.constructorArg; | ||
import static org.elasticsearch.common.xcontent.ConstructingObjectParser.optionalConstructorArg; | ||
|
||
/** | ||
* Results from the {@code string_stats} aggregation. | ||
*/ | ||
public class ParsedStringStats extends ParsedAggregation { | ||
private static final ParseField COUNT_FIELD = new ParseField("count"); | ||
private static final ParseField MIN_LENGTH_FIELD = new ParseField("min_length"); | ||
private static final ParseField MAX_LENGTH_FIELD = new ParseField("max_length"); | ||
private static final ParseField AVG_LENGTH_FIELD = new ParseField("avg_length"); | ||
private static final ParseField ENTROPY_FIELD = new ParseField("entropy"); | ||
private static final ParseField DISTRIBUTION_FIELD = new ParseField("distribution"); | ||
|
||
private final long count; | ||
private final int minLength; | ||
private final int maxLength; | ||
private final double avgLength; | ||
private final double entropy; | ||
private final boolean showDistribution; | ||
private final Map<String, Double> distribution; | ||
|
||
private ParsedStringStats(String name, long count, int minLength, int maxLength, double avgLength, double entropy, | ||
boolean showDistribution, Map<String, Double> distribution) { | ||
setName(name); | ||
this.count = count; | ||
this.minLength = minLength; | ||
this.maxLength = maxLength; | ||
this.avgLength = avgLength; | ||
this.entropy = entropy; | ||
this.showDistribution = showDistribution; | ||
this.distribution = distribution; | ||
} | ||
|
||
/** | ||
* The number of non-empty fields counted. | ||
*/ | ||
public long getCount() { | ||
return count; | ||
} | ||
|
||
/** | ||
* The length of the shortest term. | ||
*/ | ||
public int getMinLength() { | ||
return minLength; | ||
} | ||
|
||
/** | ||
* The length of the longest term. | ||
*/ | ||
public int getMaxLength() { | ||
return maxLength; | ||
} | ||
|
||
/** | ||
* The average length computed over all terms. | ||
*/ | ||
public double getAvgLength() { | ||
return avgLength; | ||
} | ||
|
||
/** | ||
* The <a href="https://en.wikipedia.org/wiki/Entropy_(information_theory)">Shannon Entropy</a> | ||
* value computed over all terms collected by the aggregation. | ||
* Shannon entropy quantifies the amount of information contained in | ||
* the field. It is a very useful metric for measuring a wide range of | ||
* properties of a data set, such as diversity, similarity, | ||
* randomness etc. | ||
*/ | ||
public double getEntropy() { | ||
return entropy; | ||
} | ||
|
||
/** | ||
* The probability distribution for all characters. {@code null} unless | ||
* explicitly requested with {@link StringStatsAggregationBuilder#showDistribution(boolean)}. | ||
*/ | ||
public Map<String, Double> getDistribution() { | ||
return distribution; | ||
} | ||
|
||
@Override | ||
public String getType() { | ||
return StringStatsAggregationBuilder.NAME; | ||
} | ||
|
||
private static final Object NULL_DISTRIBUTION_MARKER = new Object(); | ||
public static final ConstructingObjectParser<ParsedStringStats, String> PARSER = new ConstructingObjectParser<>( | ||
StringStatsAggregationBuilder.NAME, true, (args, name) -> { | ||
long count = (long) args[0]; | ||
boolean disributionWasExplicitNull = args[5] == NULL_DISTRIBUTION_MARKER; | ||
if (count == 0) { | ||
return new ParsedStringStats(name, count, 0, 0, 0, 0, disributionWasExplicitNull, null); | ||
} | ||
int minLength = (int) args[1]; | ||
int maxLength = (int) args[2]; | ||
double averageLength = (double) args[3]; | ||
double entropy = (double) args[4]; | ||
if (disributionWasExplicitNull) { | ||
return new ParsedStringStats(name, count, minLength, maxLength, averageLength, entropy, | ||
disributionWasExplicitNull, null); | ||
} else { | ||
@SuppressWarnings("unchecked") | ||
Map<String, Double> distribution = (Map<String, Double>) args[5]; | ||
return new ParsedStringStats(name, count, minLength, maxLength, averageLength, entropy, | ||
distribution != null, distribution); | ||
} | ||
}); | ||
static { | ||
PARSER.declareLong(constructorArg(), COUNT_FIELD); | ||
PARSER.declareIntOrNull(constructorArg(), 0, MIN_LENGTH_FIELD); | ||
PARSER.declareIntOrNull(constructorArg(), 0, MAX_LENGTH_FIELD); | ||
PARSER.declareDoubleOrNull(constructorArg(), 0, AVG_LENGTH_FIELD); | ||
PARSER.declareDoubleOrNull(constructorArg(), 0, ENTROPY_FIELD); | ||
PARSER.declareObjectOrNull(optionalConstructorArg(), (p, c) -> unmodifiableMap(p.map(HashMap::new, XContentParser::doubleValue)), | ||
NULL_DISTRIBUTION_MARKER, DISTRIBUTION_FIELD); | ||
ParsedAggregation.declareAggregationFields(PARSER); | ||
} | ||
|
||
@Override | ||
protected XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException { | ||
builder.field(COUNT_FIELD.getPreferredName(), count); | ||
if (count == 0) { | ||
builder.nullField(MIN_LENGTH_FIELD.getPreferredName()); | ||
builder.nullField(MAX_LENGTH_FIELD.getPreferredName()); | ||
builder.nullField(AVG_LENGTH_FIELD.getPreferredName()); | ||
builder.field(ENTROPY_FIELD.getPreferredName(), 0.0); | ||
} else { | ||
builder.field(MIN_LENGTH_FIELD.getPreferredName(), minLength); | ||
builder.field(MAX_LENGTH_FIELD.getPreferredName(), maxLength); | ||
builder.field(AVG_LENGTH_FIELD.getPreferredName(), avgLength); | ||
builder.field(ENTROPY_FIELD.getPreferredName(), entropy); | ||
} | ||
if (showDistribution) { | ||
builder.field(DISTRIBUTION_FIELD.getPreferredName(), distribution); | ||
} | ||
return builder; | ||
} | ||
} |
116 changes: 116 additions & 0 deletions
116
...level/src/main/java/org/elasticsearch/client/analytics/StringStatsAggregationBuilder.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.elasticsearch.client.analytics; | ||
|
||
import org.elasticsearch.common.ParseField; | ||
import org.elasticsearch.common.io.stream.StreamOutput; | ||
import org.elasticsearch.common.io.stream.Writeable; | ||
import org.elasticsearch.common.xcontent.XContentBuilder; | ||
import org.elasticsearch.index.query.QueryRewriteContext; | ||
import org.elasticsearch.index.query.QueryShardContext; | ||
import org.elasticsearch.search.aggregations.AbstractAggregationBuilder; | ||
import org.elasticsearch.search.aggregations.AggregationBuilder; | ||
import org.elasticsearch.search.aggregations.AggregatorFactories.Builder; | ||
import org.elasticsearch.search.aggregations.AggregatorFactory; | ||
import org.elasticsearch.search.aggregations.support.CoreValuesSourceType; | ||
import org.elasticsearch.search.aggregations.support.ValueType; | ||
import org.elasticsearch.search.aggregations.support.ValuesSource; | ||
import org.elasticsearch.search.aggregations.support.ValuesSource.Bytes; | ||
import org.elasticsearch.search.aggregations.support.ValuesSourceAggregationBuilder; | ||
import org.elasticsearch.search.aggregations.support.ValuesSourceAggregatorFactory; | ||
import org.elasticsearch.search.aggregations.support.ValuesSourceConfig; | ||
import org.elasticsearch.search.builder.SearchSourceBuilder; | ||
|
||
import java.io.IOException; | ||
import java.util.Map; | ||
import java.util.Objects; | ||
|
||
/** | ||
* Builds the {@code string_stats} aggregation request. | ||
* <p> | ||
* NOTE: This extends {@linkplain AbstractAggregationBuilder} for compatibility | ||
* with {@link SearchSourceBuilder#aggregation(AggregationBuilder)} but it | ||
* doesn't support any "server" side things like | ||
* {@linkplain Writeable#writeTo(StreamOutput)}, | ||
* {@linkplain AggregationBuilder#rewrite(QueryRewriteContext)}, or | ||
* {@linkplain AbstractAggregationBuilder#build(QueryShardContext, AggregatorFactory)}. | ||
*/ | ||
public class StringStatsAggregationBuilder extends ValuesSourceAggregationBuilder<ValuesSource.Bytes, StringStatsAggregationBuilder> { | ||
public static final String NAME = "string_stats"; | ||
private static final ParseField SHOW_DISTRIBUTION_FIELD = new ParseField("show_distribution"); | ||
|
||
private boolean showDistribution = false; | ||
|
||
public StringStatsAggregationBuilder(String name) { | ||
super(name, CoreValuesSourceType.BYTES, ValueType.STRING); | ||
} | ||
|
||
/** | ||
* Compute the distribution of each character. Disabled by default. | ||
* @return this for chaining | ||
*/ | ||
public StringStatsAggregationBuilder showDistribution(boolean showDistribution) { | ||
this.showDistribution = showDistribution; | ||
return this; | ||
} | ||
|
||
@Override | ||
public String getType() { | ||
return NAME; | ||
} | ||
|
||
@Override | ||
public XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException { | ||
return builder.field(StringStatsAggregationBuilder.SHOW_DISTRIBUTION_FIELD.getPreferredName(), showDistribution); | ||
} | ||
|
||
@Override | ||
protected void innerWriteTo(StreamOutput out) throws IOException { | ||
throw new UnsupportedOperationException(); | ||
} | ||
|
||
@Override | ||
protected ValuesSourceAggregatorFactory<Bytes> innerBuild(QueryShardContext queryShardContext, ValuesSourceConfig<Bytes> config, | ||
AggregatorFactory parent, Builder subFactoriesBuilder) throws IOException { | ||
throw new UnsupportedOperationException(); | ||
} | ||
|
||
@Override | ||
protected AggregationBuilder shallowCopy(Builder factoriesBuilder, Map<String, Object> metaData) { | ||
throw new UnsupportedOperationException(); | ||
} | ||
|
||
@Override | ||
public int hashCode() { | ||
return Objects.hash(super.hashCode(), showDistribution); | ||
} | ||
|
||
@Override | ||
public boolean equals(Object obj) { | ||
if (obj == null || getClass() != obj.getClass()) { | ||
return false; | ||
} | ||
if (false == super.equals(obj)) { | ||
return false; | ||
} | ||
StringStatsAggregationBuilder other = (StringStatsAggregationBuilder) obj; | ||
return showDistribution == other.showDistribution; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
58 changes: 58 additions & 0 deletions
58
client/rest-high-level/src/test/java/org/elasticsearch/client/analytics/AnalyticsAggsIT.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.elasticsearch.client.analytics; | ||
|
||
import org.elasticsearch.action.bulk.BulkRequest; | ||
import org.elasticsearch.action.index.IndexRequest; | ||
import org.elasticsearch.action.search.SearchRequest; | ||
import org.elasticsearch.action.search.SearchResponse; | ||
import org.elasticsearch.action.support.WriteRequest.RefreshPolicy; | ||
import org.elasticsearch.client.ESRestHighLevelClientTestCase; | ||
import org.elasticsearch.client.RequestOptions; | ||
import org.elasticsearch.common.xcontent.XContentType; | ||
|
||
import java.io.IOException; | ||
|
||
import static org.hamcrest.Matchers.aMapWithSize; | ||
import static org.hamcrest.Matchers.closeTo; | ||
import static org.hamcrest.Matchers.equalTo; | ||
import static org.hamcrest.Matchers.hasEntry; | ||
|
||
public class AnalyticsAggsIT extends ESRestHighLevelClientTestCase { | ||
public void testBasic() throws IOException { | ||
BulkRequest bulk = new BulkRequest("test").setRefreshPolicy(RefreshPolicy.IMMEDIATE); | ||
bulk.add(new IndexRequest().source(XContentType.JSON, "message", "trying out elasticsearch")); | ||
bulk.add(new IndexRequest().source(XContentType.JSON, "message", "more words")); | ||
highLevelClient().bulk(bulk, RequestOptions.DEFAULT); | ||
SearchRequest search = new SearchRequest("test"); | ||
search.source().aggregation(new StringStatsAggregationBuilder("test").field("message.keyword").showDistribution(true)); | ||
SearchResponse response = highLevelClient().search(search, RequestOptions.DEFAULT); | ||
ParsedStringStats stats = response.getAggregations().get("test"); | ||
assertThat(stats.getCount(), equalTo(2L)); | ||
assertThat(stats.getMinLength(), equalTo(10)); | ||
assertThat(stats.getMaxLength(), equalTo(24)); | ||
assertThat(stats.getAvgLength(), equalTo(17.0)); | ||
assertThat(stats.getEntropy(), closeTo(4, .1)); | ||
assertThat(stats.getDistribution(), aMapWithSize(18)); | ||
assertThat(stats.getDistribution(), hasEntry(equalTo("o"), closeTo(.09, .005))); | ||
assertThat(stats.getDistribution(), hasEntry(equalTo("r"), closeTo(.12, .005))); | ||
assertThat(stats.getDistribution(), hasEntry(equalTo("t"), closeTo(.09, .005))); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.