Skip to content

Commit

Permalink
[Analysis] Support normalizer in request param (elastic#24767)
Browse files Browse the repository at this point in the history
* [Analysis] Support normalizer in request param

Support normalizer param
Support custom normalizer with char_filter/filter param

Closes elastic#23347
  • Loading branch information
johtani authored Jul 4, 2017
1 parent 5200665 commit 6894ef6
Show file tree
Hide file tree
Showing 13 changed files with 342 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
*/
package org.elasticsearch.action.admin.indices.analyze;

import org.elasticsearch.Version;
import org.elasticsearch.action.ActionRequestValidationException;
import org.elasticsearch.action.support.single.shard.SingleShardRequest;
import org.elasticsearch.common.Strings;
Expand Down Expand Up @@ -59,6 +60,8 @@ public class AnalyzeRequest extends SingleShardRequest<AnalyzeRequest> {

private String[] attributes = Strings.EMPTY_ARRAY;

private String normalizer;

public static class NameOrDefinition implements Writeable {
// exactly one of these two members is not null
public final String name;
Expand Down Expand Up @@ -202,12 +205,27 @@ public String[] attributes() {
return this.attributes;
}

public String normalizer() {
return this.normalizer;
}

public AnalyzeRequest normalizer(String normalizer) {
this.normalizer = normalizer;
return this;
}

@Override
public ActionRequestValidationException validate() {
ActionRequestValidationException validationException = null;
if (text == null || text.length == 0) {
validationException = addValidationError("text is missing", validationException);
}
if ((index == null || index.length() == 0) && normalizer != null) {
validationException = addValidationError("index is required if normalizer is specified", validationException);
}
if (normalizer != null && (tokenizer != null || analyzer != null)) {
validationException = addValidationError("tokenizer/analyze should be null if normalizer is specified", validationException);
}
return validationException;
}

Expand All @@ -222,6 +240,9 @@ public void readFrom(StreamInput in) throws IOException {
field = in.readOptionalString();
explain = in.readBoolean();
attributes = in.readStringArray();
if (in.getVersion().onOrAfter(Version.V_6_0_0_alpha3)) {
normalizer = in.readOptionalString();
}
}

@Override
Expand All @@ -235,5 +256,8 @@ public void writeTo(StreamOutput out) throws IOException {
out.writeOptionalString(field);
out.writeBoolean(explain);
out.writeStringArray(attributes);
if (out.getVersion().onOrAfter(Version.V_6_0_0_alpha3)) {
out.writeOptionalString(normalizer);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -125,4 +125,13 @@ public AnalyzeRequestBuilder setText(String... texts) {
request.text(texts);
return this;
}

/**
* Instead of setting the analyzer and tokenizer, sets the normalizer as name
*/
public AnalyzeRequestBuilder setNormalizer(String normalizer) {
request.normalizer(normalizer);
return this;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
import org.elasticsearch.index.analysis.CustomAnalyzer;
import org.elasticsearch.index.analysis.CustomAnalyzerProvider;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
Expand All @@ -60,6 +61,7 @@
import org.elasticsearch.index.shard.ShardId;
import org.elasticsearch.indices.IndicesService;
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.indices.analysis.PreBuiltTokenizers;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.transport.TransportService;

Expand Down Expand Up @@ -178,21 +180,46 @@ public static AnalyzeResponse analyze(AnalyzeRequest request, String field, Anal
throw new IllegalArgumentException("failed to find analyzer [" + request.analyzer() + "]");
}
}

} else if (request.tokenizer() != null) {
final IndexSettings indexSettings = indexAnalyzers == null ? null : indexAnalyzers.getIndexSettings();
Tuple<String, TokenizerFactory> tokenizerFactory = parseTokenizerFactory(request, indexAnalyzers,
analysisRegistry, environment);

List<CharFilterFactory> charFilterFactoryList = parseCharFilterFactories(request, indexSettings, analysisRegistry, environment);
List<CharFilterFactory> charFilterFactoryList =
parseCharFilterFactories(request, indexSettings, analysisRegistry, environment, false);

List<TokenFilterFactory> tokenFilterFactoryList = parseTokenFilterFactories(request, indexSettings, analysisRegistry,
environment, tokenizerFactory, charFilterFactoryList);
environment, tokenizerFactory, charFilterFactoryList, false);

analyzer = new CustomAnalyzer(tokenizerFactory.v1(), tokenizerFactory.v2(),
charFilterFactoryList.toArray(new CharFilterFactory[charFilterFactoryList.size()]),
tokenFilterFactoryList.toArray(new TokenFilterFactory[tokenFilterFactoryList.size()]));
closeAnalyzer = true;
} else if (request.normalizer() != null) {
// Get normalizer from indexAnalyzers
analyzer = indexAnalyzers.getNormalizer(request.normalizer());
if (analyzer == null) {
throw new IllegalArgumentException("failed to find normalizer under [" + request.normalizer() + "]");
}
} else if (((request.tokenFilters() != null && request.tokenFilters().size() > 0)
|| (request.charFilters() != null && request.charFilters().size() > 0))) {
final IndexSettings indexSettings = indexAnalyzers == null ? null : indexAnalyzers.getIndexSettings();
// custom normalizer = if normalizer == null but filter or char_filter is not null and tokenizer/analyzer is null
// get charfilter and filter from request
List<CharFilterFactory> charFilterFactoryList =
parseCharFilterFactories(request, indexSettings, analysisRegistry, environment, true);

final String keywordTokenizerName = "keyword";
TokenizerFactory keywordTokenizerFactory = getTokenizerFactory(analysisRegistry, environment, keywordTokenizerName);

List<TokenFilterFactory> tokenFilterFactoryList =
parseTokenFilterFactories(request, indexSettings, analysisRegistry, environment, new Tuple<>(keywordTokenizerName, keywordTokenizerFactory), charFilterFactoryList, true);

analyzer = new CustomAnalyzer("keyword_for_normalizer",
keywordTokenizerFactory,
charFilterFactoryList.toArray(new CharFilterFactory[charFilterFactoryList.size()]),
tokenFilterFactoryList.toArray(new TokenFilterFactory[tokenFilterFactoryList.size()]));
closeAnalyzer = true;
} else if (analyzer == null) {
if (indexAnalyzers == null) {
analyzer = analysisRegistry.getAnalyzer("standard");
Expand Down Expand Up @@ -465,7 +492,7 @@ private static Map<String, Object> extractExtendedAttributes(TokenStream stream,
}

private static List<CharFilterFactory> parseCharFilterFactories(AnalyzeRequest request, IndexSettings indexSettings, AnalysisRegistry analysisRegistry,
Environment environment) throws IOException {
Environment environment, boolean normalizer) throws IOException {
List<CharFilterFactory> charFilterFactoryList = new ArrayList<>();
if (request.charFilters() != null && request.charFilters().size() > 0) {
List<AnalyzeRequest.NameOrDefinition> charFilters = request.charFilters();
Expand Down Expand Up @@ -506,6 +533,13 @@ private static List<CharFilterFactory> parseCharFilterFactories(AnalyzeRequest r
if (charFilterFactory == null) {
throw new IllegalArgumentException("failed to find char filter under [" + charFilter.name + "]");
}
if (normalizer) {
if (charFilterFactory instanceof MultiTermAwareComponent == false) {
throw new IllegalArgumentException("Custom normalizer may not use char filter ["
+ charFilterFactory.name() + "]");
}
charFilterFactory = (CharFilterFactory) ((MultiTermAwareComponent) charFilterFactory).getMultiTermComponent();
}
charFilterFactoryList.add(charFilterFactory);
}
}
Expand All @@ -514,7 +548,7 @@ private static List<CharFilterFactory> parseCharFilterFactories(AnalyzeRequest r

private static List<TokenFilterFactory> parseTokenFilterFactories(AnalyzeRequest request, IndexSettings indexSettings, AnalysisRegistry analysisRegistry,
Environment environment, Tuple<String, TokenizerFactory> tokenizerFactory,
List<CharFilterFactory> charFilterFactoryList) throws IOException {
List<CharFilterFactory> charFilterFactoryList, boolean normalizer) throws IOException {
List<TokenFilterFactory> tokenFilterFactoryList = new ArrayList<>();
if (request.tokenFilters() != null && request.tokenFilters().size() > 0) {
List<AnalyzeRequest.NameOrDefinition> tokenFilters = request.tokenFilters();
Expand Down Expand Up @@ -561,6 +595,13 @@ private static List<TokenFilterFactory> parseTokenFilterFactories(AnalyzeRequest
if (tokenFilterFactory == null) {
throw new IllegalArgumentException("failed to find or create token filter under [" + tokenFilter.name + "]");
}
if (normalizer) {
if (tokenFilterFactory instanceof MultiTermAwareComponent == false) {
throw new IllegalArgumentException("Custom normalizer may not use filter ["
+ tokenFilterFactory.name() + "]");
}
tokenFilterFactory = (TokenFilterFactory) ((MultiTermAwareComponent) tokenFilterFactory).getMultiTermComponent();
}
tokenFilterFactoryList.add(tokenFilterFactory);
}
}
Expand Down Expand Up @@ -590,12 +631,8 @@ private static Tuple<String, TokenizerFactory> parseTokenizerFactory(AnalyzeRequ
} else {
AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory;
if (indexAnalzyers == null) {
tokenizerFactoryFactory = analysisRegistry.getTokenizerProvider(tokenizer.name);
if (tokenizerFactoryFactory == null) {
throw new IllegalArgumentException("failed to find global tokenizer under [" + tokenizer.name + "]");
}
tokenizerFactory = getTokenizerFactory(analysisRegistry, environment, tokenizer.name);
name = tokenizer.name;
tokenizerFactory = tokenizerFactoryFactory.get(environment, tokenizer.name);
} else {
tokenizerFactoryFactory = analysisRegistry.getTokenizerProvider(tokenizer.name, indexAnalzyers.getIndexSettings());
if (tokenizerFactoryFactory == null) {
Expand All @@ -610,6 +647,17 @@ private static Tuple<String, TokenizerFactory> parseTokenizerFactory(AnalyzeRequ
return new Tuple<>(name, tokenizerFactory);
}

private static TokenizerFactory getTokenizerFactory(AnalysisRegistry analysisRegistry, Environment environment, String name) throws IOException {
AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory;
TokenizerFactory tokenizerFactory;
tokenizerFactoryFactory = analysisRegistry.getTokenizerProvider(name);
if (tokenizerFactoryFactory == null) {
throw new IllegalArgumentException("failed to find global tokenizer under [" + name + "]");
}
tokenizerFactory = tokenizerFactoryFactory.get(environment, name);
return tokenizerFactory;
}

private static IndexSettings getNaIndexSettings(Settings settings) {
IndexMetaData metaData = IndexMetaData.builder(IndexMetaData.INDEX_UUID_NA_VALUE).settings(settings).build();
return new IndexSettings(metaData, Settings.EMPTY);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ public static class Fields {
public static final ParseField CHAR_FILTERS = new ParseField("char_filter");
public static final ParseField EXPLAIN = new ParseField("explain");
public static final ParseField ATTRIBUTES = new ParseField("attributes");
public static final ParseField NORMALIZER = new ParseField("normalizer");
}

public RestAnalyzeAction(Settings settings, RestController controller) {
Expand Down Expand Up @@ -147,6 +148,12 @@ static void buildFromContent(XContentParser parser, AnalyzeRequest analyzeReques
attributes.add(parser.text());
}
analyzeRequest.attributes(attributes.toArray(new String[attributes.size()]));
} else if (Fields.NORMALIZER.match(currentFieldName)) {
if (token == XContentParser.Token.VALUE_STRING) {
analyzeRequest.normalizer(parser.text());
} else {
throw new IllegalArgumentException(currentFieldName + " should be normalizer's name");
}
} else {
throw new IllegalArgumentException("Unknown parameter ["
+ currentFieldName + "] in request body or parameter is of the wrong type[" + token + "] ");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,9 @@ public void setUp() throws Exception {
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
.put("index.analysis.analyzer.custom_analyzer.filter", "mock").build();
.put("index.analysis.analyzer.custom_analyzer.filter", "mock")
.put("index.analysis.normalizer.my_normalizer.type", "custom")
.putArray("index.analysis.normalizer.my_normalizer.filter", "lowercase").build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
environment = new Environment(settings);
AnalysisPlugin plugin = new AnalysisPlugin() {
Expand Down Expand Up @@ -304,6 +306,14 @@ public void testUnknown() throws IOException {
} else {
assertEquals(e.getMessage(), "failed to find global char filter under [foobar]");
}

e = expectThrows(IllegalArgumentException.class,
() -> TransportAnalyzeAction.analyze(
new AnalyzeRequest()
.normalizer("foobar")
.text("the qu1ck brown fox"),
AllFieldMapper.NAME, null, indexAnalyzers, registry, environment));
assertEquals(e.getMessage(), "failed to find normalizer under [foobar]");
}

public void testNonPreBuildTokenFilter() throws IOException {
Expand All @@ -317,6 +327,16 @@ public void testNonPreBuildTokenFilter() throws IOException {
int default_bucket_size = 512;
int default_hash_set_size = 1;
assertEquals(default_hash_count * default_bucket_size * default_hash_set_size, tokens.size());
}

public void testNormalizerWithIndex() throws IOException {
AnalyzeRequest request = new AnalyzeRequest("index");
request.normalizer("my_normalizer");
request.text("ABc");
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
List<AnalyzeResponse.AnalyzeToken> tokens = analyze.getTokens();

assertEquals(1, tokens.size());
assertEquals("abc", tokens.get(0).getTerm());
}
}
Loading

0 comments on commit 6894ef6

Please sign in to comment.