diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java index 1c2808c70ffcf..fd0fde76e6883 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java @@ -5,14 +5,8 @@ */ package org.elasticsearch.xpack.core.ml.job.config; -import org.apache.lucene.analysis.Analyzer; -import org.elasticsearch.Version; -import org.elasticsearch.action.admin.indices.analyze.TransportAnalyzeAction; -import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.ParseField; import org.elasticsearch.common.Strings; -import org.elasticsearch.common.UUIDs; -import org.elasticsearch.common.collect.Tuple; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.io.stream.Writeable; @@ -22,15 +16,6 @@ import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.common.xcontent.XContentType; -import org.elasticsearch.env.Environment; -import org.elasticsearch.index.IndexSettings; -import org.elasticsearch.index.analysis.AnalysisRegistry; -import org.elasticsearch.index.analysis.CharFilterFactory; -import org.elasticsearch.index.analysis.CustomAnalyzer; -import org.elasticsearch.index.analysis.CustomAnalyzerProvider; -import org.elasticsearch.index.analysis.TokenFilterFactory; -import org.elasticsearch.index.analysis.TokenizerFactory; -import org.elasticsearch.indices.analysis.AnalysisModule; import org.elasticsearch.rest.action.admin.indices.RestAnalyzeAction; import org.elasticsearch.xpack.core.ml.MlParserType; @@ -42,12 +27,11 @@ import java.util.Map; import java.util.Objects; - /** * Configuration for the categorization analyzer. * * The syntax is a subset of what can be supplied to the {@linkplain RestAnalyzeAction _analyze endpoint}. - * To summarise, the first option is to specify the name of an out-of-the-box analyzer: + * To summarize, the first option is to specify the name of an out-of-the-box analyzer: * * "categorization_analyzer" : "standard" * @@ -66,11 +50,6 @@ * { "type" : "pattern_replace", "pattern": "^[0-9].*" } * ] * - * - * Unfortunately there is no easy to to reuse a subset of the _analyze action implementation, so much - * of the code in this file is copied from {@link TransportAnalyzeAction}. Unfortunately the logic required here is - * not quite identical to that of {@link TransportAnalyzeAction}, and the required code is hard to partially reuse. - * TODO: consider refactoring ES core to allow more reuse. */ public class CategorizationAnalyzerConfig implements ToXContentFragment, Writeable { @@ -350,175 +329,6 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws return builder; } - /** - * Convert the config to an {@link Analyzer}. This may be a global analyzer or a newly created custom analyzer. - * In the case of a global analyzer the caller must NOT close it when they have finished with it. In the case of - * a newly created custom analyzer the caller is responsible for closing it. - * @return The first tuple member is the {@link Analyzer}; the second indicates whether the caller is responsible - * for closing it. - */ - public Tuple toAnalyzer(AnalysisRegistry analysisRegistry, Environment environment) throws IOException { - if (analyzer != null) { - Analyzer globalAnalyzer = analysisRegistry.getAnalyzer(analyzer); - if (globalAnalyzer == null) { - throw new IllegalArgumentException("Failed to find global analyzer [" + analyzer + "]"); - } - return new Tuple<>(globalAnalyzer, Boolean.FALSE); - } else { - List charFilterFactoryList = - parseCharFilterFactories(analysisRegistry, environment); - - Tuple tokenizerFactory = parseTokenizerFactory(analysisRegistry, - environment); - - List tokenFilterFactoryList = parseTokenFilterFactories(analysisRegistry, - environment, tokenizerFactory, charFilterFactoryList); - - return new Tuple<>(new CustomAnalyzer(tokenizerFactory.v1(), tokenizerFactory.v2(), - charFilterFactoryList.toArray(new CharFilterFactory[charFilterFactoryList.size()]), - tokenFilterFactoryList.toArray(new TokenFilterFactory[tokenFilterFactoryList.size()])), Boolean.TRUE); - } - } - - - /** - * Get char filter factories for each configured char filter. Each configuration - * element can be the name of an out-of-the-box char filter, or a custom definition. - */ - private List parseCharFilterFactories(AnalysisRegistry analysisRegistry, - Environment environment) throws IOException { - final List charFilterFactoryList = new ArrayList<>(); - for (NameOrDefinition charFilter : charFilters) { - final CharFilterFactory charFilterFactory; - if (charFilter.name != null) { - AnalysisModule.AnalysisProvider charFilterFactoryFactory = - analysisRegistry.getCharFilterProvider(charFilter.name); - if (charFilterFactoryFactory == null) { - throw new IllegalArgumentException("Failed to find global char filter under [" + charFilter.name + "]"); - } - charFilterFactory = charFilterFactoryFactory.get(environment, charFilter.name); - } else { - String charFilterTypeName = charFilter.definition.get("type"); - if (charFilterTypeName == null) { - throw new IllegalArgumentException("Missing [type] setting for char filter: " + charFilter.definition); - } - AnalysisModule.AnalysisProvider charFilterFactoryFactory = - analysisRegistry.getCharFilterProvider(charFilterTypeName); - if (charFilterFactoryFactory == null) { - throw new IllegalArgumentException("Failed to find global char filter under [" + charFilterTypeName + "]"); - } - Settings settings = augmentSettings(charFilter.definition); - // Need to set anonymous "name" of char_filter - charFilterFactory = charFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment, - "_anonymous_charfilter", settings); - } - if (charFilterFactory == null) { - throw new IllegalArgumentException("Failed to find char filter [" + charFilter + "]"); - } - charFilterFactoryList.add(charFilterFactory); - } - return charFilterFactoryList; - } - - /** - * Get the tokenizer factory for the configured tokenizer. The configuration - * can be the name of an out-of-the-box tokenizer, or a custom definition. - */ - private Tuple parseTokenizerFactory(AnalysisRegistry analysisRegistry, - Environment environment) throws IOException { - final String name; - final TokenizerFactory tokenizerFactory; - if (tokenizer.name != null) { - name = tokenizer.name; - AnalysisModule.AnalysisProvider tokenizerFactoryFactory = analysisRegistry.getTokenizerProvider(name); - if (tokenizerFactoryFactory == null) { - throw new IllegalArgumentException("Failed to find global tokenizer under [" + name + "]"); - } - tokenizerFactory = tokenizerFactoryFactory.get(environment, name); - } else { - String tokenizerTypeName = tokenizer.definition.get("type"); - if (tokenizerTypeName == null) { - throw new IllegalArgumentException("Missing [type] setting for tokenizer: " + tokenizer.definition); - } - AnalysisModule.AnalysisProvider tokenizerFactoryFactory = - analysisRegistry.getTokenizerProvider(tokenizerTypeName); - if (tokenizerFactoryFactory == null) { - throw new IllegalArgumentException("Failed to find global tokenizer under [" + tokenizerTypeName + "]"); - } - Settings settings = augmentSettings(tokenizer.definition); - // Need to set anonymous "name" of tokenizer - name = "_anonymous_tokenizer"; - tokenizerFactory = tokenizerFactoryFactory.get(buildDummyIndexSettings(settings), environment, name, settings); - } - return new Tuple<>(name, tokenizerFactory); - } - - /** - * Get token filter factories for each configured token filter. Each configuration - * element can be the name of an out-of-the-box token filter, or a custom definition. - */ - private List parseTokenFilterFactories(AnalysisRegistry analysisRegistry, Environment environment, - Tuple tokenizerFactory, - List charFilterFactoryList) throws IOException { - final List tokenFilterFactoryList = new ArrayList<>(); - for (NameOrDefinition tokenFilter : tokenFilters) { - TokenFilterFactory tokenFilterFactory; - if (tokenFilter.name != null) { - AnalysisModule.AnalysisProvider tokenFilterFactoryFactory; - tokenFilterFactoryFactory = analysisRegistry.getTokenFilterProvider(tokenFilter.name); - if (tokenFilterFactoryFactory == null) { - throw new IllegalArgumentException("Failed to find global token filter under [" + tokenFilter.name + "]"); - } - tokenFilterFactory = tokenFilterFactoryFactory.get(environment, tokenFilter.name); - } else { - String filterTypeName = tokenFilter.definition.get("type"); - if (filterTypeName == null) { - throw new IllegalArgumentException("Missing [type] setting for token filter: " + tokenFilter.definition); - } - AnalysisModule.AnalysisProvider tokenFilterFactoryFactory = - analysisRegistry.getTokenFilterProvider(filterTypeName); - if (tokenFilterFactoryFactory == null) { - throw new IllegalArgumentException("Failed to find global token filter under [" + filterTypeName + "]"); - } - Settings settings = augmentSettings(tokenFilter.definition); - // Need to set anonymous "name" of token_filter - tokenFilterFactory = tokenFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment, - "_anonymous_tokenfilter", settings); - tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(), - tokenizerFactory.v2(), tokenFilterFactoryList, charFilterFactoryList, environment); - } - if (tokenFilterFactory == null) { - throw new IllegalArgumentException("Failed to find or create token filter [" + tokenFilter + "]"); - } - tokenFilterFactoryList.add(tokenFilterFactory); - } - return tokenFilterFactoryList; - } - - /** - * The Elasticsearch analysis functionality is designed to work with indices. For - * categorization we have to pretend we've got some index settings. - */ - private IndexSettings buildDummyIndexSettings(Settings settings) { - IndexMetaData metaData = IndexMetaData.builder(IndexMetaData.INDEX_UUID_NA_VALUE).settings(settings).build(); - return new IndexSettings(metaData, Settings.EMPTY); - } - - /** - * The behaviour of Elasticsearch analyzers can vary between versions. - * For categorization we'll always use the latest version of the text analysis. - * The other settings are just to stop classes that expect to be associated with - * an index from complaining. - */ - private Settings augmentSettings(Settings settings) { - return Settings.builder().put(settings) - .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) - .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0) - .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1) - .put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID()) - .build(); - } - @Override public boolean equals(Object o) { if (this == o) return true; @@ -609,17 +419,5 @@ public CategorizationAnalyzerConfig build() { } return new CategorizationAnalyzerConfig(analyzer, charFilters, tokenizer, tokenFilters); } - - /** - * Verify that the builder will build a valid config. This is not done as part of the basic build - * because it verifies that the names of analyzers/tokenizers/filters referenced by the config are - * known, and the validity of these names could change over time. - */ - public void verify(AnalysisRegistry analysisRegistry, Environment environment) throws IOException { - Tuple tuple = build().toAnalyzer(analysisRegistry, environment); - if (tuple.v2()) { - tuple.v1().close(); - } - } } } diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/Job.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/Job.java index f3c68542fa4f4..227b8612839e8 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/Job.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/Job.java @@ -21,8 +21,6 @@ import org.elasticsearch.common.xcontent.ToXContentObject; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentParser.Token; -import org.elasticsearch.env.Environment; -import org.elasticsearch.index.analysis.AnalysisRegistry; import org.elasticsearch.xpack.core.ml.MlParserType; import org.elasticsearch.xpack.core.ml.job.messages.Messages; import org.elasticsearch.xpack.core.ml.job.persistence.AnomalyDetectorsIndexFields; @@ -777,8 +775,8 @@ public Builder setAnalysisConfig(AnalysisConfig.Builder configBuilder) { return this; } - public AnalysisLimits getAnalysisLimits() { - return analysisLimits; + public AnalysisConfig getAnalysisConfig() { + return analysisConfig; } public Builder setAnalysisLimits(AnalysisLimits analysisLimits) { @@ -1081,18 +1079,6 @@ public void validateAnalysisLimitsAndSetDefaults(@Nullable ByteSizeValue maxMode AnalysisLimits.DEFAULT_MODEL_MEMORY_LIMIT_MB); } - /** - * Validate the char filter/tokenizer/token filter names used in the categorization analyzer config (if any). - * The overall structure can be validated at parse time, but the exact names need to be checked separately, - * as plugins that provide the functionality can be installed/uninstalled. - */ - public void validateCategorizationAnalyzer(AnalysisRegistry analysisRegistry, Environment environment) throws IOException { - CategorizationAnalyzerConfig categorizationAnalyzerConfig = analysisConfig.getCategorizationAnalyzerConfig(); - if (categorizationAnalyzerConfig != null) { - new CategorizationAnalyzerConfig.Builder(categorizationAnalyzerConfig).verify(analysisRegistry, environment); - } - } - private void validateGroups() { for (String group : this.groups) { if (MlStrings.isValidId(group) == false) { diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/JobManager.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/JobManager.java index a812b7b464c69..081b9a07db207 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/JobManager.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/JobManager.java @@ -39,6 +39,7 @@ import org.elasticsearch.xpack.core.ml.action.UpdateJobAction; import org.elasticsearch.xpack.core.ml.action.util.QueryPage; import org.elasticsearch.xpack.core.ml.job.config.AnalysisLimits; +import org.elasticsearch.xpack.core.ml.job.config.CategorizationAnalyzerConfig; import org.elasticsearch.xpack.core.ml.job.config.DataDescription; import org.elasticsearch.xpack.core.ml.job.config.Job; import org.elasticsearch.xpack.core.ml.job.config.JobState; @@ -50,6 +51,7 @@ import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.ModelSnapshot; import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper; import org.elasticsearch.xpack.ml.MachineLearning; +import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzer; import org.elasticsearch.xpack.ml.job.persistence.JobProvider; import org.elasticsearch.xpack.ml.job.persistence.JobResultsPersister; import org.elasticsearch.xpack.ml.job.process.autodetect.UpdateParams; @@ -170,6 +172,22 @@ public JobState getJobState(String jobId) { return MlMetadata.getJobState(jobId, tasks); } + /** + * Validate the char filter/tokenizer/token filter names used in the categorization analyzer config (if any). + * This validation has to be done server-side; it cannot be done in a client as that won't have loaded the + * appropriate analysis modules/plugins. + * The overall structure can be validated at parse time, but the exact names need to be checked separately, + * as plugins that provide the functionality can be installed/uninstalled. + */ + static void validateCategorizationAnalyzer(Job.Builder jobBuilder, AnalysisRegistry analysisRegistry, Environment environment) + throws IOException { + CategorizationAnalyzerConfig categorizationAnalyzerConfig = jobBuilder.getAnalysisConfig().getCategorizationAnalyzerConfig(); + if (categorizationAnalyzerConfig != null) { + CategorizationAnalyzer.verifyConfigBuilder(new CategorizationAnalyzerConfig.Builder(categorizationAnalyzerConfig), + analysisRegistry, environment); + } + } + /** * Stores a job in the cluster state */ @@ -177,7 +195,7 @@ public void putJob(PutJobAction.Request request, AnalysisRegistry analysisRegist ActionListener actionListener) throws IOException { request.getJobBuilder().validateAnalysisLimitsAndSetDefaults(maxModelMemoryLimit); - request.getJobBuilder().validateCategorizationAnalyzer(analysisRegistry, environment); + validateCategorizationAnalyzer(request.getJobBuilder(), analysisRegistry, environment); Job job = request.getJobBuilder().build(new Date()); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzer.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzer.java index a0101b999d5dc..6111fa139f97f 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzer.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzer.java @@ -9,9 +9,21 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.elasticsearch.ElasticsearchException; +import org.elasticsearch.Version; +import org.elasticsearch.action.admin.indices.analyze.TransportAnalyzeAction; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.common.UUIDs; import org.elasticsearch.common.collect.Tuple; +import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AnalysisRegistry; +import org.elasticsearch.index.analysis.CharFilterFactory; +import org.elasticsearch.index.analysis.CustomAnalyzer; +import org.elasticsearch.index.analysis.CustomAnalyzerProvider; +import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.elasticsearch.index.analysis.TokenizerFactory; +import org.elasticsearch.indices.analysis.AnalysisModule; import org.elasticsearch.xpack.core.ml.job.config.CategorizationAnalyzerConfig; import java.io.Closeable; @@ -19,11 +31,16 @@ import java.util.ArrayList; import java.util.List; - /** * The categorization analyzer. * * Converts messages to lists of tokens that will be fed to the ML categorization algorithm. + * + * The code in {@link #makeAnalyzer} and the methods it calls is largely copied from {@link TransportAnalyzeAction}. + * Unfortunately there is no easy way to reuse a subset of the _analyze action implementation, as the + * logic required here is not quite identical to that of {@link TransportAnalyzeAction}, and the required code is + * hard to partially reuse. + * TODO: consider refactoring ES core to allow more reuse. */ public class CategorizationAnalyzer implements Closeable { @@ -33,7 +50,7 @@ public class CategorizationAnalyzer implements Closeable { public CategorizationAnalyzer(AnalysisRegistry analysisRegistry, Environment environment, CategorizationAnalyzerConfig categorizationAnalyzerConfig) throws IOException { - Tuple tuple = categorizationAnalyzerConfig.toAnalyzer(analysisRegistry, environment); + Tuple tuple = makeAnalyzer(categorizationAnalyzerConfig, analysisRegistry, environment); analyzer = tuple.v1(); closeAnalyzer = tuple.v2(); } @@ -69,4 +86,193 @@ public List tokenizeField(String fieldName, String fieldValue) { } return tokens; } + + /** + * Verify that the config builder will build a valid config. This is not done as part of the basic build + * because it verifies that the names of analyzers/tokenizers/filters referenced by the config are + * known, and the validity of these names could change over time. Additionally, it has to be done + * server-side rather than client-side, as the client will not have loaded the appropriate analysis + * modules/plugins. + */ + public static void verifyConfigBuilder(CategorizationAnalyzerConfig.Builder configBuilder, AnalysisRegistry analysisRegistry, + Environment environment) throws IOException { + Tuple tuple = makeAnalyzer(configBuilder.build(), analysisRegistry, environment); + if (tuple.v2()) { + tuple.v1().close(); + } + } + + /** + * Convert a config to an {@link Analyzer}. This may be a global analyzer or a newly created custom analyzer. + * In the case of a global analyzer the caller must NOT close it when they have finished with it. In the case of + * a newly created custom analyzer the caller is responsible for closing it. + * @return The first tuple member is the {@link Analyzer}; the second indicates whether the caller is responsible + * for closing it. + */ + private static Tuple makeAnalyzer(CategorizationAnalyzerConfig config, AnalysisRegistry analysisRegistry, + Environment environment) throws IOException { + String analyzer = config.getAnalyzer(); + if (analyzer != null) { + Analyzer globalAnalyzer = analysisRegistry.getAnalyzer(analyzer); + if (globalAnalyzer == null) { + throw new IllegalArgumentException("Failed to find global analyzer [" + analyzer + "]"); + } + return new Tuple<>(globalAnalyzer, Boolean.FALSE); + } else { + List charFilterFactoryList = parseCharFilterFactories(config, analysisRegistry, environment); + + Tuple tokenizerFactory = parseTokenizerFactory(config, analysisRegistry, environment); + + List tokenFilterFactoryList = parseTokenFilterFactories(config, analysisRegistry, environment, + tokenizerFactory, charFilterFactoryList); + + return new Tuple<>(new CustomAnalyzer(tokenizerFactory.v1(), tokenizerFactory.v2(), + charFilterFactoryList.toArray(new CharFilterFactory[charFilterFactoryList.size()]), + tokenFilterFactoryList.toArray(new TokenFilterFactory[tokenFilterFactoryList.size()])), Boolean.TRUE); + } + } + + + /** + * Get char filter factories for each configured char filter. Each configuration + * element can be the name of an out-of-the-box char filter, or a custom definition. + */ + private static List parseCharFilterFactories(CategorizationAnalyzerConfig config, AnalysisRegistry analysisRegistry, + Environment environment) throws IOException { + List charFilters = config.getCharFilters(); + final List charFilterFactoryList = new ArrayList<>(); + for (CategorizationAnalyzerConfig.NameOrDefinition charFilter : charFilters) { + final CharFilterFactory charFilterFactory; + if (charFilter.name != null) { + AnalysisModule.AnalysisProvider charFilterFactoryFactory = + analysisRegistry.getCharFilterProvider(charFilter.name); + if (charFilterFactoryFactory == null) { + throw new IllegalArgumentException("Failed to find global char filter under [" + charFilter.name + "]"); + } + charFilterFactory = charFilterFactoryFactory.get(environment, charFilter.name); + } else { + String charFilterTypeName = charFilter.definition.get("type"); + if (charFilterTypeName == null) { + throw new IllegalArgumentException("Missing [type] setting for char filter: " + charFilter.definition); + } + AnalysisModule.AnalysisProvider charFilterFactoryFactory = + analysisRegistry.getCharFilterProvider(charFilterTypeName); + if (charFilterFactoryFactory == null) { + throw new IllegalArgumentException("Failed to find global char filter under [" + charFilterTypeName + "]"); + } + Settings settings = augmentSettings(charFilter.definition); + // Need to set anonymous "name" of char_filter + charFilterFactory = charFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment, "_anonymous_charfilter", + settings); + } + if (charFilterFactory == null) { + throw new IllegalArgumentException("Failed to find char filter [" + charFilter + "]"); + } + charFilterFactoryList.add(charFilterFactory); + } + return charFilterFactoryList; + } + + /** + * Get the tokenizer factory for the configured tokenizer. The configuration + * can be the name of an out-of-the-box tokenizer, or a custom definition. + */ + private static Tuple parseTokenizerFactory(CategorizationAnalyzerConfig config, + AnalysisRegistry analysisRegistry, Environment environment) + throws IOException { + CategorizationAnalyzerConfig.NameOrDefinition tokenizer = config.getTokenizer(); + final String name; + final TokenizerFactory tokenizerFactory; + if (tokenizer.name != null) { + name = tokenizer.name; + AnalysisModule.AnalysisProvider tokenizerFactoryFactory = analysisRegistry.getTokenizerProvider(name); + if (tokenizerFactoryFactory == null) { + throw new IllegalArgumentException("Failed to find global tokenizer under [" + name + "]"); + } + tokenizerFactory = tokenizerFactoryFactory.get(environment, name); + } else { + String tokenizerTypeName = tokenizer.definition.get("type"); + if (tokenizerTypeName == null) { + throw new IllegalArgumentException("Missing [type] setting for tokenizer: " + tokenizer.definition); + } + AnalysisModule.AnalysisProvider tokenizerFactoryFactory = + analysisRegistry.getTokenizerProvider(tokenizerTypeName); + if (tokenizerFactoryFactory == null) { + throw new IllegalArgumentException("Failed to find global tokenizer under [" + tokenizerTypeName + "]"); + } + Settings settings = augmentSettings(tokenizer.definition); + // Need to set anonymous "name" of tokenizer + name = "_anonymous_tokenizer"; + tokenizerFactory = tokenizerFactoryFactory.get(buildDummyIndexSettings(settings), environment, name, settings); + } + return new Tuple<>(name, tokenizerFactory); + } + + /** + * Get token filter factories for each configured token filter. Each configuration + * element can be the name of an out-of-the-box token filter, or a custom definition. + */ + private static List parseTokenFilterFactories(CategorizationAnalyzerConfig config, + AnalysisRegistry analysisRegistry, Environment environment, + Tuple tokenizerFactory, + List charFilterFactoryList) throws IOException { + List tokenFilters = config.getTokenFilters(); + final List tokenFilterFactoryList = new ArrayList<>(); + for (CategorizationAnalyzerConfig.NameOrDefinition tokenFilter : tokenFilters) { + TokenFilterFactory tokenFilterFactory; + if (tokenFilter.name != null) { + AnalysisModule.AnalysisProvider tokenFilterFactoryFactory; + tokenFilterFactoryFactory = analysisRegistry.getTokenFilterProvider(tokenFilter.name); + if (tokenFilterFactoryFactory == null) { + throw new IllegalArgumentException("Failed to find global token filter under [" + tokenFilter.name + "]"); + } + tokenFilterFactory = tokenFilterFactoryFactory.get(environment, tokenFilter.name); + } else { + String filterTypeName = tokenFilter.definition.get("type"); + if (filterTypeName == null) { + throw new IllegalArgumentException("Missing [type] setting for token filter: " + tokenFilter.definition); + } + AnalysisModule.AnalysisProvider tokenFilterFactoryFactory = + analysisRegistry.getTokenFilterProvider(filterTypeName); + if (tokenFilterFactoryFactory == null) { + throw new IllegalArgumentException("Failed to find global token filter under [" + filterTypeName + "]"); + } + Settings settings = augmentSettings(tokenFilter.definition); + // Need to set anonymous "name" of token_filter + tokenFilterFactory = tokenFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment, "_anonymous_tokenfilter", + settings); + tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(), + tokenizerFactory.v2(), tokenFilterFactoryList, charFilterFactoryList, environment); + } + if (tokenFilterFactory == null) { + throw new IllegalArgumentException("Failed to find or create token filter [" + tokenFilter + "]"); + } + tokenFilterFactoryList.add(tokenFilterFactory); + } + return tokenFilterFactoryList; + } + + /** + * The Elasticsearch analysis functionality is designed to work with indices. For + * categorization we have to pretend we've got some index settings. + */ + private static IndexSettings buildDummyIndexSettings(Settings settings) { + IndexMetaData metaData = IndexMetaData.builder(IndexMetaData.INDEX_UUID_NA_VALUE).settings(settings).build(); + return new IndexSettings(metaData, Settings.EMPTY); + } + + /** + * The behaviour of Elasticsearch analyzers can vary between versions. + * For categorization we'll always use the latest version of the text analysis. + * The other settings are just to stop classes that expect to be associated with + * an index from complaining. + */ + private static Settings augmentSettings(Settings settings) { + return Settings.builder().put(settings) + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0) + .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1) + .put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID()) + .build(); + } } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzerTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzerTests.java index 9f03952165c13..59413f6a61879 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzerTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzerTests.java @@ -22,7 +22,6 @@ import java.util.HashMap; import java.util.Map; - public class CategorizationAnalyzerTests extends ESTestCase { private AnalysisRegistry analysisRegistry; @@ -41,6 +40,158 @@ public void setup() throws Exception { analysisRegistry = buildTestAnalysisRegistry(environment); } + public void testVerifyConfigBuilder_GivenNoConfig() { + CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder(); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment)); + assertEquals("categorization_analyzer that is not a global analyzer must specify a [tokenizer] field", e.getMessage()); + } + + public void testVerifyConfigBuilder_GivenDefault() throws IOException { + CategorizationAnalyzerConfig defaultConfig = CategorizationAnalyzerConfig.buildDefaultCategorizationAnalyzer(null); + CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder(defaultConfig); + CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment); + } + + public void testVerifyConfigBuilder_GivenValidAnalyzer() throws IOException { + CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder().setAnalyzer("standard"); + CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment); + } + + public void testVerifyConfigBuilder_GivenInvalidAnalyzer() { + CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder().setAnalyzer("does not exist"); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment)); + assertEquals("Failed to find global analyzer [does not exist]", e.getMessage()); + } + + public void testVerifyConfigBuilder_GivenValidCustomConfig() throws IOException { + Map ignoreStuffInSqaureBrackets = new HashMap<>(); + ignoreStuffInSqaureBrackets.put("type", "pattern_replace"); + ignoreStuffInSqaureBrackets.put("pattern", "\\[[^\\]]*\\]"); + Map ignoreStuffThatBeginsWithADigit = new HashMap<>(); + ignoreStuffThatBeginsWithADigit.put("type", "pattern_replace"); + ignoreStuffThatBeginsWithADigit.put("pattern", "^[0-9].*"); + CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder() + .addCharFilter(ignoreStuffInSqaureBrackets) + .setTokenizer("classic") + .addTokenFilter("lowercase") + .addTokenFilter(ignoreStuffThatBeginsWithADigit) + .addTokenFilter("snowball"); + CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment); + } + + public void testVerifyConfigBuilder_GivenCustomConfigWithInvalidCharFilter() { + CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder() + .addCharFilter("wrong!") + .setTokenizer("classic") + .addTokenFilter("lowercase") + .addTokenFilter("snowball"); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment)); + assertEquals("Failed to find global char filter under [wrong!]", e.getMessage()); + } + + public void testVerifyConfigBuilder_GivenCustomConfigWithMisconfiguredCharFilter() { + Map noPattern = new HashMap<>(); + noPattern.put("type", "pattern_replace"); + noPattern.put("attern", "should have been pattern"); + CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder() + .addCharFilter(noPattern) + .setTokenizer("classic") + .addTokenFilter("lowercase") + .addTokenFilter("snowball"); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment)); + assertEquals("pattern is missing for [_anonymous_charfilter] char filter of type 'pattern_replace'", e.getMessage()); + } + + public void testVerifyConfigBuilder_GivenCustomConfigWithInvalidTokenizer() { + Map ignoreStuffInSqaureBrackets = new HashMap<>(); + ignoreStuffInSqaureBrackets.put("type", "pattern_replace"); + ignoreStuffInSqaureBrackets.put("pattern", "\\[[^\\]]*\\]"); + CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder() + .addCharFilter(ignoreStuffInSqaureBrackets) + .setTokenizer("oops!") + .addTokenFilter("lowercase") + .addTokenFilter("snowball"); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment)); + assertEquals("Failed to find global tokenizer under [oops!]", e.getMessage()); + } + + public void testVerifyConfigBuilder_GivenNoTokenizer() { + Map ignoreStuffInSqaureBrackets = new HashMap<>(); + ignoreStuffInSqaureBrackets.put("type", "pattern_replace"); + ignoreStuffInSqaureBrackets.put("pattern", "\\[[^\\]]*\\]"); + Map ignoreStuffThatBeginsWithADigit = new HashMap<>(); + ignoreStuffThatBeginsWithADigit.put("type", "pattern_replace"); + ignoreStuffThatBeginsWithADigit.put("pattern", "^[0-9].*"); + CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder() + .addCharFilter(ignoreStuffInSqaureBrackets) + .addTokenFilter("lowercase") + .addTokenFilter(ignoreStuffThatBeginsWithADigit) + .addTokenFilter("snowball"); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment)); + assertEquals("categorization_analyzer that is not a global analyzer must specify a [tokenizer] field", e.getMessage()); + } + + public void testVerifyConfigBuilder_GivenCustomConfigWithInvalidTokenFilter() { + Map ignoreStuffInSqaureBrackets = new HashMap<>(); + ignoreStuffInSqaureBrackets.put("type", "pattern_replace"); + ignoreStuffInSqaureBrackets.put("pattern", "\\[[^\\]]*\\]"); + CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder() + .addCharFilter(ignoreStuffInSqaureBrackets) + .setTokenizer("classic") + .addTokenFilter("lowercase") + .addTokenFilter("oh dear!"); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment)); + assertEquals("Failed to find global token filter under [oh dear!]", e.getMessage()); + } + + public void testVerifyConfigBuilder_GivenCustomConfigWithMisconfiguredTokenFilter() { + Map noPattern = new HashMap<>(); + noPattern.put("type", "pattern_replace"); + noPattern.put("attern", "should have been pattern"); + CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder() + .addCharFilter("html_strip") + .setTokenizer("classic") + .addTokenFilter("lowercase") + .addTokenFilter(noPattern); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment)); + assertEquals("pattern is missing for [_anonymous_tokenfilter] token filter of type 'pattern_replace'", e.getMessage()); + } + + public void testVerifyConfigBuilder_GivenAnalyzerAndCharFilter() { + CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder() + .setAnalyzer("standard") + .addCharFilter("html_strip"); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment)); + assertEquals("categorization_analyzer that is a global analyzer cannot also specify a [char_filter] field", e.getMessage()); + } + + public void testVerifyConfigBuilder_GivenAnalyzerAndTokenizer() { + CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder() + .setAnalyzer("standard") + .setTokenizer("classic"); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment)); + assertEquals("categorization_analyzer that is a global analyzer cannot also specify a [tokenizer] field", e.getMessage()); + } + + public void testVerifyConfigBuilder_GivenAnalyzerAndTokenFilter() { + CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder() + .setAnalyzer("standard") + .addTokenFilter("lowercase"); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment)); + assertEquals("categorization_analyzer that is a global analyzer cannot also specify a [filter] field", e.getMessage()); + } + // The default categorization analyzer matches what the analyzer in the ML C++ does public void testDefaultCategorizationAnalyzer() throws IOException { CategorizationAnalyzerConfig defaultConfig = CategorizationAnalyzerConfig.buildDefaultCategorizationAnalyzer(null); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/config/CategorizationAnalyzerConfigTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/config/CategorizationAnalyzerConfigTests.java index 9c725fe76292a..2fe2c0b334c4e 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/config/CategorizationAnalyzerConfigTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/config/CategorizationAnalyzerConfigTests.java @@ -6,175 +6,17 @@ package org.elasticsearch.xpack.ml.job.config; import org.elasticsearch.common.io.stream.Writeable; -import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.XContentParser; -import org.elasticsearch.env.Environment; -import org.elasticsearch.env.TestEnvironment; -import org.elasticsearch.index.analysis.AnalysisRegistry; import org.elasticsearch.test.AbstractSerializingTestCase; import org.elasticsearch.xpack.core.ml.job.config.CategorizationAnalyzerConfig; import org.elasticsearch.xpack.core.ml.MlParserType; -import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzerTests; -import org.junit.Before; import java.io.IOException; import java.util.HashMap; import java.util.Map; - public class CategorizationAnalyzerConfigTests extends AbstractSerializingTestCase { - private AnalysisRegistry analysisRegistry; - private Environment environment; - - @Before - public void setup() throws Exception { - Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()).build(); - environment = TestEnvironment.newEnvironment(settings); - analysisRegistry = CategorizationAnalyzerTests.buildTestAnalysisRegistry(environment); - } - - public void testVerify_GivenNoConfig() { - CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder(); - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment)); - assertEquals("categorization_analyzer that is not a global analyzer must specify a [tokenizer] field", e.getMessage()); - } - - public void testVerify_GivenDefault() throws IOException { - CategorizationAnalyzerConfig defaultConfig = CategorizationAnalyzerConfig.buildDefaultCategorizationAnalyzer(null); - CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder(defaultConfig); - builder.verify(analysisRegistry, environment); - } - - public void testVerify_GivenValidAnalyzer() throws IOException { - CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder().setAnalyzer("standard"); - builder.verify(analysisRegistry, environment); - } - - public void testVerify_GivenInvalidAnalyzer() { - CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder().setAnalyzer("does not exist"); - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment)); - assertEquals("Failed to find global analyzer [does not exist]", e.getMessage()); - } - - public void testVerify_GivenValidCustomConfig() throws IOException { - Map ignoreStuffInSqaureBrackets = new HashMap<>(); - ignoreStuffInSqaureBrackets.put("type", "pattern_replace"); - ignoreStuffInSqaureBrackets.put("pattern", "\\[[^\\]]*\\]"); - Map ignoreStuffThatBeginsWithADigit = new HashMap<>(); - ignoreStuffThatBeginsWithADigit.put("type", "pattern_replace"); - ignoreStuffThatBeginsWithADigit.put("pattern", "^[0-9].*"); - CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder() - .addCharFilter(ignoreStuffInSqaureBrackets) - .setTokenizer("classic") - .addTokenFilter("lowercase") - .addTokenFilter(ignoreStuffThatBeginsWithADigit) - .addTokenFilter("snowball"); - builder.verify(analysisRegistry, environment); - } - - public void testVerify_GivenCustomConfigWithInvalidCharFilter() { - CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder() - .addCharFilter("wrong!") - .setTokenizer("classic") - .addTokenFilter("lowercase") - .addTokenFilter("snowball"); - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment)); - assertEquals("Failed to find global char filter under [wrong!]", e.getMessage()); - } - - public void testVerify_GivenCustomConfigWithMisconfiguredCharFilter() { - Map noPattern = new HashMap<>(); - noPattern.put("type", "pattern_replace"); - noPattern.put("attern", "should have been pattern"); - CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder() - .addCharFilter(noPattern) - .setTokenizer("classic") - .addTokenFilter("lowercase") - .addTokenFilter("snowball"); - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment)); - assertEquals("pattern is missing for [_anonymous_charfilter] char filter of type 'pattern_replace'", e.getMessage()); - } - - public void testVerify_GivenCustomConfigWithInvalidTokenizer() { - Map ignoreStuffInSqaureBrackets = new HashMap<>(); - ignoreStuffInSqaureBrackets.put("type", "pattern_replace"); - ignoreStuffInSqaureBrackets.put("pattern", "\\[[^\\]]*\\]"); - CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder() - .addCharFilter(ignoreStuffInSqaureBrackets) - .setTokenizer("oops!") - .addTokenFilter("lowercase") - .addTokenFilter("snowball"); - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment)); - assertEquals("Failed to find global tokenizer under [oops!]", e.getMessage()); - } - - public void testVerify_GivenNoTokenizer() { - Map ignoreStuffInSqaureBrackets = new HashMap<>(); - ignoreStuffInSqaureBrackets.put("type", "pattern_replace"); - ignoreStuffInSqaureBrackets.put("pattern", "\\[[^\\]]*\\]"); - Map ignoreStuffThatBeginsWithADigit = new HashMap<>(); - ignoreStuffThatBeginsWithADigit.put("type", "pattern_replace"); - ignoreStuffThatBeginsWithADigit.put("pattern", "^[0-9].*"); - CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder() - .addCharFilter(ignoreStuffInSqaureBrackets) - .addTokenFilter("lowercase") - .addTokenFilter(ignoreStuffThatBeginsWithADigit) - .addTokenFilter("snowball"); - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment)); - assertEquals("categorization_analyzer that is not a global analyzer must specify a [tokenizer] field", e.getMessage()); - } - - public void testVerify_GivenCustomConfigWithInvalidTokenFilter() { - Map ignoreStuffInSqaureBrackets = new HashMap<>(); - ignoreStuffInSqaureBrackets.put("type", "pattern_replace"); - ignoreStuffInSqaureBrackets.put("pattern", "\\[[^\\]]*\\]"); - CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder() - .addCharFilter(ignoreStuffInSqaureBrackets) - .setTokenizer("classic") - .addTokenFilter("lowercase") - .addTokenFilter("oh dear!"); - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment)); - assertEquals("Failed to find global token filter under [oh dear!]", e.getMessage()); - } - - public void testVerify_GivenCustomConfigWithMisconfiguredTokenFilter() { - Map noPattern = new HashMap<>(); - noPattern.put("type", "pattern_replace"); - noPattern.put("attern", "should have been pattern"); - CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder() - .addCharFilter("html_strip") - .setTokenizer("classic") - .addTokenFilter("lowercase") - .addTokenFilter(noPattern); - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment)); - assertEquals("pattern is missing for [_anonymous_tokenfilter] token filter of type 'pattern_replace'", e.getMessage()); - } - - public void testVerify_GivenAnalyzerAndCharFilter() { - CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder() - .setAnalyzer("standard") - .addCharFilter("html_strip"); - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment)); - assertEquals("categorization_analyzer that is a global analyzer cannot also specify a [char_filter] field", e.getMessage()); - } - - public void testVerify_GivenAnalyzerAndTokenizer() { - CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder() - .setAnalyzer("standard") - .setTokenizer("classic"); - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment)); - assertEquals("categorization_analyzer that is a global analyzer cannot also specify a [tokenizer] field", e.getMessage()); - } - - public void testVerify_GivenAnalyzerAndTokenFilter() { - CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder() - .setAnalyzer("standard") - .addTokenFilter("lowercase"); - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment)); - assertEquals("categorization_analyzer that is a global analyzer cannot also specify a [filter] field", e.getMessage()); - } - @Override protected CategorizationAnalyzerConfig createTestInstance() { return createRandomized().build();