diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java
index 1c2808c70ffcf..fd0fde76e6883 100644
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java
@@ -5,14 +5,8 @@
*/
package org.elasticsearch.xpack.core.ml.job.config;
-import org.apache.lucene.analysis.Analyzer;
-import org.elasticsearch.Version;
-import org.elasticsearch.action.admin.indices.analyze.TransportAnalyzeAction;
-import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.Strings;
-import org.elasticsearch.common.UUIDs;
-import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.io.stream.Writeable;
@@ -22,15 +16,6 @@
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.common.xcontent.XContentType;
-import org.elasticsearch.env.Environment;
-import org.elasticsearch.index.IndexSettings;
-import org.elasticsearch.index.analysis.AnalysisRegistry;
-import org.elasticsearch.index.analysis.CharFilterFactory;
-import org.elasticsearch.index.analysis.CustomAnalyzer;
-import org.elasticsearch.index.analysis.CustomAnalyzerProvider;
-import org.elasticsearch.index.analysis.TokenFilterFactory;
-import org.elasticsearch.index.analysis.TokenizerFactory;
-import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.rest.action.admin.indices.RestAnalyzeAction;
import org.elasticsearch.xpack.core.ml.MlParserType;
@@ -42,12 +27,11 @@
import java.util.Map;
import java.util.Objects;
-
/**
* Configuration for the categorization analyzer.
*
* The syntax is a subset of what can be supplied to the {@linkplain RestAnalyzeAction _analyze
endpoint}.
- * To summarise, the first option is to specify the name of an out-of-the-box analyzer:
+ * To summarize, the first option is to specify the name of an out-of-the-box analyzer:
*
* "categorization_analyzer" : "standard"
*
@@ -66,11 +50,6 @@
* { "type" : "pattern_replace", "pattern": "^[0-9].*" }
* ]
*
- *
- * Unfortunately there is no easy to to reuse a subset of the _analyze
action implementation, so much
- * of the code in this file is copied from {@link TransportAnalyzeAction}. Unfortunately the logic required here is
- * not quite identical to that of {@link TransportAnalyzeAction}, and the required code is hard to partially reuse.
- * TODO: consider refactoring ES core to allow more reuse.
*/
public class CategorizationAnalyzerConfig implements ToXContentFragment, Writeable {
@@ -350,175 +329,6 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
return builder;
}
- /**
- * Convert the config to an {@link Analyzer}. This may be a global analyzer or a newly created custom analyzer.
- * In the case of a global analyzer the caller must NOT close it when they have finished with it. In the case of
- * a newly created custom analyzer the caller is responsible for closing it.
- * @return The first tuple member is the {@link Analyzer}; the second indicates whether the caller is responsible
- * for closing it.
- */
- public Tuple toAnalyzer(AnalysisRegistry analysisRegistry, Environment environment) throws IOException {
- if (analyzer != null) {
- Analyzer globalAnalyzer = analysisRegistry.getAnalyzer(analyzer);
- if (globalAnalyzer == null) {
- throw new IllegalArgumentException("Failed to find global analyzer [" + analyzer + "]");
- }
- return new Tuple<>(globalAnalyzer, Boolean.FALSE);
- } else {
- List charFilterFactoryList =
- parseCharFilterFactories(analysisRegistry, environment);
-
- Tuple tokenizerFactory = parseTokenizerFactory(analysisRegistry,
- environment);
-
- List tokenFilterFactoryList = parseTokenFilterFactories(analysisRegistry,
- environment, tokenizerFactory, charFilterFactoryList);
-
- return new Tuple<>(new CustomAnalyzer(tokenizerFactory.v1(), tokenizerFactory.v2(),
- charFilterFactoryList.toArray(new CharFilterFactory[charFilterFactoryList.size()]),
- tokenFilterFactoryList.toArray(new TokenFilterFactory[tokenFilterFactoryList.size()])), Boolean.TRUE);
- }
- }
-
-
- /**
- * Get char filter factories for each configured char filter. Each configuration
- * element can be the name of an out-of-the-box char filter, or a custom definition.
- */
- private List parseCharFilterFactories(AnalysisRegistry analysisRegistry,
- Environment environment) throws IOException {
- final List charFilterFactoryList = new ArrayList<>();
- for (NameOrDefinition charFilter : charFilters) {
- final CharFilterFactory charFilterFactory;
- if (charFilter.name != null) {
- AnalysisModule.AnalysisProvider charFilterFactoryFactory =
- analysisRegistry.getCharFilterProvider(charFilter.name);
- if (charFilterFactoryFactory == null) {
- throw new IllegalArgumentException("Failed to find global char filter under [" + charFilter.name + "]");
- }
- charFilterFactory = charFilterFactoryFactory.get(environment, charFilter.name);
- } else {
- String charFilterTypeName = charFilter.definition.get("type");
- if (charFilterTypeName == null) {
- throw new IllegalArgumentException("Missing [type] setting for char filter: " + charFilter.definition);
- }
- AnalysisModule.AnalysisProvider charFilterFactoryFactory =
- analysisRegistry.getCharFilterProvider(charFilterTypeName);
- if (charFilterFactoryFactory == null) {
- throw new IllegalArgumentException("Failed to find global char filter under [" + charFilterTypeName + "]");
- }
- Settings settings = augmentSettings(charFilter.definition);
- // Need to set anonymous "name" of char_filter
- charFilterFactory = charFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment,
- "_anonymous_charfilter", settings);
- }
- if (charFilterFactory == null) {
- throw new IllegalArgumentException("Failed to find char filter [" + charFilter + "]");
- }
- charFilterFactoryList.add(charFilterFactory);
- }
- return charFilterFactoryList;
- }
-
- /**
- * Get the tokenizer factory for the configured tokenizer. The configuration
- * can be the name of an out-of-the-box tokenizer, or a custom definition.
- */
- private Tuple parseTokenizerFactory(AnalysisRegistry analysisRegistry,
- Environment environment) throws IOException {
- final String name;
- final TokenizerFactory tokenizerFactory;
- if (tokenizer.name != null) {
- name = tokenizer.name;
- AnalysisModule.AnalysisProvider tokenizerFactoryFactory = analysisRegistry.getTokenizerProvider(name);
- if (tokenizerFactoryFactory == null) {
- throw new IllegalArgumentException("Failed to find global tokenizer under [" + name + "]");
- }
- tokenizerFactory = tokenizerFactoryFactory.get(environment, name);
- } else {
- String tokenizerTypeName = tokenizer.definition.get("type");
- if (tokenizerTypeName == null) {
- throw new IllegalArgumentException("Missing [type] setting for tokenizer: " + tokenizer.definition);
- }
- AnalysisModule.AnalysisProvider tokenizerFactoryFactory =
- analysisRegistry.getTokenizerProvider(tokenizerTypeName);
- if (tokenizerFactoryFactory == null) {
- throw new IllegalArgumentException("Failed to find global tokenizer under [" + tokenizerTypeName + "]");
- }
- Settings settings = augmentSettings(tokenizer.definition);
- // Need to set anonymous "name" of tokenizer
- name = "_anonymous_tokenizer";
- tokenizerFactory = tokenizerFactoryFactory.get(buildDummyIndexSettings(settings), environment, name, settings);
- }
- return new Tuple<>(name, tokenizerFactory);
- }
-
- /**
- * Get token filter factories for each configured token filter. Each configuration
- * element can be the name of an out-of-the-box token filter, or a custom definition.
- */
- private List parseTokenFilterFactories(AnalysisRegistry analysisRegistry, Environment environment,
- Tuple tokenizerFactory,
- List charFilterFactoryList) throws IOException {
- final List tokenFilterFactoryList = new ArrayList<>();
- for (NameOrDefinition tokenFilter : tokenFilters) {
- TokenFilterFactory tokenFilterFactory;
- if (tokenFilter.name != null) {
- AnalysisModule.AnalysisProvider tokenFilterFactoryFactory;
- tokenFilterFactoryFactory = analysisRegistry.getTokenFilterProvider(tokenFilter.name);
- if (tokenFilterFactoryFactory == null) {
- throw new IllegalArgumentException("Failed to find global token filter under [" + tokenFilter.name + "]");
- }
- tokenFilterFactory = tokenFilterFactoryFactory.get(environment, tokenFilter.name);
- } else {
- String filterTypeName = tokenFilter.definition.get("type");
- if (filterTypeName == null) {
- throw new IllegalArgumentException("Missing [type] setting for token filter: " + tokenFilter.definition);
- }
- AnalysisModule.AnalysisProvider tokenFilterFactoryFactory =
- analysisRegistry.getTokenFilterProvider(filterTypeName);
- if (tokenFilterFactoryFactory == null) {
- throw new IllegalArgumentException("Failed to find global token filter under [" + filterTypeName + "]");
- }
- Settings settings = augmentSettings(tokenFilter.definition);
- // Need to set anonymous "name" of token_filter
- tokenFilterFactory = tokenFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment,
- "_anonymous_tokenfilter", settings);
- tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(),
- tokenizerFactory.v2(), tokenFilterFactoryList, charFilterFactoryList, environment);
- }
- if (tokenFilterFactory == null) {
- throw new IllegalArgumentException("Failed to find or create token filter [" + tokenFilter + "]");
- }
- tokenFilterFactoryList.add(tokenFilterFactory);
- }
- return tokenFilterFactoryList;
- }
-
- /**
- * The Elasticsearch analysis functionality is designed to work with indices. For
- * categorization we have to pretend we've got some index settings.
- */
- private IndexSettings buildDummyIndexSettings(Settings settings) {
- IndexMetaData metaData = IndexMetaData.builder(IndexMetaData.INDEX_UUID_NA_VALUE).settings(settings).build();
- return new IndexSettings(metaData, Settings.EMPTY);
- }
-
- /**
- * The behaviour of Elasticsearch analyzers can vary between versions.
- * For categorization we'll always use the latest version of the text analysis.
- * The other settings are just to stop classes that expect to be associated with
- * an index from complaining.
- */
- private Settings augmentSettings(Settings settings) {
- return Settings.builder().put(settings)
- .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
- .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
- .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
- .put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
- .build();
- }
-
@Override
public boolean equals(Object o) {
if (this == o) return true;
@@ -609,17 +419,5 @@ public CategorizationAnalyzerConfig build() {
}
return new CategorizationAnalyzerConfig(analyzer, charFilters, tokenizer, tokenFilters);
}
-
- /**
- * Verify that the builder will build a valid config. This is not done as part of the basic build
- * because it verifies that the names of analyzers/tokenizers/filters referenced by the config are
- * known, and the validity of these names could change over time.
- */
- public void verify(AnalysisRegistry analysisRegistry, Environment environment) throws IOException {
- Tuple tuple = build().toAnalyzer(analysisRegistry, environment);
- if (tuple.v2()) {
- tuple.v1().close();
- }
- }
}
}
diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/Job.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/Job.java
index f3c68542fa4f4..227b8612839e8 100644
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/Job.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/Job.java
@@ -21,8 +21,6 @@
import org.elasticsearch.common.xcontent.ToXContentObject;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser.Token;
-import org.elasticsearch.env.Environment;
-import org.elasticsearch.index.analysis.AnalysisRegistry;
import org.elasticsearch.xpack.core.ml.MlParserType;
import org.elasticsearch.xpack.core.ml.job.messages.Messages;
import org.elasticsearch.xpack.core.ml.job.persistence.AnomalyDetectorsIndexFields;
@@ -777,8 +775,8 @@ public Builder setAnalysisConfig(AnalysisConfig.Builder configBuilder) {
return this;
}
- public AnalysisLimits getAnalysisLimits() {
- return analysisLimits;
+ public AnalysisConfig getAnalysisConfig() {
+ return analysisConfig;
}
public Builder setAnalysisLimits(AnalysisLimits analysisLimits) {
@@ -1081,18 +1079,6 @@ public void validateAnalysisLimitsAndSetDefaults(@Nullable ByteSizeValue maxMode
AnalysisLimits.DEFAULT_MODEL_MEMORY_LIMIT_MB);
}
- /**
- * Validate the char filter/tokenizer/token filter names used in the categorization analyzer config (if any).
- * The overall structure can be validated at parse time, but the exact names need to be checked separately,
- * as plugins that provide the functionality can be installed/uninstalled.
- */
- public void validateCategorizationAnalyzer(AnalysisRegistry analysisRegistry, Environment environment) throws IOException {
- CategorizationAnalyzerConfig categorizationAnalyzerConfig = analysisConfig.getCategorizationAnalyzerConfig();
- if (categorizationAnalyzerConfig != null) {
- new CategorizationAnalyzerConfig.Builder(categorizationAnalyzerConfig).verify(analysisRegistry, environment);
- }
- }
-
private void validateGroups() {
for (String group : this.groups) {
if (MlStrings.isValidId(group) == false) {
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/JobManager.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/JobManager.java
index a812b7b464c69..081b9a07db207 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/JobManager.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/JobManager.java
@@ -39,6 +39,7 @@
import org.elasticsearch.xpack.core.ml.action.UpdateJobAction;
import org.elasticsearch.xpack.core.ml.action.util.QueryPage;
import org.elasticsearch.xpack.core.ml.job.config.AnalysisLimits;
+import org.elasticsearch.xpack.core.ml.job.config.CategorizationAnalyzerConfig;
import org.elasticsearch.xpack.core.ml.job.config.DataDescription;
import org.elasticsearch.xpack.core.ml.job.config.Job;
import org.elasticsearch.xpack.core.ml.job.config.JobState;
@@ -50,6 +51,7 @@
import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.ModelSnapshot;
import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
import org.elasticsearch.xpack.ml.MachineLearning;
+import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzer;
import org.elasticsearch.xpack.ml.job.persistence.JobProvider;
import org.elasticsearch.xpack.ml.job.persistence.JobResultsPersister;
import org.elasticsearch.xpack.ml.job.process.autodetect.UpdateParams;
@@ -170,6 +172,22 @@ public JobState getJobState(String jobId) {
return MlMetadata.getJobState(jobId, tasks);
}
+ /**
+ * Validate the char filter/tokenizer/token filter names used in the categorization analyzer config (if any).
+ * This validation has to be done server-side; it cannot be done in a client as that won't have loaded the
+ * appropriate analysis modules/plugins.
+ * The overall structure can be validated at parse time, but the exact names need to be checked separately,
+ * as plugins that provide the functionality can be installed/uninstalled.
+ */
+ static void validateCategorizationAnalyzer(Job.Builder jobBuilder, AnalysisRegistry analysisRegistry, Environment environment)
+ throws IOException {
+ CategorizationAnalyzerConfig categorizationAnalyzerConfig = jobBuilder.getAnalysisConfig().getCategorizationAnalyzerConfig();
+ if (categorizationAnalyzerConfig != null) {
+ CategorizationAnalyzer.verifyConfigBuilder(new CategorizationAnalyzerConfig.Builder(categorizationAnalyzerConfig),
+ analysisRegistry, environment);
+ }
+ }
+
/**
* Stores a job in the cluster state
*/
@@ -177,7 +195,7 @@ public void putJob(PutJobAction.Request request, AnalysisRegistry analysisRegist
ActionListener actionListener) throws IOException {
request.getJobBuilder().validateAnalysisLimitsAndSetDefaults(maxModelMemoryLimit);
- request.getJobBuilder().validateCategorizationAnalyzer(analysisRegistry, environment);
+ validateCategorizationAnalyzer(request.getJobBuilder(), analysisRegistry, environment);
Job job = request.getJobBuilder().build(new Date());
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzer.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzer.java
index a0101b999d5dc..6111fa139f97f 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzer.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzer.java
@@ -9,9 +9,21 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.elasticsearch.ElasticsearchException;
+import org.elasticsearch.Version;
+import org.elasticsearch.action.admin.indices.analyze.TransportAnalyzeAction;
+import org.elasticsearch.cluster.metadata.IndexMetaData;
+import org.elasticsearch.common.UUIDs;
import org.elasticsearch.common.collect.Tuple;
+import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AnalysisRegistry;
+import org.elasticsearch.index.analysis.CharFilterFactory;
+import org.elasticsearch.index.analysis.CustomAnalyzer;
+import org.elasticsearch.index.analysis.CustomAnalyzerProvider;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
+import org.elasticsearch.index.analysis.TokenizerFactory;
+import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.xpack.core.ml.job.config.CategorizationAnalyzerConfig;
import java.io.Closeable;
@@ -19,11 +31,16 @@
import java.util.ArrayList;
import java.util.List;
-
/**
* The categorization analyzer.
*
* Converts messages to lists of tokens that will be fed to the ML categorization algorithm.
+ *
+ * The code in {@link #makeAnalyzer} and the methods it calls is largely copied from {@link TransportAnalyzeAction}.
+ * Unfortunately there is no easy way to reuse a subset of the _analyze
action implementation, as the
+ * logic required here is not quite identical to that of {@link TransportAnalyzeAction}, and the required code is
+ * hard to partially reuse.
+ * TODO: consider refactoring ES core to allow more reuse.
*/
public class CategorizationAnalyzer implements Closeable {
@@ -33,7 +50,7 @@ public class CategorizationAnalyzer implements Closeable {
public CategorizationAnalyzer(AnalysisRegistry analysisRegistry, Environment environment,
CategorizationAnalyzerConfig categorizationAnalyzerConfig) throws IOException {
- Tuple tuple = categorizationAnalyzerConfig.toAnalyzer(analysisRegistry, environment);
+ Tuple tuple = makeAnalyzer(categorizationAnalyzerConfig, analysisRegistry, environment);
analyzer = tuple.v1();
closeAnalyzer = tuple.v2();
}
@@ -69,4 +86,193 @@ public List tokenizeField(String fieldName, String fieldValue) {
}
return tokens;
}
+
+ /**
+ * Verify that the config builder will build a valid config. This is not done as part of the basic build
+ * because it verifies that the names of analyzers/tokenizers/filters referenced by the config are
+ * known, and the validity of these names could change over time. Additionally, it has to be done
+ * server-side rather than client-side, as the client will not have loaded the appropriate analysis
+ * modules/plugins.
+ */
+ public static void verifyConfigBuilder(CategorizationAnalyzerConfig.Builder configBuilder, AnalysisRegistry analysisRegistry,
+ Environment environment) throws IOException {
+ Tuple tuple = makeAnalyzer(configBuilder.build(), analysisRegistry, environment);
+ if (tuple.v2()) {
+ tuple.v1().close();
+ }
+ }
+
+ /**
+ * Convert a config to an {@link Analyzer}. This may be a global analyzer or a newly created custom analyzer.
+ * In the case of a global analyzer the caller must NOT close it when they have finished with it. In the case of
+ * a newly created custom analyzer the caller is responsible for closing it.
+ * @return The first tuple member is the {@link Analyzer}; the second indicates whether the caller is responsible
+ * for closing it.
+ */
+ private static Tuple makeAnalyzer(CategorizationAnalyzerConfig config, AnalysisRegistry analysisRegistry,
+ Environment environment) throws IOException {
+ String analyzer = config.getAnalyzer();
+ if (analyzer != null) {
+ Analyzer globalAnalyzer = analysisRegistry.getAnalyzer(analyzer);
+ if (globalAnalyzer == null) {
+ throw new IllegalArgumentException("Failed to find global analyzer [" + analyzer + "]");
+ }
+ return new Tuple<>(globalAnalyzer, Boolean.FALSE);
+ } else {
+ List charFilterFactoryList = parseCharFilterFactories(config, analysisRegistry, environment);
+
+ Tuple tokenizerFactory = parseTokenizerFactory(config, analysisRegistry, environment);
+
+ List tokenFilterFactoryList = parseTokenFilterFactories(config, analysisRegistry, environment,
+ tokenizerFactory, charFilterFactoryList);
+
+ return new Tuple<>(new CustomAnalyzer(tokenizerFactory.v1(), tokenizerFactory.v2(),
+ charFilterFactoryList.toArray(new CharFilterFactory[charFilterFactoryList.size()]),
+ tokenFilterFactoryList.toArray(new TokenFilterFactory[tokenFilterFactoryList.size()])), Boolean.TRUE);
+ }
+ }
+
+
+ /**
+ * Get char filter factories for each configured char filter. Each configuration
+ * element can be the name of an out-of-the-box char filter, or a custom definition.
+ */
+ private static List parseCharFilterFactories(CategorizationAnalyzerConfig config, AnalysisRegistry analysisRegistry,
+ Environment environment) throws IOException {
+ List charFilters = config.getCharFilters();
+ final List charFilterFactoryList = new ArrayList<>();
+ for (CategorizationAnalyzerConfig.NameOrDefinition charFilter : charFilters) {
+ final CharFilterFactory charFilterFactory;
+ if (charFilter.name != null) {
+ AnalysisModule.AnalysisProvider charFilterFactoryFactory =
+ analysisRegistry.getCharFilterProvider(charFilter.name);
+ if (charFilterFactoryFactory == null) {
+ throw new IllegalArgumentException("Failed to find global char filter under [" + charFilter.name + "]");
+ }
+ charFilterFactory = charFilterFactoryFactory.get(environment, charFilter.name);
+ } else {
+ String charFilterTypeName = charFilter.definition.get("type");
+ if (charFilterTypeName == null) {
+ throw new IllegalArgumentException("Missing [type] setting for char filter: " + charFilter.definition);
+ }
+ AnalysisModule.AnalysisProvider charFilterFactoryFactory =
+ analysisRegistry.getCharFilterProvider(charFilterTypeName);
+ if (charFilterFactoryFactory == null) {
+ throw new IllegalArgumentException("Failed to find global char filter under [" + charFilterTypeName + "]");
+ }
+ Settings settings = augmentSettings(charFilter.definition);
+ // Need to set anonymous "name" of char_filter
+ charFilterFactory = charFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment, "_anonymous_charfilter",
+ settings);
+ }
+ if (charFilterFactory == null) {
+ throw new IllegalArgumentException("Failed to find char filter [" + charFilter + "]");
+ }
+ charFilterFactoryList.add(charFilterFactory);
+ }
+ return charFilterFactoryList;
+ }
+
+ /**
+ * Get the tokenizer factory for the configured tokenizer. The configuration
+ * can be the name of an out-of-the-box tokenizer, or a custom definition.
+ */
+ private static Tuple parseTokenizerFactory(CategorizationAnalyzerConfig config,
+ AnalysisRegistry analysisRegistry, Environment environment)
+ throws IOException {
+ CategorizationAnalyzerConfig.NameOrDefinition tokenizer = config.getTokenizer();
+ final String name;
+ final TokenizerFactory tokenizerFactory;
+ if (tokenizer.name != null) {
+ name = tokenizer.name;
+ AnalysisModule.AnalysisProvider tokenizerFactoryFactory = analysisRegistry.getTokenizerProvider(name);
+ if (tokenizerFactoryFactory == null) {
+ throw new IllegalArgumentException("Failed to find global tokenizer under [" + name + "]");
+ }
+ tokenizerFactory = tokenizerFactoryFactory.get(environment, name);
+ } else {
+ String tokenizerTypeName = tokenizer.definition.get("type");
+ if (tokenizerTypeName == null) {
+ throw new IllegalArgumentException("Missing [type] setting for tokenizer: " + tokenizer.definition);
+ }
+ AnalysisModule.AnalysisProvider tokenizerFactoryFactory =
+ analysisRegistry.getTokenizerProvider(tokenizerTypeName);
+ if (tokenizerFactoryFactory == null) {
+ throw new IllegalArgumentException("Failed to find global tokenizer under [" + tokenizerTypeName + "]");
+ }
+ Settings settings = augmentSettings(tokenizer.definition);
+ // Need to set anonymous "name" of tokenizer
+ name = "_anonymous_tokenizer";
+ tokenizerFactory = tokenizerFactoryFactory.get(buildDummyIndexSettings(settings), environment, name, settings);
+ }
+ return new Tuple<>(name, tokenizerFactory);
+ }
+
+ /**
+ * Get token filter factories for each configured token filter. Each configuration
+ * element can be the name of an out-of-the-box token filter, or a custom definition.
+ */
+ private static List parseTokenFilterFactories(CategorizationAnalyzerConfig config,
+ AnalysisRegistry analysisRegistry, Environment environment,
+ Tuple tokenizerFactory,
+ List charFilterFactoryList) throws IOException {
+ List tokenFilters = config.getTokenFilters();
+ final List tokenFilterFactoryList = new ArrayList<>();
+ for (CategorizationAnalyzerConfig.NameOrDefinition tokenFilter : tokenFilters) {
+ TokenFilterFactory tokenFilterFactory;
+ if (tokenFilter.name != null) {
+ AnalysisModule.AnalysisProvider tokenFilterFactoryFactory;
+ tokenFilterFactoryFactory = analysisRegistry.getTokenFilterProvider(tokenFilter.name);
+ if (tokenFilterFactoryFactory == null) {
+ throw new IllegalArgumentException("Failed to find global token filter under [" + tokenFilter.name + "]");
+ }
+ tokenFilterFactory = tokenFilterFactoryFactory.get(environment, tokenFilter.name);
+ } else {
+ String filterTypeName = tokenFilter.definition.get("type");
+ if (filterTypeName == null) {
+ throw new IllegalArgumentException("Missing [type] setting for token filter: " + tokenFilter.definition);
+ }
+ AnalysisModule.AnalysisProvider tokenFilterFactoryFactory =
+ analysisRegistry.getTokenFilterProvider(filterTypeName);
+ if (tokenFilterFactoryFactory == null) {
+ throw new IllegalArgumentException("Failed to find global token filter under [" + filterTypeName + "]");
+ }
+ Settings settings = augmentSettings(tokenFilter.definition);
+ // Need to set anonymous "name" of token_filter
+ tokenFilterFactory = tokenFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment, "_anonymous_tokenfilter",
+ settings);
+ tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(),
+ tokenizerFactory.v2(), tokenFilterFactoryList, charFilterFactoryList, environment);
+ }
+ if (tokenFilterFactory == null) {
+ throw new IllegalArgumentException("Failed to find or create token filter [" + tokenFilter + "]");
+ }
+ tokenFilterFactoryList.add(tokenFilterFactory);
+ }
+ return tokenFilterFactoryList;
+ }
+
+ /**
+ * The Elasticsearch analysis functionality is designed to work with indices. For
+ * categorization we have to pretend we've got some index settings.
+ */
+ private static IndexSettings buildDummyIndexSettings(Settings settings) {
+ IndexMetaData metaData = IndexMetaData.builder(IndexMetaData.INDEX_UUID_NA_VALUE).settings(settings).build();
+ return new IndexSettings(metaData, Settings.EMPTY);
+ }
+
+ /**
+ * The behaviour of Elasticsearch analyzers can vary between versions.
+ * For categorization we'll always use the latest version of the text analysis.
+ * The other settings are just to stop classes that expect to be associated with
+ * an index from complaining.
+ */
+ private static Settings augmentSettings(Settings settings) {
+ return Settings.builder().put(settings)
+ .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
+ .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
+ .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
+ .put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
+ .build();
+ }
}
diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzerTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzerTests.java
index 9f03952165c13..59413f6a61879 100644
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzerTests.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzerTests.java
@@ -22,7 +22,6 @@
import java.util.HashMap;
import java.util.Map;
-
public class CategorizationAnalyzerTests extends ESTestCase {
private AnalysisRegistry analysisRegistry;
@@ -41,6 +40,158 @@ public void setup() throws Exception {
analysisRegistry = buildTestAnalysisRegistry(environment);
}
+ public void testVerifyConfigBuilder_GivenNoConfig() {
+ CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder();
+ IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
+ () -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment));
+ assertEquals("categorization_analyzer that is not a global analyzer must specify a [tokenizer] field", e.getMessage());
+ }
+
+ public void testVerifyConfigBuilder_GivenDefault() throws IOException {
+ CategorizationAnalyzerConfig defaultConfig = CategorizationAnalyzerConfig.buildDefaultCategorizationAnalyzer(null);
+ CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder(defaultConfig);
+ CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment);
+ }
+
+ public void testVerifyConfigBuilder_GivenValidAnalyzer() throws IOException {
+ CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder().setAnalyzer("standard");
+ CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment);
+ }
+
+ public void testVerifyConfigBuilder_GivenInvalidAnalyzer() {
+ CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder().setAnalyzer("does not exist");
+ IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
+ () -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment));
+ assertEquals("Failed to find global analyzer [does not exist]", e.getMessage());
+ }
+
+ public void testVerifyConfigBuilder_GivenValidCustomConfig() throws IOException {
+ Map ignoreStuffInSqaureBrackets = new HashMap<>();
+ ignoreStuffInSqaureBrackets.put("type", "pattern_replace");
+ ignoreStuffInSqaureBrackets.put("pattern", "\\[[^\\]]*\\]");
+ Map ignoreStuffThatBeginsWithADigit = new HashMap<>();
+ ignoreStuffThatBeginsWithADigit.put("type", "pattern_replace");
+ ignoreStuffThatBeginsWithADigit.put("pattern", "^[0-9].*");
+ CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
+ .addCharFilter(ignoreStuffInSqaureBrackets)
+ .setTokenizer("classic")
+ .addTokenFilter("lowercase")
+ .addTokenFilter(ignoreStuffThatBeginsWithADigit)
+ .addTokenFilter("snowball");
+ CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment);
+ }
+
+ public void testVerifyConfigBuilder_GivenCustomConfigWithInvalidCharFilter() {
+ CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
+ .addCharFilter("wrong!")
+ .setTokenizer("classic")
+ .addTokenFilter("lowercase")
+ .addTokenFilter("snowball");
+ IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
+ () -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment));
+ assertEquals("Failed to find global char filter under [wrong!]", e.getMessage());
+ }
+
+ public void testVerifyConfigBuilder_GivenCustomConfigWithMisconfiguredCharFilter() {
+ Map noPattern = new HashMap<>();
+ noPattern.put("type", "pattern_replace");
+ noPattern.put("attern", "should have been pattern");
+ CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
+ .addCharFilter(noPattern)
+ .setTokenizer("classic")
+ .addTokenFilter("lowercase")
+ .addTokenFilter("snowball");
+ IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
+ () -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment));
+ assertEquals("pattern is missing for [_anonymous_charfilter] char filter of type 'pattern_replace'", e.getMessage());
+ }
+
+ public void testVerifyConfigBuilder_GivenCustomConfigWithInvalidTokenizer() {
+ Map ignoreStuffInSqaureBrackets = new HashMap<>();
+ ignoreStuffInSqaureBrackets.put("type", "pattern_replace");
+ ignoreStuffInSqaureBrackets.put("pattern", "\\[[^\\]]*\\]");
+ CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
+ .addCharFilter(ignoreStuffInSqaureBrackets)
+ .setTokenizer("oops!")
+ .addTokenFilter("lowercase")
+ .addTokenFilter("snowball");
+ IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
+ () -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment));
+ assertEquals("Failed to find global tokenizer under [oops!]", e.getMessage());
+ }
+
+ public void testVerifyConfigBuilder_GivenNoTokenizer() {
+ Map ignoreStuffInSqaureBrackets = new HashMap<>();
+ ignoreStuffInSqaureBrackets.put("type", "pattern_replace");
+ ignoreStuffInSqaureBrackets.put("pattern", "\\[[^\\]]*\\]");
+ Map ignoreStuffThatBeginsWithADigit = new HashMap<>();
+ ignoreStuffThatBeginsWithADigit.put("type", "pattern_replace");
+ ignoreStuffThatBeginsWithADigit.put("pattern", "^[0-9].*");
+ CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
+ .addCharFilter(ignoreStuffInSqaureBrackets)
+ .addTokenFilter("lowercase")
+ .addTokenFilter(ignoreStuffThatBeginsWithADigit)
+ .addTokenFilter("snowball");
+ IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
+ () -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment));
+ assertEquals("categorization_analyzer that is not a global analyzer must specify a [tokenizer] field", e.getMessage());
+ }
+
+ public void testVerifyConfigBuilder_GivenCustomConfigWithInvalidTokenFilter() {
+ Map ignoreStuffInSqaureBrackets = new HashMap<>();
+ ignoreStuffInSqaureBrackets.put("type", "pattern_replace");
+ ignoreStuffInSqaureBrackets.put("pattern", "\\[[^\\]]*\\]");
+ CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
+ .addCharFilter(ignoreStuffInSqaureBrackets)
+ .setTokenizer("classic")
+ .addTokenFilter("lowercase")
+ .addTokenFilter("oh dear!");
+ IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
+ () -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment));
+ assertEquals("Failed to find global token filter under [oh dear!]", e.getMessage());
+ }
+
+ public void testVerifyConfigBuilder_GivenCustomConfigWithMisconfiguredTokenFilter() {
+ Map noPattern = new HashMap<>();
+ noPattern.put("type", "pattern_replace");
+ noPattern.put("attern", "should have been pattern");
+ CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
+ .addCharFilter("html_strip")
+ .setTokenizer("classic")
+ .addTokenFilter("lowercase")
+ .addTokenFilter(noPattern);
+ IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
+ () -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment));
+ assertEquals("pattern is missing for [_anonymous_tokenfilter] token filter of type 'pattern_replace'", e.getMessage());
+ }
+
+ public void testVerifyConfigBuilder_GivenAnalyzerAndCharFilter() {
+ CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
+ .setAnalyzer("standard")
+ .addCharFilter("html_strip");
+ IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
+ () -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment));
+ assertEquals("categorization_analyzer that is a global analyzer cannot also specify a [char_filter] field", e.getMessage());
+ }
+
+ public void testVerifyConfigBuilder_GivenAnalyzerAndTokenizer() {
+ CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
+ .setAnalyzer("standard")
+ .setTokenizer("classic");
+ IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
+ () -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment));
+ assertEquals("categorization_analyzer that is a global analyzer cannot also specify a [tokenizer] field", e.getMessage());
+ }
+
+ public void testVerifyConfigBuilder_GivenAnalyzerAndTokenFilter() {
+ CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
+ .setAnalyzer("standard")
+ .addTokenFilter("lowercase");
+ IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
+ () -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment));
+ assertEquals("categorization_analyzer that is a global analyzer cannot also specify a [filter] field", e.getMessage());
+ }
+
// The default categorization analyzer matches what the analyzer in the ML C++ does
public void testDefaultCategorizationAnalyzer() throws IOException {
CategorizationAnalyzerConfig defaultConfig = CategorizationAnalyzerConfig.buildDefaultCategorizationAnalyzer(null);
diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/config/CategorizationAnalyzerConfigTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/config/CategorizationAnalyzerConfigTests.java
index 9c725fe76292a..2fe2c0b334c4e 100644
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/config/CategorizationAnalyzerConfigTests.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/config/CategorizationAnalyzerConfigTests.java
@@ -6,175 +6,17 @@
package org.elasticsearch.xpack.ml.job.config;
import org.elasticsearch.common.io.stream.Writeable;
-import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentParser;
-import org.elasticsearch.env.Environment;
-import org.elasticsearch.env.TestEnvironment;
-import org.elasticsearch.index.analysis.AnalysisRegistry;
import org.elasticsearch.test.AbstractSerializingTestCase;
import org.elasticsearch.xpack.core.ml.job.config.CategorizationAnalyzerConfig;
import org.elasticsearch.xpack.core.ml.MlParserType;
-import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzerTests;
-import org.junit.Before;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
-
public class CategorizationAnalyzerConfigTests extends AbstractSerializingTestCase {
- private AnalysisRegistry analysisRegistry;
- private Environment environment;
-
- @Before
- public void setup() throws Exception {
- Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()).build();
- environment = TestEnvironment.newEnvironment(settings);
- analysisRegistry = CategorizationAnalyzerTests.buildTestAnalysisRegistry(environment);
- }
-
- public void testVerify_GivenNoConfig() {
- CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder();
- IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment));
- assertEquals("categorization_analyzer that is not a global analyzer must specify a [tokenizer] field", e.getMessage());
- }
-
- public void testVerify_GivenDefault() throws IOException {
- CategorizationAnalyzerConfig defaultConfig = CategorizationAnalyzerConfig.buildDefaultCategorizationAnalyzer(null);
- CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder(defaultConfig);
- builder.verify(analysisRegistry, environment);
- }
-
- public void testVerify_GivenValidAnalyzer() throws IOException {
- CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder().setAnalyzer("standard");
- builder.verify(analysisRegistry, environment);
- }
-
- public void testVerify_GivenInvalidAnalyzer() {
- CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder().setAnalyzer("does not exist");
- IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment));
- assertEquals("Failed to find global analyzer [does not exist]", e.getMessage());
- }
-
- public void testVerify_GivenValidCustomConfig() throws IOException {
- Map ignoreStuffInSqaureBrackets = new HashMap<>();
- ignoreStuffInSqaureBrackets.put("type", "pattern_replace");
- ignoreStuffInSqaureBrackets.put("pattern", "\\[[^\\]]*\\]");
- Map ignoreStuffThatBeginsWithADigit = new HashMap<>();
- ignoreStuffThatBeginsWithADigit.put("type", "pattern_replace");
- ignoreStuffThatBeginsWithADigit.put("pattern", "^[0-9].*");
- CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
- .addCharFilter(ignoreStuffInSqaureBrackets)
- .setTokenizer("classic")
- .addTokenFilter("lowercase")
- .addTokenFilter(ignoreStuffThatBeginsWithADigit)
- .addTokenFilter("snowball");
- builder.verify(analysisRegistry, environment);
- }
-
- public void testVerify_GivenCustomConfigWithInvalidCharFilter() {
- CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
- .addCharFilter("wrong!")
- .setTokenizer("classic")
- .addTokenFilter("lowercase")
- .addTokenFilter("snowball");
- IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment));
- assertEquals("Failed to find global char filter under [wrong!]", e.getMessage());
- }
-
- public void testVerify_GivenCustomConfigWithMisconfiguredCharFilter() {
- Map noPattern = new HashMap<>();
- noPattern.put("type", "pattern_replace");
- noPattern.put("attern", "should have been pattern");
- CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
- .addCharFilter(noPattern)
- .setTokenizer("classic")
- .addTokenFilter("lowercase")
- .addTokenFilter("snowball");
- IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment));
- assertEquals("pattern is missing for [_anonymous_charfilter] char filter of type 'pattern_replace'", e.getMessage());
- }
-
- public void testVerify_GivenCustomConfigWithInvalidTokenizer() {
- Map ignoreStuffInSqaureBrackets = new HashMap<>();
- ignoreStuffInSqaureBrackets.put("type", "pattern_replace");
- ignoreStuffInSqaureBrackets.put("pattern", "\\[[^\\]]*\\]");
- CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
- .addCharFilter(ignoreStuffInSqaureBrackets)
- .setTokenizer("oops!")
- .addTokenFilter("lowercase")
- .addTokenFilter("snowball");
- IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment));
- assertEquals("Failed to find global tokenizer under [oops!]", e.getMessage());
- }
-
- public void testVerify_GivenNoTokenizer() {
- Map ignoreStuffInSqaureBrackets = new HashMap<>();
- ignoreStuffInSqaureBrackets.put("type", "pattern_replace");
- ignoreStuffInSqaureBrackets.put("pattern", "\\[[^\\]]*\\]");
- Map ignoreStuffThatBeginsWithADigit = new HashMap<>();
- ignoreStuffThatBeginsWithADigit.put("type", "pattern_replace");
- ignoreStuffThatBeginsWithADigit.put("pattern", "^[0-9].*");
- CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
- .addCharFilter(ignoreStuffInSqaureBrackets)
- .addTokenFilter("lowercase")
- .addTokenFilter(ignoreStuffThatBeginsWithADigit)
- .addTokenFilter("snowball");
- IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment));
- assertEquals("categorization_analyzer that is not a global analyzer must specify a [tokenizer] field", e.getMessage());
- }
-
- public void testVerify_GivenCustomConfigWithInvalidTokenFilter() {
- Map ignoreStuffInSqaureBrackets = new HashMap<>();
- ignoreStuffInSqaureBrackets.put("type", "pattern_replace");
- ignoreStuffInSqaureBrackets.put("pattern", "\\[[^\\]]*\\]");
- CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
- .addCharFilter(ignoreStuffInSqaureBrackets)
- .setTokenizer("classic")
- .addTokenFilter("lowercase")
- .addTokenFilter("oh dear!");
- IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment));
- assertEquals("Failed to find global token filter under [oh dear!]", e.getMessage());
- }
-
- public void testVerify_GivenCustomConfigWithMisconfiguredTokenFilter() {
- Map noPattern = new HashMap<>();
- noPattern.put("type", "pattern_replace");
- noPattern.put("attern", "should have been pattern");
- CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
- .addCharFilter("html_strip")
- .setTokenizer("classic")
- .addTokenFilter("lowercase")
- .addTokenFilter(noPattern);
- IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment));
- assertEquals("pattern is missing for [_anonymous_tokenfilter] token filter of type 'pattern_replace'", e.getMessage());
- }
-
- public void testVerify_GivenAnalyzerAndCharFilter() {
- CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
- .setAnalyzer("standard")
- .addCharFilter("html_strip");
- IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment));
- assertEquals("categorization_analyzer that is a global analyzer cannot also specify a [char_filter] field", e.getMessage());
- }
-
- public void testVerify_GivenAnalyzerAndTokenizer() {
- CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
- .setAnalyzer("standard")
- .setTokenizer("classic");
- IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment));
- assertEquals("categorization_analyzer that is a global analyzer cannot also specify a [tokenizer] field", e.getMessage());
- }
-
- public void testVerify_GivenAnalyzerAndTokenFilter() {
- CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
- .setAnalyzer("standard")
- .addTokenFilter("lowercase");
- IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment));
- assertEquals("categorization_analyzer that is a global analyzer cannot also specify a [filter] field", e.getMessage());
- }
-
@Override
protected CategorizationAnalyzerConfig createTestInstance() {
return createRandomized().build();