[ML] Move analyzer dependencies out of categorization config (#32123)

The ML config classes will shortly be moved to the X-Pack protocol library to allow the ML APIs to be moved to the high level REST client. Dependencies on server functionality should be removed from the config classes before this is done. This change is entirely about moving code between packages. It does not add or remove any functionality or tests.
elastic · Jul 17, 2018 · e87afdf · e87afdf
1 parent 5018c47
commit e87afdf
Show file tree

Hide file tree

Showing 6 changed files with 382 additions and 381 deletions.
diff --git a/...rc/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java b/...rc/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java
@@ -5,14 +5,8 @@
  */
 package org.elasticsearch.xpack.core.ml.job.config;
 
-import org.apache.lucene.analysis.Analyzer;
-import org.elasticsearch.Version;
-import org.elasticsearch.action.admin.indices.analyze.TransportAnalyzeAction;
-import org.elasticsearch.cluster.metadata.IndexMetaData;
 import org.elasticsearch.common.ParseField;
 import org.elasticsearch.common.Strings;
-import org.elasticsearch.common.UUIDs;
-import org.elasticsearch.common.collect.Tuple;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
 import org.elasticsearch.common.io.stream.Writeable;
@@ -22,15 +16,6 @@
 import org.elasticsearch.common.xcontent.XContentFactory;
 import org.elasticsearch.common.xcontent.XContentParser;
 import org.elasticsearch.common.xcontent.XContentType;
-import org.elasticsearch.env.Environment;
-import org.elasticsearch.index.IndexSettings;
-import org.elasticsearch.index.analysis.AnalysisRegistry;
-import org.elasticsearch.index.analysis.CharFilterFactory;
-import org.elasticsearch.index.analysis.CustomAnalyzer;
-import org.elasticsearch.index.analysis.CustomAnalyzerProvider;
-import org.elasticsearch.index.analysis.TokenFilterFactory;
-import org.elasticsearch.index.analysis.TokenizerFactory;
-import org.elasticsearch.indices.analysis.AnalysisModule;
 import org.elasticsearch.rest.action.admin.indices.RestAnalyzeAction;
 import org.elasticsearch.xpack.core.ml.MlParserType;
 
@@ -42,12 +27,11 @@
 import java.util.Map;
 import java.util.Objects;
 
-
 /**
  * Configuration for the categorization analyzer.
  *
  * The syntax is a subset of what can be supplied to the {@linkplain RestAnalyzeAction <code>_analyze</code> endpoint}.
- * To summarise, the first option is to specify the name of an out-of-the-box analyzer:
+ * To summarize, the first option is to specify the name of an out-of-the-box analyzer:
  * <code>
  *     "categorization_analyzer" : "standard"
  * </code>
@@ -66,11 +50,6 @@
  *         { "type" : "pattern_replace", "pattern": "^[0-9].*" }
  *     ]
  * </code>
- *
- * Unfortunately there is no easy to to reuse a subset of the <code>_analyze</code> action implementation, so much
- * of the code in this file is copied from {@link TransportAnalyzeAction}.  Unfortunately the logic required here is
- * not quite identical to that of {@link TransportAnalyzeAction}, and the required code is hard to partially reuse.
- * TODO: consider refactoring ES core to allow more reuse.
  */
 public class CategorizationAnalyzerConfig implements ToXContentFragment, Writeable {
 
@@ -350,175 +329,6 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
         return builder;
     }
 
-    /**
-     * Convert the config to an {@link Analyzer}.  This may be a global analyzer or a newly created custom analyzer.
-     * In the case of a global analyzer the caller must NOT close it when they have finished with it.  In the case of
-     * a newly created custom analyzer the caller is responsible for closing it.
-     * @return The first tuple member is the {@link Analyzer}; the second indicates whether the caller is responsible
-     *         for closing it.
-     */
-    public Tuple<Analyzer, Boolean> toAnalyzer(AnalysisRegistry analysisRegistry, Environment environment) throws IOException {
-        if (analyzer != null) {
-            Analyzer globalAnalyzer = analysisRegistry.getAnalyzer(analyzer);
-            if (globalAnalyzer == null) {
-                throw new IllegalArgumentException("Failed to find global analyzer [" + analyzer + "]");
-            }
-            return new Tuple<>(globalAnalyzer, Boolean.FALSE);
-        } else {
-            List<CharFilterFactory> charFilterFactoryList =
-                    parseCharFilterFactories(analysisRegistry, environment);
-
-            Tuple<String, TokenizerFactory> tokenizerFactory = parseTokenizerFactory(analysisRegistry,
-                    environment);
-
-            List<TokenFilterFactory> tokenFilterFactoryList = parseTokenFilterFactories(analysisRegistry,
-                    environment, tokenizerFactory, charFilterFactoryList);
-
-            return new Tuple<>(new CustomAnalyzer(tokenizerFactory.v1(), tokenizerFactory.v2(),
-                    charFilterFactoryList.toArray(new CharFilterFactory[charFilterFactoryList.size()]),
-                    tokenFilterFactoryList.toArray(new TokenFilterFactory[tokenFilterFactoryList.size()])), Boolean.TRUE);
-        }
-    }
-
-
-    /**
-     * Get char filter factories for each configured char filter.  Each configuration
-     * element can be the name of an out-of-the-box char filter, or a custom definition.
-     */
-    private List<CharFilterFactory> parseCharFilterFactories(AnalysisRegistry analysisRegistry,
-                                                             Environment environment) throws IOException {
-        final List<CharFilterFactory> charFilterFactoryList = new ArrayList<>();
-        for (NameOrDefinition charFilter : charFilters) {
-            final CharFilterFactory charFilterFactory;
-            if (charFilter.name != null) {
-                AnalysisModule.AnalysisProvider<CharFilterFactory> charFilterFactoryFactory =
-                        analysisRegistry.getCharFilterProvider(charFilter.name);
-                if (charFilterFactoryFactory == null) {
-                    throw new IllegalArgumentException("Failed to find global char filter under [" + charFilter.name + "]");
-                }
-                charFilterFactory = charFilterFactoryFactory.get(environment, charFilter.name);
-            } else {
-                String charFilterTypeName = charFilter.definition.get("type");
-                if (charFilterTypeName == null) {
-                    throw new IllegalArgumentException("Missing [type] setting for char filter: " + charFilter.definition);
-                }
-                AnalysisModule.AnalysisProvider<CharFilterFactory> charFilterFactoryFactory =
-                        analysisRegistry.getCharFilterProvider(charFilterTypeName);
-                if (charFilterFactoryFactory == null) {
-                    throw new IllegalArgumentException("Failed to find global char filter under [" + charFilterTypeName + "]");
-                }
-                Settings settings = augmentSettings(charFilter.definition);
-                // Need to set anonymous "name" of char_filter
-                charFilterFactory = charFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment,
-                        "_anonymous_charfilter", settings);
-            }
-            if (charFilterFactory == null) {
-                throw new IllegalArgumentException("Failed to find char filter [" + charFilter + "]");
-            }
-            charFilterFactoryList.add(charFilterFactory);
-        }
-        return charFilterFactoryList;
-    }
-
-    /**
-     * Get the tokenizer factory for the configured tokenizer.  The configuration
-     * can be the name of an out-of-the-box tokenizer, or a custom definition.
-     */
-    private Tuple<String, TokenizerFactory> parseTokenizerFactory(AnalysisRegistry analysisRegistry,
-                                                                  Environment environment) throws IOException {
-        final String name;
-        final TokenizerFactory tokenizerFactory;
-        if (tokenizer.name != null) {
-            name = tokenizer.name;
-            AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory = analysisRegistry.getTokenizerProvider(name);
-            if (tokenizerFactoryFactory == null) {
-                throw new IllegalArgumentException("Failed to find global tokenizer under [" + name + "]");
-            }
-            tokenizerFactory = tokenizerFactoryFactory.get(environment, name);
-        } else {
-            String tokenizerTypeName = tokenizer.definition.get("type");
-            if (tokenizerTypeName == null) {
-                throw new IllegalArgumentException("Missing [type] setting for tokenizer: " + tokenizer.definition);
-            }
-            AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory =
-                    analysisRegistry.getTokenizerProvider(tokenizerTypeName);
-            if (tokenizerFactoryFactory == null) {
-                throw new IllegalArgumentException("Failed to find global tokenizer under [" + tokenizerTypeName + "]");
-            }
-            Settings settings = augmentSettings(tokenizer.definition);
-            // Need to set anonymous "name" of tokenizer
-            name = "_anonymous_tokenizer";
-            tokenizerFactory = tokenizerFactoryFactory.get(buildDummyIndexSettings(settings), environment, name, settings);
-        }
-        return new Tuple<>(name, tokenizerFactory);
-    }
-
-    /**
-     * Get token filter factories for each configured token filter.  Each configuration
-     * element can be the name of an out-of-the-box token filter, or a custom definition.
-     */
-    private List<TokenFilterFactory> parseTokenFilterFactories(AnalysisRegistry analysisRegistry, Environment environment,
-                                                               Tuple<String, TokenizerFactory> tokenizerFactory,
-                                                               List<CharFilterFactory> charFilterFactoryList) throws IOException {
-        final List<TokenFilterFactory> tokenFilterFactoryList = new ArrayList<>();
-        for (NameOrDefinition tokenFilter : tokenFilters) {
-            TokenFilterFactory tokenFilterFactory;
-            if (tokenFilter.name != null) {
-                AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory;
-                tokenFilterFactoryFactory = analysisRegistry.getTokenFilterProvider(tokenFilter.name);
-                if (tokenFilterFactoryFactory == null) {
-                    throw new IllegalArgumentException("Failed to find global token filter under [" + tokenFilter.name + "]");
-                }
-                tokenFilterFactory = tokenFilterFactoryFactory.get(environment, tokenFilter.name);
-            } else {
-                String filterTypeName = tokenFilter.definition.get("type");
-                if (filterTypeName == null) {
-                    throw new IllegalArgumentException("Missing [type] setting for token filter: " + tokenFilter.definition);
-                }
-                AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory =
-                        analysisRegistry.getTokenFilterProvider(filterTypeName);
-                if (tokenFilterFactoryFactory == null) {
-                    throw new IllegalArgumentException("Failed to find global token filter under [" + filterTypeName + "]");
-                }
-                Settings settings = augmentSettings(tokenFilter.definition);
-                // Need to set anonymous "name" of token_filter
-                tokenFilterFactory = tokenFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment,
-                        "_anonymous_tokenfilter", settings);
-                tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(),
-                        tokenizerFactory.v2(), tokenFilterFactoryList, charFilterFactoryList, environment);
-            }
-            if (tokenFilterFactory == null) {
-                throw new IllegalArgumentException("Failed to find or create token filter [" + tokenFilter + "]");
-            }
-            tokenFilterFactoryList.add(tokenFilterFactory);
-        }
-        return tokenFilterFactoryList;
-    }
-
-    /**
-     * The Elasticsearch analysis functionality is designed to work with indices.  For
-     * categorization we have to pretend we've got some index settings.
-     */
-    private IndexSettings buildDummyIndexSettings(Settings settings) {
-        IndexMetaData metaData = IndexMetaData.builder(IndexMetaData.INDEX_UUID_NA_VALUE).settings(settings).build();
-        return new IndexSettings(metaData, Settings.EMPTY);
-    }
-
-    /**
-     * The behaviour of Elasticsearch analyzers can vary between versions.
-     * For categorization we'll always use the latest version of the text analysis.
-     * The other settings are just to stop classes that expect to be associated with
-     * an index from complaining.
-     */
-    private Settings augmentSettings(Settings settings) {
-        return Settings.builder().put(settings)
-                .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
-                .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
-                .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
-                .put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
-                .build();
-    }
-
     @Override
     public boolean equals(Object o) {
         if (this == o) return true;
@@ -609,17 +419,5 @@ public CategorizationAnalyzerConfig build() {
             }
             return new CategorizationAnalyzerConfig(analyzer, charFilters, tokenizer, tokenFilters);
         }
-
-        /**
-         * Verify that the builder will build a valid config.  This is not done as part of the basic build
-         * because it verifies that the names of analyzers/tokenizers/filters referenced by the config are
-         * known, and the validity of these names could change over time.
-         */
-        public void verify(AnalysisRegistry analysisRegistry, Environment environment) throws IOException {
-            Tuple<Analyzer, Boolean> tuple = build().toAnalyzer(analysisRegistry, environment);
-            if (tuple.v2()) {
-                tuple.v1().close();
-            }
-        }
     }
 }
diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/Job.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/Job.java
@@ -21,8 +21,6 @@
 import org.elasticsearch.common.xcontent.ToXContentObject;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentParser.Token;
-import org.elasticsearch.env.Environment;
-import org.elasticsearch.index.analysis.AnalysisRegistry;
 import org.elasticsearch.xpack.core.ml.MlParserType;
 import org.elasticsearch.xpack.core.ml.job.messages.Messages;
 import org.elasticsearch.xpack.core.ml.job.persistence.AnomalyDetectorsIndexFields;
@@ -777,8 +775,8 @@ public Builder setAnalysisConfig(AnalysisConfig.Builder configBuilder) {
             return this;
         }
 
-        public AnalysisLimits getAnalysisLimits() {
-             return analysisLimits;
+        public AnalysisConfig getAnalysisConfig() {
+             return analysisConfig;
         }
 
         public Builder setAnalysisLimits(AnalysisLimits analysisLimits) {
@@ -1081,18 +1079,6 @@ public void validateAnalysisLimitsAndSetDefaults(@Nullable ByteSizeValue maxMode
                     AnalysisLimits.DEFAULT_MODEL_MEMORY_LIMIT_MB);
         }
 
-        /**
-         * Validate the char filter/tokenizer/token filter names used in the categorization analyzer config (if any).
-         * The overall structure can be validated at parse time, but the exact names need to be checked separately,
-         * as plugins that provide the functionality can be installed/uninstalled.
-         */
-        public void validateCategorizationAnalyzer(AnalysisRegistry analysisRegistry, Environment environment) throws IOException {
-            CategorizationAnalyzerConfig categorizationAnalyzerConfig = analysisConfig.getCategorizationAnalyzerConfig();
-            if (categorizationAnalyzerConfig != null) {
-                new CategorizationAnalyzerConfig.Builder(categorizationAnalyzerConfig).verify(analysisRegistry, environment);
-            }
-        }
-
         private void validateGroups() {
             for (String group : this.groups) {
                 if (MlStrings.isValidId(group) == false) {

diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/JobManager.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/JobManager.java
@@ -39,6 +39,7 @@
 import org.elasticsearch.xpack.core.ml.action.UpdateJobAction;
 import org.elasticsearch.xpack.core.ml.action.util.QueryPage;
 import org.elasticsearch.xpack.core.ml.job.config.AnalysisLimits;
+import org.elasticsearch.xpack.core.ml.job.config.CategorizationAnalyzerConfig;
 import org.elasticsearch.xpack.core.ml.job.config.DataDescription;
 import org.elasticsearch.xpack.core.ml.job.config.Job;
 import org.elasticsearch.xpack.core.ml.job.config.JobState;
@@ -50,6 +51,7 @@
 import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.ModelSnapshot;
 import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
 import org.elasticsearch.xpack.ml.MachineLearning;
+import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzer;
 import org.elasticsearch.xpack.ml.job.persistence.JobProvider;
 import org.elasticsearch.xpack.ml.job.persistence.JobResultsPersister;
 import org.elasticsearch.xpack.ml.job.process.autodetect.UpdateParams;
@@ -170,14 +172,30 @@ public JobState getJobState(String jobId) {
         return MlMetadata.getJobState(jobId, tasks);
     }
 
+    /**
+     * Validate the char filter/tokenizer/token filter names used in the categorization analyzer config (if any).
+     * This validation has to be done server-side; it cannot be done in a client as that won't have loaded the
+     * appropriate analysis modules/plugins.
+     * The overall structure can be validated at parse time, but the exact names need to be checked separately,
+     * as plugins that provide the functionality can be installed/uninstalled.
+     */
+    static void validateCategorizationAnalyzer(Job.Builder jobBuilder, AnalysisRegistry analysisRegistry, Environment environment)
+        throws IOException {
+        CategorizationAnalyzerConfig categorizationAnalyzerConfig = jobBuilder.getAnalysisConfig().getCategorizationAnalyzerConfig();
+        if (categorizationAnalyzerConfig != null) {
+            CategorizationAnalyzer.verifyConfigBuilder(new CategorizationAnalyzerConfig.Builder(categorizationAnalyzerConfig),
+                analysisRegistry, environment);
+        }
+    }
+
     /**
      * Stores a job in the cluster state
      */
     public void putJob(PutJobAction.Request request, AnalysisRegistry analysisRegistry, ClusterState state,
                        ActionListener<PutJobAction.Response> actionListener) throws IOException {
 
         request.getJobBuilder().validateAnalysisLimitsAndSetDefaults(maxModelMemoryLimit);
-        request.getJobBuilder().validateCategorizationAnalyzer(analysisRegistry, environment);
+        validateCategorizationAnalyzer(request.getJobBuilder(), analysisRegistry, environment);
 
         Job job = request.getJobBuilder().build(new Date());