Skip to content

Commit

Permalink
[ML] Move analyzer dependencies out of categorization config (#32123)
Browse files Browse the repository at this point in the history
The ML config classes will shortly be moved to the X-Pack protocol
library to allow the ML APIs to be moved to the high level REST
client.  Dependencies on server functionality should be removed
from the config classes before this is done.

This change is entirely about moving code between packages.  It
does not add or remove any functionality or tests.
  • Loading branch information
droberts195 committed Jul 17, 2018
1 parent 5018c47 commit e87afdf
Show file tree
Hide file tree
Showing 6 changed files with 382 additions and 381 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,8 @@
*/
package org.elasticsearch.xpack.core.ml.job.config;

import org.apache.lucene.analysis.Analyzer;
import org.elasticsearch.Version;
import org.elasticsearch.action.admin.indices.analyze.TransportAnalyzeAction;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.UUIDs;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.io.stream.Writeable;
Expand All @@ -22,15 +16,6 @@
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AnalysisRegistry;
import org.elasticsearch.index.analysis.CharFilterFactory;
import org.elasticsearch.index.analysis.CustomAnalyzer;
import org.elasticsearch.index.analysis.CustomAnalyzerProvider;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.rest.action.admin.indices.RestAnalyzeAction;
import org.elasticsearch.xpack.core.ml.MlParserType;

Expand All @@ -42,12 +27,11 @@
import java.util.Map;
import java.util.Objects;


/**
* Configuration for the categorization analyzer.
*
* The syntax is a subset of what can be supplied to the {@linkplain RestAnalyzeAction <code>_analyze</code> endpoint}.
* To summarise, the first option is to specify the name of an out-of-the-box analyzer:
* To summarize, the first option is to specify the name of an out-of-the-box analyzer:
* <code>
* "categorization_analyzer" : "standard"
* </code>
Expand All @@ -66,11 +50,6 @@
* { "type" : "pattern_replace", "pattern": "^[0-9].*" }
* ]
* </code>
*
* Unfortunately there is no easy to to reuse a subset of the <code>_analyze</code> action implementation, so much
* of the code in this file is copied from {@link TransportAnalyzeAction}. Unfortunately the logic required here is
* not quite identical to that of {@link TransportAnalyzeAction}, and the required code is hard to partially reuse.
* TODO: consider refactoring ES core to allow more reuse.
*/
public class CategorizationAnalyzerConfig implements ToXContentFragment, Writeable {

Expand Down Expand Up @@ -350,175 +329,6 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
return builder;
}

/**
* Convert the config to an {@link Analyzer}. This may be a global analyzer or a newly created custom analyzer.
* In the case of a global analyzer the caller must NOT close it when they have finished with it. In the case of
* a newly created custom analyzer the caller is responsible for closing it.
* @return The first tuple member is the {@link Analyzer}; the second indicates whether the caller is responsible
* for closing it.
*/
public Tuple<Analyzer, Boolean> toAnalyzer(AnalysisRegistry analysisRegistry, Environment environment) throws IOException {
if (analyzer != null) {
Analyzer globalAnalyzer = analysisRegistry.getAnalyzer(analyzer);
if (globalAnalyzer == null) {
throw new IllegalArgumentException("Failed to find global analyzer [" + analyzer + "]");
}
return new Tuple<>(globalAnalyzer, Boolean.FALSE);
} else {
List<CharFilterFactory> charFilterFactoryList =
parseCharFilterFactories(analysisRegistry, environment);

Tuple<String, TokenizerFactory> tokenizerFactory = parseTokenizerFactory(analysisRegistry,
environment);

List<TokenFilterFactory> tokenFilterFactoryList = parseTokenFilterFactories(analysisRegistry,
environment, tokenizerFactory, charFilterFactoryList);

return new Tuple<>(new CustomAnalyzer(tokenizerFactory.v1(), tokenizerFactory.v2(),
charFilterFactoryList.toArray(new CharFilterFactory[charFilterFactoryList.size()]),
tokenFilterFactoryList.toArray(new TokenFilterFactory[tokenFilterFactoryList.size()])), Boolean.TRUE);
}
}


/**
* Get char filter factories for each configured char filter. Each configuration
* element can be the name of an out-of-the-box char filter, or a custom definition.
*/
private List<CharFilterFactory> parseCharFilterFactories(AnalysisRegistry analysisRegistry,
Environment environment) throws IOException {
final List<CharFilterFactory> charFilterFactoryList = new ArrayList<>();
for (NameOrDefinition charFilter : charFilters) {
final CharFilterFactory charFilterFactory;
if (charFilter.name != null) {
AnalysisModule.AnalysisProvider<CharFilterFactory> charFilterFactoryFactory =
analysisRegistry.getCharFilterProvider(charFilter.name);
if (charFilterFactoryFactory == null) {
throw new IllegalArgumentException("Failed to find global char filter under [" + charFilter.name + "]");
}
charFilterFactory = charFilterFactoryFactory.get(environment, charFilter.name);
} else {
String charFilterTypeName = charFilter.definition.get("type");
if (charFilterTypeName == null) {
throw new IllegalArgumentException("Missing [type] setting for char filter: " + charFilter.definition);
}
AnalysisModule.AnalysisProvider<CharFilterFactory> charFilterFactoryFactory =
analysisRegistry.getCharFilterProvider(charFilterTypeName);
if (charFilterFactoryFactory == null) {
throw new IllegalArgumentException("Failed to find global char filter under [" + charFilterTypeName + "]");
}
Settings settings = augmentSettings(charFilter.definition);
// Need to set anonymous "name" of char_filter
charFilterFactory = charFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment,
"_anonymous_charfilter", settings);
}
if (charFilterFactory == null) {
throw new IllegalArgumentException("Failed to find char filter [" + charFilter + "]");
}
charFilterFactoryList.add(charFilterFactory);
}
return charFilterFactoryList;
}

/**
* Get the tokenizer factory for the configured tokenizer. The configuration
* can be the name of an out-of-the-box tokenizer, or a custom definition.
*/
private Tuple<String, TokenizerFactory> parseTokenizerFactory(AnalysisRegistry analysisRegistry,
Environment environment) throws IOException {
final String name;
final TokenizerFactory tokenizerFactory;
if (tokenizer.name != null) {
name = tokenizer.name;
AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory = analysisRegistry.getTokenizerProvider(name);
if (tokenizerFactoryFactory == null) {
throw new IllegalArgumentException("Failed to find global tokenizer under [" + name + "]");
}
tokenizerFactory = tokenizerFactoryFactory.get(environment, name);
} else {
String tokenizerTypeName = tokenizer.definition.get("type");
if (tokenizerTypeName == null) {
throw new IllegalArgumentException("Missing [type] setting for tokenizer: " + tokenizer.definition);
}
AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory =
analysisRegistry.getTokenizerProvider(tokenizerTypeName);
if (tokenizerFactoryFactory == null) {
throw new IllegalArgumentException("Failed to find global tokenizer under [" + tokenizerTypeName + "]");
}
Settings settings = augmentSettings(tokenizer.definition);
// Need to set anonymous "name" of tokenizer
name = "_anonymous_tokenizer";
tokenizerFactory = tokenizerFactoryFactory.get(buildDummyIndexSettings(settings), environment, name, settings);
}
return new Tuple<>(name, tokenizerFactory);
}

/**
* Get token filter factories for each configured token filter. Each configuration
* element can be the name of an out-of-the-box token filter, or a custom definition.
*/
private List<TokenFilterFactory> parseTokenFilterFactories(AnalysisRegistry analysisRegistry, Environment environment,
Tuple<String, TokenizerFactory> tokenizerFactory,
List<CharFilterFactory> charFilterFactoryList) throws IOException {
final List<TokenFilterFactory> tokenFilterFactoryList = new ArrayList<>();
for (NameOrDefinition tokenFilter : tokenFilters) {
TokenFilterFactory tokenFilterFactory;
if (tokenFilter.name != null) {
AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory;
tokenFilterFactoryFactory = analysisRegistry.getTokenFilterProvider(tokenFilter.name);
if (tokenFilterFactoryFactory == null) {
throw new IllegalArgumentException("Failed to find global token filter under [" + tokenFilter.name + "]");
}
tokenFilterFactory = tokenFilterFactoryFactory.get(environment, tokenFilter.name);
} else {
String filterTypeName = tokenFilter.definition.get("type");
if (filterTypeName == null) {
throw new IllegalArgumentException("Missing [type] setting for token filter: " + tokenFilter.definition);
}
AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory =
analysisRegistry.getTokenFilterProvider(filterTypeName);
if (tokenFilterFactoryFactory == null) {
throw new IllegalArgumentException("Failed to find global token filter under [" + filterTypeName + "]");
}
Settings settings = augmentSettings(tokenFilter.definition);
// Need to set anonymous "name" of token_filter
tokenFilterFactory = tokenFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment,
"_anonymous_tokenfilter", settings);
tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(),
tokenizerFactory.v2(), tokenFilterFactoryList, charFilterFactoryList, environment);
}
if (tokenFilterFactory == null) {
throw new IllegalArgumentException("Failed to find or create token filter [" + tokenFilter + "]");
}
tokenFilterFactoryList.add(tokenFilterFactory);
}
return tokenFilterFactoryList;
}

/**
* The Elasticsearch analysis functionality is designed to work with indices. For
* categorization we have to pretend we've got some index settings.
*/
private IndexSettings buildDummyIndexSettings(Settings settings) {
IndexMetaData metaData = IndexMetaData.builder(IndexMetaData.INDEX_UUID_NA_VALUE).settings(settings).build();
return new IndexSettings(metaData, Settings.EMPTY);
}

/**
* The behaviour of Elasticsearch analyzers can vary between versions.
* For categorization we'll always use the latest version of the text analysis.
* The other settings are just to stop classes that expect to be associated with
* an index from complaining.
*/
private Settings augmentSettings(Settings settings) {
return Settings.builder().put(settings)
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
.put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
.put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
.build();
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
Expand Down Expand Up @@ -609,17 +419,5 @@ public CategorizationAnalyzerConfig build() {
}
return new CategorizationAnalyzerConfig(analyzer, charFilters, tokenizer, tokenFilters);
}

/**
* Verify that the builder will build a valid config. This is not done as part of the basic build
* because it verifies that the names of analyzers/tokenizers/filters referenced by the config are
* known, and the validity of these names could change over time.
*/
public void verify(AnalysisRegistry analysisRegistry, Environment environment) throws IOException {
Tuple<Analyzer, Boolean> tuple = build().toAnalyzer(analysisRegistry, environment);
if (tuple.v2()) {
tuple.v1().close();
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@
import org.elasticsearch.common.xcontent.ToXContentObject;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser.Token;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.analysis.AnalysisRegistry;
import org.elasticsearch.xpack.core.ml.MlParserType;
import org.elasticsearch.xpack.core.ml.job.messages.Messages;
import org.elasticsearch.xpack.core.ml.job.persistence.AnomalyDetectorsIndexFields;
Expand Down Expand Up @@ -777,8 +775,8 @@ public Builder setAnalysisConfig(AnalysisConfig.Builder configBuilder) {
return this;
}

public AnalysisLimits getAnalysisLimits() {
return analysisLimits;
public AnalysisConfig getAnalysisConfig() {
return analysisConfig;
}

public Builder setAnalysisLimits(AnalysisLimits analysisLimits) {
Expand Down Expand Up @@ -1081,18 +1079,6 @@ public void validateAnalysisLimitsAndSetDefaults(@Nullable ByteSizeValue maxMode
AnalysisLimits.DEFAULT_MODEL_MEMORY_LIMIT_MB);
}

/**
* Validate the char filter/tokenizer/token filter names used in the categorization analyzer config (if any).
* The overall structure can be validated at parse time, but the exact names need to be checked separately,
* as plugins that provide the functionality can be installed/uninstalled.
*/
public void validateCategorizationAnalyzer(AnalysisRegistry analysisRegistry, Environment environment) throws IOException {
CategorizationAnalyzerConfig categorizationAnalyzerConfig = analysisConfig.getCategorizationAnalyzerConfig();
if (categorizationAnalyzerConfig != null) {
new CategorizationAnalyzerConfig.Builder(categorizationAnalyzerConfig).verify(analysisRegistry, environment);
}
}

private void validateGroups() {
for (String group : this.groups) {
if (MlStrings.isValidId(group) == false) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
import org.elasticsearch.xpack.core.ml.action.UpdateJobAction;
import org.elasticsearch.xpack.core.ml.action.util.QueryPage;
import org.elasticsearch.xpack.core.ml.job.config.AnalysisLimits;
import org.elasticsearch.xpack.core.ml.job.config.CategorizationAnalyzerConfig;
import org.elasticsearch.xpack.core.ml.job.config.DataDescription;
import org.elasticsearch.xpack.core.ml.job.config.Job;
import org.elasticsearch.xpack.core.ml.job.config.JobState;
Expand All @@ -50,6 +51,7 @@
import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.ModelSnapshot;
import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
import org.elasticsearch.xpack.ml.MachineLearning;
import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzer;
import org.elasticsearch.xpack.ml.job.persistence.JobProvider;
import org.elasticsearch.xpack.ml.job.persistence.JobResultsPersister;
import org.elasticsearch.xpack.ml.job.process.autodetect.UpdateParams;
Expand Down Expand Up @@ -170,14 +172,30 @@ public JobState getJobState(String jobId) {
return MlMetadata.getJobState(jobId, tasks);
}

/**
* Validate the char filter/tokenizer/token filter names used in the categorization analyzer config (if any).
* This validation has to be done server-side; it cannot be done in a client as that won't have loaded the
* appropriate analysis modules/plugins.
* The overall structure can be validated at parse time, but the exact names need to be checked separately,
* as plugins that provide the functionality can be installed/uninstalled.
*/
static void validateCategorizationAnalyzer(Job.Builder jobBuilder, AnalysisRegistry analysisRegistry, Environment environment)
throws IOException {
CategorizationAnalyzerConfig categorizationAnalyzerConfig = jobBuilder.getAnalysisConfig().getCategorizationAnalyzerConfig();
if (categorizationAnalyzerConfig != null) {
CategorizationAnalyzer.verifyConfigBuilder(new CategorizationAnalyzerConfig.Builder(categorizationAnalyzerConfig),
analysisRegistry, environment);
}
}

/**
* Stores a job in the cluster state
*/
public void putJob(PutJobAction.Request request, AnalysisRegistry analysisRegistry, ClusterState state,
ActionListener<PutJobAction.Response> actionListener) throws IOException {

request.getJobBuilder().validateAnalysisLimitsAndSetDefaults(maxModelMemoryLimit);
request.getJobBuilder().validateCategorizationAnalyzer(analysisRegistry, environment);
validateCategorizationAnalyzer(request.getJobBuilder(), analysisRegistry, environment);

Job job = request.getJobBuilder().build(new Date());

Expand Down
Loading

0 comments on commit e87afdf

Please sign in to comment.