Skip to content

Commit

Permalink
Allow plugins to build pre-configured token filters (#24223)
Browse files Browse the repository at this point in the history
This changes the way we register pre-configured token filters so that
plugins can declare them and starts to move all of the pre-configured
token filters out of core. It doesn't finish the job because doing
so would make the change unreviewably large. So this PR includes
a shim that keeps the "old" way of registering pre-configured token
filters around.

The Lowercase token filter is special because there is a "special"
interaction between it and the lowercase tokenizer. I'm not sure
exactly what to do about it so for now I'm leaving it alone with
the intent of figuring out what to do with it in a followup.

This also renames these pre-configured token filters from
"pre-built" to "pre-configured" because that seemed like a more
descriptive name.

This is a part of #23658
  • Loading branch information
nik9000 authored May 9, 2017
1 parent 4283908 commit bb06d8e
Show file tree
Hide file tree
Showing 23 changed files with 579 additions and 427 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.indices.analysis.PreBuiltAnalyzers;
import org.elasticsearch.indices.analysis.PreBuiltCharFilters;
import org.elasticsearch.indices.analysis.PreBuiltTokenFilters;
import org.elasticsearch.indices.analysis.PreBuiltTokenizers;

import java.io.Closeable;
Expand All @@ -59,7 +58,7 @@ public final class AnalysisRegistry implements Closeable {
public static final String INDEX_ANALYSIS_CHAR_FILTER = "index.analysis.char_filter";
public static final String INDEX_ANALYSIS_FILTER = "index.analysis.filter";
public static final String INDEX_ANALYSIS_TOKENIZER = "index.analysis.tokenizer";
private final PrebuiltAnalysis prebuiltAnalysis = new PrebuiltAnalysis();
private final PrebuiltAnalysis prebuiltAnalysis;
private final Map<String, Analyzer> cachedAnalyzer = new ConcurrentHashMap<>();

private final Environment environment;
Expand All @@ -74,13 +73,15 @@ public AnalysisRegistry(Environment environment,
Map<String, AnalysisProvider<TokenFilterFactory>> tokenFilters,
Map<String, AnalysisProvider<TokenizerFactory>> tokenizers,
Map<String, AnalysisProvider<AnalyzerProvider<?>>> analyzers,
Map<String, AnalysisProvider<AnalyzerProvider<?>>> normalizers) {
Map<String, AnalysisProvider<AnalyzerProvider<?>>> normalizers,
Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters) {
this.environment = environment;
this.charFilters = unmodifiableMap(charFilters);
this.tokenFilters = unmodifiableMap(tokenFilters);
this.tokenizers = unmodifiableMap(tokenizers);
this.analyzers = unmodifiableMap(analyzers);
this.normalizers = unmodifiableMap(normalizers);
prebuiltAnalysis = new PrebuiltAnalysis(preConfiguredTokenFilters);
}

/**
Expand Down Expand Up @@ -305,8 +306,8 @@ public String toString() {
}

private <T> Map<String, T> buildMapping(Component component, IndexSettings settings, Map<String, Settings> settingsMap,
Map<String, AnalysisModule.AnalysisProvider<T>> providerMap, Map<String, AnalysisModule.AnalysisProvider<T>> defaultInstance)
throws IOException {
Map<String, ? extends AnalysisModule.AnalysisProvider<T>> providerMap,
Map<String, ? extends AnalysisModule.AnalysisProvider<T>> defaultInstance) throws IOException {
Settings defaultSettings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, settings.getIndexVersionCreated()).build();
Map<String, T> factories = new HashMap<>();
for (Map.Entry<String, Settings> entry : settingsMap.entrySet()) {
Expand Down Expand Up @@ -344,7 +345,7 @@ private <T> Map<String, T> buildMapping(Component component, IndexSettings setti

}
// go over the char filters in the bindings and register the ones that are not configured
for (Map.Entry<String, AnalysisModule.AnalysisProvider<T>> entry : providerMap.entrySet()) {
for (Map.Entry<String, ? extends AnalysisModule.AnalysisProvider<T>> entry : providerMap.entrySet()) {
String name = entry.getKey();
AnalysisModule.AnalysisProvider<T> provider = entry.getValue();
// we don't want to re-register one that already exists
Expand All @@ -365,7 +366,7 @@ private <T> Map<String, T> buildMapping(Component component, IndexSettings setti
factories.put(name, instance);
}

for (Map.Entry<String, AnalysisModule.AnalysisProvider<T>> entry : defaultInstance.entrySet()) {
for (Map.Entry<String, ? extends AnalysisModule.AnalysisProvider<T>> entry : defaultInstance.entrySet()) {
final String name = entry.getKey();
final AnalysisModule.AnalysisProvider<T> provider = entry.getValue();
if (factories.containsKey(name) == false) {
Expand All @@ -378,7 +379,8 @@ private <T> Map<String, T> buildMapping(Component component, IndexSettings setti
return factories;
}

private <T> AnalysisProvider<T> getAnalysisProvider(Component component, Map<String, AnalysisProvider<T>> providerMap, String name, String typeName) {
private <T> AnalysisProvider<T> getAnalysisProvider(Component component, Map<String, ? extends AnalysisProvider<T>> providerMap,
String name, String typeName) {
if (typeName == null) {
throw new IllegalArgumentException(component + " [" + name + "] must specify either an analyzer type, or a tokenizer");
}
Expand All @@ -393,13 +395,12 @@ private static class PrebuiltAnalysis implements Closeable {

final Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<?>>> analyzerProviderFactories;
final Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> tokenizerFactories;
final Map<String, AnalysisModule.AnalysisProvider<TokenFilterFactory>> tokenFilterFactories;
final Map<String, ? extends AnalysisProvider<TokenFilterFactory>> tokenFilterFactories;
final Map<String, AnalysisModule.AnalysisProvider<CharFilterFactory>> charFilterFactories;

private PrebuiltAnalysis() {
private PrebuiltAnalysis(Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters) {
Map<String, PreBuiltAnalyzerProviderFactory> analyzerProviderFactories = new HashMap<>();
Map<String, PreBuiltTokenizerFactoryFactory> tokenizerFactories = new HashMap<>();
Map<String, PreBuiltTokenFilterFactoryFactory> tokenFilterFactories = new HashMap<>();
Map<String, PreBuiltCharFilterFactoryFactory> charFilterFactories = new HashMap<>();
// Analyzers
for (PreBuiltAnalyzers preBuiltAnalyzerEnum : PreBuiltAnalyzers.values()) {
Expand All @@ -418,28 +419,18 @@ private PrebuiltAnalysis() {
tokenizerFactories.put("edgeNGram", new PreBuiltTokenizerFactoryFactory(PreBuiltTokenizers.EDGE_NGRAM.getTokenizerFactory(Version.CURRENT)));
tokenizerFactories.put("PathHierarchy", new PreBuiltTokenizerFactoryFactory(PreBuiltTokenizers.PATH_HIERARCHY.getTokenizerFactory(Version.CURRENT)));


// Token filters
for (PreBuiltTokenFilters preBuiltTokenFilter : PreBuiltTokenFilters.values()) {
String name = preBuiltTokenFilter.name().toLowerCase(Locale.ROOT);
tokenFilterFactories.put(name, new PreBuiltTokenFilterFactoryFactory(preBuiltTokenFilter.getTokenFilterFactory(Version.CURRENT)));
}
// Token filter aliases
tokenFilterFactories.put("nGram", new PreBuiltTokenFilterFactoryFactory(PreBuiltTokenFilters.NGRAM.getTokenFilterFactory(Version.CURRENT)));
tokenFilterFactories.put("edgeNGram", new PreBuiltTokenFilterFactoryFactory(PreBuiltTokenFilters.EDGE_NGRAM.getTokenFilterFactory(Version.CURRENT)));


// Char Filters
for (PreBuiltCharFilters preBuiltCharFilter : PreBuiltCharFilters.values()) {
String name = preBuiltCharFilter.name().toLowerCase(Locale.ROOT);
charFilterFactories.put(name, new PreBuiltCharFilterFactoryFactory(preBuiltCharFilter.getCharFilterFactory(Version.CURRENT)));
}
// Char filter aliases
charFilterFactories.put("htmlStrip", new PreBuiltCharFilterFactoryFactory(PreBuiltCharFilters.HTML_STRIP.getCharFilterFactory(Version.CURRENT)));

this.analyzerProviderFactories = Collections.unmodifiableMap(analyzerProviderFactories);
this.charFilterFactories = Collections.unmodifiableMap(charFilterFactories);
this.tokenFilterFactories = Collections.unmodifiableMap(tokenFilterFactories);
this.tokenizerFactories = Collections.unmodifiableMap(tokenizerFactories);
tokenFilterFactories = preConfiguredTokenFilters;
}

public AnalysisModule.AnalysisProvider<CharFilterFactory> getCharFilterFactory(String name) {
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.index.analysis;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.elasticsearch.Version;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory;

import java.io.IOException;
import java.util.function.BiFunction;
import java.util.function.Function;

/**
* Provides pre-configured, shared {@link TokenFilter}s.
*/
public final class PreConfiguredTokenFilter implements AnalysisModule.AnalysisProvider<TokenFilterFactory> {
private final String name;
private final boolean useFilterForMultitermQueries;
private final PreBuiltCacheFactory.PreBuiltCache<TokenFilterFactory> cache;
private final BiFunction<TokenStream, Version, TokenStream> create;

/**
* Standard ctor with all the power.
*/
public PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries,
PreBuiltCacheFactory.CachingStrategy cachingStrategy, BiFunction<TokenStream, Version, TokenStream> create) {
this.name = name;
this.useFilterForMultitermQueries = useFilterForMultitermQueries;
cache = PreBuiltCacheFactory.getCache(cachingStrategy);
this.create = create;
}

/**
* Convenience ctor for token streams that don't vary based on version.
*/
public PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries,
PreBuiltCacheFactory.CachingStrategy cachingStrategy, Function<TokenStream, TokenStream> create) {
this(name, useFilterForMultitermQueries, cachingStrategy, (input, version) -> create.apply(input));
// TODO why oh why aren't these all CachingStrategy.ONE? They *can't* vary based on version because they don't get it, right?!
}

@Override
public TokenFilterFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException {
return getTokenFilterFactory(Version.indexCreated(settings));
}

/**
* The name of the {@link TokenFilter} in the API.
*/
public String getName() {
return name;
}

/**
* Can this {@link TokenFilter} be used in multi-term queries?
*/
public boolean shouldUseFilterForMultitermQueries() {
return useFilterForMultitermQueries;
}

private interface MultiTermAwareTokenFilterFactory extends TokenFilterFactory, MultiTermAwareComponent {}

private synchronized TokenFilterFactory getTokenFilterFactory(final Version version) {
TokenFilterFactory factory = cache.get(version);
if (factory == null) {
if (useFilterForMultitermQueries) {
factory = new MultiTermAwareTokenFilterFactory() {
@Override
public String name() {
return name;
}

@Override
public TokenStream create(TokenStream tokenStream) {
return create.apply(tokenStream, version);
}

@Override
public Object getMultiTermComponent() {
return this;
}
};
} else {
factory = new TokenFilterFactory() {
@Override
public String name() {
return name;
}

@Override
public TokenStream create(TokenStream tokenStream) {
return create.apply(tokenStream, version);
}
};
}
cache.put(version, factory);
}

return factory;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

package org.elasticsearch.indices.analysis;

import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.NamedRegistry;
Expand Down Expand Up @@ -101,6 +103,7 @@
import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory;
import org.elasticsearch.index.analysis.PorterStemTokenFilterFactory;
import org.elasticsearch.index.analysis.PortugueseAnalyzerProvider;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
import org.elasticsearch.index.analysis.ReverseTokenFilterFactory;
import org.elasticsearch.index.analysis.RomanianAnalyzerProvider;
import org.elasticsearch.index.analysis.RussianAnalyzerProvider;
Expand Down Expand Up @@ -138,11 +141,15 @@
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
import org.elasticsearch.plugins.AnalysisPlugin;

import java.io.IOException;
import java.util.List;
import java.util.Locale;
import java.util.Map;

import static java.util.Collections.unmodifiableMap;
import static org.elasticsearch.plugins.AnalysisPlugin.requriesAnalysisSettings;

/**
Expand All @@ -169,8 +176,11 @@ public AnalysisModule(Environment environment, List<AnalysisPlugin> plugins) thr
NamedRegistry<AnalysisProvider<TokenizerFactory>> tokenizers = setupTokenizers(plugins);
NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> analyzers = setupAnalyzers(plugins);
NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> normalizers = setupNormalizers(plugins);

Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters = setupPreConfiguredTokenFilters(plugins);

analysisRegistry = new AnalysisRegistry(environment, charFilters.getRegistry(), tokenFilters.getRegistry(), tokenizers
.getRegistry(), analyzers.getRegistry(), normalizers.getRegistry());
.getRegistry(), analyzers.getRegistry(), normalizers.getRegistry(), preConfiguredTokenFilters);
}

HunspellService getHunspellService() {
Expand Down Expand Up @@ -258,6 +268,40 @@ private NamedRegistry<AnalysisProvider<TokenFilterFactory>> setupTokenFilters(Li
return tokenFilters;
}

static Map<String, PreConfiguredTokenFilter> setupPreConfiguredTokenFilters(List<AnalysisPlugin> plugins) {
NamedRegistry<PreConfiguredTokenFilter> preConfiguredTokenFilters = new NamedRegistry<>("pre-configured token_filter");

// Add filters available in lucene-core
preConfiguredTokenFilters.register("lowercase",
new PreConfiguredTokenFilter("lowercase", true, CachingStrategy.LUCENE, LowerCaseFilter::new));
preConfiguredTokenFilters.register("standard",
new PreConfiguredTokenFilter("standard", false, CachingStrategy.LUCENE, StandardFilter::new));
/* Note that "stop" is available in lucene-core but it's pre-built
* version uses a set of English stop words that are in
* lucene-analyzers-common so "stop" is defined in the analysis-common
* module. */

// Add token filters declared in PreBuiltTokenFilters until they have all been migrated
for (PreBuiltTokenFilters preBuilt : PreBuiltTokenFilters.values()) {
switch (preBuilt) {
case LOWERCASE:
// This has been migrated but has to stick around until PreBuiltTokenizers is removed.
continue;
default:
String name = preBuilt.name().toLowerCase(Locale.ROOT);
preConfiguredTokenFilters.register(name,
new PreConfiguredTokenFilter(name, preBuilt.isMultiTermAware(), preBuilt.getCachingStrategy(), preBuilt::create));
}
}

for (AnalysisPlugin plugin: plugins) {
for (PreConfiguredTokenFilter filter : plugin.getPreConfiguredTokenFilters()) {
preConfiguredTokenFilters.register(filter.getName(), filter);
}
}
return unmodifiableMap(preConfiguredTokenFilters.getRegistry());
}

private NamedRegistry<AnalysisProvider<TokenizerFactory>> setupTokenizers(List<AnalysisPlugin> plugins) {
NamedRegistry<AnalysisProvider<TokenizerFactory>> tokenizers = new NamedRegistry<>("tokenizer");
tokenizers.register("standard", StandardTokenizerFactory::new);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ public interface PreBuiltCache<T> {

private PreBuiltCacheFactory() {}

static <T> PreBuiltCache<T> getCache(CachingStrategy cachingStrategy) {
public static <T> PreBuiltCache<T> getCache(CachingStrategy cachingStrategy) {
switch (cachingStrategy) {
case ONE:
return new PreBuiltCacheStrategyOne<>();
Expand Down
Loading

0 comments on commit bb06d8e

Please sign in to comment.