Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make PreBuiltAnalyzerProviderFactory plugable via AnalysisPlugin and #31095

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
Expand Down Expand Up @@ -79,14 +80,17 @@
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.index.analysis.AnalyzerProvider;
import org.elasticsearch.index.analysis.CharFilterFactory;
import org.elasticsearch.index.analysis.PreBuiltAnalyzerProviderFactory;
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.plugins.Plugin;
import org.tartarus.snowball.ext.DutchStemmer;
Expand All @@ -103,6 +107,15 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {

private static final DeprecationLogger DEPRECATION_LOGGER = new DeprecationLogger(Loggers.getLogger(CommonAnalysisPlugin.class));

@Override
public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() {
Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> analyzers = new TreeMap<>();
analyzers.put("fingerprint", FingerprintAnalyzerProvider::new);
analyzers.put("standard_html_strip", StandardHtmlStripAnalyzerProvider::new);
analyzers.put("pattern", PatternAnalyzerProvider::new);
return analyzers;
}

@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
Map<String, AnalysisProvider<TokenFilterFactory>> filters = new TreeMap<>();
Expand Down Expand Up @@ -197,6 +210,16 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
return tokenizers;
}

@Override
public List<PreBuiltAnalyzerProviderFactory> getPreBuiltAnalyzerProviderFactories() {
List<PreBuiltAnalyzerProviderFactory> analyzers = new ArrayList<>();
analyzers.add(new PreBuiltAnalyzerProviderFactory("standard_html_strip", CachingStrategy.LUCENE,
version -> new StandardHtmlStripAnalyzer(CharArraySet.EMPTY_SET)));
analyzers.add(new PreBuiltAnalyzerProviderFactory("pattern", CachingStrategy.ELASTICSEARCH, version ->
new PatternAnalyzer(Regex.compile("\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/, null), true, CharArraySet.EMPTY_SET)));
return analyzers;
}

@Override
public List<PreConfiguredCharFilter> getPreConfiguredCharFilters() {
List<PreConfiguredCharFilter> filters = new ArrayList<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
Expand All @@ -35,7 +35,7 @@ public final class FingerprintAnalyzer extends Analyzer {
private final int maxOutputSize;
private final CharArraySet stopWords;

public FingerprintAnalyzer(CharArraySet stopWords, char separator, int maxOutputSize) {
FingerprintAnalyzer(CharArraySet stopWords, char separator, int maxOutputSize) {
this.separator = separator;
this.maxOutputSize = maxOutputSize;
this.stopWords = stopWords;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,16 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.elasticsearch.index.analysis.Analysis;


/**
Expand All @@ -42,7 +44,7 @@ public class FingerprintAnalyzerProvider extends AbstractIndexAnalyzerProvider<A

private final FingerprintAnalyzer analyzer;

public FingerprintAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
FingerprintAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);

char separator = parseSeparator(settings);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,13 @@

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.FingerprintFilter;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.FingerprintAnalyzerProvider;

import static org.elasticsearch.index.analysis.FingerprintAnalyzerProvider.DEFAULT_MAX_OUTPUT_SIZE;
import static org.elasticsearch.index.analysis.FingerprintAnalyzerProvider.MAX_OUTPUT_SIZE;
import static org.elasticsearch.analysis.common.FingerprintAnalyzerProvider.DEFAULT_MAX_OUTPUT_SIZE;
import static org.elasticsearch.analysis.common.FingerprintAnalyzerProvider.MAX_OUTPUT_SIZE;

public class FingerprintTokenFilterFactory extends AbstractTokenFilterFactory {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
Expand All @@ -35,7 +35,7 @@ public final class PatternAnalyzer extends Analyzer {
private final boolean lowercase;
private final CharArraySet stopWords;

public PatternAnalyzer(Pattern pattern, boolean lowercase, CharArraySet stopWords) {
PatternAnalyzer(Pattern pattern, boolean lowercase, CharArraySet stopWords) {
this.pattern = pattern;
this.lowercase = lowercase;
this.stopWords = stopWords;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,24 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.elasticsearch.index.analysis.Analysis;

import java.util.regex.Pattern;

public class PatternAnalyzerProvider extends AbstractIndexAnalyzerProvider<Analyzer> {

private final PatternAnalyzer analyzer;

public PatternAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
PatternAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);

final CharArraySet defaultStopwords = CharArraySet.EMPTY_SET;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
Expand All @@ -39,7 +39,7 @@ public StandardHtmlStripAnalyzer() {
super(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}

public StandardHtmlStripAnalyzer(CharArraySet stopwords) {
StandardHtmlStripAnalyzer(CharArraySet stopwords) {
super(stopwords);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,22 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.elasticsearch.Version;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.elasticsearch.index.analysis.Analysis;

public class StandardHtmlStripAnalyzerProvider extends AbstractIndexAnalyzerProvider<StandardHtmlStripAnalyzer> {

private final StandardHtmlStripAnalyzer analyzer;

public StandardHtmlStripAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
StandardHtmlStripAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
final CharArraySet defaultStopwords = CharArraySet.EMPTY_SET;
CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

/*
* Licensed to Elasticsearch under one or more contributor
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

/*
* Licensed to Elasticsearch under one or more contributor
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,35 @@
analyzer: bengali
- length: { tokens: 1 }
- match: { tokens.0.token: বার }

---
"fingerprint":
- do:
indices.analyze:
body:
text: A1 B2 A1 D4 C3
analyzer: fingerprint
- length: { tokens: 1 }
- match: { tokens.0.token: a1 b2 c3 d4 }

---
"standard_html_strip":
- do:
indices.analyze:
body:
text: <bold/> <italic/>
analyzer: standard_html_strip
- length: { tokens: 2 }
- match: { tokens.0.token: bold }
- match: { tokens.1.token: italic }

---
"pattern":
- do:
indices.analyze:
body:
text: foo bar
analyzer: pattern
- length: { tokens: 2 }
- match: { tokens.0.token: foo }
- match: { tokens.1.token: bar }
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.elasticsearch.core.internal.io.IOUtils;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
Expand Down Expand Up @@ -70,14 +69,16 @@ public AnalysisRegistry(Environment environment,
Map<String, AnalysisProvider<AnalyzerProvider<?>>> normalizers,
Map<String, PreConfiguredCharFilter> preConfiguredCharFilters,
Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters,
Map<String, PreConfiguredTokenizer> preConfiguredTokenizers) {
Map<String, PreConfiguredTokenizer> preConfiguredTokenizers,
Map<String, PreBuiltAnalyzerProviderFactory> preConfiguredAnalyzers) {
this.environment = environment;
this.charFilters = unmodifiableMap(charFilters);
this.tokenFilters = unmodifiableMap(tokenFilters);
this.tokenizers = unmodifiableMap(tokenizers);
this.analyzers = unmodifiableMap(analyzers);
this.normalizers = unmodifiableMap(normalizers);
prebuiltAnalysis = new PrebuiltAnalysis(preConfiguredCharFilters, preConfiguredTokenFilters, preConfiguredTokenizers);
prebuiltAnalysis =
new PrebuiltAnalysis(preConfiguredCharFilters, preConfiguredTokenFilters, preConfiguredTokenizers, preConfiguredAnalyzers);
}

/**
Expand Down Expand Up @@ -398,13 +399,15 @@ private static class PrebuiltAnalysis implements Closeable {
private PrebuiltAnalysis(
Map<String, PreConfiguredCharFilter> preConfiguredCharFilters,
Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters,
Map<String, PreConfiguredTokenizer> preConfiguredTokenizers) {
Map<String, PreBuiltAnalyzerProviderFactory> analyzerProviderFactories = new HashMap<>();
Map<String, PreConfiguredTokenizer> preConfiguredTokenizers,
Map<String, PreBuiltAnalyzerProviderFactory> preConfiguredAnalyzers) {

// Analyzers
Map<String, PreBuiltAnalyzerProviderFactory> analyzerProviderFactories = new HashMap<>();
analyzerProviderFactories.putAll(preConfiguredAnalyzers);
// Pre-build analyzers
for (PreBuiltAnalyzers preBuiltAnalyzerEnum : PreBuiltAnalyzers.values()) {
String name = preBuiltAnalyzerEnum.name().toLowerCase(Locale.ROOT);
analyzerProviderFactories.put(name, new PreBuiltAnalyzerProviderFactory(name, AnalyzerScope.INDICES, preBuiltAnalyzerEnum.getAnalyzer(Version.CURRENT)));
analyzerProviderFactories.put(name, new PreBuiltAnalyzerProviderFactory(name, preBuiltAnalyzerEnum));
}

this.analyzerProviderFactories = Collections.unmodifiableMap(analyzerProviderFactories);
Expand All @@ -429,17 +432,10 @@ public AnalysisModule.AnalysisProvider<AnalyzerProvider<?>> getAnalyzerProvider(
return analyzerProviderFactories.get(name);
}

Analyzer analyzer(String name) {
PreBuiltAnalyzerProviderFactory analyzerProviderFactory = (PreBuiltAnalyzerProviderFactory) analyzerProviderFactories.get(name);
if (analyzerProviderFactory == null) {
return null;
}
return analyzerProviderFactory.analyzer();
}

@Override
public void close() throws IOException {
IOUtils.close(analyzerProviderFactories.values().stream().map((a) -> ((PreBuiltAnalyzerProviderFactory)a).analyzer()).collect(Collectors.toList()));
IOUtils.close(analyzerProviderFactories.values().stream()
.map((a) -> ((PreBuiltAnalyzerProviderFactory)a)).collect(Collectors.toList()));
}
}

Expand Down
Loading