From 7bdeed0e970374ae4a199f9ec62e40c33bb100e4 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Tue, 25 Jun 2019 09:50:36 +0100 Subject: [PATCH 1/7] Use preconfig filters when no index and no config --- .../index/analysis/AnalysisRegistry.java | 14 ++++---- .../indices/TransportAnalyzeActionTests.java | 33 +++++-------------- 2 files changed, 17 insertions(+), 30 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java b/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java index b198a66d24a49..d65bbb0c109da 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java @@ -111,6 +111,7 @@ private static Settings getSettingsFromIndexSettings(IndexSettings indexSettings private T getComponentFactory(IndexSettings settings, NameOrDefinition nod, String componentType, Function> globalComponentProvider, + Function> prebuiltComponentProvider, BiFunction> indexComponentProvider) throws IOException { if (nod.definition != null) { // custom component, so we build it from scratch @@ -128,8 +129,8 @@ private T getComponentFactory(IndexSettings settings, NameOrDefinition nod, return factory.get(settings, environment, "__anonymous__" + type, nod.definition); } if (settings == null) { - // no index provided, so we use global analysis components only - AnalysisProvider factory = globalComponentProvider.apply(nod.name); + // no index provided, so we use prebuilt analysis components only + AnalysisProvider factory = prebuiltComponentProvider.apply(nod.name); if (factory == null) { throw new IllegalArgumentException("failed to find global " + componentType + " under [" + nod.name + "]"); } @@ -217,25 +218,26 @@ public IndexAnalyzers build(IndexSettings indexSettings) throws IOException { public NamedAnalyzer buildCustomAnalyzer(IndexSettings indexSettings, boolean normalizer, NameOrDefinition tokenizer, List charFilters, List tokenFilters) throws IOException { TokenizerFactory tokenizerFactory - = getComponentFactory(indexSettings, tokenizer, "tokenizer", this::getTokenizerProvider, this::getTokenizerProvider); + = getComponentFactory(indexSettings, tokenizer, "tokenizer", + this::getTokenizerProvider, prebuiltAnalysis::getTokenizerFactory, this::getTokenizerProvider); List charFilterFactories = new ArrayList<>(); for (NameOrDefinition nod : charFilters) { charFilterFactories.add(getComponentFactory(indexSettings, nod, "char_filter", - this::getCharFilterProvider, this::getCharFilterProvider)); + this::getCharFilterProvider, prebuiltAnalysis::getCharFilterFactory, this::getCharFilterProvider)); } List tokenFilterFactories = new ArrayList<>(); for (NameOrDefinition nod : tokenFilters) { TokenFilterFactory tff = getComponentFactory(indexSettings, nod, "filter", - this::getTokenFilterProvider, this::getTokenFilterProvider); + this::getTokenFilterProvider, prebuiltAnalysis::getTokenFilterFactory, this::getTokenFilterProvider); if (normalizer && tff instanceof NormalizingTokenFilterFactory == false) { throw new IllegalArgumentException("Custom normalizer may not use filter [" + tff.name() + "]"); } tff = tff.getChainAwareTokenFilterFactory(tokenizerFactory, charFilterFactories, tokenFilterFactories, name -> { try { return getComponentFactory(indexSettings, new NameOrDefinition(name), "filter", - this::getTokenFilterProvider, this::getTokenFilterProvider); + this::getTokenFilterProvider, prebuiltAnalysis::getTokenFilterFactory, this::getTokenFilterProvider); } catch (IOException e) { throw new UncheckedIOException(e); } diff --git a/server/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java b/server/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java index 1ffd7410fa66a..3f4b54af47395 100644 --- a/server/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java +++ b/server/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java @@ -141,7 +141,7 @@ public Map> getTokenFilters() { @Override public List getPreConfiguredCharFilters() { - return singletonList(PreConfiguredCharFilter.singleton("append_foo", false, reader -> new AppendCharFilter(reader, "foo"))); + return singletonList(PreConfiguredCharFilter.singleton("append", false, reader -> new AppendCharFilter(reader, "foo"))); } }; registry = new AnalysisModule(environment, singletonList(plugin)).getAnalysisRegistry(); @@ -170,24 +170,11 @@ public void testNoIndexAnalyzers() throws IOException { List tokens = analyze.getTokens(); assertEquals(4, tokens.size()); - // Refer to a token filter by its type so we get its default configuration - request = new AnalyzeAction.Request(); - request.text("the qu1ck brown fox"); - request.tokenizer("standard"); - request.addTokenFilter("mock"); - analyze - = TransportAnalyzeAction.analyze(request, registry, null, maxTokenCount); - tokens = analyze.getTokens(); - assertEquals(3, tokens.size()); - assertEquals("qu1ck", tokens.get(0).getTerm()); - assertEquals("brown", tokens.get(1).getTerm()); - assertEquals("fox", tokens.get(2).getTerm()); - // We can refer to a pre-configured token filter by its name to get it request = new AnalyzeAction.Request(); request.text("the qu1ck brown fox"); request.tokenizer("standard"); - request.addCharFilter("append_foo"); + request.addCharFilter("append"); // <-- no config, so use preconfigured filter analyze = TransportAnalyzeAction.analyze(request, registry, null, maxTokenCount); tokens = analyze.getTokens(); @@ -197,35 +184,33 @@ public void testNoIndexAnalyzers() throws IOException { assertEquals("brown", tokens.get(2).getTerm()); assertEquals("foxfoo", tokens.get(3).getTerm()); - // We can refer to a token filter by its type to get its default configuration + // We can build a new char filter to get default values request = new AnalyzeAction.Request(); request.text("the qu1ck brown fox"); request.tokenizer("standard"); - request.addCharFilter("append"); - request.text("the qu1ck brown fox"); + request.addTokenFilter(Map.of("type", "mock", "stopword", "brown")); + request.addCharFilter(Map.of("type", "append")); // <-- basic config, uses defaults analyze = TransportAnalyzeAction.analyze(request, registry, null, maxTokenCount); tokens = analyze.getTokens(); - assertEquals(4, tokens.size()); + assertEquals(3, tokens.size()); assertEquals("the", tokens.get(0).getTerm()); assertEquals("qu1ck", tokens.get(1).getTerm()); - assertEquals("brown", tokens.get(2).getTerm()); - assertEquals("foxbar", tokens.get(3).getTerm()); + assertEquals("foxbar", tokens.get(2).getTerm()); // We can pass a new configuration request = new AnalyzeAction.Request(); request.text("the qu1ck brown fox"); request.tokenizer("standard"); request.addTokenFilter(Map.of("type", "mock", "stopword", "brown")); - request.addCharFilter("append"); - request.text("the qu1ck brown fox"); + request.addCharFilter(Map.of("type", "append", "suffix", "baz")); analyze = TransportAnalyzeAction.analyze(request, registry, null, maxTokenCount); tokens = analyze.getTokens(); assertEquals(3, tokens.size()); assertEquals("the", tokens.get(0).getTerm()); assertEquals("qu1ck", tokens.get(1).getTerm()); - assertEquals("foxbar", tokens.get(2).getTerm()); + assertEquals("foxbaz", tokens.get(2).getTerm()); } public void testFillsAttributes() throws IOException { From 9442e4160dc2500bb192a8ba10d6c483941b2d0d Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Tue, 25 Jun 2019 11:35:06 +0100 Subject: [PATCH 2/7] Fall back to global components if there are no prebuilt ones under that name --- .../elasticsearch/index/analysis/AnalysisRegistry.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java b/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java index d65bbb0c109da..496b1eb3bfaea 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java @@ -129,10 +129,14 @@ private T getComponentFactory(IndexSettings settings, NameOrDefinition nod, return factory.get(settings, environment, "__anonymous__" + type, nod.definition); } if (settings == null) { - // no index provided, so we use prebuilt analysis components only + // no index provided, so we use prebuilt analysis components AnalysisProvider factory = prebuiltComponentProvider.apply(nod.name); if (factory == null) { - throw new IllegalArgumentException("failed to find global " + componentType + " under [" + nod.name + "]"); + // if there's no prebuilt component, try loading a global one to build with no settings + factory = globalComponentProvider.apply(nod.name); + if (factory == null) { + throw new IllegalArgumentException("failed to find global " + componentType + " under [" + nod.name + "]"); + } } return factory.get(environment, nod.name); } else { From 04d51552d601233f5406dc0ffd6a78ea1523b051 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Tue, 25 Jun 2019 11:39:13 +0100 Subject: [PATCH 3/7] Add unit-test for fallback --- .../admin/indices/TransportAnalyzeActionTests.java | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/server/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java b/server/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java index 3f4b54af47395..72830f79889c0 100644 --- a/server/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java +++ b/server/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java @@ -184,6 +184,19 @@ public void testNoIndexAnalyzers() throws IOException { assertEquals("brown", tokens.get(2).getTerm()); assertEquals("foxfoo", tokens.get(3).getTerm()); + // If the preconfigured filter doesn't exist, we use a global filter with no settings + request = new AnalyzeAction.Request(); + request.text("the qu1ck brown fox"); + request.tokenizer("standard"); + request.addTokenFilter("mock"); // <-- not preconfigured, but a global one available + analyze + = TransportAnalyzeAction.analyze(request, registry, null, maxTokenCount); + tokens = analyze.getTokens(); + assertEquals(3, tokens.size()); + assertEquals("qu1ck", tokens.get(0).getTerm()); + assertEquals("brown", tokens.get(1).getTerm()); + assertEquals("fox", tokens.get(2).getTerm()); + // We can build a new char filter to get default values request = new AnalyzeAction.Request(); request.text("the qu1ck brown fox"); From fd4086bfa1cbd69bbbd22833695162e52def97bc Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Tue, 25 Jun 2019 14:56:49 +0100 Subject: [PATCH 4/7] Add fix for #43582 --- .../analysis/common/CommonAnalysisPlugin.java | 17 ++- .../common/EdgeNGramTokenizerTests.java | 130 ++++++++++++++++++ 2 files changed, 143 insertions(+), 4 deletions(-) create mode 100644 modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index f095b766ee1d5..5303d47103cc1 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -110,6 +110,7 @@ import org.apache.lucene.analysis.tr.TurkishAnalyzer; import org.apache.lucene.analysis.util.ElisionFilter; import org.apache.lucene.util.SetOnce; +import org.elasticsearch.Version; import org.elasticsearch.client.Client; import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.io.stream.NamedWriteableRegistry; @@ -475,8 +476,12 @@ public List getPreConfiguredTokenizers() { tokenizers.add(PreConfiguredTokenizer.singleton("letter", LetterTokenizer::new)); tokenizers.add(PreConfiguredTokenizer.singleton("whitespace", WhitespaceTokenizer::new)); tokenizers.add(PreConfiguredTokenizer.singleton("ngram", NGramTokenizer::new)); - tokenizers.add(PreConfiguredTokenizer.singleton("edge_ngram", - () -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE))); + tokenizers.add(PreConfiguredTokenizer.elasticsearchVersion("edge_ngram", (version) -> { + if (version.onOrAfter(Version.V_7_3_0)) { + return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); + } + return new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE); + })); tokenizers.add(PreConfiguredTokenizer.singleton("pattern", () -> new PatternTokenizer(Regex.compile("\\W+", null), -1))); tokenizers.add(PreConfiguredTokenizer.singleton("thai", ThaiTokenizer::new)); // TODO deprecate and remove in API @@ -485,8 +490,12 @@ public List getPreConfiguredTokenizers() { // Temporary shim for aliases. TODO deprecate after they are moved tokenizers.add(PreConfiguredTokenizer.singleton("nGram", NGramTokenizer::new)); - tokenizers.add(PreConfiguredTokenizer.singleton("edgeNGram", - () -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE))); + tokenizers.add(PreConfiguredTokenizer.elasticsearchVersion("edgeNGram", (version) -> { + if (version.onOrAfter(Version.V_7_3_0)) { + return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); + } + return new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE); + })); tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new)); return tokenizers; diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java new file mode 100644 index 0000000000000..87580306607da --- /dev/null +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java @@ -0,0 +1,130 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.TestEnvironment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.IndexAnalyzers; +import org.elasticsearch.index.analysis.NamedAnalyzer; +import org.elasticsearch.indices.analysis.AnalysisModule; +import org.elasticsearch.test.ESTokenStreamTestCase; +import org.elasticsearch.test.IndexSettingsModule; +import org.elasticsearch.test.VersionUtils; + +import java.io.IOException; +import java.util.Collections; + +public class EdgeNGramTokenizerTests extends ESTokenStreamTestCase { + + public void testPreConfiguredTokenizer() throws IOException { + + // Before 7.3 we return ngrams of length 1 only + { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + Settings indexSettings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, + VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, VersionUtils.getPreviousVersion(Version.V_7_3_0))) + .put("index.analysis.analyzer.my_analyzer.tokenizer", "edge_ngram") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); + + try (IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings), + Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings)) { + + NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer"); + assertNotNull(analyzer); + assertAnalyzesTo(analyzer, "test", new String[]{"t"}); + + } + } + + // Check deprecated name as well + { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + Settings indexSettings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, + VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, VersionUtils.getPreviousVersion(Version.V_7_3_0))) + .put("index.analysis.analyzer.my_analyzer.tokenizer", "edgeNGram") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); + + try (IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings), + Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings)) { + + NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer"); + assertNotNull(analyzer); + assertAnalyzesTo(analyzer, "test", new String[]{"t"}); + + } + } + + // Afterwards, we return ngrams of length 1 and 2, to match the default factory settings + { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + Settings indexSettings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put("index.analysis.analyzer.my_analyzer.tokenizer", "edge_ngram") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); + + try (IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings), + Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings)) { + + NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer"); + assertNotNull(analyzer); + assertAnalyzesTo(analyzer, "test", new String[]{"t", "te"}); + + } + } + + // Check deprecated name as well + { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + Settings indexSettings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put("index.analysis.analyzer.my_analyzer.tokenizer", "edgeNGram") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); + + try (IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings), + Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings)) { + + NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer"); + assertNotNull(analyzer); + assertAnalyzesTo(analyzer, "test", new String[]{"t", "te"}); + + } + } + + } + +} From 8c01ed3a521052a3ce24dc7dcb03ef0908bb7556 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Wed, 26 Jun 2019 09:01:10 +0100 Subject: [PATCH 5/7] Fixes #43621 --- .../analysis/common/CommonAnalysisPlugin.java | 9 ++- ...DelimiterGraphTokenFilterFactoryTests.java | 57 +++++++++++++++++++ 2 files changed, 63 insertions(+), 3 deletions(-) diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index 5303d47103cc1..44eeace79be43 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -83,6 +83,7 @@ import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter; +import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer; import org.apache.lucene.analysis.ngram.NGramTokenFilter; @@ -456,13 +457,15 @@ public List getPreConfiguredTokenFilters() { | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null))); - filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, false, input -> - new WordDelimiterGraphFilter(input, + filters.add(PreConfiguredTokenFilter.singletonWithVersion("word_delimiter_graph", false, false, (input, version) -> { + boolean adjustOffsets = version.onOrAfter(Version.V_7_3_0); + return new WordDelimiterGraphFilter(input, adjustOffsets, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, WordDelimiterGraphFilter.GENERATE_WORD_PARTS | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS - | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null))); + | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null); + })); return filters; } diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java index d799674f231a1..c8e3699ea840d 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java @@ -20,14 +20,24 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; +import org.elasticsearch.env.TestEnvironment; +import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AnalysisTestsHelper; +import org.elasticsearch.index.analysis.IndexAnalyzers; +import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.elasticsearch.indices.analysis.AnalysisModule; import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.test.IndexSettingsModule; +import org.elasticsearch.test.VersionUtils; import java.io.IOException; import java.io.StringReader; +import java.util.Collections; public class WordDelimiterGraphTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase { @@ -107,4 +117,51 @@ public void testAdjustingOffsets() throws IOException { assertTokenStreamContents(tokenFilter.create(tokenizer), expected, expectedStartOffsets, expectedEndOffsets, null, expectedIncr, expectedPosLen, null); } + + public void testPreconfiguredFilter() throws IOException { + // Before 7.3 we don't adjust offsets + { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + Settings indexSettings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, + VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, VersionUtils.getPreviousVersion(Version.V_7_3_0))) + .put("index.analysis.analyzer.my_analyzer.tokenizer", "standard") + .putList("index.analysis.analyzer.my_analyzer.filter", "word_delimiter_graph") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); + + try (IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings), + Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings)) { + + NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer"); + assertNotNull(analyzer); + assertAnalyzesTo(analyzer, "h100", new String[]{"h", "100"}, new int[]{ 0, 0 }, new int[]{ 4, 4 }); + + } + } + + // Afger 7.3 we do adjust offsets + { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + Settings indexSettings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put("index.analysis.analyzer.my_analyzer.tokenizer", "standard") + .putList("index.analysis.analyzer.my_analyzer.filter", "word_delimiter_graph") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); + + try (IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings), + Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings)) { + + NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer"); + assertNotNull(analyzer); + assertAnalyzesTo(analyzer, "h100", new String[]{"h", "100"}, new int[]{ 0, 1 }, new int[]{ 1, 4 }); + + } + } + } } From b5934964d8f286274ef8324c24d34e848f036470 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Wed, 26 Jun 2019 15:49:32 +0100 Subject: [PATCH 6/7] dry up tests a bit --- .../common/EdgeNGramTokenizerTests.java | 72 +++++-------------- 1 file changed, 19 insertions(+), 53 deletions(-) diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java index 87580306607da..2b92ecdaf459b 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java @@ -37,87 +37,53 @@ public class EdgeNGramTokenizerTests extends ESTokenStreamTestCase { + private IndexAnalyzers buildAnalyzers(Version version, String tokenizer) throws IOException { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + Settings indexSettings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, version) + .put("index.analysis.analyzer.my_analyzer.tokenizer", tokenizer) + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); + return new AnalysisModule(TestEnvironment.newEnvironment(settings), + Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings); + } + public void testPreConfiguredTokenizer() throws IOException { // Before 7.3 we return ngrams of length 1 only { - Settings settings = Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .build(); - Settings indexSettings = Settings.builder() - .put(IndexMetaData.SETTING_VERSION_CREATED, - VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, VersionUtils.getPreviousVersion(Version.V_7_3_0))) - .put("index.analysis.analyzer.my_analyzer.tokenizer", "edge_ngram") - .build(); - IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); - - try (IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings), - Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings)) { - + Version version = VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, VersionUtils.getPreviousVersion(Version.V_7_3_0)); + try (IndexAnalyzers indexAnalyzers = buildAnalyzers(version, "edge_ngram")) { NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer"); assertNotNull(analyzer); assertAnalyzesTo(analyzer, "test", new String[]{"t"}); - } } // Check deprecated name as well { - Settings settings = Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .build(); - Settings indexSettings = Settings.builder() - .put(IndexMetaData.SETTING_VERSION_CREATED, - VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, VersionUtils.getPreviousVersion(Version.V_7_3_0))) - .put("index.analysis.analyzer.my_analyzer.tokenizer", "edgeNGram") - .build(); - IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); - - try (IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings), - Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings)) { - + Version version = VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, VersionUtils.getPreviousVersion(Version.V_7_3_0)); + try (IndexAnalyzers indexAnalyzers = buildAnalyzers(version, "edgeNGram")) { NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer"); assertNotNull(analyzer); assertAnalyzesTo(analyzer, "test", new String[]{"t"}); - } } // Afterwards, we return ngrams of length 1 and 2, to match the default factory settings { - Settings settings = Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .build(); - Settings indexSettings = Settings.builder() - .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) - .put("index.analysis.analyzer.my_analyzer.tokenizer", "edge_ngram") - .build(); - IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); - - try (IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings), - Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings)) { - + try (IndexAnalyzers indexAnalyzers = buildAnalyzers(Version.CURRENT, "edge_ngram")) { NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer"); assertNotNull(analyzer); assertAnalyzesTo(analyzer, "test", new String[]{"t", "te"}); - } } // Check deprecated name as well { - Settings settings = Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .build(); - Settings indexSettings = Settings.builder() - .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) - .put("index.analysis.analyzer.my_analyzer.tokenizer", "edgeNGram") - .build(); - IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); - - try (IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings), - Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings)) { - + try (IndexAnalyzers indexAnalyzers = buildAnalyzers(Version.CURRENT, "edgeNGram")) { NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer"); assertNotNull(analyzer); assertAnalyzesTo(analyzer, "test", new String[]{"t", "te"}); From 62e9822a2d27cf48fb9546fed4af1130e3d60adb Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Wed, 26 Jun 2019 16:15:19 +0100 Subject: [PATCH 7/7] checkstyle --- .../analysis/common/EdgeNGramTokenizerTests.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java index 2b92ecdaf459b..0172f7cbc2657 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java @@ -54,7 +54,8 @@ public void testPreConfiguredTokenizer() throws IOException { // Before 7.3 we return ngrams of length 1 only { - Version version = VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, VersionUtils.getPreviousVersion(Version.V_7_3_0)); + Version version = VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, + VersionUtils.getPreviousVersion(Version.V_7_3_0)); try (IndexAnalyzers indexAnalyzers = buildAnalyzers(version, "edge_ngram")) { NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer"); assertNotNull(analyzer); @@ -64,7 +65,8 @@ public void testPreConfiguredTokenizer() throws IOException { // Check deprecated name as well { - Version version = VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, VersionUtils.getPreviousVersion(Version.V_7_3_0)); + Version version = VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, + VersionUtils.getPreviousVersion(Version.V_7_3_0)); try (IndexAnalyzers indexAnalyzers = buildAnalyzers(version, "edgeNGram")) { NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer"); assertNotNull(analyzer);