Skip to content

Commit

Permalink
Showing 21 changed files with 228 additions and 75 deletions.
Original file line number Diff line number Diff line change
@@ -193,6 +193,7 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
tokenizers.put("pattern", PatternTokenizerFactory::new);
tokenizers.put("uax_url_email", UAX29URLEmailTokenizerFactory::new);
tokenizers.put("whitespace", WhitespaceTokenizerFactory::new);
tokenizers.put("keyword", KeywordTokenizerFactory::new);
return tokenizers;
}

Original file line number Diff line number Diff line change
@@ -17,7 +17,7 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
@@ -30,7 +30,7 @@ public class KeywordTokenizerFactory extends AbstractTokenizerFactory {

private final int bufferSize;

public KeywordTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
KeywordTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
bufferSize = settings.getAsInt("buffer_size", 256);
}
Original file line number Diff line number Diff line change
@@ -24,7 +24,6 @@
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory;
import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory;
import org.elasticsearch.index.analysis.SynonymTokenFilterFactory;
import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase;
@@ -56,6 +55,7 @@ protected Map<String, Class<?>> getTokenizers() {
tokenizers.put("pattern", PatternTokenizerFactory.class);
tokenizers.put("uax29urlemail", UAX29URLEmailTokenizerFactory.class);
tokenizers.put("whitespace", WhitespaceTokenizerFactory.class);
tokenizers.put("keyword", KeywordTokenizerFactory.class);
return tokenizers;
}

Original file line number Diff line number Diff line change
@@ -5,9 +5,22 @@
indices.analyze:
body:
text: Foo Bar!
explain: true
tokenizer: keyword
- length: { tokens: 1 }
- match: { tokens.0.token: Foo Bar! }
- length: { detail.tokenizer.tokens: 1 }
- match: { detail.tokenizer.name: keyword }
- match: { detail.tokenizer.tokens.0.token: Foo Bar! }

- do:
indices.analyze:
body:
text: Foo Bar!
explain: true
tokenizer:
type: keyword
- length: { detail.tokenizer.tokens: 1 }
- match: { detail.tokenizer.name: _anonymous_tokenizer }
- match: { detail.tokenizer.tokens.0.token: Foo Bar! }

---
"nGram":
Original file line number Diff line number Diff line change
@@ -97,3 +97,19 @@
- length: { tokens: 2 }
- match: { tokens.0.token: sha }
- match: { tokens.1.token: hay }

---
"Custom normalizer in request":
- do:
indices.analyze:
body:
text: ABc
explain: true
filter: ["lowercase"]

- length: { detail.tokenizer.tokens: 1 }
- length: { detail.tokenfilters.0.tokens: 1 }
- match: { detail.tokenizer.name: keyword_for_normalizer }
- match: { detail.tokenizer.tokens.0.token: ABc }
- match: { detail.tokenfilters.0.name: lowercase }
- match: { detail.tokenfilters.0.tokens.0.token: abc }
Original file line number Diff line number Diff line change
@@ -16,29 +16,35 @@
body:
filter: [icu_normalizer]
text: Foo Bar Ruß
tokenizer: keyword
- length: { tokens: 1 }
- match: { tokens.0.token: foo bar russ }
tokenizer: standard
- length: { tokens: 3 }
- match: { tokens.0.token: foo}
- match: { tokens.1.token: bar }
- match: { tokens.2.token: russ }
---
"Normalization charfilter":
- do:
indices.analyze:
body:
char_filter: [icu_normalizer]
text: Foo Bar Ruß
tokenizer: keyword
- length: { tokens: 1 }
- match: { tokens.0.token: foo bar russ }
tokenizer: standard
- length: { tokens: 3 }
- match: { tokens.0.token: foo }
- match: { tokens.1.token: bar }
- match: { tokens.2.token: russ }
---
"Folding filter":
- do:
indices.analyze:
body:
filter: [icu_folding]
text: Foo Bar résumé
tokenizer: keyword
- length: { tokens: 1 }
- match: { tokens.0.token: foo bar resume }
tokenizer: standard
- length: { tokens: 3 }
- match: { tokens.0.token: foo }
- match: { tokens.1.token: bar }
- match: { tokens.2.token: resume }
---
"Normalization with a UnicodeSet Filter":
- do:
@@ -64,25 +70,34 @@
index: test
body:
char_filter: ["charfilter_icu_normalizer"]
tokenizer: keyword
tokenizer: standard
text: charfilter Föo Bâr Ruß
- length: { tokens: 1 }
- match: { tokens.0.token: charfilter föo bâr ruß }
- length: { tokens: 4 }
- match: { tokens.0.token: charfilter }
- match: { tokens.1.token: föo }
- match: { tokens.2.token: bâr }
- match: { tokens.3.token: ruß }
- do:
indices.analyze:
index: test
body:
tokenizer: keyword
tokenizer: standard
filter: ["tokenfilter_icu_normalizer"]
text: tokenfilter Föo Bâr Ruß
- length: { tokens: 1 }
- match: { tokens.0.token: tokenfilter föo Bâr ruß }
- length: { tokens: 4 }
- match: { tokens.0.token: tokenfilter }
- match: { tokens.1.token: föo }
- match: { tokens.2.token: Bâr }
- match: { tokens.3.token: ruß }
- do:
indices.analyze:
index: test
body:
tokenizer: keyword
tokenizer: standard
filter: ["tokenfilter_icu_folding"]
text: icufolding Föo Bâr Ruß
- length: { tokens: 1 }
- match: { tokens.0.token: icufolding foo bâr russ }
- length: { tokens: 4 }
- match: { tokens.0.token: icufolding }
- match: { tokens.1.token: foo }
- match: { tokens.2.token: bâr }
- match: { tokens.3.token: russ }
Original file line number Diff line number Diff line change
@@ -5,7 +5,7 @@
indices.analyze:
body:
text: studenci
tokenizer: keyword
tokenizer: standard
filter: [polish_stem]
- length: { tokens: 1 }
- match: { tokens.0.token: student }
Original file line number Diff line number Diff line change
@@ -75,19 +75,3 @@
- match: { detail.tokenizer.tokens.2.token: buzz }
- match: { detail.tokenfilters.0.name: "_anonymous_tokenfilter" }
- match: { detail.tokenfilters.0.tokens.0.token: bar }

---
"Custom normalizer in request":
- do:
indices.analyze:
body:
text: ABc
explain: true
filter: ["lowercase"]

- length: { detail.tokenizer.tokens: 1 }
- length: { detail.tokenfilters.0.tokens: 1 }
- match: { detail.tokenizer.name: keyword_for_normalizer }
- match: { detail.tokenizer.tokens.0.token: ABc }
- match: { detail.tokenfilters.0.name: lowercase }
- match: { detail.tokenfilters.0.tokens.0.token: abc }
Original file line number Diff line number Diff line change
@@ -548,6 +548,10 @@ private void processNormalizerFactory(
TokenizerFactory keywordTokenizerFactory,
Map<String, TokenFilterFactory> tokenFilters,
Map<String, CharFilterFactory> charFilters) {
if (keywordTokenizerFactory == null) {
throw new IllegalStateException("keyword tokenizer factory is null, normalizers require analysis-common module");
}

if (normalizerFactory instanceof CustomNormalizerProvider) {
((CustomNormalizerProvider) normalizerFactory).build(keywordTokenizerFactory, charFilters, tokenFilters);
}
Original file line number Diff line number Diff line change
@@ -56,7 +56,6 @@
import org.elasticsearch.index.analysis.IrishAnalyzerProvider;
import org.elasticsearch.index.analysis.ItalianAnalyzerProvider;
import org.elasticsearch.index.analysis.KeywordAnalyzerProvider;
import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
import org.elasticsearch.index.analysis.LatvianAnalyzerProvider;
import org.elasticsearch.index.analysis.LithuanianAnalyzerProvider;
import org.elasticsearch.index.analysis.NorwegianAnalyzerProvider;
@@ -225,7 +224,6 @@ static Map<String, PreConfiguredTokenizer> setupPreConfiguredTokenizers(List<Ana
private NamedRegistry<AnalysisProvider<TokenizerFactory>> setupTokenizers(List<AnalysisPlugin> plugins) {
NamedRegistry<AnalysisProvider<TokenizerFactory>> tokenizers = new NamedRegistry<>("tokenizer");
tokenizers.register("standard", StandardTokenizerFactory::new);
tokenizers.register("keyword", KeywordTokenizerFactory::new);
tokenizers.extractAndRegister(plugins, AnalysisPlugin::getTokenizers);
return tokenizers;
}
Original file line number Diff line number Diff line change
@@ -19,6 +19,7 @@
package org.elasticsearch.action.admin.indices;

import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.elasticsearch.Version;
import org.elasticsearch.action.admin.indices.analyze.AnalyzeRequest;
@@ -37,6 +38,7 @@
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.indices.analysis.AnalysisModuleTests.AppendCharFilter;
@@ -107,6 +109,12 @@ public Map<String, AnalysisProvider<CharFilterFactory>> getCharFilters() {
return singletonMap("append", AppendCharFilterFactory::new);
}

@Override
public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
return singletonMap("keyword", (indexSettings, environment, name, settings) ->
() -> new MockTokenizer(MockTokenizer.KEYWORD, false));
}

@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
return singletonMap("mock", MockFactory::new);
Original file line number Diff line number Diff line change
@@ -37,10 +37,13 @@
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.engine.VersionConflictEngineException;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.test.MockKeywordPlugin;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
@@ -58,6 +61,12 @@
import static org.hamcrest.Matchers.nullValue;

public class GetTermVectorsIT extends AbstractTermVectorsTestCase {

@Override
protected Collection<Class<? extends Plugin>> nodePlugins() {
return Collections.singleton(MockKeywordPlugin.class);
}

public void testNoSuchDoc() throws Exception {
XContentBuilder mapping = jsonBuilder().startObject().startObject("type1")
.startObject("properties")
Original file line number Diff line number Diff line change
@@ -432,7 +432,7 @@ public void testRecoverMissingAnalyzer() throws Exception {
logger.info("--> starting one node");
internalCluster().startNode();
prepareCreate("test").setSettings(Settings.builder()
.put("index.analysis.analyzer.test.tokenizer", "keyword")
.put("index.analysis.analyzer.test.tokenizer", "standard")
.put("index.number_of_shards", "1"))
.addMapping("type1", "{\n" +
" \"type1\": {\n" +
Original file line number Diff line number Diff line change
@@ -20,6 +20,8 @@
package org.elasticsearch.index.analysis;

import org.apache.lucene.analysis.MockLowerCaseFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
@@ -71,7 +73,7 @@ public void testTokenizer() throws IOException {
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings));
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, MOCK_ANALYSIS_PLUGIN));
assertEquals("Custom normalizer [my_normalizer] cannot configure a tokenizer", e.getMessage());
}

@@ -135,7 +137,7 @@ public Reader create(Reader reader) {
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
int result = reader.read(cbuf, off, len);
for (int i = off; i < result; i++) {
for (int i = off; i < off + len; i++) {
if (cbuf[i] == 'a') {
cbuf[i] = 'z';
}
@@ -157,5 +159,11 @@ public Object getMultiTermComponent() {
return new Factory();
});
}

@Override
public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
return singletonMap("keyword", (indexSettings, environment, name, settings) ->
() -> new MockTokenizer(MockTokenizer.KEYWORD, false));
}
}
}
Original file line number Diff line number Diff line change
@@ -20,6 +20,8 @@
package org.elasticsearch.index.mapper;

import org.apache.lucene.analysis.MockLowerCaseFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
@@ -33,7 +35,9 @@
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.IndexService;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.index.mapper.MapperService.MergeReason;
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.test.ESSingleNodeTestCase;
@@ -44,8 +48,10 @@
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Map;

import static java.util.Collections.singletonList;
import static java.util.Collections.singletonMap;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;

@@ -58,6 +64,21 @@ public static class MockAnalysisPlugin extends Plugin implements AnalysisPlugin
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
return singletonList(PreConfiguredTokenFilter.singleton("mock_other_lowercase", true, MockLowerCaseFilter::new));
}

@Override
public Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> getTokenizers() {
return singletonMap("keyword", (indexSettings, environment, name, settings) -> {
class Factory implements TokenizerFactory {

@Override
public Tokenizer create() {
return new MockTokenizer(MockTokenizer.KEYWORD, false);
}
}
return new Factory();
});
}

};

@Override
Loading

0 comments on commit 544822c

Please sign in to comment.