Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a prebuilt ICU Analyzer #34958

Merged
merged 10 commits into from
Nov 21, 2018
18 changes: 18 additions & 0 deletions docs/plugins/analysis-icu.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,24 @@ characters.
:plugin_name: analysis-icu
include::install_remove.asciidoc[]

[[analysis-icu-analyzer]]
==== ICU Analyzer

Performs basic normalization, tokenization and character folding, using the
`icu_normalizer` char filter, `icu_tokenizer` and `icu_normalizer` token filter

The following parameters are accepted:

[horizontal]

`method`::

Normalization method. Accepts `nfkc`, `nfc` or `nfkc_cf` (default)

`mode`::

Normalization mode. Accepts `compose` (default) or `decompose`.

[[analysis-icu-normalization-charfilter]]
==== ICU Normalization Character Filter

Expand Down
3 changes: 3 additions & 0 deletions docs/reference/analysis/analyzers/lang-analyzer.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,9 @@ PUT /catalan_example
[[cjk-analyzer]]
===== `cjk` analyzer

NOTE: You may find that `icu_analyzer` in the ICU analysis plugin works better
for CJK text than the `cjk` analyzer. Experiment with your text and queries.

The `cjk` analyzer could be reimplemented as a `custom` analyzer as follows:

[source,js]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.index.analysis;

import com.ibm.icu.text.Normalizer2;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.icu.ICUFoldingFilter;
import org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter;
import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;

import java.io.Reader;

public class IcuAnalyzerProvider extends AbstractIndexAnalyzerProvider<Analyzer> {

private final Normalizer2 normalizer;

public IcuAnalyzerProvider(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
String method = settings.get("method", "nfkc_cf");
String mode = settings.get("mode", "compose");
if (!"compose".equals(mode) && !"decompose".equals(mode)) {
throw new IllegalArgumentException("Unknown mode [" + mode + "] in analyzer [" + name +
"], expected one of [compose, decompose]");
}
Normalizer2 normalizer = Normalizer2.getInstance(
null, method, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
this.normalizer = IcuNormalizerTokenFilterFactory.wrapWithUnicodeSetFilter(indexSettings, normalizer, settings);
}

@Override
public Analyzer get() {
return new Analyzer() {

@Override
protected Reader initReader(String fieldName, Reader reader) {
return new ICUNormalizer2CharFilter(reader, normalizer);
}

@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new ICUTokenizer();
return new TokenStreamComponents(source, new ICUFoldingFilter(source));
}
};
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,11 @@

import static java.util.Collections.singletonMap;

import org.apache.lucene.analysis.Analyzer;
import org.elasticsearch.common.io.stream.NamedWriteableRegistry;
import org.elasticsearch.index.analysis.AnalyzerProvider;
import org.elasticsearch.index.analysis.CharFilterFactory;
import org.elasticsearch.index.analysis.IcuAnalyzerProvider;
import org.elasticsearch.index.analysis.IcuCollationTokenFilterFactory;
import org.elasticsearch.index.analysis.IcuFoldingTokenFilterFactory;
import org.elasticsearch.index.analysis.IcuNormalizerCharFilterFactory;
Expand Down Expand Up @@ -60,6 +63,11 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
return extra;
}

@Override
public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() {
return singletonMap("icu_analyzer", IcuAnalyzerProvider::new);
}

@Override
public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
return singletonMap("icu_tokenizer", IcuTokenizerFactory::new);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.index.analysis;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin;
import org.elasticsearch.test.IndexSettingsModule;

import java.io.IOException;

import static org.hamcrest.Matchers.containsString;

public class IcuAnalyzerTests extends BaseTokenStreamTestCase {

public void testMixedAlphabetTokenization() throws IOException {

Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);

String input = "안녕은하철도999극장판2.1981년8월8일.일본개봉작1999년재더빙video판";

AnalysisICUPlugin plugin = new AnalysisICUPlugin();
Analyzer analyzer = plugin.getAnalyzers().get("icu_analyzer").get(idxSettings, null, "icu", settings).get();
assertAnalyzesTo(analyzer, input,
new String[]{"안녕은하철도", "999", "극장판", "2.1981", "년", "8", "월", "8", "일", "일본개봉작", "1999", "년재더빙", "video", "판"});

}

public void testMiddleDots() throws IOException {
Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);

String input = "경승지·산악·협곡·해협·곶·심연·폭포·호수·급류";

Analyzer analyzer = new IcuAnalyzerProvider(idxSettings, null, "icu", settings).get();
assertAnalyzesTo(analyzer, input,
new String[]{"경승지", "산악", "협곡", "해협", "곶", "심연", "폭포", "호수", "급류"});
}

public void testUnicodeNumericCharacters() throws IOException {

Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);

String input = "① ② ③ ⑴ ⑵ ⑶ ¼ ⅓ ⅜ ¹ ² ³ ₁ ₂ ₃";

Analyzer analyzer = new IcuAnalyzerProvider(idxSettings, null, "icu", settings).get();
assertAnalyzesTo(analyzer, input,
new String[]{"1", "2", "3", "1", "2", "3", "1/4", "1/3", "3/8", "1", "2", "3", "1", "2", "3"});
}

public void testBadSettings() {

Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put("mode", "wrong")
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);

IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> {
new IcuAnalyzerProvider(idxSettings, null, "icu", settings);
});

assertThat(e.getMessage(), containsString("Unknown mode [wrong] in analyzer [icu], expected one of [compose, decompose]"));

}

}