From 61486680a25fc16299f666385970fd7dc310271c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=BCscher?= Date: Tue, 17 Jul 2018 09:04:41 +0200 Subject: [PATCH] Add exclusion option to `keep_types` token filter (#32012) Currently the `keep_types` token filter includes all token types specified using its `types` parameter. Lucenes TypeTokenFilter also provides a second mode where instead of keeping the specified tokens (include) they are filtered out (exclude). This change exposes this option as a new `mode` parameter that can either take the values `include` (the default, if not specified) or `exclude`. Closes #29277 --- .../keep-types-tokenfilter.asciidoc | 74 ++++++++++++++++++- .../common/KeepTypesFilterFactory.java | 36 +++++++-- .../common/KeepTypesFilterFactoryTests.java | 48 ++++++++++-- 3 files changed, 142 insertions(+), 16 deletions(-) diff --git a/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc index afaf4f8fa8c46..05687f8669155 100644 --- a/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc @@ -8,8 +8,9 @@ contained in a predefined set. [float] === Options [horizontal] -types:: a list of types to keep - +types:: a list of types to include (default mode) or exclude +mode:: if set to `include` (default) the specified token types will be kept, +if set to `exclude` the specified token types will be removed from the stream [float] === Settings example @@ -53,7 +54,7 @@ POST /keep_types_example/_analyze // CONSOLE // TEST[continued] -And it'd respond: +The response will be: [source,js] -------------------------------------------------- @@ -72,3 +73,70 @@ And it'd respond: // TESTRESPONSE Note how only the `` token is in the output. + +=== Exclude mode settings example + +If the `mode` parameter is set to `exclude` like in the following example: + +[source,js] +-------------------------------------------------- +PUT /keep_types_exclude_example +{ + "settings" : { + "analysis" : { + "analyzer" : { + "my_analyzer" : { + "tokenizer" : "standard", + "filter" : ["standard", "lowercase", "remove_numbers"] + } + }, + "filter" : { + "remove_numbers" : { + "type" : "keep_types", + "mode" : "exclude", + "types" : [ "" ] + } + } + } + } +} +-------------------------------------------------- +// CONSOLE + +And we test it like: + +[source,js] +-------------------------------------------------- +POST /keep_types_exclude_example/_analyze +{ + "analyzer" : "my_analyzer", + "text" : "hello 101 world" +} +-------------------------------------------------- +// CONSOLE +// TEST[continued] + +The response will be: + +[source,js] +-------------------------------------------------- +{ + "tokens": [ + { + "token": "hello", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "world", + "start_offset": 10, + "end_offset": 15, + "type": "", + "position": 2 + } + ] +} +-------------------------------------------------- +// TESTRESPONSE \ No newline at end of file diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java index 0f94b521e4b7d..b6b8b45fabfc2 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java @@ -29,21 +29,47 @@ import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Set; /** * A {@link TokenFilterFactory} for {@link TypeTokenFilter}. This filter only * keep tokens that are contained in the set configured via - * {@value #KEEP_TYPES_KEY} setting. + * {@value #KEEP_TYPES_MODE_KEY} setting. *

* Configuration options: *

    - *
  • {@value #KEEP_TYPES_KEY} the array of words / tokens to keep.
  • + *
  • {@value #KEEP_TYPES_KEY} the array of words / tokens.
  • + *
  • {@value #KEEP_TYPES_MODE_KEY} whether to keep ("include") or discard + * ("exclude") the specified token types.
  • *
*/ public class KeepTypesFilterFactory extends AbstractTokenFilterFactory { private final Set keepTypes; - private static final String KEEP_TYPES_KEY = "types"; + private final KeepTypesMode includeMode; + static final String KEEP_TYPES_KEY = "types"; + static final String KEEP_TYPES_MODE_KEY = "mode"; + + enum KeepTypesMode { + INCLUDE, EXCLUDE; + + @Override + public String toString() { + return this.name().toLowerCase(Locale.ROOT); + } + + private static KeepTypesMode fromString(String modeString) { + String lc = modeString.toLowerCase(Locale.ROOT); + if (lc.equals("include")) { + return INCLUDE; + } else if (lc.equals("exclude")) { + return EXCLUDE; + } else { + throw new IllegalArgumentException("`keep_types` tokenfilter mode can only be [" + KeepTypesMode.INCLUDE + "] or [" + + KeepTypesMode.EXCLUDE + "] but was [" + modeString + "]."); + } + } + }; KeepTypesFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); @@ -52,12 +78,12 @@ public class KeepTypesFilterFactory extends AbstractTokenFilterFactory { if ((arrayKeepTypes == null)) { throw new IllegalArgumentException("keep_types requires `" + KEEP_TYPES_KEY + "` to be configured"); } - + this.includeMode = KeepTypesMode.fromString(settings.get(KEEP_TYPES_MODE_KEY, "include")); this.keepTypes = new HashSet<>(arrayKeepTypes); } @Override public TokenStream create(TokenStream tokenStream) { - return new TypeTokenFilter(tokenStream, keepTypes, true); + return new TypeTokenFilter(tokenStream, keepTypes, includeMode == KeepTypesMode.INCLUDE); } } diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepTypesFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepTypesFilterFactoryTests.java index a19882d6faa00..d0c7723457ff3 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepTypesFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepTypesFilterFactoryTests.java @@ -34,19 +34,51 @@ import static org.hamcrest.Matchers.instanceOf; public class KeepTypesFilterFactoryTests extends ESTokenStreamTestCase { - public void testKeepTypes() throws IOException { - Settings settings = Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.keep_numbers.type", "keep_types") - .putList("index.analysis.filter.keep_numbers.types", new String[] {"", ""}) - .build(); + + private static final String BASE_SETTING = "index.analysis.filter.keep_numbers"; + + public void testKeepTypesInclude() throws IOException { + Settings.Builder settingsBuilder = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put(BASE_SETTING + ".type", "keep_types") + .putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "", "" }); + // either use default mode or set "include" mode explicitly + if (random().nextBoolean()) { + settingsBuilder.put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY, + KeepTypesFilterFactory.KeepTypesMode.INCLUDE); + } + Settings settings = settingsBuilder.build(); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers"); + assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class)); + String source = "Hello 123 world"; + String[] expected = new String[] { "123" }; + Tokenizer tokenizer = new StandardTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] { 2 }); + } + + public void testKeepTypesExclude() throws IOException { + Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put(BASE_SETTING + ".type", "keep_types") + .putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "", "" }) + .put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY, KeepTypesFilterFactory.KeepTypesMode.EXCLUDE).build(); ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers"); assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class)); String source = "Hello 123 world"; - String[] expected = new String[]{"123"}; + String[] expected = new String[] { "Hello", "world" }; Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader(source)); - assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{2}); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] { 1, 2 }); + } + + public void testKeepTypesException() throws IOException { + Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put(BASE_SETTING + ".type", "keep_types") + .putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "", "" }) + .put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY, "bad_parameter").build(); + IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, + () -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin())); + assertEquals("`keep_types` tokenfilter mode can only be [include] or [exclude] but was [bad_parameter].", ex.getMessage()); } }