Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] Adding a char_group tokenizer #24186

Merged
merged 5 commits into from
May 22, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/reference/analysis/tokenizers.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,11 @@ The `simple_pattern` tokenizer uses a regular expression to capture matching
text as terms. It uses a restricted subset of regular expression features
and is generally faster than the `pattern` tokenizer.

<<analysis-chargroup-tokenizer,Char Group Tokenizer>>::

The `char_group` tokenizer is configurable through sets of characters to split
on, which is usually less expensive than running regular expressions.

<<analysis-simplepatternsplit-tokenizer,Simple Pattern Split Tokenizer>>::

The `simple_pattern_split` tokenizer uses the same restricted regular expression
Expand Down Expand Up @@ -143,6 +148,8 @@ include::tokenizers/keyword-tokenizer.asciidoc[]

include::tokenizers/pattern-tokenizer.asciidoc[]

include::tokenizers/chargroup-tokenizer.asciidoc[]

include::tokenizers/simplepattern-tokenizer.asciidoc[]

include::tokenizers/simplepatternsplit-tokenizer.asciidoc[]
Expand Down
80 changes: 80 additions & 0 deletions docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
[[analysis-chargroup-tokenizer]]
=== Char Group Tokenizer

The `char_group` tokenizer breaks text into terms whenever it encounters a
character which is in a defined set. It is mostly useful for cases where a simple
custom tokenization is desired, and the overhead of use of the <<analysis-pattern-tokenizer, `pattern` tokenizer>>
is not acceptable.

[float]
=== Configuration

The `char_group` tokenizer accepts one parameter:

[horizontal]
`tokenize_on_chars`::
A list containing a list of characters to tokenize the string on. Whenever a character
from this list is encountered, a new token is started. This accepts either single
characters like eg. `-`, or character groups: `whitespace`, `letter`, `digit`,
`punctuation`, `symbol`.


[float]
=== Example output

[source,js]
---------------------------
POST _analyze
{
"tokenizer": {
"type": "char_group",
"tokenize_on_chars": [
"whitespace",
"-",
"\n"
]
},
"text": "The QUICK brown-fox"
}
---------------------------
// CONSOLE

returns

[source,js]
---------------------------
{
"tokens": [
{
"token": "The",
"start_offset": 0,
"end_offset": 3,
"type": "word",
"position": 0
},
{
"token": "QUICK",
"start_offset": 4,
"end_offset": 9,
"type": "word",
"position": 1
},
{
"token": "brown",
"start_offset": 10,
"end_offset": 15,
"type": "word",
"position": 2
},
{
"token": "fox",
"start_offset": 16,
"end_offset": 19,
"type": "word",
"position": 3
}
]
}
---------------------------
// TESTRESPONSE

Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;

import java.util.HashSet;
import java.util.Set;

public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{

private final Set<Integer> tokenizeOnChars = new HashSet<>();
private boolean tokenizeOnSpace = false;
private boolean tokenizeOnLetter = false;
private boolean tokenizeOnDigit = false;
private boolean tokenizeOnPunctuation = false;
private boolean tokenizeOnSymbol = false;

public CharGroupTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);

for (final String c : settings.getAsList("tokenize_on_chars")) {
if (c == null || c.length() == 0) {
throw new RuntimeException("[tokenize_on_chars] cannot contain empty characters");
}

if (c.length() == 1) {
tokenizeOnChars.add((int) c.charAt(0));
}
else if (c.charAt(0) == '\\') {
tokenizeOnChars.add((int) parseEscapedChar(c));
} else {
switch (c) {
case "letter":
tokenizeOnLetter = true;
break;
case "digit":
tokenizeOnDigit = true;
break;
case "whitespace":
tokenizeOnSpace = true;
break;
case "punctuation":
tokenizeOnPunctuation = true;
break;
case "symbol":
tokenizeOnSymbol = true;
break;
default:
throw new RuntimeException("Invalid escaped char in [" + c + "]");
}
}
}
}

private char parseEscapedChar(final String s) {
int len = s.length();
char c = s.charAt(0);
if (c == '\\') {
if (1 >= len)
throw new RuntimeException("Invalid escaped char in [" + s + "]");
c = s.charAt(1);
switch (c) {
case '\\':
return '\\';
case 'n':
return '\n';
case 't':
return '\t';
case 'r':
return '\r';
case 'b':
return '\b';
case 'f':
return '\f';
case 'u':
if (len > 6) {
throw new RuntimeException("Invalid escaped char in [" + s + "]");
}
return (char) Integer.parseInt(s.substring(2), 16);
default:
throw new RuntimeException("Invalid escaped char " + c + " in [" + s + "]");
}
} else {
throw new RuntimeException("Invalid escaped char [" + s + "]");
}
}

@Override
public Tokenizer create() {
return new CharTokenizer() {
@Override
protected boolean isTokenChar(int c) {
if (tokenizeOnSpace && Character.isWhitespace(c)) {
return false;
}
if (tokenizeOnLetter && Character.isLetter(c)) {
return false;
}
if (tokenizeOnDigit && Character.isDigit(c)) {
return false;
}
if (tokenizeOnPunctuation && CharMatcher.Basic.PUNCTUATION.isTokenChar(c)) {
return false;
}
if (tokenizeOnSymbol && CharMatcher.Basic.SYMBOL.isTokenChar(c)) {
return false;
}
return !tokenizeOnChars.contains(c);
}
};
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
tokenizers.put("ngram", NGramTokenizerFactory::new);
tokenizers.put("edgeNGram", EdgeNGramTokenizerFactory::new);
tokenizers.put("edge_ngram", EdgeNGramTokenizerFactory::new);
tokenizers.put("char_group", CharGroupTokenizerFactory::new);
tokenizers.put("classic", ClassicTokenizerFactory::new);
tokenizers.put("letter", LetterTokenizerFactory::new);
tokenizers.put("lowercase", LowerCaseTokenizerFactory::new);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.test.ESTokenStreamTestCase;
import org.elasticsearch.test.IndexSettingsModule;

import java.io.IOException;
import java.io.StringReader;
import java.util.Arrays;


public class CharGroupTokenizerFactoryTests extends ESTokenStreamTestCase {
public void testParseTokenChars() {
final Index index = new Index("test", "_na_");
final Settings indexSettings = newAnalysisSettingsBuilder().build();
IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
final String name = "cg";
for (String[] conf : Arrays.asList(
new String[] { "\\v" },
new String[] { "\\u00245" },
new String[] { "commas" },
new String[] { "a", "b", "c", "\\$" })) {
final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", conf).build();
expectThrows(RuntimeException.class, () -> new CharGroupTokenizerFactory(indexProperties, null, name, settings).create());
}

for (String[] conf : Arrays.asList(
new String[0],
new String[] { "\\n" },
new String[] { "\\u0024" },
new String[] { "whitespace" },
new String[] { "a", "b", "c" },
new String[] { "a", "b", "c", "\\r" },
new String[] { "\\r" },
new String[] { "f", "o", "o", "symbol" })) {
final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", Arrays.asList(conf)).build();
new CharGroupTokenizerFactory(indexProperties, null, name, settings).create();
// no exception
}
}

public void testTokenization() throws IOException {
final Index index = new Index("test", "_na_");
final String name = "cg";
final Settings indexSettings = newAnalysisSettingsBuilder().build();
final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", "whitespace", ":", "\\u0024").build();
Tokenizer tokenizer = new CharGroupTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings),
null, name, settings).create();
tokenizer.setReader(new StringReader("foo bar $34 test:test2"));
assertTokenStreamContents(tokenizer, new String[] {"foo", "bar", "34", "test", "test2"});
}
}