Skip to content

Commit

Permalink
[Feature] Support hashtag in token filter type_table rules parser (op…
Browse files Browse the repository at this point in the history
…ensearch-project#10220) (opensearch-project#10257)

* Support hashtag in token filter type_table rules parser



* Address comment from @reta



---------


(cherry picked from commit 8f4f995)

Signed-off-by: Louis Chu <[email protected]>
Signed-off-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
  • Loading branch information
1 parent eaccdf5 commit b12f908
Show file tree
Hide file tree
Showing 6 changed files with 141 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ public class MappingCharFilterFactory extends AbstractCharFilterFactory implemen
MappingCharFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name);

List<MappingRule<String, String>> rules = Analysis.parseWordList(env, settings, "mappings", this::parse, false);
List<MappingRule<String, String>> rules = Analysis.parseWordList(env, settings, "mappings", this::parse);
if (rules == null) {
throw new IllegalArgumentException("mapping requires either `mappings` or `mappings_path` to be configured");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,8 @@ private void createTokenFilterFactoryWithTypeTable(String[] rules) throws IOExce
}

public void testTypeTableParsingError() {
String[] rules = { "# This is a comment", "$ => DIGIT", "\\u200D => ALPHANUM", "abc => ALPHA" };
String[] rules = { "# This is a comment", "# => ALPHANUM", "$ => DIGIT", "\\u200D => ALPHANUM", "abc => ALPHA" };
RuntimeException ex = expectThrows(RuntimeException.class, () -> createTokenFilterFactoryWithTypeTable(rules));
assertEquals("Line [4]: Invalid mapping rule: [abc => ALPHA]. Only a single character is allowed.", ex.getMessage());
assertEquals("Line [5]: Invalid mapping rule: [abc => ALPHA]. Only a single character is allowed.", ex.getMessage());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ public static CharFilterFactory create(String... rules) throws IOException {

public void testRulesOk() throws IOException {
MappingCharFilterFactory mappingCharFilterFactory = (MappingCharFilterFactory) create(
"# This is a comment",
"# => _hashtag_",
":) => _happy_",
":( => _sad_"
Expand Down Expand Up @@ -64,7 +65,10 @@ public void testRuleError() {
}

public void testRulePartError() {
RuntimeException ex = expectThrows(RuntimeException.class, () -> create("# => _hashtag_", ":) => _happy_", "a:b"));
assertEquals("Line [3]: Invalid mapping rule : [a:b]", ex.getMessage());
RuntimeException ex = expectThrows(
RuntimeException.class,
() -> create("# This is a comment", "# => _hashtag_", ":) => _happy_", "a:b")
);
assertEquals("Line [4]: Invalid mapping rule : [a:b]", ex.getMessage());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,69 @@
- match: { tokens.2.token: brown }
- match: { tokens.3.token: fox }

- do:
indices.analyze:
body:
text: 'text1 #text2'
tokenizer: whitespace
filter:
- type: word_delimiter
split_on_numerics: false
type_table:
- "\\u0023 => ALPHANUM"
- length: { tokens: 2 }
- match: { tokens.0.token: text1 }
- match: { tokens.0.start_offset: 0 }
- match: { tokens.0.end_offset: 5 }
- match: { tokens.0.position: 0 }
- match: { tokens.1.token: "#text2" }
- match: { tokens.1.start_offset: 6 }
- match: { tokens.1.end_offset: 12 }
- match: { tokens.1.position: 1 }

- do:
indices.analyze:
body:
text: 'text1 #text2'
tokenizer: whitespace
filter:
- type: word_delimiter
split_on_numerics: false
type_table:
- "# This is a comment"
- "# => ALPHANUM"
- length: { tokens: 2 }
- match: { tokens.0.token: text1 }
- match: { tokens.0.start_offset: 0 }
- match: { tokens.0.end_offset: 5 }
- match: { tokens.0.position: 0 }
- match: { tokens.1.token: "#text2" }
- match: { tokens.1.start_offset: 6 }
- match: { tokens.1.end_offset: 12 }
- match: { tokens.1.position: 1 }

- do:
indices.analyze:
body:
text: 'text1 #text2'
tokenizer: whitespace
filter:
- type: word_delimiter
split_on_numerics: false
type_table:
- "# This is a comment"
- "# => ALPHANUM"
- "@ => ALPHANUM"
- length: { tokens: 2 }
- match: { tokens.0.token: text1 }
- match: { tokens.0.start_offset: 0 }
- match: { tokens.0.end_offset: 5 }
- match: { tokens.0.position: 0 }
- match: { tokens.1.token: "#text2" }
- match: { tokens.1.start_offset: 6 }
- match: { tokens.1.end_offset: 12 }
- match: { tokens.1.position: 1 }

---
"word_delimiter_graph":
- do:
Expand Down Expand Up @@ -231,6 +294,69 @@
- match: { detail.tokenfilters.0.tokens.5.end_offset: 19 }
- match: { detail.tokenfilters.0.tokens.5.position: 5 }

- do:
indices.analyze:
body:
text: 'text1 #text2'
tokenizer: whitespace
filter:
- type: word_delimiter_graph
split_on_numerics: false
type_table:
- "\\u0023 => ALPHANUM"
- length: { tokens: 2 }
- match: { tokens.0.token: text1 }
- match: { tokens.0.start_offset: 0 }
- match: { tokens.0.end_offset: 5 }
- match: { tokens.0.position: 0 }
- match: { tokens.1.token: "#text2" }
- match: { tokens.1.start_offset: 6 }
- match: { tokens.1.end_offset: 12 }
- match: { tokens.1.position: 1 }

- do:
indices.analyze:
body:
text: 'text1 #text2'
tokenizer: whitespace
filter:
- type: word_delimiter_graph
split_on_numerics: false
type_table:
- "# This is a comment"
- "# => ALPHANUM"
- length: { tokens: 2 }
- match: { tokens.0.token: text1 }
- match: { tokens.0.start_offset: 0 }
- match: { tokens.0.end_offset: 5 }
- match: { tokens.0.position: 0 }
- match: { tokens.1.token: "#text2" }
- match: { tokens.1.start_offset: 6 }
- match: { tokens.1.end_offset: 12 }
- match: { tokens.1.position: 1 }

- do:
indices.analyze:
body:
text: 'text1 #text2'
tokenizer: whitespace
filter:
- type: word_delimiter_graph
split_on_numerics: false
type_table:
- "# This is a comment"
- "# => ALPHANUM"
- "@ => ALPHANUM"
- length: { tokens: 2 }
- match: { tokens.0.token: text1 }
- match: { tokens.0.start_offset: 0 }
- match: { tokens.0.end_offset: 5 }
- match: { tokens.0.position: 0 }
- match: { tokens.1.token: "#text2" }
- match: { tokens.1.start_offset: 6 }
- match: { tokens.1.end_offset: 12 }
- match: { tokens.1.position: 1 }

---
"unique":
- do:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
char_filter:
- type: mapping
mappings:
- "# This is a comment"
- "# => _hashsign_"
- "@ => _atsign_"
- length: { tokens: 3 }
Expand Down
27 changes: 5 additions & 22 deletions server/src/main/java/org/opensearch/index/analysis/Analysis.java
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

import static java.util.Collections.unmodifiableMap;

Expand All @@ -98,6 +99,9 @@
public class Analysis {
private static final Logger LOGGER = LogManager.getLogger(Analysis.class);

// Regular expression to support hashtag tokenization
private static final Pattern HASH_TAG_RULE_PATTERN = Pattern.compile("^\\s*#\\s*=>");

public static CharArraySet parseStemExclusion(Settings settings, CharArraySet defaultStemExclusion) {
String value = settings.get("stem_exclusion");
if ("_none_".equals(value)) {
Expand Down Expand Up @@ -222,16 +226,6 @@ public static <T> List<T> parseWordList(Environment env, Settings settings, Stri
return parseWordList(env, settings, settingPrefix + "_path", settingPrefix, parser);
}

public static <T> List<T> parseWordList(
Environment env,
Settings settings,
String settingPrefix,
CustomMappingRuleParser<T> parser,
boolean removeComments
) {
return parseWordList(env, settings, settingPrefix + "_path", settingPrefix, parser, removeComments);
}

/**
* Parses a list of words from the specified settings or from a file, with the given parser.
*
Expand All @@ -246,17 +240,6 @@ public static <T> List<T> parseWordList(
String settingPath,
String settingList,
CustomMappingRuleParser<T> parser
) {
return parseWordList(env, settings, settingPath, settingList, parser, true);
}

public static <T> List<T> parseWordList(
Environment env,
Settings settings,
String settingPath,
String settingList,
CustomMappingRuleParser<T> parser,
boolean removeComments
) {
List<String> words = getWordList(env, settings, settingPath, settingList);
if (words == null) {
Expand All @@ -266,7 +249,7 @@ public static <T> List<T> parseWordList(
int lineNum = 0;
for (String word : words) {
lineNum++;
if (removeComments == false || word.startsWith("#") == false) {
if (word.startsWith("#") == false || HASH_TAG_RULE_PATTERN.matcher(word).find() == true) {
try {
rules.add(parser.apply(word));
} catch (RuntimeException ex) {
Expand Down

0 comments on commit b12f908

Please sign in to comment.