From b12f908fd1fa1cb43bd88bf6f401a61af346218c Mon Sep 17 00:00:00 2001 From: "opensearch-trigger-bot[bot]" <98922864+opensearch-trigger-bot[bot]@users.noreply.github.com> Date: Thu, 28 Sep 2023 10:39:42 -0400 Subject: [PATCH] [Feature] Support hashtag in token filter type_table rules parser (#10220) (#10257) * Support hashtag in token filter type_table rules parser * Address comment from @reta --------- (cherry picked from commit 8f4f9951de817a58d18815c5dd35896dae2bced1) Signed-off-by: Louis Chu <clingzhi@amazon.com> Signed-off-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> --- .../common/MappingCharFilterFactory.java | 2 +- ...rdDelimiterTokenFilterFactoryTestCase.java | 4 +- .../common/MappingCharFilterFactoryTests.java | 8 +- .../test/analysis-common/40_token_filters.yml | 126 ++++++++++++++++++ .../test/analysis-common/50_char_filters.yml | 1 + .../opensearch/index/analysis/Analysis.java | 27 +--- 6 files changed, 141 insertions(+), 27 deletions(-) diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/MappingCharFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/MappingCharFilterFactory.java index bd241de749f11..d6d9f8975f2fc 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/MappingCharFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/MappingCharFilterFactory.java @@ -54,7 +54,7 @@ public class MappingCharFilterFactory extends AbstractCharFilterFactory implemen MappingCharFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name); - List<MappingRule<String, String>> rules = Analysis.parseWordList(env, settings, "mappings", this::parse, false); + List<MappingRule<String, String>> rules = Analysis.parseWordList(env, settings, "mappings", this::parse); if (rules == null) { throw new IllegalArgumentException("mapping requires either `mappings` or `mappings_path` to be configured"); } diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/BaseWordDelimiterTokenFilterFactoryTestCase.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/BaseWordDelimiterTokenFilterFactoryTestCase.java index 2c3864a36fd22..d28155272a9db 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/BaseWordDelimiterTokenFilterFactoryTestCase.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/BaseWordDelimiterTokenFilterFactoryTestCase.java @@ -211,8 +211,8 @@ private void createTokenFilterFactoryWithTypeTable(String[] rules) throws IOExce } public void testTypeTableParsingError() { - String[] rules = { "# This is a comment", "$ => DIGIT", "\\u200D => ALPHANUM", "abc => ALPHA" }; + String[] rules = { "# This is a comment", "# => ALPHANUM", "$ => DIGIT", "\\u200D => ALPHANUM", "abc => ALPHA" }; RuntimeException ex = expectThrows(RuntimeException.class, () -> createTokenFilterFactoryWithTypeTable(rules)); - assertEquals("Line [4]: Invalid mapping rule: [abc => ALPHA]. Only a single character is allowed.", ex.getMessage()); + assertEquals("Line [5]: Invalid mapping rule: [abc => ALPHA]. Only a single character is allowed.", ex.getMessage()); } } diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/MappingCharFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/MappingCharFilterFactoryTests.java index 843a3cecd56c0..2ef9f4e91655f 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/MappingCharFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/MappingCharFilterFactoryTests.java @@ -37,6 +37,7 @@ public static CharFilterFactory create(String... rules) throws IOException { public void testRulesOk() throws IOException { MappingCharFilterFactory mappingCharFilterFactory = (MappingCharFilterFactory) create( + "# This is a comment", "# => _hashtag_", ":) => _happy_", ":( => _sad_" @@ -64,7 +65,10 @@ public void testRuleError() { } public void testRulePartError() { - RuntimeException ex = expectThrows(RuntimeException.class, () -> create("# => _hashtag_", ":) => _happy_", "a:b")); - assertEquals("Line [3]: Invalid mapping rule : [a:b]", ex.getMessage()); + RuntimeException ex = expectThrows( + RuntimeException.class, + () -> create("# This is a comment", "# => _hashtag_", ":) => _happy_", "a:b") + ); + assertEquals("Line [4]: Invalid mapping rule : [a:b]", ex.getMessage()); } } diff --git a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml index e92cc0c4838c7..802c79c780689 100644 --- a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml +++ b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml @@ -127,6 +127,69 @@ - match: { tokens.2.token: brown } - match: { tokens.3.token: fox } + - do: + indices.analyze: + body: + text: 'text1 #text2' + tokenizer: whitespace + filter: + - type: word_delimiter + split_on_numerics: false + type_table: + - "\\u0023 => ALPHANUM" + - length: { tokens: 2 } + - match: { tokens.0.token: text1 } + - match: { tokens.0.start_offset: 0 } + - match: { tokens.0.end_offset: 5 } + - match: { tokens.0.position: 0 } + - match: { tokens.1.token: "#text2" } + - match: { tokens.1.start_offset: 6 } + - match: { tokens.1.end_offset: 12 } + - match: { tokens.1.position: 1 } + + - do: + indices.analyze: + body: + text: 'text1 #text2' + tokenizer: whitespace + filter: + - type: word_delimiter + split_on_numerics: false + type_table: + - "# This is a comment" + - "# => ALPHANUM" + - length: { tokens: 2 } + - match: { tokens.0.token: text1 } + - match: { tokens.0.start_offset: 0 } + - match: { tokens.0.end_offset: 5 } + - match: { tokens.0.position: 0 } + - match: { tokens.1.token: "#text2" } + - match: { tokens.1.start_offset: 6 } + - match: { tokens.1.end_offset: 12 } + - match: { tokens.1.position: 1 } + + - do: + indices.analyze: + body: + text: 'text1 #text2' + tokenizer: whitespace + filter: + - type: word_delimiter + split_on_numerics: false + type_table: + - "# This is a comment" + - "# => ALPHANUM" + - "@ => ALPHANUM" + - length: { tokens: 2 } + - match: { tokens.0.token: text1 } + - match: { tokens.0.start_offset: 0 } + - match: { tokens.0.end_offset: 5 } + - match: { tokens.0.position: 0 } + - match: { tokens.1.token: "#text2" } + - match: { tokens.1.start_offset: 6 } + - match: { tokens.1.end_offset: 12 } + - match: { tokens.1.position: 1 } + --- "word_delimiter_graph": - do: @@ -231,6 +294,69 @@ - match: { detail.tokenfilters.0.tokens.5.end_offset: 19 } - match: { detail.tokenfilters.0.tokens.5.position: 5 } + - do: + indices.analyze: + body: + text: 'text1 #text2' + tokenizer: whitespace + filter: + - type: word_delimiter_graph + split_on_numerics: false + type_table: + - "\\u0023 => ALPHANUM" + - length: { tokens: 2 } + - match: { tokens.0.token: text1 } + - match: { tokens.0.start_offset: 0 } + - match: { tokens.0.end_offset: 5 } + - match: { tokens.0.position: 0 } + - match: { tokens.1.token: "#text2" } + - match: { tokens.1.start_offset: 6 } + - match: { tokens.1.end_offset: 12 } + - match: { tokens.1.position: 1 } + + - do: + indices.analyze: + body: + text: 'text1 #text2' + tokenizer: whitespace + filter: + - type: word_delimiter_graph + split_on_numerics: false + type_table: + - "# This is a comment" + - "# => ALPHANUM" + - length: { tokens: 2 } + - match: { tokens.0.token: text1 } + - match: { tokens.0.start_offset: 0 } + - match: { tokens.0.end_offset: 5 } + - match: { tokens.0.position: 0 } + - match: { tokens.1.token: "#text2" } + - match: { tokens.1.start_offset: 6 } + - match: { tokens.1.end_offset: 12 } + - match: { tokens.1.position: 1 } + + - do: + indices.analyze: + body: + text: 'text1 #text2' + tokenizer: whitespace + filter: + - type: word_delimiter_graph + split_on_numerics: false + type_table: + - "# This is a comment" + - "# => ALPHANUM" + - "@ => ALPHANUM" + - length: { tokens: 2 } + - match: { tokens.0.token: text1 } + - match: { tokens.0.start_offset: 0 } + - match: { tokens.0.end_offset: 5 } + - match: { tokens.0.position: 0 } + - match: { tokens.1.token: "#text2" } + - match: { tokens.1.start_offset: 6 } + - match: { tokens.1.end_offset: 12 } + - match: { tokens.1.position: 1 } + --- "unique": - do: diff --git a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/50_char_filters.yml b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/50_char_filters.yml index 0078575ae8e57..5e266c10cba8f 100644 --- a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/50_char_filters.yml +++ b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/50_char_filters.yml @@ -69,6 +69,7 @@ char_filter: - type: mapping mappings: + - "# This is a comment" - "# => _hashsign_" - "@ => _atsign_" - length: { tokens: 3 } diff --git a/server/src/main/java/org/opensearch/index/analysis/Analysis.java b/server/src/main/java/org/opensearch/index/analysis/Analysis.java index 0062e4d8fbe05..6294230a02ada 100644 --- a/server/src/main/java/org/opensearch/index/analysis/Analysis.java +++ b/server/src/main/java/org/opensearch/index/analysis/Analysis.java @@ -87,6 +87,7 @@ import java.util.Locale; import java.util.Map; import java.util.Set; +import java.util.regex.Pattern; import static java.util.Collections.unmodifiableMap; @@ -98,6 +99,9 @@ public class Analysis { private static final Logger LOGGER = LogManager.getLogger(Analysis.class); + // Regular expression to support hashtag tokenization + private static final Pattern HASH_TAG_RULE_PATTERN = Pattern.compile("^\\s*#\\s*=>"); + public static CharArraySet parseStemExclusion(Settings settings, CharArraySet defaultStemExclusion) { String value = settings.get("stem_exclusion"); if ("_none_".equals(value)) { @@ -222,16 +226,6 @@ public static <T> List<T> parseWordList(Environment env, Settings settings, Stri return parseWordList(env, settings, settingPrefix + "_path", settingPrefix, parser); } - public static <T> List<T> parseWordList( - Environment env, - Settings settings, - String settingPrefix, - CustomMappingRuleParser<T> parser, - boolean removeComments - ) { - return parseWordList(env, settings, settingPrefix + "_path", settingPrefix, parser, removeComments); - } - /** * Parses a list of words from the specified settings or from a file, with the given parser. * @@ -246,17 +240,6 @@ public static <T> List<T> parseWordList( String settingPath, String settingList, CustomMappingRuleParser<T> parser - ) { - return parseWordList(env, settings, settingPath, settingList, parser, true); - } - - public static <T> List<T> parseWordList( - Environment env, - Settings settings, - String settingPath, - String settingList, - CustomMappingRuleParser<T> parser, - boolean removeComments ) { List<String> words = getWordList(env, settings, settingPath, settingList); if (words == null) { @@ -266,7 +249,7 @@ public static <T> List<T> parseWordList( int lineNum = 0; for (String word : words) { lineNum++; - if (removeComments == false || word.startsWith("#") == false) { + if (word.startsWith("#") == false || HASH_TAG_RULE_PATTERN.matcher(word).find() == true) { try { rules.add(parser.apply(word)); } catch (RuntimeException ex) {