From 5ff9c8d03f233d8bf4a0c6f709e3cbff0f617c05 Mon Sep 17 00:00:00 2001 From: Louis Chu Date: Wed, 27 Sep 2023 16:41:37 -0400 Subject: [PATCH] [Feature] Support hashtag in token filter type_table rules parser (#10220) * Support hashtag in token filter type_table rules parser Signed-off-by: Louis Chu * Address comment from @reta Signed-off-by: Louis Chu --------- Signed-off-by: Louis Chu --- .../common/MappingCharFilterFactory.java | 2 +- ...rdDelimiterTokenFilterFactoryTestCase.java | 4 +- .../common/MappingCharFilterFactoryTests.java | 8 +- .../test/analysis-common/40_token_filters.yml | 126 ++++++++++++++++++ .../test/analysis-common/50_char_filters.yml | 1 + .../opensearch/index/analysis/Analysis.java | 27 +--- 6 files changed, 141 insertions(+), 27 deletions(-) diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/MappingCharFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/MappingCharFilterFactory.java index bd241de749f11..d6d9f8975f2fc 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/MappingCharFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/MappingCharFilterFactory.java @@ -54,7 +54,7 @@ public class MappingCharFilterFactory extends AbstractCharFilterFactory implemen MappingCharFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name); - List> rules = Analysis.parseWordList(env, settings, "mappings", this::parse, false); + List> rules = Analysis.parseWordList(env, settings, "mappings", this::parse); if (rules == null) { throw new IllegalArgumentException("mapping requires either `mappings` or `mappings_path` to be configured"); } diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/BaseWordDelimiterTokenFilterFactoryTestCase.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/BaseWordDelimiterTokenFilterFactoryTestCase.java index 94c7d63f2bee7..f37d5862b9d3f 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/BaseWordDelimiterTokenFilterFactoryTestCase.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/BaseWordDelimiterTokenFilterFactoryTestCase.java @@ -211,8 +211,8 @@ private void createTokenFilterFactoryWithTypeTable(String[] rules) throws IOExce } public void testTypeTableParsingError() { - String[] rules = { "# This is a comment", "$ => DIGIT", "\\u200D => ALPHANUM", "abc => ALPHA" }; + String[] rules = { "# This is a comment", "# => ALPHANUM", "$ => DIGIT", "\\u200D => ALPHANUM", "abc => ALPHA" }; RuntimeException ex = expectThrows(RuntimeException.class, () -> createTokenFilterFactoryWithTypeTable(rules)); - assertEquals("Line [4]: Invalid mapping rule: [abc => ALPHA]. Only a single character is allowed.", ex.getMessage()); + assertEquals("Line [5]: Invalid mapping rule: [abc => ALPHA]. Only a single character is allowed.", ex.getMessage()); } } diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/MappingCharFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/MappingCharFilterFactoryTests.java index 387eb4a377007..28e041ac8c92d 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/MappingCharFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/MappingCharFilterFactoryTests.java @@ -37,6 +37,7 @@ public static CharFilterFactory create(String... rules) throws IOException { public void testRulesOk() throws IOException { MappingCharFilterFactory mappingCharFilterFactory = (MappingCharFilterFactory) create( + "# This is a comment", "# => _hashtag_", ":) => _happy_", ":( => _sad_" @@ -64,7 +65,10 @@ public void testRuleError() { } public void testRulePartError() { - RuntimeException ex = expectThrows(RuntimeException.class, () -> create("# => _hashtag_", ":) => _happy_", "a:b")); - assertEquals("Line [3]: Invalid mapping rule : [a:b]", ex.getMessage()); + RuntimeException ex = expectThrows( + RuntimeException.class, + () -> create("# This is a comment", "# => _hashtag_", ":) => _happy_", "a:b") + ); + assertEquals("Line [4]: Invalid mapping rule : [a:b]", ex.getMessage()); } } diff --git a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml index e92cc0c4838c7..802c79c780689 100644 --- a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml +++ b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml @@ -127,6 +127,69 @@ - match: { tokens.2.token: brown } - match: { tokens.3.token: fox } + - do: + indices.analyze: + body: + text: 'text1 #text2' + tokenizer: whitespace + filter: + - type: word_delimiter + split_on_numerics: false + type_table: + - "\\u0023 => ALPHANUM" + - length: { tokens: 2 } + - match: { tokens.0.token: text1 } + - match: { tokens.0.start_offset: 0 } + - match: { tokens.0.end_offset: 5 } + - match: { tokens.0.position: 0 } + - match: { tokens.1.token: "#text2" } + - match: { tokens.1.start_offset: 6 } + - match: { tokens.1.end_offset: 12 } + - match: { tokens.1.position: 1 } + + - do: + indices.analyze: + body: + text: 'text1 #text2' + tokenizer: whitespace + filter: + - type: word_delimiter + split_on_numerics: false + type_table: + - "# This is a comment" + - "# => ALPHANUM" + - length: { tokens: 2 } + - match: { tokens.0.token: text1 } + - match: { tokens.0.start_offset: 0 } + - match: { tokens.0.end_offset: 5 } + - match: { tokens.0.position: 0 } + - match: { tokens.1.token: "#text2" } + - match: { tokens.1.start_offset: 6 } + - match: { tokens.1.end_offset: 12 } + - match: { tokens.1.position: 1 } + + - do: + indices.analyze: + body: + text: 'text1 #text2' + tokenizer: whitespace + filter: + - type: word_delimiter + split_on_numerics: false + type_table: + - "# This is a comment" + - "# => ALPHANUM" + - "@ => ALPHANUM" + - length: { tokens: 2 } + - match: { tokens.0.token: text1 } + - match: { tokens.0.start_offset: 0 } + - match: { tokens.0.end_offset: 5 } + - match: { tokens.0.position: 0 } + - match: { tokens.1.token: "#text2" } + - match: { tokens.1.start_offset: 6 } + - match: { tokens.1.end_offset: 12 } + - match: { tokens.1.position: 1 } + --- "word_delimiter_graph": - do: @@ -231,6 +294,69 @@ - match: { detail.tokenfilters.0.tokens.5.end_offset: 19 } - match: { detail.tokenfilters.0.tokens.5.position: 5 } + - do: + indices.analyze: + body: + text: 'text1 #text2' + tokenizer: whitespace + filter: + - type: word_delimiter_graph + split_on_numerics: false + type_table: + - "\\u0023 => ALPHANUM" + - length: { tokens: 2 } + - match: { tokens.0.token: text1 } + - match: { tokens.0.start_offset: 0 } + - match: { tokens.0.end_offset: 5 } + - match: { tokens.0.position: 0 } + - match: { tokens.1.token: "#text2" } + - match: { tokens.1.start_offset: 6 } + - match: { tokens.1.end_offset: 12 } + - match: { tokens.1.position: 1 } + + - do: + indices.analyze: + body: + text: 'text1 #text2' + tokenizer: whitespace + filter: + - type: word_delimiter_graph + split_on_numerics: false + type_table: + - "# This is a comment" + - "# => ALPHANUM" + - length: { tokens: 2 } + - match: { tokens.0.token: text1 } + - match: { tokens.0.start_offset: 0 } + - match: { tokens.0.end_offset: 5 } + - match: { tokens.0.position: 0 } + - match: { tokens.1.token: "#text2" } + - match: { tokens.1.start_offset: 6 } + - match: { tokens.1.end_offset: 12 } + - match: { tokens.1.position: 1 } + + - do: + indices.analyze: + body: + text: 'text1 #text2' + tokenizer: whitespace + filter: + - type: word_delimiter_graph + split_on_numerics: false + type_table: + - "# This is a comment" + - "# => ALPHANUM" + - "@ => ALPHANUM" + - length: { tokens: 2 } + - match: { tokens.0.token: text1 } + - match: { tokens.0.start_offset: 0 } + - match: { tokens.0.end_offset: 5 } + - match: { tokens.0.position: 0 } + - match: { tokens.1.token: "#text2" } + - match: { tokens.1.start_offset: 6 } + - match: { tokens.1.end_offset: 12 } + - match: { tokens.1.position: 1 } + --- "unique": - do: diff --git a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/50_char_filters.yml b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/50_char_filters.yml index 0078575ae8e57..5e266c10cba8f 100644 --- a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/50_char_filters.yml +++ b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/50_char_filters.yml @@ -69,6 +69,7 @@ char_filter: - type: mapping mappings: + - "# This is a comment" - "# => _hashsign_" - "@ => _atsign_" - length: { tokens: 3 } diff --git a/server/src/main/java/org/opensearch/index/analysis/Analysis.java b/server/src/main/java/org/opensearch/index/analysis/Analysis.java index f4465c9dffac6..b9a219057f326 100644 --- a/server/src/main/java/org/opensearch/index/analysis/Analysis.java +++ b/server/src/main/java/org/opensearch/index/analysis/Analysis.java @@ -87,6 +87,7 @@ import java.util.Locale; import java.util.Map; import java.util.Set; +import java.util.regex.Pattern; import static java.util.Collections.unmodifiableMap; @@ -98,6 +99,9 @@ public class Analysis { private static final Logger LOGGER = LogManager.getLogger(Analysis.class); + // Regular expression to support hashtag tokenization + private static final Pattern HASH_TAG_RULE_PATTERN = Pattern.compile("^\\s*#\\s*=>"); + public static CharArraySet parseStemExclusion(Settings settings, CharArraySet defaultStemExclusion) { String value = settings.get("stem_exclusion"); if ("_none_".equals(value)) { @@ -222,16 +226,6 @@ public static List parseWordList(Environment env, Settings settings, Stri return parseWordList(env, settings, settingPrefix + "_path", settingPrefix, parser); } - public static List parseWordList( - Environment env, - Settings settings, - String settingPrefix, - CustomMappingRuleParser parser, - boolean removeComments - ) { - return parseWordList(env, settings, settingPrefix + "_path", settingPrefix, parser, removeComments); - } - /** * Parses a list of words from the specified settings or from a file, with the given parser. * @@ -246,17 +240,6 @@ public static List parseWordList( String settingPath, String settingList, CustomMappingRuleParser parser - ) { - return parseWordList(env, settings, settingPath, settingList, parser, true); - } - - public static List parseWordList( - Environment env, - Settings settings, - String settingPath, - String settingList, - CustomMappingRuleParser parser, - boolean removeComments ) { List words = getWordList(env, settings, settingPath, settingList); if (words == null) { @@ -266,7 +249,7 @@ public static List parseWordList( int lineNum = 0; for (String word : words) { lineNum++; - if (removeComments == false || word.startsWith("#") == false) { + if (word.startsWith("#") == false || HASH_TAG_RULE_PATTERN.matcher(word).find() == true) { try { rules.add(parser.apply(word)); } catch (RuntimeException ex) {