From 5ff9c8d03f233d8bf4a0c6f709e3cbff0f617c05 Mon Sep 17 00:00:00 2001
From: Louis Chu <clingzhi@amazon.com>
Date: Wed, 27 Sep 2023 16:41:37 -0400
Subject: [PATCH] [Feature] Support hashtag in token filter type_table rules
 parser (#10220)

* Support hashtag in token filter type_table rules parser

Signed-off-by: Louis Chu <clingzhi@amazon.com>

* Address comment from @reta

Signed-off-by: Louis Chu <clingzhi@amazon.com>

---------

Signed-off-by: Louis Chu <clingzhi@amazon.com>
---
 .../common/MappingCharFilterFactory.java      |   2 +-
 ...rdDelimiterTokenFilterFactoryTestCase.java |   4 +-
 .../common/MappingCharFilterFactoryTests.java |   8 +-
 .../test/analysis-common/40_token_filters.yml | 126 ++++++++++++++++++
 .../test/analysis-common/50_char_filters.yml  |   1 +
 .../opensearch/index/analysis/Analysis.java   |  27 +---
 6 files changed, 141 insertions(+), 27 deletions(-)

diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/MappingCharFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/MappingCharFilterFactory.java
index bd241de749f11..d6d9f8975f2fc 100644
--- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/MappingCharFilterFactory.java
+++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/MappingCharFilterFactory.java
@@ -54,7 +54,7 @@ public class MappingCharFilterFactory extends AbstractCharFilterFactory implemen
     MappingCharFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
         super(indexSettings, name);
 
-        List<MappingRule<String, String>> rules = Analysis.parseWordList(env, settings, "mappings", this::parse, false);
+        List<MappingRule<String, String>> rules = Analysis.parseWordList(env, settings, "mappings", this::parse);
         if (rules == null) {
             throw new IllegalArgumentException("mapping requires either `mappings` or `mappings_path` to be configured");
         }
diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/BaseWordDelimiterTokenFilterFactoryTestCase.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/BaseWordDelimiterTokenFilterFactoryTestCase.java
index 94c7d63f2bee7..f37d5862b9d3f 100644
--- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/BaseWordDelimiterTokenFilterFactoryTestCase.java
+++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/BaseWordDelimiterTokenFilterFactoryTestCase.java
@@ -211,8 +211,8 @@ private void createTokenFilterFactoryWithTypeTable(String[] rules) throws IOExce
     }
 
     public void testTypeTableParsingError() {
-        String[] rules = { "# This is a comment", "$ => DIGIT", "\\u200D => ALPHANUM", "abc => ALPHA" };
+        String[] rules = { "# This is a comment", "# => ALPHANUM", "$ => DIGIT", "\\u200D => ALPHANUM", "abc => ALPHA" };
         RuntimeException ex = expectThrows(RuntimeException.class, () -> createTokenFilterFactoryWithTypeTable(rules));
-        assertEquals("Line [4]: Invalid mapping rule: [abc => ALPHA]. Only a single character is allowed.", ex.getMessage());
+        assertEquals("Line [5]: Invalid mapping rule: [abc => ALPHA]. Only a single character is allowed.", ex.getMessage());
     }
 }
diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/MappingCharFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/MappingCharFilterFactoryTests.java
index 387eb4a377007..28e041ac8c92d 100644
--- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/MappingCharFilterFactoryTests.java
+++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/MappingCharFilterFactoryTests.java
@@ -37,6 +37,7 @@ public static CharFilterFactory create(String... rules) throws IOException {
 
     public void testRulesOk() throws IOException {
         MappingCharFilterFactory mappingCharFilterFactory = (MappingCharFilterFactory) create(
+            "# This is a comment",
             "# => _hashtag_",
             ":) => _happy_",
             ":( => _sad_"
@@ -64,7 +65,10 @@ public void testRuleError() {
     }
 
     public void testRulePartError() {
-        RuntimeException ex = expectThrows(RuntimeException.class, () -> create("# => _hashtag_", ":) => _happy_", "a:b"));
-        assertEquals("Line [3]: Invalid mapping rule : [a:b]", ex.getMessage());
+        RuntimeException ex = expectThrows(
+            RuntimeException.class,
+            () -> create("# This is a comment", "# => _hashtag_", ":) => _happy_", "a:b")
+        );
+        assertEquals("Line [4]: Invalid mapping rule : [a:b]", ex.getMessage());
     }
 }
diff --git a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml
index e92cc0c4838c7..802c79c780689 100644
--- a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml
+++ b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml
@@ -127,6 +127,69 @@
     - match:  { tokens.2.token: brown }
     - match:  { tokens.3.token: fox }
 
+    - do:
+        indices.analyze:
+          body:
+            text: 'text1 #text2'
+            tokenizer: whitespace
+            filter:
+              - type: word_delimiter
+                split_on_numerics: false
+                type_table:
+                  - "\\u0023 => ALPHANUM"
+    - length: { tokens: 2 }
+    - match: { tokens.0.token: text1 }
+    - match: { tokens.0.start_offset: 0 }
+    - match: { tokens.0.end_offset: 5 }
+    - match: { tokens.0.position: 0 }
+    - match: { tokens.1.token: "#text2" }
+    - match: { tokens.1.start_offset: 6 }
+    - match: { tokens.1.end_offset: 12 }
+    - match: { tokens.1.position: 1 }
+
+    - do:
+        indices.analyze:
+          body:
+            text: 'text1 #text2'
+            tokenizer: whitespace
+            filter:
+              - type: word_delimiter
+                split_on_numerics: false
+                type_table:
+                  - "# This is a comment"
+                  - "# => ALPHANUM"
+    - length: { tokens: 2 }
+    - match: { tokens.0.token: text1 }
+    - match: { tokens.0.start_offset: 0 }
+    - match: { tokens.0.end_offset: 5 }
+    - match: { tokens.0.position: 0 }
+    - match: { tokens.1.token: "#text2" }
+    - match: { tokens.1.start_offset: 6 }
+    - match: { tokens.1.end_offset: 12 }
+    - match: { tokens.1.position: 1 }
+
+    - do:
+        indices.analyze:
+          body:
+            text: 'text1 #text2'
+            tokenizer: whitespace
+            filter:
+              - type: word_delimiter
+                split_on_numerics: false
+                type_table:
+                  - "# This is a comment"
+                  - "# => ALPHANUM"
+                  - "@ => ALPHANUM"
+    - length: { tokens: 2 }
+    - match: { tokens.0.token: text1 }
+    - match: { tokens.0.start_offset: 0 }
+    - match: { tokens.0.end_offset: 5 }
+    - match: { tokens.0.position: 0 }
+    - match: { tokens.1.token: "#text2" }
+    - match: { tokens.1.start_offset: 6 }
+    - match: { tokens.1.end_offset: 12 }
+    - match: { tokens.1.position: 1 }
+
 ---
 "word_delimiter_graph":
     - do:
@@ -231,6 +294,69 @@
     - match:  { detail.tokenfilters.0.tokens.5.end_offset: 19 }
     - match:  { detail.tokenfilters.0.tokens.5.position: 5 }
 
+    - do:
+        indices.analyze:
+          body:
+            text: 'text1 #text2'
+            tokenizer: whitespace
+            filter:
+              - type: word_delimiter_graph
+                split_on_numerics: false
+                type_table:
+                  - "\\u0023 => ALPHANUM"
+    - length: { tokens: 2 }
+    - match: { tokens.0.token: text1 }
+    - match: { tokens.0.start_offset: 0 }
+    - match: { tokens.0.end_offset: 5 }
+    - match: { tokens.0.position: 0 }
+    - match: { tokens.1.token: "#text2" }
+    - match: { tokens.1.start_offset: 6 }
+    - match: { tokens.1.end_offset: 12 }
+    - match: { tokens.1.position: 1 }
+
+    - do:
+        indices.analyze:
+          body:
+            text: 'text1 #text2'
+            tokenizer: whitespace
+            filter:
+              - type: word_delimiter_graph
+                split_on_numerics: false
+                type_table:
+                  - "# This is a comment"
+                  - "# => ALPHANUM"
+    - length: { tokens: 2 }
+    - match: { tokens.0.token: text1 }
+    - match: { tokens.0.start_offset: 0 }
+    - match: { tokens.0.end_offset: 5 }
+    - match: { tokens.0.position: 0 }
+    - match: { tokens.1.token: "#text2" }
+    - match: { tokens.1.start_offset: 6 }
+    - match: { tokens.1.end_offset: 12 }
+    - match: { tokens.1.position: 1 }
+
+    - do:
+        indices.analyze:
+          body:
+            text: 'text1 #text2'
+            tokenizer: whitespace
+            filter:
+              - type: word_delimiter_graph
+                split_on_numerics: false
+                type_table:
+                  - "# This is a comment"
+                  - "# => ALPHANUM"
+                  - "@ => ALPHANUM"
+    - length: { tokens: 2 }
+    - match: { tokens.0.token: text1 }
+    - match: { tokens.0.start_offset: 0 }
+    - match: { tokens.0.end_offset: 5 }
+    - match: { tokens.0.position: 0 }
+    - match: { tokens.1.token: "#text2" }
+    - match: { tokens.1.start_offset: 6 }
+    - match: { tokens.1.end_offset: 12 }
+    - match: { tokens.1.position: 1 }
+
 ---
 "unique":
     - do:
diff --git a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/50_char_filters.yml b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/50_char_filters.yml
index 0078575ae8e57..5e266c10cba8f 100644
--- a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/50_char_filters.yml
+++ b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/50_char_filters.yml
@@ -69,6 +69,7 @@
           char_filter:
             - type: mapping
               mappings:
+                - "# This is a comment"
                 - "# => _hashsign_"
                 - "@ => _atsign_"
   - length: { tokens: 3 }
diff --git a/server/src/main/java/org/opensearch/index/analysis/Analysis.java b/server/src/main/java/org/opensearch/index/analysis/Analysis.java
index f4465c9dffac6..b9a219057f326 100644
--- a/server/src/main/java/org/opensearch/index/analysis/Analysis.java
+++ b/server/src/main/java/org/opensearch/index/analysis/Analysis.java
@@ -87,6 +87,7 @@
 import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
+import java.util.regex.Pattern;
 
 import static java.util.Collections.unmodifiableMap;
 
@@ -98,6 +99,9 @@
 public class Analysis {
     private static final Logger LOGGER = LogManager.getLogger(Analysis.class);
 
+    // Regular expression to support hashtag tokenization
+    private static final Pattern HASH_TAG_RULE_PATTERN = Pattern.compile("^\\s*#\\s*=>");
+
     public static CharArraySet parseStemExclusion(Settings settings, CharArraySet defaultStemExclusion) {
         String value = settings.get("stem_exclusion");
         if ("_none_".equals(value)) {
@@ -222,16 +226,6 @@ public static <T> List<T> parseWordList(Environment env, Settings settings, Stri
         return parseWordList(env, settings, settingPrefix + "_path", settingPrefix, parser);
     }
 
-    public static <T> List<T> parseWordList(
-        Environment env,
-        Settings settings,
-        String settingPrefix,
-        CustomMappingRuleParser<T> parser,
-        boolean removeComments
-    ) {
-        return parseWordList(env, settings, settingPrefix + "_path", settingPrefix, parser, removeComments);
-    }
-
     /**
      * Parses a list of words from the specified settings or from a file, with the given parser.
      *
@@ -246,17 +240,6 @@ public static <T> List<T> parseWordList(
         String settingPath,
         String settingList,
         CustomMappingRuleParser<T> parser
-    ) {
-        return parseWordList(env, settings, settingPath, settingList, parser, true);
-    }
-
-    public static <T> List<T> parseWordList(
-        Environment env,
-        Settings settings,
-        String settingPath,
-        String settingList,
-        CustomMappingRuleParser<T> parser,
-        boolean removeComments
     ) {
         List<String> words = getWordList(env, settings, settingPath, settingList);
         if (words == null) {
@@ -266,7 +249,7 @@ public static <T> List<T> parseWordList(
         int lineNum = 0;
         for (String word : words) {
             lineNum++;
-            if (removeComments == false || word.startsWith("#") == false) {
+            if (word.startsWith("#") == false || HASH_TAG_RULE_PATTERN.matcher(word).find() == true) {
                 try {
                     rules.add(parser.apply(word));
                 } catch (RuntimeException ex) {