feature: enhance Text.contains("word") (#1652)

apache · Dec 7, 2021 · 3b623bc · 3b623bc
1 parent 83180fb
commit 3b623bc
Show file tree

Hide file tree

Showing 3 changed files with 75 additions and 3 deletions.
diff --git a/hugegraph-core/src/main/java/com/baidu/hugegraph/backend/tx/GraphIndexTransaction.java b/hugegraph-core/src/main/java/com/baidu/hugegraph/backend/tx/GraphIndexTransaction.java
@@ -98,6 +98,10 @@
 
 public class GraphIndexTransaction extends AbstractTransaction {
 
+    public static final String START_SYMBOL = "(";
+    public static final String END_SYMBOL = ")";
+    public static final String WORD_DELIMITER = "|";
+
     private final Analyzer textAnalyzer;
     private final int indexIntersectThresh;
 
@@ -874,11 +878,33 @@ private boolean matchSearchIndexWords(String propValue, String fieldValue) {
     }
 
     private Set<String> segmentWords(String text) {
-        Set<String> words = this.textAnalyzer.segment(text);
+        /*
+         Support 3 kinds of query:
+         - Text.contains("(word)"): query by user-specified word;
+         - Text.contains("(word1|word2|word3)"): query by user-specified words;
+         - Text.contains("words"): query by words splitted from analyzer;
+         Note: all kinds support words exact match
+         */
+        if (text.startsWith(START_SYMBOL) && text.endsWith(END_SYMBOL)) {
+            String subText = text.substring(1, text.length() - 1);
+            if (subText.contains(WORD_DELIMITER)) {
+                String[] texts = StringUtils.split(subText, WORD_DELIMITER);
+                return ImmutableSet.copyOf(texts);
+            } else {
+                return ImmutableSet.of(subText);
+            }
+        }
+        Set<String> segments = this.textAnalyzer.segment(text);
+
+        /*
+         * Add original text to segments at the insertion stage,
+         * in order to can match fully words at the query stage.
+         */
+        segments.add(text);
 
         // Ignore unicode \u0000 to \u0003
-        words.removeAll(ConditionQuery.IGNORE_SYM_SET);
-        return words;
+        segments.removeAll(ConditionQuery.IGNORE_SYM_SET);
+        return segments;
     }
 
     private boolean needIndexForLabel() {

diff --git a/hugegraph-test/src/main/java/com/baidu/hugegraph/core/VertexCoreTest.java b/hugegraph-test/src/main/java/com/baidu/hugegraph/core/VertexCoreTest.java
@@ -8734,6 +8734,49 @@ public void testQueryBySearchIndexWithSpecialSymbol() {
         }
     }
 
+    @Test
+    public void testEnhanceTextMatch() {
+        HugeGraph graph = graph();
+
+        graph.schema().indexLabel("personByName").onV("person")
+             .by("name").search().ifNotExist().create();
+
+        Vertex vertex1 = graph.addVertex(T.label, "person", "name", "秦始皇",
+                                         "city", "Hongkong", "age", 15);
+        Vertex vertex2 = graph.addVertex(T.label, "person", "name", "始皇",
+                                         "city", "Hongkong", "age", 18);
+        Vertex vertex3 = graph.addVertex(T.label, "person", "name", "秦始皇2",
+                                         "city", "Beijing", "age", 21);
+        Vertex vertex4 = graph.addVertex(T.label, "person", "name", "秦始皇3",
+                                         "city", "Beijing", "age", 23);
+        Vertex vertex5 = graph.addVertex(T.label, "person", "name", "秦始皇帝",
+                                         "city", "Beijing", "age", 29);
+        graph.tx().commit();
+
+        GraphTraversalSource g = graph.traversal();
+
+        List<Vertex> vertices;
+        vertices = g.V().has("name", Text.contains("秦始皇")).toList();
+        Assert.assertEquals(5, vertices.size());
+        Assert.assertTrue(vertices.contains(vertex2));
+
+        vertices = g.V().has("name", Text.contains("(秦始皇)")).toList();
+        Assert.assertEquals(4, vertices.size());
+        Assert.assertTrue(vertices.contains(vertex1));
+        Assert.assertTrue(vertices.contains(vertex3));
+        Assert.assertTrue(vertices.contains(vertex4));
+        Assert.assertTrue(vertices.contains(vertex5));
+
+        vertices = g.V().has("name", Text.contains("(秦始皇帝)")).toList();
+        Assert.assertEquals(1, vertices.size());
+        Assert.assertTrue(vertices.contains(vertex5));
+
+        vertices = g.V().has("name", Text.contains("(秦始皇2|秦始皇3)")).toList();
+        Assert.assertEquals(2, vertices.size());
+        Assert.assertTrue(vertices.contains(vertex3));
+        Assert.assertTrue(vertices.contains(vertex4));
+    }
+
     private void init10Vertices() {
         HugeGraph graph = graph();
 

diff --git a/hugegraph-test/src/main/resources/hugegraph.properties b/hugegraph-test/src/main/resources/hugegraph.properties
@@ -63,3 +63,6 @@ task.sync_deletion=true
 expired.delete_batch=1
 
 test.tinkerpop.filter=fast-methods.filter
+
+search.text_analyzer=ikanalyzer
+search.text_analyzer_mode=max_word