Skip to content

Commit

Permalink
feature: enhance Text.contains("word") (#1652)
Browse files Browse the repository at this point in the history
  • Loading branch information
coderzc authored Dec 7, 2021
1 parent 83180fb commit 3b623bc
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,10 @@

public class GraphIndexTransaction extends AbstractTransaction {

public static final String START_SYMBOL = "(";
public static final String END_SYMBOL = ")";
public static final String WORD_DELIMITER = "|";

private final Analyzer textAnalyzer;
private final int indexIntersectThresh;

Expand Down Expand Up @@ -874,11 +878,33 @@ private boolean matchSearchIndexWords(String propValue, String fieldValue) {
}

private Set<String> segmentWords(String text) {
Set<String> words = this.textAnalyzer.segment(text);
/*
Support 3 kinds of query:
- Text.contains("(word)"): query by user-specified word;
- Text.contains("(word1|word2|word3)"): query by user-specified words;
- Text.contains("words"): query by words splitted from analyzer;
Note: all kinds support words exact match
*/
if (text.startsWith(START_SYMBOL) && text.endsWith(END_SYMBOL)) {
String subText = text.substring(1, text.length() - 1);
if (subText.contains(WORD_DELIMITER)) {
String[] texts = StringUtils.split(subText, WORD_DELIMITER);
return ImmutableSet.copyOf(texts);
} else {
return ImmutableSet.of(subText);
}
}
Set<String> segments = this.textAnalyzer.segment(text);

/*
* Add original text to segments at the insertion stage,
* in order to can match fully words at the query stage.
*/
segments.add(text);

// Ignore unicode \u0000 to \u0003
words.removeAll(ConditionQuery.IGNORE_SYM_SET);
return words;
segments.removeAll(ConditionQuery.IGNORE_SYM_SET);
return segments;
}

private boolean needIndexForLabel() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8734,6 +8734,49 @@ public void testQueryBySearchIndexWithSpecialSymbol() {
}
}

@Test
public void testEnhanceTextMatch() {
HugeGraph graph = graph();

graph.schema().indexLabel("personByName").onV("person")
.by("name").search().ifNotExist().create();

Vertex vertex1 = graph.addVertex(T.label, "person", "name", "秦始皇",
"city", "Hongkong", "age", 15);
Vertex vertex2 = graph.addVertex(T.label, "person", "name", "始皇",
"city", "Hongkong", "age", 18);
Vertex vertex3 = graph.addVertex(T.label, "person", "name", "秦始皇2",
"city", "Beijing", "age", 21);
Vertex vertex4 = graph.addVertex(T.label, "person", "name", "秦始皇3",
"city", "Beijing", "age", 23);
Vertex vertex5 = graph.addVertex(T.label, "person", "name", "秦始皇帝",
"city", "Beijing", "age", 29);
graph.tx().commit();

GraphTraversalSource g = graph.traversal();

List<Vertex> vertices;
vertices = g.V().has("name", Text.contains("秦始皇")).toList();
Assert.assertEquals(5, vertices.size());
Assert.assertTrue(vertices.contains(vertex2));

vertices = g.V().has("name", Text.contains("(秦始皇)")).toList();
Assert.assertEquals(4, vertices.size());
Assert.assertTrue(vertices.contains(vertex1));
Assert.assertTrue(vertices.contains(vertex3));
Assert.assertTrue(vertices.contains(vertex4));
Assert.assertTrue(vertices.contains(vertex5));

vertices = g.V().has("name", Text.contains("(秦始皇帝)")).toList();
Assert.assertEquals(1, vertices.size());
Assert.assertTrue(vertices.contains(vertex5));

vertices = g.V().has("name", Text.contains("(秦始皇2|秦始皇3)")).toList();
Assert.assertEquals(2, vertices.size());
Assert.assertTrue(vertices.contains(vertex3));
Assert.assertTrue(vertices.contains(vertex4));
}

private void init10Vertices() {
HugeGraph graph = graph();

Expand Down
3 changes: 3 additions & 0 deletions hugegraph-test/src/main/resources/hugegraph.properties
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,6 @@ task.sync_deletion=true
expired.delete_batch=1

test.tinkerpop.filter=fast-methods.filter

search.text_analyzer=ikanalyzer
search.text_analyzer_mode=max_word

0 comments on commit 3b623bc

Please sign in to comment.