HugeGraph-1323: Implement the full-text search word segmentation

Change-Id: I6558fb4cebb7b2bff53aa95665be66ae2ea48192
VGalaxies · Aug 9, 2018 · 1cbab6a · 1cbab6a
1 parent f49d72e
commit 1cbab6a
Show file tree

Hide file tree

Showing 21 changed files with 1,232 additions and 0 deletions.
diff --git a/hugegraph-core/pom.xml b/hugegraph-core/pom.xml
@@ -54,6 +54,63 @@
             <groupId>com.google.guava</groupId>
             <artifactId>guava</artifactId>
         </dependency>
+
+        <dependency>
+            <groupId>org.apdplat</groupId>
+            <artifactId>word</artifactId>
+            <version>1.3</version>
+        </dependency>
+        <dependency>
+            <groupId>edu.stanford.nlp</groupId>
+            <artifactId>stanford-corenlp</artifactId>
+            <version>3.5.2</version>
+        </dependency>
+        <dependency>
+            <groupId>edu.stanford.nlp</groupId>
+            <artifactId>stanford-corenlp</artifactId>
+            <version>3.5.2</version>
+            <classifier>models-chinese</classifier>
+        </dependency>
+        <dependency>
+            <groupId>org.ansj</groupId>
+            <artifactId>ansj_seg</artifactId>
+            <version>5.1.6</version>
+        </dependency>
+        <dependency>
+            <groupId>com.hankcs</groupId>
+            <artifactId>hanlp</artifactId>
+            <version>portable-1.5.0</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-analyzers-smartcn</artifactId>
+            <version>7.4.0</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-core</artifactId>
+            <version>7.4.0</version>
+        </dependency>
+        <dependency>
+            <groupId>com.huaban</groupId>
+            <artifactId>jieba-analysis</artifactId>
+            <version>1.0.2</version>
+        </dependency>
+        <dependency>
+            <groupId>org.lionsoul</groupId>
+            <artifactId>jcseg-core</artifactId>
+            <version>2.2.0</version>
+        </dependency>
+        <dependency>
+            <groupId>com.chenlb.mmseg4j</groupId>
+            <artifactId>mmseg4j-core</artifactId>
+            <version>1.10.0</version>
+        </dependency>
+        <dependency>
+            <groupId>com.janeluo</groupId>
+            <artifactId>ikanalyzer</artifactId>
+            <version>2012_u6</version>
+        </dependency>
     </dependencies>
 
     <build>

diff --git a/hugegraph-core/src/main/java/com/baidu/hugegraph/HugeGraph.java b/hugegraph-core/src/main/java/com/baidu/hugegraph/HugeGraph.java
@@ -36,6 +36,7 @@
 import org.apache.tinkerpop.gremlin.structure.util.StringFactory;
 import org.slf4j.Logger;
 
+import com.baidu.hugegraph.analyzer.Analyzer;
 import com.baidu.hugegraph.backend.BackendException;
 import com.baidu.hugegraph.backend.cache.CachedGraphTransaction;
 import com.baidu.hugegraph.backend.cache.CachedSchemaTransaction;
@@ -59,6 +60,7 @@
 import com.baidu.hugegraph.schema.SchemaElement;
 import com.baidu.hugegraph.schema.SchemaManager;
 import com.baidu.hugegraph.schema.VertexLabel;
+import com.baidu.hugegraph.analyzer.AnalyzerFactory;
 import com.baidu.hugegraph.structure.HugeFeatures;
 import com.baidu.hugegraph.task.HugeTaskManager;
 import com.baidu.hugegraph.traversal.optimize.HugeGraphStepStrategy;
@@ -282,6 +284,10 @@ public AbstractSerializer serializer() {
         return serializer;
     }
 
+    public Analyzer analyzer() {
+        return AnalyzerFactory.analyzer(this.configuration);
+    }
+
     @Override
     public Vertex addVertex(Object... keyValues) {
         return this.graphTransaction().addVertex(keyValues);

diff --git a/hugegraph-core/src/main/java/com/baidu/hugegraph/analyzer/Analyzer.java b/hugegraph-core/src/main/java/com/baidu/hugegraph/analyzer/Analyzer.java
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2017 HugeGraph Authors
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+
+package com.baidu.hugegraph.analyzer;
+
+import java.util.Set;
+
+public interface Analyzer {
+
+    public Set<String> segment(String text);
+}
diff --git a/hugegraph-core/src/main/java/com/baidu/hugegraph/analyzer/AnalyzerFactory.java b/hugegraph-core/src/main/java/com/baidu/hugegraph/analyzer/AnalyzerFactory.java
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2017 HugeGraph Authors
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+
+package com.baidu.hugegraph.analyzer;
+
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+import com.baidu.hugegraph.HugeException;
+import com.baidu.hugegraph.backend.serializer.SerializerFactory;
+import com.baidu.hugegraph.config.CoreOptions;
+import com.baidu.hugegraph.config.HugeConfig;
+
+public class AnalyzerFactory {
+
+    private static Map<String, Class<? extends Analyzer>> analyzers;
+
+    static {
+        analyzers = new ConcurrentHashMap<>();
+    }
+
+    public static Analyzer analyzer(HugeConfig config) {
+        String name = config.get(CoreOptions.TEXT_ANALYZER).toLowerCase();
+        String mode = config.get(CoreOptions.TEXT_ANALYZER_MODE);
+        return analyzer(name, mode);
+    }
+
+    public static Analyzer analyzer(String name, String mode) {
+        switch (name) {
+            case "word":
+                return new WordAnalyzer(mode);
+            case "stanford":
+                return new StanfordAnalyzer(mode);
+            case "ansj":
+                return new AnsjAnalyzer(mode);
+            case "hanlp":
+                return new HanLPAnalyzer(mode);
+            case "smartcn":
+                return new SmartCNAnalyzer(mode);
+            case "jieba":
+                return new JiebaAnalyzer(mode);
+            case "jcseg":
+                return new JcsegAnalyzer(mode);
+            case "mmseg4j":
+                return new MMSeg4JAnalyzer(mode);
+            case "ikanalyzer":
+                return new IKAnalyzer(mode);
+            default:
+                return customizedAnalyzer(name, mode);
+        }
+    }
+
+    private static Analyzer customizedAnalyzer(String name, String mode) {
+        Class<? extends Analyzer> clazz = analyzers.get(name);
+        if (clazz == null) {
+            throw new HugeException("Not exists analyzer: %s", name);
+        }
+
+        assert Analyzer.class.isAssignableFrom(clazz);
+        try {
+            return clazz.getConstructor(String.class).newInstance(mode);
+        } catch (Exception e) {
+            throw new HugeException(
+                      "Failed to construct analyzer '%s' with mode '%s'",
+                      e, name, mode);
+        }
+    }
+
+    @SuppressWarnings({ "rawtypes", "unchecked" })
+    public static void register(String name, String classPath) {
+        ClassLoader classLoader = SerializerFactory.class.getClassLoader();
+        Class<?> clazz;
+        try {
+            clazz = classLoader.loadClass(classPath);
+        } catch (Exception e) {
+            throw new HugeException("Load class path '%s' failed",
+                                    e, classPath);
+        }
+
+        // Check subclass
+        if (!Analyzer.class.isAssignableFrom(clazz)) {
+            throw new HugeException("Class '%s' is not a subclass of " +
+                                    "class Analyzer", classPath);
+        }
+
+        // Check exists
+        if (analyzers.containsKey(name)) {
+            throw new HugeException("Exists analyzer: %s(%s)",
+                                    name, analyzers.get(name).getName());
+        }
+
+        // Register class
+        analyzers.put(name, (Class) clazz);
+    }
+}
diff --git a/hugegraph-core/src/main/java/com/baidu/hugegraph/analyzer/AnsjAnalyzer.java b/hugegraph-core/src/main/java/com/baidu/hugegraph/analyzer/AnsjAnalyzer.java
@@ -0,0 +1,84 @@
+/*
+ * Copyright 2017 HugeGraph Authors
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+
+package com.baidu.hugegraph.analyzer;
+
+import java.util.List;
+import java.util.Set;
+
+import org.ansj.domain.Result;
+import org.ansj.domain.Term;
+import org.ansj.splitWord.analysis.BaseAnalysis;
+import org.ansj.splitWord.analysis.IndexAnalysis;
+import org.ansj.splitWord.analysis.NlpAnalysis;
+import org.ansj.splitWord.analysis.ToAnalysis;
+
+import com.baidu.hugegraph.config.ConfigException;
+import com.baidu.hugegraph.util.InsertionOrderUtil;
+import com.google.common.collect.ImmutableList;
+
+/**
+ * Reference from https://my.oschina.net/apdplat/blog/412921
+ */
+public class AnsjAnalyzer implements Analyzer {
+
+    public static final List<String> SUPPORT_MODES = ImmutableList.of(
+            "BaseAnalysis",
+            "IndexAnalysis",
+            "ToAnalysis",
+            "NlpAnalysis"
+    );
+
+    private String analysis;
+
+    public AnsjAnalyzer(String mode) {
+        if (!SUPPORT_MODES.contains(mode)) {
+            throw new ConfigException(
+                      "Unsupported segment mode '%s' for ansj analyzer, " +
+                      "the available values are %s", mode, SUPPORT_MODES);
+        }
+        this.analysis = mode;
+    }
+
+    @Override
+    public Set<String> segment(String text) {
+        Result terms = null;
+        switch (this.analysis) {
+            case "BaseAnalysis":
+                terms = BaseAnalysis.parse(text);
+                break;
+            case "ToAnalysis":
+                terms = ToAnalysis.parse(text);
+                break;
+            case "NlpAnalysis":
+                terms = NlpAnalysis.parse(text);
+                break;
+            case "IndexAnalysis":
+                terms = IndexAnalysis.parse(text);
+                break;
+        }
+
+        assert terms != null;
+        Set<String> result = InsertionOrderUtil.newSet();
+        for (Term term : terms) {
+            result.add(term.getName());
+        }
+        return result;
+    }
+}