Skip to content

Commit

Permalink
HugeGraph-1323: Implement the full-text search word segmentation
Browse files Browse the repository at this point in the history
Change-Id: I6558fb4cebb7b2bff53aa95665be66ae2ea48192
  • Loading branch information
Linary committed Aug 9, 2018
1 parent f49d72e commit 1cbab6a
Show file tree
Hide file tree
Showing 21 changed files with 1,232 additions and 0 deletions.
57 changes: 57 additions & 0 deletions hugegraph-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,63 @@
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</dependency>

<dependency>
<groupId>org.apdplat</groupId>
<artifactId>word</artifactId>
<version>1.3</version>
</dependency>
<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>3.5.2</version>
</dependency>
<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>3.5.2</version>
<classifier>models-chinese</classifier>
</dependency>
<dependency>
<groupId>org.ansj</groupId>
<artifactId>ansj_seg</artifactId>
<version>5.1.6</version>
</dependency>
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.5.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-smartcn</artifactId>
<version>7.4.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>7.4.0</version>
</dependency>
<dependency>
<groupId>com.huaban</groupId>
<artifactId>jieba-analysis</artifactId>
<version>1.0.2</version>
</dependency>
<dependency>
<groupId>org.lionsoul</groupId>
<artifactId>jcseg-core</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>com.chenlb.mmseg4j</groupId>
<artifactId>mmseg4j-core</artifactId>
<version>1.10.0</version>
</dependency>
<dependency>
<groupId>com.janeluo</groupId>
<artifactId>ikanalyzer</artifactId>
<version>2012_u6</version>
</dependency>
</dependencies>

<build>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import org.apache.tinkerpop.gremlin.structure.util.StringFactory;
import org.slf4j.Logger;

import com.baidu.hugegraph.analyzer.Analyzer;
import com.baidu.hugegraph.backend.BackendException;
import com.baidu.hugegraph.backend.cache.CachedGraphTransaction;
import com.baidu.hugegraph.backend.cache.CachedSchemaTransaction;
Expand All @@ -59,6 +60,7 @@
import com.baidu.hugegraph.schema.SchemaElement;
import com.baidu.hugegraph.schema.SchemaManager;
import com.baidu.hugegraph.schema.VertexLabel;
import com.baidu.hugegraph.analyzer.AnalyzerFactory;
import com.baidu.hugegraph.structure.HugeFeatures;
import com.baidu.hugegraph.task.HugeTaskManager;
import com.baidu.hugegraph.traversal.optimize.HugeGraphStepStrategy;
Expand Down Expand Up @@ -282,6 +284,10 @@ public AbstractSerializer serializer() {
return serializer;
}

public Analyzer analyzer() {
return AnalyzerFactory.analyzer(this.configuration);
}

@Override
public Vertex addVertex(Object... keyValues) {
return this.graphTransaction().addVertex(keyValues);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/*
* Copyright 2017 HugeGraph Authors
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with this
* work for additional information regarding copyright ownership. The ASF
* licenses this file to You under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*/

package com.baidu.hugegraph.analyzer;

import java.util.Set;

public interface Analyzer {

public Set<String> segment(String text);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
/*
* Copyright 2017 HugeGraph Authors
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with this
* work for additional information regarding copyright ownership. The ASF
* licenses this file to You under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*/

package com.baidu.hugegraph.analyzer;

import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

import com.baidu.hugegraph.HugeException;
import com.baidu.hugegraph.backend.serializer.SerializerFactory;
import com.baidu.hugegraph.config.CoreOptions;
import com.baidu.hugegraph.config.HugeConfig;

public class AnalyzerFactory {

private static Map<String, Class<? extends Analyzer>> analyzers;

static {
analyzers = new ConcurrentHashMap<>();
}

public static Analyzer analyzer(HugeConfig config) {
String name = config.get(CoreOptions.TEXT_ANALYZER).toLowerCase();
String mode = config.get(CoreOptions.TEXT_ANALYZER_MODE);
return analyzer(name, mode);
}

public static Analyzer analyzer(String name, String mode) {
switch (name) {
case "word":
return new WordAnalyzer(mode);
case "stanford":
return new StanfordAnalyzer(mode);
case "ansj":
return new AnsjAnalyzer(mode);
case "hanlp":
return new HanLPAnalyzer(mode);
case "smartcn":
return new SmartCNAnalyzer(mode);
case "jieba":
return new JiebaAnalyzer(mode);
case "jcseg":
return new JcsegAnalyzer(mode);
case "mmseg4j":
return new MMSeg4JAnalyzer(mode);
case "ikanalyzer":
return new IKAnalyzer(mode);
default:
return customizedAnalyzer(name, mode);
}
}

private static Analyzer customizedAnalyzer(String name, String mode) {
Class<? extends Analyzer> clazz = analyzers.get(name);
if (clazz == null) {
throw new HugeException("Not exists analyzer: %s", name);
}

assert Analyzer.class.isAssignableFrom(clazz);
try {
return clazz.getConstructor(String.class).newInstance(mode);
} catch (Exception e) {
throw new HugeException(
"Failed to construct analyzer '%s' with mode '%s'",
e, name, mode);
}
}

@SuppressWarnings({ "rawtypes", "unchecked" })
public static void register(String name, String classPath) {
ClassLoader classLoader = SerializerFactory.class.getClassLoader();
Class<?> clazz;
try {
clazz = classLoader.loadClass(classPath);
} catch (Exception e) {
throw new HugeException("Load class path '%s' failed",
e, classPath);
}

// Check subclass
if (!Analyzer.class.isAssignableFrom(clazz)) {
throw new HugeException("Class '%s' is not a subclass of " +
"class Analyzer", classPath);
}

// Check exists
if (analyzers.containsKey(name)) {
throw new HugeException("Exists analyzer: %s(%s)",
name, analyzers.get(name).getName());
}

// Register class
analyzers.put(name, (Class) clazz);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*
* Copyright 2017 HugeGraph Authors
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with this
* work for additional information regarding copyright ownership. The ASF
* licenses this file to You under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*/

package com.baidu.hugegraph.analyzer;

import java.util.List;
import java.util.Set;

import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.splitWord.analysis.BaseAnalysis;
import org.ansj.splitWord.analysis.IndexAnalysis;
import org.ansj.splitWord.analysis.NlpAnalysis;
import org.ansj.splitWord.analysis.ToAnalysis;

import com.baidu.hugegraph.config.ConfigException;
import com.baidu.hugegraph.util.InsertionOrderUtil;
import com.google.common.collect.ImmutableList;

/**
* Reference from https://my.oschina.net/apdplat/blog/412921
*/
public class AnsjAnalyzer implements Analyzer {

public static final List<String> SUPPORT_MODES = ImmutableList.of(
"BaseAnalysis",
"IndexAnalysis",
"ToAnalysis",
"NlpAnalysis"
);

private String analysis;

public AnsjAnalyzer(String mode) {
if (!SUPPORT_MODES.contains(mode)) {
throw new ConfigException(
"Unsupported segment mode '%s' for ansj analyzer, " +
"the available values are %s", mode, SUPPORT_MODES);
}
this.analysis = mode;
}

@Override
public Set<String> segment(String text) {
Result terms = null;
switch (this.analysis) {
case "BaseAnalysis":
terms = BaseAnalysis.parse(text);
break;
case "ToAnalysis":
terms = ToAnalysis.parse(text);
break;
case "NlpAnalysis":
terms = NlpAnalysis.parse(text);
break;
case "IndexAnalysis":
terms = IndexAnalysis.parse(text);
break;
}

assert terms != null;
Set<String> result = InsertionOrderUtil.newSet();
for (Term term : terms) {
result.add(term.getName());
}
return result;
}
}
Loading

0 comments on commit 1cbab6a

Please sign in to comment.