forked from apache/incubator-hugegraph
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
HugeGraph-1323: Implement the full-text search word segmentation
Change-Id: I6558fb4cebb7b2bff53aa95665be66ae2ea48192
- Loading branch information
Showing
21 changed files
with
1,232 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
27 changes: 27 additions & 0 deletions
27
hugegraph-core/src/main/java/com/baidu/hugegraph/analyzer/Analyzer.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
/* | ||
* Copyright 2017 HugeGraph Authors | ||
* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with this | ||
* work for additional information regarding copyright ownership. The ASF | ||
* licenses this file to You under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | ||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | ||
* License for the specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package com.baidu.hugegraph.analyzer; | ||
|
||
import java.util.Set; | ||
|
||
public interface Analyzer { | ||
|
||
public Set<String> segment(String text); | ||
} |
111 changes: 111 additions & 0 deletions
111
hugegraph-core/src/main/java/com/baidu/hugegraph/analyzer/AnalyzerFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
/* | ||
* Copyright 2017 HugeGraph Authors | ||
* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with this | ||
* work for additional information regarding copyright ownership. The ASF | ||
* licenses this file to You under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | ||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | ||
* License for the specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package com.baidu.hugegraph.analyzer; | ||
|
||
import java.util.Map; | ||
import java.util.concurrent.ConcurrentHashMap; | ||
|
||
import com.baidu.hugegraph.HugeException; | ||
import com.baidu.hugegraph.backend.serializer.SerializerFactory; | ||
import com.baidu.hugegraph.config.CoreOptions; | ||
import com.baidu.hugegraph.config.HugeConfig; | ||
|
||
public class AnalyzerFactory { | ||
|
||
private static Map<String, Class<? extends Analyzer>> analyzers; | ||
|
||
static { | ||
analyzers = new ConcurrentHashMap<>(); | ||
} | ||
|
||
public static Analyzer analyzer(HugeConfig config) { | ||
String name = config.get(CoreOptions.TEXT_ANALYZER).toLowerCase(); | ||
String mode = config.get(CoreOptions.TEXT_ANALYZER_MODE); | ||
return analyzer(name, mode); | ||
} | ||
|
||
public static Analyzer analyzer(String name, String mode) { | ||
switch (name) { | ||
case "word": | ||
return new WordAnalyzer(mode); | ||
case "stanford": | ||
return new StanfordAnalyzer(mode); | ||
case "ansj": | ||
return new AnsjAnalyzer(mode); | ||
case "hanlp": | ||
return new HanLPAnalyzer(mode); | ||
case "smartcn": | ||
return new SmartCNAnalyzer(mode); | ||
case "jieba": | ||
return new JiebaAnalyzer(mode); | ||
case "jcseg": | ||
return new JcsegAnalyzer(mode); | ||
case "mmseg4j": | ||
return new MMSeg4JAnalyzer(mode); | ||
case "ikanalyzer": | ||
return new IKAnalyzer(mode); | ||
default: | ||
return customizedAnalyzer(name, mode); | ||
} | ||
} | ||
|
||
private static Analyzer customizedAnalyzer(String name, String mode) { | ||
Class<? extends Analyzer> clazz = analyzers.get(name); | ||
if (clazz == null) { | ||
throw new HugeException("Not exists analyzer: %s", name); | ||
} | ||
|
||
assert Analyzer.class.isAssignableFrom(clazz); | ||
try { | ||
return clazz.getConstructor(String.class).newInstance(mode); | ||
} catch (Exception e) { | ||
throw new HugeException( | ||
"Failed to construct analyzer '%s' with mode '%s'", | ||
e, name, mode); | ||
} | ||
} | ||
|
||
@SuppressWarnings({ "rawtypes", "unchecked" }) | ||
public static void register(String name, String classPath) { | ||
ClassLoader classLoader = SerializerFactory.class.getClassLoader(); | ||
Class<?> clazz; | ||
try { | ||
clazz = classLoader.loadClass(classPath); | ||
} catch (Exception e) { | ||
throw new HugeException("Load class path '%s' failed", | ||
e, classPath); | ||
} | ||
|
||
// Check subclass | ||
if (!Analyzer.class.isAssignableFrom(clazz)) { | ||
throw new HugeException("Class '%s' is not a subclass of " + | ||
"class Analyzer", classPath); | ||
} | ||
|
||
// Check exists | ||
if (analyzers.containsKey(name)) { | ||
throw new HugeException("Exists analyzer: %s(%s)", | ||
name, analyzers.get(name).getName()); | ||
} | ||
|
||
// Register class | ||
analyzers.put(name, (Class) clazz); | ||
} | ||
} |
84 changes: 84 additions & 0 deletions
84
hugegraph-core/src/main/java/com/baidu/hugegraph/analyzer/AnsjAnalyzer.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
/* | ||
* Copyright 2017 HugeGraph Authors | ||
* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with this | ||
* work for additional information regarding copyright ownership. The ASF | ||
* licenses this file to You under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | ||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | ||
* License for the specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package com.baidu.hugegraph.analyzer; | ||
|
||
import java.util.List; | ||
import java.util.Set; | ||
|
||
import org.ansj.domain.Result; | ||
import org.ansj.domain.Term; | ||
import org.ansj.splitWord.analysis.BaseAnalysis; | ||
import org.ansj.splitWord.analysis.IndexAnalysis; | ||
import org.ansj.splitWord.analysis.NlpAnalysis; | ||
import org.ansj.splitWord.analysis.ToAnalysis; | ||
|
||
import com.baidu.hugegraph.config.ConfigException; | ||
import com.baidu.hugegraph.util.InsertionOrderUtil; | ||
import com.google.common.collect.ImmutableList; | ||
|
||
/** | ||
* Reference from https://my.oschina.net/apdplat/blog/412921 | ||
*/ | ||
public class AnsjAnalyzer implements Analyzer { | ||
|
||
public static final List<String> SUPPORT_MODES = ImmutableList.of( | ||
"BaseAnalysis", | ||
"IndexAnalysis", | ||
"ToAnalysis", | ||
"NlpAnalysis" | ||
); | ||
|
||
private String analysis; | ||
|
||
public AnsjAnalyzer(String mode) { | ||
if (!SUPPORT_MODES.contains(mode)) { | ||
throw new ConfigException( | ||
"Unsupported segment mode '%s' for ansj analyzer, " + | ||
"the available values are %s", mode, SUPPORT_MODES); | ||
} | ||
this.analysis = mode; | ||
} | ||
|
||
@Override | ||
public Set<String> segment(String text) { | ||
Result terms = null; | ||
switch (this.analysis) { | ||
case "BaseAnalysis": | ||
terms = BaseAnalysis.parse(text); | ||
break; | ||
case "ToAnalysis": | ||
terms = ToAnalysis.parse(text); | ||
break; | ||
case "NlpAnalysis": | ||
terms = NlpAnalysis.parse(text); | ||
break; | ||
case "IndexAnalysis": | ||
terms = IndexAnalysis.parse(text); | ||
break; | ||
} | ||
|
||
assert terms != null; | ||
Set<String> result = InsertionOrderUtil.newSet(); | ||
for (Term term : terms) { | ||
result.add(term.getName()); | ||
} | ||
return result; | ||
} | ||
} |
Oops, something went wrong.