diff --git a/src/main/java/com/hankcs/hanlp/seg/common/WordNet.java b/src/main/java/com/hankcs/hanlp/seg/common/WordNet.java index 83747da1e..717978a5a 100644 --- a/src/main/java/com/hankcs/hanlp/seg/common/WordNet.java +++ b/src/main/java/com/hankcs/hanlp/seg/common/WordNet.java @@ -283,7 +283,7 @@ public void add(int line, List atomSegment) break; } // 这些通用符的量级都在10万左右 - add(line + offset, new Vertex(sWord, atomNode.sWord, new CoreDictionary.Attribute(nature, 10000), id)); + add(line + offset, new Vertex(sWord, atomNode.sWord, new CoreDictionary.Attribute(nature, Predefine.OOV_DEFAULT_FREQUENCY), id)); offset += atomNode.sWord.length(); } } diff --git a/src/main/java/com/hankcs/hanlp/utility/Predefine.java b/src/main/java/com/hankcs/hanlp/utility/Predefine.java index b989cd363..fd4773e7a 100644 --- a/src/main/java/com/hankcs/hanlp/utility/Predefine.java +++ b/src/main/java/com/hankcs/hanlp/utility/Predefine.java @@ -56,6 +56,10 @@ public class Predefine public static int MAX_SEGMENT_NUM = 10; public static int TOTAL_FREQUENCY = 25146057; // 现在总词频25146057 + /** + * 未登录词的默认词频 + */ + public static int OOV_DEFAULT_FREQUENCY = 10000; /** * Bigram 平滑因子 */ @@ -141,5 +145,6 @@ public static void setTotalFrequency(int totalFrequency) { TOTAL_FREQUENCY = totalFrequency; myu = 1 - ((double) 1 / TOTAL_FREQUENCY + 0.00001); + OOV_DEFAULT_FREQUENCY = Math.max(1, Math.min(OOV_DEFAULT_FREQUENCY / 100, TOTAL_FREQUENCY)); // 默认百分之一 } }