Skip to content

Commit

Permalink
根据总词频动态决定未登录词的默认词频
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Nov 5, 2021
1 parent 61cc753 commit 6cff689
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 1 deletion.
2 changes: 1 addition & 1 deletion src/main/java/com/hankcs/hanlp/seg/common/WordNet.java
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ public void add(int line, List<AtomNode> atomSegment)
break;
}
// 这些通用符的量级都在10万左右
add(line + offset, new Vertex(sWord, atomNode.sWord, new CoreDictionary.Attribute(nature, 10000), id));
add(line + offset, new Vertex(sWord, atomNode.sWord, new CoreDictionary.Attribute(nature, Predefine.OOV_DEFAULT_FREQUENCY), id));
offset += atomNode.sWord.length();
}
}
Expand Down
5 changes: 5 additions & 0 deletions src/main/java/com/hankcs/hanlp/utility/Predefine.java
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ public class Predefine
public static int MAX_SEGMENT_NUM = 10;

public static int TOTAL_FREQUENCY = 25146057; // 现在总词频25146057
/**
* 未登录词的默认词频
*/
public static int OOV_DEFAULT_FREQUENCY = 10000;
/**
* Bigram 平滑因子
*/
Expand Down Expand Up @@ -141,5 +145,6 @@ public static void setTotalFrequency(int totalFrequency)
{
TOTAL_FREQUENCY = totalFrequency;
myu = 1 - ((double) 1 / TOTAL_FREQUENCY + 0.00001);
OOV_DEFAULT_FREQUENCY = Math.max(1, Math.min(OOV_DEFAULT_FREQUENCY / 100, TOTAL_FREQUENCY)); // 默认百分之一
}
}

2 comments on commit 6cff689

@hanlpbot
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This commit has been mentioned on Butterfly Effect. There might be relevant details there:

https://bbs.hankcs.com/t/topic/1352/10

@hanlpbot
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This commit has been mentioned on Butterfly Effect. There might be relevant details there:

https://bbs.hankcs.com/t/topic/1352/10

Please sign in to comment.