Skip to content

Commit

Permalink
Fix hotwords OOV log
Browse files Browse the repository at this point in the history
  • Loading branch information
pkufool committed Jul 16, 2024
1 parent 960eb75 commit 48feb68
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 11 deletions.
6 changes: 3 additions & 3 deletions sherpa-onnx/csrc/utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,9 @@ static bool EncodeBase(const std::vector<std::string> &lines,
break;
default:
SHERPA_ONNX_LOGE(
"Cannot find ID for token %s at line: %s. (Hint: words on "
"the same line are separated by spaces)",
word.c_str(), line.c_str());
"Cannot find ID for token %s at line: %s. (Hint: Check the "
"tokens.txt see if %s in it)",
word.c_str(), line.c_str(), word.c_str());
has_oov = true;
break;
}
Expand Down
20 changes: 12 additions & 8 deletions sherpa-onnx/python/sherpa_onnx/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path
from typing import List, Optional, Union


def text2token(
texts: List[str],
tokens: str,
Expand Down Expand Up @@ -33,20 +34,20 @@ def text2token(
is True, or it is a list of list of tokens.
"""
try:
import sentencepiece as spm
import sentencepiece as spm
except ImportError:
print('Please run')
print(' pip install sentencepiece')
print('before you continue')
print("Please run")
print(" pip install sentencepiece")
print("before you continue")
raise

try:
from pypinyin import pinyin
from pypinyin.contrib.tone_convert import to_initials, to_finals_tone
except ImportError:
print('Please run')
print(' pip install pypinyin')
print('before you continue')
print("Please run")
print(" pip install pypinyin")
print("before you continue")
raise

assert Path(tokens).is_file(), f"File not exists, {tokens}"
Expand Down Expand Up @@ -119,7 +120,10 @@ def text2token(
if txt in tokens_table:
text_list.append(tokens_table[txt] if output_ids else txt)
else:
print(f"OOV token : {txt}, skipping text : {text}.")
print(
f"Can't find token {txt} in token table, check your "
f"tokens.txt see if {txt} in it. skipping text : {text}."
)
contain_oov = True
break
if contain_oov:
Expand Down

0 comments on commit 48feb68

Please sign in to comment.