Skip to content

Commit

Permalink
[fix]: fix capitalization problem in tokenization
Browse files Browse the repository at this point in the history
  • Loading branch information
jianganbai committed Aug 30, 2023
1 parent c2dc047 commit a1db2c9
Showing 1 changed file with 22 additions and 5 deletions.
27 changes: 22 additions & 5 deletions wenet/dataset/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,15 +321,15 @@ def compute_mfcc(data,
yield dict(key=sample['key'], label=sample['label'], feat=mat)


def __tokenize_by_bpe_model(sp, txt):
def __tokenize_by_bpe_model(sp, txt, special_tokens):
tokens = []
# CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
pattern = re.compile(r'([\u4e00-\u9fff])')
# Example:
# txt = "你好 ITS'S OKAY 的"
# chars = ["你", "好", " ITS'S OKAY ", "的"]
chars = pattern.split(txt.upper())
chars = pattern.split(get_upper_txt(txt, special_tokens))
mix_chars = [w for w in chars if len(w.strip()) > 0]
for ch_or_w in mix_chars:
# ch_or_w is a single CJK charater(i.e., "你"), do nothing.
Expand All @@ -348,7 +348,8 @@ def tokenize(data,
symbol_table,
bpe_model=None,
non_lang_syms=None,
split_with_space=False):
split_with_space=False,
special_tokens=['<unk>']):
""" Decode text to chars or BPE
Inplace operation
Expand All @@ -375,7 +376,7 @@ def tokenize(data,
assert 'txt' in sample
txt = sample['txt'].strip()
if non_lang_syms_pattern is not None:
parts = non_lang_syms_pattern.split(txt.upper())
parts = non_lang_syms_pattern.split(get_upper_txt(txt, special_tokens))
parts = [w for w in parts if len(w.strip()) > 0]
else:
parts = [txt]
Expand All @@ -387,7 +388,7 @@ def tokenize(data,
tokens.append(part)
else:
if bpe_model is not None:
tokens.extend(__tokenize_by_bpe_model(sp, part))
tokens.extend(__tokenize_by_bpe_model(sp, part, special_tokens))
else:
if split_with_space:
part = part.split(" ")
Expand All @@ -407,6 +408,22 @@ def tokenize(data,
yield sample


def get_upper_txt(txt, special_tokens):
"""Capitalize except for special tokens
Args:
txt (str)
special_token (List[str])
Returns:
str
"""
txt = txt.upper()
for token in special_tokens:
txt = re.subn(token.upper(), token, txt)
return txt


def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10, max_w=80):
""" Do spec augmentation
Inplace operation
Expand Down

0 comments on commit a1db2c9

Please sign in to comment.