diff --git a/pinyin.py b/pinyin.py index 938b727..f12435f 100644 --- a/pinyin.py +++ b/pinyin.py @@ -10,7 +10,9 @@ __all__ = ["PinYin"] import os.path +import re +INDEX =[0,0,0,0,2239,5543,9510,13471,17413,21406,25333] class PinYin(object): def __init__(self, dict_file='word.data'): @@ -25,8 +27,11 @@ def load_word(self): with file(self.dict_file) as f_obj: for f_line in f_obj.readlines(): try: - line = f_line.split(' ') - self.word_dict[line[0]] = line[1] + match = re.compile('\s+') + line = match.split(f_line) + self.word_dict.setdefault(line[0],[]) + for value in line[1:]: + self.word_dict[line[0]].append(value) except: line = f_line.split(' ') self.word_dict[line[0]] = line[1] @@ -39,8 +44,16 @@ def hanzi2pinyin(self, string=""): for char in string: key = '%X' % ord(char) - result.append(self.word_dict.get(key, char).split()[0][:-1].lower()) - + if len(key)==4: + index = INDEX[(int)(key[0])] + else: + index = INDEX[10] + + for ks in sorted(self.word_dict)[index:]: + if key == ks: + result.append(self.word_dict[key][0][:-1].lower()) + break + return result