From 257dabd300b29957a0be38e7a8049a54f2095ccc Mon Sep 17 00:00:00 2001 From: steve Date: Tue, 4 Jun 2019 00:08:29 +0800 Subject: [PATCH] fix some bugs --- README.md | 3 ++- nlp_toolkit/chunk_segmentor/README.md | 3 +++ nlp_toolkit/chunk_segmentor/__init__.py | 26 ++++++++++++------------- nlp_toolkit/chunk_segmentor/segment.py | 3 ++- nlp_toolkit/chunk_segmentor/utils.py | 19 ++++++++++++++++-- nlp_toolkit/classifier.py | 4 ++-- nlp_toolkit/data.py | 3 ++- setup.py | 2 +- 8 files changed, 42 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index f88f8e5..e0cca9f 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ pip install git+https://www.github.com/keras-team/keras-contrib.git 3. Trainer:定义模型的训练流程,支持bucket序列、自定义callbacks和N折交叉验证 - * bucket序列:通过将相似长度的文本放入同一batch来减小padding的多余计算来实现模型训练的加速,在文本分类任务中,能够对RNN网络提速2倍以上(暂时不支持含有Flatten层的网络) + * bucket序列:通过将相似长度的文本放入同一batch来减小padding的多余计算来实现模型训练的加速,在文本分类任务中,能够对RNN网络提速2倍以上(**暂时不支持含有Flatten层的网络**) * callbacks:通过自定义回调器来控制训练流程,目前预设的回调器有提前终止训练,学习率自动变化,更丰富的评估函数等 @@ -98,6 +98,7 @@ y_pred = text_classifier.predict(dataset.texts) # chunk分词 # 第一次import的时候,会自动下载模型和字典数据 # 支持单句和多句文本的输入格式,建议以列表的形式传入分词器 +# 源代码中已略去相关数据的下载路径,有需要的请邮件联系 from nlp_toolkit.chunk_segmentor import Chunk_Segmentor cutter = Chunk_Segmentor() s = '这是一个能够输出名词短语的分词器,欢迎试用!' diff --git a/nlp_toolkit/chunk_segmentor/README.md b/nlp_toolkit/chunk_segmentor/README.md index 7ee0e9b..b1424ba 100644 --- a/nlp_toolkit/chunk_segmentor/README.md +++ b/nlp_toolkit/chunk_segmentor/README.md @@ -2,6 +2,9 @@ 环境依赖:python 3.6.5 (暂时只支持python3) +**不再维护更新** +**源代码中已略去相关数据的下载路径,有需要的请邮件联系** + ## 安装 ```bash diff --git a/nlp_toolkit/chunk_segmentor/__init__.py b/nlp_toolkit/chunk_segmentor/__init__.py index f6752c7..c857514 100644 --- a/nlp_toolkit/chunk_segmentor/__init__.py +++ b/nlp_toolkit/chunk_segmentor/__init__.py @@ -12,12 +12,12 @@ MD5_FILE_PATH = DATA_PATH / 'model_data.md5' UPDATE_TAG_PATH = DATA_PATH / 'last_update.pkl' UPDATE_INIT_PATH = DATA_PATH / 'init_update.txt' -MD5_HDFS_PATH = '/user/kdd_wangyilei/chunk_segmentor/model_data.md5' -MODEL_HDFS_PATH = '/user/kdd_wangyilei/chunk_segmentor/model_data.zip' -USER_NAME = 'yilei.wang' -PASSWORD = 'ifchange0829FWGR' -FTP_PATH_1 = 'ftp://192.168.8.23:21/chunk_segmentor' -FTP_PATH_2 = 'ftp://211.148.28.11:21/chunk_segmentor' +MD5_HDFS_PATH = '/user/xxxx/chunk_segmentor/model_data.md5' +MODEL_HDFS_PATH = '/user/xxxx/chunk_segmentor/model_data.zip' +USER_NAME = 'xxxx' +PASSWORD = 'xxxxx' +FTP_PATH_1 = 'ftp://xxx.xxx.xx.xx:xx/chunk_segmentor' +FTP_PATH_2 = 'ftp://xxx.xxx.xx.xx:xx/chunk_segmentor' IP = socket.gethostbyname(socket.gethostname()) @@ -43,7 +43,7 @@ def check_version(): with open(UPDATE_INIT_PATH, 'w') as fout: fout.write(init_update_time) else: - print('请寻找一台有hadoop或者能访问ftp://192.168.8.23:21或者ftp://211.148.28.11:21的机器') + print('请寻找一台有hadoop或者能访问ftp://xxx.xxx.xx.xx:xx或者ftp://xxx.xxx.xx.xx:xx的机器') def write_config(config_path, new_root_path): @@ -67,7 +67,7 @@ def download(): os.remove(fname) if not IP.startswith('127'): - print('尝试从ftp://192.168.8.23:21获取数据') + print('尝试从ftp://xxx.xxx.xx.xx:xx获取数据') ret2 = os.system('wget -q --timeout=2 --tries=1 --ftp-user=%s --ftp-password=%s %s/model_data.md5' % (USER_NAME, PASSWORD, FTP_PATH_1)) if ret2 == 0: @@ -78,7 +78,7 @@ def download(): ret1 = os.system('hadoop fs -get %s' % MODEL_HDFS_PATH) ret2 = os.system('hadoop fs -get %s' % MD5_HDFS_PATH) else: - print('尝试从ftp://211.148.28.11:21获取数据') + print('尝试从ftp://xxx.xxx.xx.xx:xx获取数据') ret2 = os.system('wget -q --timeout=2 --tries=1 --ftp-user=%s --ftp-password=%s %s/model_data.md5' % (USER_NAME, PASSWORD, FTP_PATH_2)) if ret2 == 0: @@ -118,7 +118,7 @@ def get_data_md5(): if ret == 0: src = 'ftp2' if ret != 0: - print('请寻找一台有hadoop或者能访问ftp://192.168.8.23:21或者ftp://211.148.28.11:21的机器') + print('请寻找一台有hadoop或者能访问ftp://xxx.xxx.xx.xx:xx或者ftp://xxx.xxx.xx.xx:xx的机器') return None else: return src @@ -154,12 +154,12 @@ def update_data(src): os.remove(fname) if src == 'hdfs': print('尝试从hdfs上拉取数据,大约20-30s') - os.system('hadoop fs -get /user/kdd_wangyilei/chunk_segmentor/model_data.zip') + os.system('hadoop fs -get /user/xxxxx/chunk_segmentor/model_data.zip') elif src == 'ftp1': - print('尝试从ftp://192.168.8.23:21获取数据') + print('尝试从ftp://xxx.xxx.xx.xx:xx获取数据') os.system('wget --ftp-user=%s --ftp-password=%s %s/model_data.zip' % (USER_NAME, PASSWORD, FTP_PATH_1)) elif src == 'ftp2': - print('尝试从ftp://211.148.28.11:21获取数据') + print('尝试从ftp://xxx.xxx.xx.xx:xx获取数据') os.system('wget --ftp-user=%s --ftp-password=%s %s/model_data.zip' % (USER_NAME, PASSWORD, FTP_PATH_2)) os.system('unzip -q model_data.zip') diff --git a/nlp_toolkit/chunk_segmentor/segment.py b/nlp_toolkit/chunk_segmentor/segment.py index 308574f..827f28d 100644 --- a/nlp_toolkit/chunk_segmentor/segment.py +++ b/nlp_toolkit/chunk_segmentor/segment.py @@ -175,6 +175,7 @@ def extract_item(self, item): poss = list(flatten_gen(complete_poss)) if self.cut_all: words, poss = zip(*[(x1, y1) for x, y in zip(words, poss) for x1, y1 in self.cut_qualifier(x, y)]) + words = [' ' if word == 's_' else word for word in words] if self.pos: d = (words, # C_CUT_WORD poss, # C_CUT_POS @@ -186,7 +187,7 @@ def extract_item(self, item): return d def cut_qualifier(self, x, y): - if y == 'np' and '_' in x: + if y == 'np' and '_' in x and x not in ['s_', 'ss_', 'lan_']: for sub_word in x.split('_'): yield sub_word, y else: diff --git a/nlp_toolkit/chunk_segmentor/utils.py b/nlp_toolkit/chunk_segmentor/utils.py index eaa52ef..977c720 100644 --- a/nlp_toolkit/chunk_segmentor/utils.py +++ b/nlp_toolkit/chunk_segmentor/utils.py @@ -187,15 +187,30 @@ def jieba_cut(sent_list, segmentor, qualifier_word=None, mode='accurate', dict_l # URLs # r'(?:https?://|www\.)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', # r'\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-] +\.[a-zA-Z0-9-.] +\b' # E-MAIL + r'&#[\s\w\d]+;' ] -START_PATTERN = r'(\d+、|\d+\.(?!\d+)|\d+\)|(? n_labels: + if result.shape[1] > n_labels and self.model_name == 'bi_lstm_att': attention = result[:, n_labels:] attention = [attention[idx][:l] for idx, l in enumerate(x_len)] return y_pred, attention @@ -171,7 +171,7 @@ def evaluate(self, x: Dict[str, List[List[str]]], y: List[str], def load(self, weight_fname, para_fname): if self.model_name == 'bi_lstm_att': self.model = bi_lstm_attention.load(weight_fname, para_fname) - elif self.model_name == 'multi_head_self_att': + elif self.model_name == 'transformer': self.model = Transformer.load(weight_fname, para_fname) elif self.model_name == 'text_cnn': self.model = textCNN.load(weight_fname, para_fname) diff --git a/nlp_toolkit/data.py b/nlp_toolkit/data.py index bc400ba..c3430fa 100644 --- a/nlp_toolkit/data.py +++ b/nlp_toolkit/data.py @@ -75,7 +75,8 @@ def __init__(self, mode, fname='', tran_fname='', self.config = config self.data_config = config['data'] self.embed_config = config['embed'] - self.data_format = self.data_config['format'] + if self.task_type == 'sequence': + self.data_format = self.data_config['format'] if self.basic_token == 'word': self.max_tokens = self.data_config['max_words'] self.inner_char = self.data_config['inner_char'] diff --git a/setup.py b/setup.py index c4b60f4..7f2119c 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setup( name='nlp_toolkit', - version='1.3.1', + version='1.3.2', description='NLP Toolkit with easy model training and applications', long_description=long_description, long_description_content_type='text/markdown',