diff --git a/README.md b/README.md index e586100..95890b1 100644 --- a/README.md +++ b/README.md @@ -22,8 +22,9 @@ THIS PROJECT ONLY SUPPORT Python3. ## Download project and install You can install this project by: ``` -pip install bert-base==0.0.7 -i https://pypi.python.org/simple +pip install bert-base==0.0.8 -i https://pypi.python.org/simple ``` + OR ```angular2html git clone https://github.com/macanv/BERT-BiLSTM-CRF-NER @@ -148,6 +149,11 @@ with BertClient(show_server_config=False, check_version=False, check_length=Fals ``` you can see this after run the above code: ![](./pictures/server_ner_rst.png) +If you want to customize the word segmentation method, you only need to make the following simple changes on the client side code. + +```angular2html +rst = bc.encode([list(str), list(str)], is_tokenized=True) +``` #### 2. Text Classification Client ```angular2html diff --git a/bert_base/bert/extract_features.py b/bert_base/bert/extract_features.py index 5510f2e..b571b6c 100644 --- a/bert_base/bert/extract_features.py +++ b/bert_base/bert/extract_features.py @@ -415,7 +415,7 @@ def convert_lst_to_features(lst_str, seq_length, tokenizer, logger, is_tokenized examples = read_tokenized_examples(lst_str) if is_tokenized else read_line_examples(lst_str) - _tokenize = lambda x: tokenizer.mark_unk_tokens(x) if is_tokenized else tokenizer.tokenize(x) + _tokenize = lambda x: x if is_tokenized else tokenizer.tokenize(x) for (ex_index, example) in enumerate(examples): tokens_a = _tokenize(example.text_a) diff --git a/bert_base/bert/tokenization.py b/bert_base/bert/tokenization.py index f16efcb..2883c3e 100644 --- a/bert_base/bert/tokenization.py +++ b/bert_base/bert/tokenization.py @@ -348,3 +348,4 @@ def _is_punctuation(char): if cat.startswith("P"): return True return False + diff --git a/client_test.py b/client_test.py index 01c03a3..5abbb5a 100644 --- a/client_test.py +++ b/client_test.py @@ -13,14 +13,31 @@ def ner_test(): with BertClient(show_server_config=False, check_version=False, check_length=False, mode='NER') as bc: start_t = time.perf_counter() - str = '1月24日,新华社对外发布了中央对雄安新区的指导意见,洋洋洒洒1.2万多字,17次提到北京,4次提到天津,信息量很大,其实也回答了人们关心的很多问题。' - rst = bc.encode([str, str]) + str1 = '1月24日,新华社对外发布了中央对雄安新区的指导意见,洋洋洒洒1.2万多字,17次提到北京,4次提到天津,信息量很大,其实也回答了人们关心的很多问题。' + # rst = bc.encode([list(str1)], is_tokenized=True) + # str1 = list(str1) + rst = bc.encode([str1], is_tokenized=True) print('rst:', rst) + print(len(rst[0])) + print(time.perf_counter() - start_t) + + +def ner_cu_seg(): + """ + 自定义分字 + :return: + """ + with BertClient(show_server_config=False, check_version=False, check_length=False, mode='NER') as bc: + start_t = time.perf_counter() + str1 = '1月24日,新华社对外发布了中央对雄安新区的指导意见,洋洋洒洒1.2万多字,17次提到北京,4次提到天津,信息量很大,其实也回答了人们关心的很多问题。' + rst = bc.encode([list(str1)], is_tokenized=True) + print('rst:', rst) + print(len(rst[0])) print(time.perf_counter() - start_t) def class_test(): - with BertClient(port=5557, port_out=5558, show_server_config=False, check_version=False, check_length=False, mode='CLASS') as bc: + with BertClient(show_server_config=False, check_version=False, check_length=False, mode='CLASS') as bc: start_t = time.perf_counter() str = '北京时间2月17日凌晨,第69届柏林国际电影节公布主竞赛单元获奖名单,王景春、咏梅凭借王小帅执导的中国影片《地久天长》连夺最佳男女演员双银熊大奖,这是中国演员首次包揽柏林电影节最佳男女演员奖,为华语影片刷新纪录。与此同时,由青年导演王丽娜执导的影片《第一次的别离》也荣获了本届柏林电影节新生代单元国际评审团最佳影片,可以说,在经历数个获奖小年之后,中国电影在柏林影展再次迎来了高光时刻。' str2 = '受粤港澳大湾区规划纲要提振,港股周二高开,恒指开盘上涨近百点,涨幅0.33%,报28440.49点,相关概念股亦集体上涨,电子元件、新能源车、保险、基建概念多数上涨。粤泰股份、珠江实业、深天地A等10余股涨停;中兴通讯、丘钛科技、舜宇光学分别高开1.4%、4.3%、1.6%。比亚迪电子、比亚迪股份、光宇国际分别高开1.7%、1.2%、1%。越秀交通基建涨近2%,粤海投资、碧桂园等多股涨超1%。其他方面,日本软银集团股价上涨超0.4%,推动日经225和东证指数齐齐高开,但随后均回吐涨幅转跌东证指数跌0.2%,日经225指数跌0.11%,报21258.4点。受芯片制造商SK海力士股价下跌1.34%拖累,韩国综指下跌0.34%至2203.9点。澳大利亚ASX 200指数早盘上涨0.39%至6089.8点,大多数行业板块均现涨势。在保健品品牌澳佳宝下调下半财年的销售预期后,其股价暴跌超过23%。澳佳宝CEO亨弗里(Richard Henfrey)认为,公司下半年的利润可能会低于上半年,主要是受到销售额疲弱的影响。同时,亚市早盘澳洲联储公布了2月会议纪要,政策委员将继续谨慎评估经济增长前景,因前景充满不确定性的影响,稳定当前的利率水平比贸然调整利率更为合适,而且当前利率水平将有利于趋向通胀目标及改善就业,当前劳动力市场数据表现强势于其他经济数据。另一方面,经济增长前景亦令消费者消费意愿下滑,如果房价出现下滑,消费可能会进一步疲弱。在澳洲联储公布会议纪要后,澳元兑美元下跌近30点,报0.7120 。美元指数在昨日触及96.65附近的低点之后反弹至96.904。日元兑美元报110.56,接近上一交易日的低点。' @@ -31,5 +48,6 @@ def class_test(): if __name__ == '__main__': - class_test() - # ner_test() \ No newline at end of file + # class_test() + ner_test() + ner_cu_seg() \ No newline at end of file diff --git a/setup.py b/setup.py index 32daa65..7d45c1d 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ # print(__version__) setup( name='bert_base', - version='0.0.7', + version='0.0.8', description='Use Google\'s BERT for Chinese natural language processing tasks such as named entity recognition and provide server services', url='https://github.com/macanv/BERT-BiLSTM-CRF-NER', long_description=open('README.md', 'r', encoding='utf-8').read(),