-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathtext_preprocess.py
executable file
·263 lines (200 loc) · 8.64 KB
/
text_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 23 20:55:25 2018
提取招聘信息中的技能要求
以数据分析实习为例
@author: situ
"""
import pandas as pd
import numpy as np
import os
import re
import jieba
from collections import Counter,defaultdict
import operator
from nltk import ngrams
import csv
import matplotlib.pyplot as plt
"""
自定义词典选取词的方式:
1. 查看分词结果
2. 查看2-gram词频统计
"""
#os.chdir("E:/graduate/class/EDA/final")
os.chdir("/Users/situ/Documents/EDA/final")
jieba.load_userdict("dict.txt")
#手动删除英文的招聘信息
data = pd.read_csv("数据分析_共47页.csv",encoding = "gbk")
data.head()
text = data["contents"]
#删除重复内容
sum(data["contents"].duplicated())
data[data["contents"].duplicated()]
data = data.drop_duplicates(["contents"])
#检查是否有空值
sum(data["contents"].isnull())
#文本预处理————————————————————————————————————————————————————————
def get_text(data):
text=data["contents"]
text = text.dropna()
len(text)
text=[t.encode('utf-8').decode("utf-8") for t in text]
return text
def get_stop_words(file='stopWord.txt'):
file = open(file, 'rb').read().decode('utf8').split(',')
file = [line.strip() for line in file]
return set(file) #查分停用词函数
def rm_tokens(words): # 去掉一些停用词和完全包含数字的字符串
words_list = list(words)
stop_words = get_stop_words()
for i in range(words_list.__len__())[::-1]:
if words_list[i] in stop_words: # 去除停用词
words_list.pop(i)
elif words_list[i].isdigit():
words_list.pop(i)
return words_list
def rm_char(text):
text = re.sub('\x01', '', text) #全角的空白符
text = re.sub('\u3000', '', text)
text = re.sub(r"[\)(↓%·▲ \s+】&【]","", text)
text = re.sub(r"[\d()《》><‘’“”"".,-]"," ",text,flags=re.I)
text = re.sub('\n+', " ", text)
text = re.sub('[,、:。!??;——]', " ", text)
text = re.sub(' +', " ", text)
return text
def convert_doc_to_wordlist(paragraph, cut_all=False):
sent_list = [sent for sent in re.split(r"[。!;:\n.;:?]",paragraph)]
sent_list = map(rm_char, sent_list) # 去掉一些字符,例如\u3000
word_2dlist = [rm_tokens(jieba.cut(part, cut_all=cut_all))
for part in sent_list] # 分词
# word_list = sum(word_2dlist, [])
def rm_space_null(alist):
alist = [s for s in alist if s not in [""," "]]
return alist
rm_space = [rm_space_null(ws) for ws in word_2dlist if len(ws)>0]
return rm_space
def rm_1ow_freq_word(texts,low_freq=1):
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] > low_freq] for text in texts]
return texts
def rm_short_len_word(texts,short_len=0):
texts = [[token for token in text if len(token)>short_len and len(token)<15] for text in texts]
return texts
def rm_high_freq_word(texts,num=10,other_dele_file="delete_words.txt"):
whole_text = []
for doc in texts:
whole_text.extend(doc)
word_count = np.array(Counter(whole_text).most_common())
high_freq = []
for i in range(num):
high_freq.append(word_count[i][0])
if other_dele_file!=None:
other_dele_list = open(other_dele_file, 'rb').read().decode('gbk').split('\n')
high_freq.extend(other_dele_list)
dele_list = np.unique(high_freq)
else:
dele_list = high_freq
# print(dele_list)
texts = [[token.lower() for token in text if token not in dele_list] for text in texts]
return texts
"""
可尝试词性标注,把动词去掉,没试,感觉不好
尝试2-Gram模型,已尝试,效果不好
"""
def word_seg():
"""
对职位描述进行分词,保存在原来的文件里
"""
clean_text=[convert_doc_to_wordlist(line) for line in get_text(data)]
clean_text[0]
clean_text_for_wordseg =[ " ".join([" ".join(sentlist) for sentlist in line]) for line in clean_text]
clean_text_for_wordseg = [line.split() for line in clean_text_for_wordseg]
clean_text_for_wordseg = rm_high_freq_word(rm_short_len_word(clean_text_for_wordseg))
data["word_seg"] = [" ".join(line) for line in clean_text_for_wordseg]
data.to_csv("data_with_wordseg.csv",index = False,encoding = "gbk")
def main():
clean_text=[convert_doc_to_wordlist(line) for line in get_text(data)]
clean_text = [sent for para in clean_text for sent in para] #拆成一个个句子
# length = np.array([len(sent) for sent in clean_text]) #检查太长的文本
# plt.hist(length)
# np.array(clean_text)[length>50]
clean_text2 = rm_1ow_freq_word(clean_text)
clean_text3 = rm_short_len_word(clean_text2)
clean_text4 = rm_high_freq_word(clean_text3)
clean_text5 = [sent for sent in clean_text4 if len(sent)>2]
# len(clean_text4)
with open("clean_text.txt","w") as f2:
for sent in clean_text5:
sent1 = " ".join(sent)+"\n"
f2.write(sent1)
if __name__ == '__main__':
main()
#保留index的clean_text
clean_text=[convert_doc_to_wordlist(line) for line in get_text(data)]
sent_index = [[i]*len(clean_text[i]) for i in range(len(clean_text))]
sent_index = [index for index_set in sent_index for index in index_set]
len(sent_index)
clean_text = [sent for para in clean_text for sent in para] #拆成一个个句子
clean_text2 = rm_1ow_freq_word(clean_text)
clean_text3 = rm_short_len_word(clean_text2)
clean_text4 = rm_high_freq_word(clean_text3)
clean_text5 = [sent for sent in clean_text4 if len(sent)>2]
sent_index5 = [sent_index[i] for i in range(len(clean_text4)) if len(clean_text4[i])>2]
clean_text5 = [" ".join(sent_list) for sent_list in clean_text5]
clean_text_with_index = pd.DataFrame({"index":sent_index5,"text":clean_text5})
clean_text_with_index.to_csv("clean_text_with_index.csv",index = False,encoding = "utf-8-sig")
csvfile = open("clean_text_with_index.csv",'w',newline='',encoding='utf-8-sig')
writer = csv.writer(csvfile)
for i in range(len(clean_text5)):
writer.writerow([sent_index5[i]+2," ".join(clean_text5[i])])
csvfile.close()
#把分词前后对比保存进csv
sent_list = [sent for paragraph in get_text(data) for sent in re.split(r"[。!;:\n.;:]",paragraph)]
sent_list_2 = [sent_i for sent_i in sent_list if len(sent_i)>5]
sent_list_2 = list(map(rm_char, sent_list_2) )
sent_list_cut = ["/".join(rm_tokens(jieba.cut(part, cut_all=False))) for part in sent_list_2]
d = pd.DataFrame({"sentence":sent_list_2,"wordseg":sent_list_cut})
d.head()
d.to_csv("word_seg.csv",index = False,encoding = "gbk")
#导入数据
def loadDataset():
'''导入文本数据集'''
f = open('clean_text.txt','r')
dataset = []
for line in f.readlines():
# print(line)
dataset.append(line.strip())
f.close()
return dataset
text = loadDataset()
def word_count(texts):
#词频统计,转化成矩阵
texts_list = [w for text_i in texts for w in text_i.split() ]
word_count = np.array(Counter(texts_list).most_common())
print (word_count[:10])
csvfile = open("wordcount.csv",'w',newline='',encoding='utf-8-sig')
writer = csv.writer(csvfile)
for row in word_count[0:1000,]:
writer.writerow([row[0], row[1]])
csvfile.close()
word_count(text)
# 统计2-gram词频,写入csv
def CountNgram(text,n=2,print_n=20):
ngram_list = []
for text_i in text:
analyzer2 = ngrams(text_i.split(),n)
Ngram_dict_i = Counter(analyzer2)
for k in Ngram_dict_i.keys():
ngram_list.append("/".join(k))
Ngram_dict = Counter(ngram_list)
sortedNGrams = sorted(Ngram_dict.items(), key = operator.itemgetter(1), reverse=True) #=True 降序排列
print("the top %d wordcount of %d gram model of period_1 is:\n" %(print_n,n),sortedNGrams[:print_n],"/n")
csvfile = open("2gram_wordcount.csv",'w',newline='',encoding='utf-8-sig')
writer = csv.writer(csvfile)
for line in sortedNGrams:
writer.writerow([line[0],line[1]])
csvfile.close()
CountNgram(text)