-
Notifications
You must be signed in to change notification settings - Fork 0
/
新建文本文档.txt
158 lines (132 loc) · 4.75 KB
/
新建文本文档.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
'''
本文档负责实际读取语料库文件
训练LR模型
过程中保存词典、语料和训练后的模型
'''
import os
import numpy as np
from sklearn.linear_model.logistic import *
from gensim import corpora, models, similarities
import jieba
from sklearn.model_selection import train_test_split
import pickle
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
#from learnTextClsf.textProcess import *
from scipy.sparse import csr_matrix
from sklearn.metrics import classification_report
def get_stop_words():
path = r"C:\Users\刘衍程\nlp\stopwords.txt"
file = open(path, 'rb').read().decode('utf-8').split('\r\n')
return set(file)
def rm_stop_words(word_list):
word_list = list(word_list)
stop_words = get_stop_words()
# 这个很重要,注意每次pop之后总长度是变化的
for i in range(word_list.__len__())[::-1]:
# 去停用词
if word_list[i] in stop_words:
word_list.pop(i)
# 去数字
elif word_list[i].isdigit():
word_list.pop(i)
return word_list
def rm_word_freq_so_little(dictionary, freq_thred):
small_freq_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq < freq_thred ]
dictionary.filter_tokens(small_freq_ids)
dictionary.compactify()
def listdir(path, list_name):
for file in os.listdir(path):
file_path = os.path.join(path, file)
if os.path.isdir(file_path):
listdir(file_path, list_name)
else:
list_name.append(file_path)
if __name__ == '__main__':
freq_thred = 10 # 当一个单词在所有语料中出现次数小于这个阈值,那么该词语不应被计入词典中
# 字典
dictionary = corpora.Dictionary()
# 词袋
bow = []
labels_count = []
list_name = []
listdir(r'C:\Users\刘衍程\nlp\一步一步来\中文新闻分类准备\资源\搜狗新闻语料\news_tensite_xml.smarty', list_name)
count = 0
for path in list_name[0:2]:
print(path)
file = open(path, 'rb').read().decode('utf-8').split('\n')
class_count = 0
for text in file:
# 打标签
class_count = class_count + 1
content = text
# 分词
word_list = list(jieba.cut(content, cut_all=False))
# 去停用词
word_list = rm_stop_words(word_list)
dictionary.add_documents([word_list])
'''
转化成词袋
gensim包中的dic实际相当于一个map
doc2bow方法,对没有出现过的词语,在dic中增加该词语
如果dic中有该词语,则将该词语序号放到当前word_bow中并且统计该序号单词在该文本中出现了几次
'''
word_bow = dictionary.doc2bow(word_list)
bow.append(word_bow)
labels_count.append(class_count-1)
for i in (bow.__len__()):
print(bow[i])
# with open('dictionary.pkl', 'wb') as f1:
# pickle.dump(dictionary, f1)
# 去除过少单词 ps:可能导致维数不同
rm_word_freq_so_little(dictionary,freq_thred)
# dictionary.save('dicsave.dict')
# corpora.MmCorpus.serialize('bowsave.mm', bow)
tfidf_model = models.TfidfModel(corpus=bow,dictionary=dictionary)
# with open('tfidf_model.pkl', 'wb') as f2:
# pickle.dump(tfidf_model, f2)
'''训练tf-idf模型'''
corpus_tfidf = [tfidf_model[doc] for doc in bow]
'''将gensim格式稀疏矩阵转换成可以输入scikit-learn模型格式矩阵'''
data = []
rows = []
cols = []
line_count = 0
for line in corpus_tfidf:
for elem in line:
rows.append(line_count)
cols.append(elem[0])
data.append(elem[1])
line_count += 1
print(line_count)
tfidf_matrix = csr_matrix((data,(rows,cols))).toarray()
count = 0
for ele in tfidf_matrix:
# print(ele)
# print(count)
count = count + 1
# cut label 1 mil label 0
'''生成labels'''
labels = np.zeros(sum(labels_count) + 1)
for i in range(labels_count[0]):
labels[i] = 1
'''分割训练集和测试集'''
rarray=np.random.random(size=line_count)
x_train = []
y_train = []
x_test = []
y_test = []
for i in range(line_count-1):
if rarray[i]<0.8:
x_train.append(tfidf_matrix[i,:])
y_train.append(labels[i])
else:
x_test.append(tfidf_matrix[i,:])
y_test.append(labels[i])
# x_train,x_test,y_train,y_test = train_test_split(tfidf_matrix,labels,test_size=0.3,random_state=0)
'''LR模型分类训练'''
classifier=LogisticRegression()
classifier.fit(x_train, y_train)
#
# with open('LR_model.pkl', 'wb') as f:
# pickle.dump(classifier, f)
print(classification_report(y_test,classifier.predict(x_test)))