-
Notifications
You must be signed in to change notification settings - Fork 12
/
main.py
129 lines (110 loc) · 3.92 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import jieba
import pickle
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans
def load_articles():
"""
将文本分词
:return:
"""
filename = 'data/36kr_articles.pkl'
if os.path.exists(filename):
with open(filename, 'rb') as in_data:
articles = pickle.load(in_data)
return articles
stop_words = [word.strip() for word in open('data/stop_words.txt', 'r', encoding='utf-8').readlines()]
print(stop_words)
data = pd.read_csv('data/36kr_articles.csv')
print("正在分词...")
# 分词,去除停用词
data['title'] = data['title'].apply(lambda x: " ".join([word for word in jieba.cut(x) if word not in stop_words]))
data['summary'] = data['summary'].apply(
lambda x: " ".join([word for word in jieba.cut(x) if word not in stop_words]))
data['content'] = data['content'].apply(
lambda x: " ".join([word for word in jieba.cut(str(x)) if word not in stop_words]))
articles = []
for title, summary, content in zip(data['title'].tolist(),
data['summary'].tolist(),
data['content'].tolist()):
article = title + summary + content
articles.append(article)
with open(filename, 'wb') as out_data:
pickle.dump(articles, out_data, pickle.HIGHEST_PROTOCOL)
return articles
# articles=load_articles()
def transform(articles, n_features=1000):
"""
提取tf-idf特征
:param articles:
:param n_features:
:return:
"""
vectorizer = TfidfVectorizer(max_df=0.5, max_features=n_features, min_df=2, use_idf=True)
X = vectorizer.fit_transform(articles)
return X, vectorizer
def train(X, vectorizer, true_k=10, mini_batch=False, show_label=False):
"""
训练 k-means
:param X:
:param vectorizer:
:param true_k:
:param mini_batch:
:param show_label:
:return:
"""
if mini_batch:
k_means = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
init_size=1000, batch_size=1000, verbose=False)
else:
k_means = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=1,
verbose=False)
k_means.fit(X)
if show_label: # 显示标签
print("Top terms per cluster:")
order_centroids = k_means.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
# print(vectorizer.get_stop_words())
for i in range(true_k):
print("Cluster %d" % i, end='')
for ind in order_centroids[i, :10]:
print(' %s' % terms[ind], end='')
print()
result = list(k_means.predict(X))
print('Cluster distribution:')
print(dict([(i, result.count(i)) for i in result]))
return -k_means.score(X)
def plot_params():
"""
测试选择最优参数
:return:
"""
articles = load_articles()
print("%d docments" % len(articles))
X, vectorizer = transform(articles, n_features=500)
true_ks = []
scores = []
for i in range(3, 80, 1):
score = train(X, vectorizer, true_k=i) / len(articles)
print(i, score)
true_ks.append(i)
scores.append(score)
plt.figure(figsize=(8, 4))
plt.plot(true_ks, scores, label="error", color="red", linewidth=1)
plt.xlabel("n_features")
plt.ylabel("error")
plt.legend()
plt.show()
# plot_params()
def out():
"""
在最优参数下输出聚类效果
:return:
"""
articles = load_articles()
X, vectorizer = transform(articles, n_features=500)
score = train(X, vectorizer, true_k=10, show_label=True) / len(articles)
print(score)
out()