-
Notifications
You must be signed in to change notification settings - Fork 25
/
text_cnn_demo.py
110 lines (91 loc) · 3.66 KB
/
text_cnn_demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author : szu-hwj
from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv1D, MaxPooling1D, Dropout, Input, concatenate
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Model
import os
import tarfile
import numpy as np
# 有些数据是含有html标签的,需要去除
import re
def rm_tags(text):
re_tag = re.compile(r'<[^>]+>')
return re_tag.sub('', text)
def read_files(filetype):
"""
filetype: 'train' or 'test'
return:
all_texts: filetype数据集文本
all_labels: filetype数据集标签
"""
# 标签1表示正面,0表示负面
all_labels = [1]*12500 + [0]*12500
all_texts = []
file_list = []
path = r'./aclImdb/'
# 读取正面文本名
pos_path = path + filetype + '/pos/'
for file in os.listdir(pos_path):
file_list.append(pos_path+file)
# 读取负面文本名
neg_path = path + filetype + '/neg/'
for file in os.listdir(neg_path):
file_list.append(neg_path+file)
# 将所有文本内容加到all_texts
for file_name in file_list:
with open(file_name, encoding='utf-8') as f:
all_texts.append(rm_tags(" ".join(f.readlines())))
return all_texts, all_labels
def preprocessing(train_texts, train_labels, test_texts, test_labels):
tokenizer = Tokenizer(num_words=2000) # 建立一个2000个单词的字典
tokenizer.fit_on_texts(train_texts)
# 对每一句影评文字转换为数字列表,使用每个词的编号进行编号
x_train_seq = tokenizer.texts_to_sequences(train_texts)
x_test_seq = tokenizer.texts_to_sequences(test_texts)
x_train = sequence.pad_sequences(x_train_seq, maxlen=150)
x_test = sequence.pad_sequences(x_test_seq, maxlen=150)
y_train = np.array(train_labels)
y_test = np.array(test_labels)
return x_train, y_train, x_test, y_test
def text_cnn(maxlen=150, max_features=2000, embed_size=32):
# Inputs
comment_seq = Input(shape=[maxlen], name='x_seq')
# Embeddings layers
emb_comment = Embedding(max_features, embed_size)(comment_seq)
# conv layers
convs = []
filter_sizes = [2, 3, 4, 5]
for fsz in filter_sizes:
l_conv = Conv1D(filters=100, kernel_size=fsz, activation='relu')(emb_comment)
l_pool = MaxPooling1D(maxlen - fsz + 1)(l_conv)
l_pool = Flatten()(l_pool)
convs.append(l_pool)
merge = concatenate(convs, axis=1)
out = Dropout(0.5)(merge)
output = Dense(32, activation='relu')(out)
output = Dense(units=1, activation='sigmoid')(output)
model = Model([comment_seq], output)
# adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])
return model
if __name__ == '__main__':
if not os.path.exists('./aclImdb'):
tfile = tarfile.open(r'./aclImdb_v1.tar.gz', 'r:gz') # r;gz是读取gzip压缩文件
result = tfile.extractall('./') # 解压缩文件到当前目录中
train_texts, train_labels = read_files('train')
test_texts, test_labels = read_files('test')
x_train, y_train, x_test, y_test = preprocessing(train_texts, train_labels, test_texts, test_labels)
model = text_cnn()
batch_size = 128
epochs = 20
model.fit(x_train, y_train,
validation_split=0.1,
batch_size=batch_size,
epochs=epochs,
shuffle=True)
scores = model.evaluate(x_test, y_test)
print('test_loss: %f, accuracy: %f' % (scores[0], scores[1]))