-
Notifications
You must be signed in to change notification settings - Fork 3
/
prepare_data.py
58 lines (50 loc) · 1.46 KB
/
prepare_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import re
from Feature import Feature
def read_file(fileName):
'''
读取文件内容
'''
with open(fileName, 'r', encoding='utf-8') as f:
lines = f.readlines()
# drop all the '\n'
lines = [l for l in lines if l != '\n' and len(l) > 0]
return lines
def write_file(filter_file, corpus):
'''
写入文件
'''
with open(filter_file, 'a', encoding='utf-8') as f:
for data in corpus:
f.write(data+"\n")
def prepare_data(data, filter_file):
import logging
'''
去除语料中的标签
'''
corpus = []
for line in data:
line.strip('\n')
split_line = line.split()
filter_arr = []
data = ''
for arr in split_line:
index = arr.find(']')
if index > 0:
arr = arr[:index+1]
ret = re.sub("/[a-zA-Z]+", "", arr)
filter_arr.append(ret)
filter_arr = filter_arr[1:]
data = ' '.join(filter_arr)
corpus.append(data)
write_file(filter_file, corpus)
logging.info("file filtering completed")
print("file filtering completed")
if __name__ == "__main__":
train_file = "./data/train.txt"
test_file = "./data/test.txt"
filter_train_file = "./data/filter_train.txt"
filter_test_file = "./data/filter_test.txt"
# data=read_file(train_file)
data = read_file(test_file)
# prepare_data(data,filter_train_file)
prepare_data(data, filter_test_file)