-
Notifications
You must be signed in to change notification settings - Fork 0
/
3.data_generate.py
114 lines (95 loc) · 4.33 KB
/
3.data_generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os
import struct
import collections
from tensorflow.core.example import example_pb2
from src import config
def chunk_file(finished_files_dir, chunks_dir, name, chunk_size):
in_file = os.path.join(finished_files_dir, '%s.bin' % name)
print(in_file)
reader = open(in_file, "rb")
chunk = 0
finished = False
while not finished:
chunk_fname = os.path.join(chunks_dir, '%s_%03d.bin' % (name, chunk)) # 新的分块
with open(chunk_fname, 'wb') as writer:
for _ in range(chunk_size):
len_bytes = reader.read(8)
if not len_bytes:
finished = True
break
str_len = struct.unpack('q', len_bytes)[0]
example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
writer.write(struct.pack('q', str_len))
writer.write(struct.pack('%ds' % str_len, example_str))
chunk += 1
def chunk_all():
# 创建一个文件夹来保存分块
if not os.path.isdir(CHUNKS_DIR):
os.mkdir(CHUNKS_DIR)
# 将数据分块
for name in ['train', 'val']:
print("Splitting %s data into chunks..." % name)
chunk_file(FINISHED_FILE_DIR, CHUNKS_DIR, name, CHUNK_SIZE)
print("Saved chunked data in %s" % CHUNKS_DIR)
def read_text_file(text_file):
"""从预处理好的文件中加载数据"""
lines = []
with open(text_file, "r", encoding='utf-8') as f:
for line in f:
lines.append(line.strip())
return lines
def write_to_bin(input_file, out_file, makevocab=False):
# 生成bin模型
if makevocab:
vocab_counter = collections.Counter()
with open(out_file, 'wb') as writer:
# 读取输入的文本文件,使偶数行成为article,奇数行成为abstract(行号从0开始)
lines = read_text_file(input_file)
for i, new_line in enumerate(lines):
if i % 2 == 0:
article = lines[i]
if i % 2 != 0:
# 构造<s> txt </s>
abstract = "%s %s %s" % (SENTENCE_START, lines[i], SENTENCE_END)
# tf.Example序列化
tf_example = example_pb2.Example()
tf_example.features.feature['article'].bytes_list.value.extend([bytes(article, encoding='utf-8')])
tf_example.features.feature['abstract'].bytes_list.value.extend([bytes(abstract, encoding='utf-8')])
tf_example_str = tf_example.SerializeToString()
str_len = len(tf_example_str)
writer.write(struct.pack('q', str_len))
writer.write(struct.pack('%ds' % str_len, tf_example_str))
# 如果可以,将词典写入文件
if makevocab:
art_tokens = article.split(' ')
abs_tokens = abstract.split(' ')
abs_tokens = [t for t in abs_tokens if
t not in [SENTENCE_START, SENTENCE_END]] # 从词典中删除这些符号
tokens = art_tokens + abs_tokens
tokens = [t.strip() for t in tokens] # 去掉句子开头结尾的空字符
tokens = [t for t in tokens if t != ""] # 删除空行
vocab_counter.update(tokens)
print("Finished writing file %s\n" % out_file)
# 将词典写入文件,词 词频
if makevocab:
print("Writing vocab file...")
with open(os.path.join(FINISHED_FILE_DIR, "vocab"), 'w', encoding='utf-8') as writer:
for word, count in vocab_counter.most_common(VOCAB_SIZE):
writer.write(word + ' ' + str(count) + '\n')
print("Finished writing vocab file")
FINISHED_FILE_DIR = 'data/'
TRAIN_FILE = FINISHED_FILE_DIR + "train_art_summ_prep.txt"
VAL_FILE = FINISHED_FILE_DIR + "val_art_summ_prep.txt"
SENTENCE_START = '<s>'
SENTENCE_END = '</s>'
# 词汇表大小
VOCAB_SIZE = config.vocab_size
# 每个分块example的数量,用于分块的数据
CHUNK_SIZE = 1000
# tf模型数据文件存放目录
CHUNKS_DIR = os.path.join(FINISHED_FILE_DIR, 'chunked')
if not os.path.exists(FINISHED_FILE_DIR):
os.makedirs(FINISHED_FILE_DIR)
write_to_bin(VAL_FILE, os.path.join(FINISHED_FILE_DIR, "val.bin"))
write_to_bin(TRAIN_FILE, os.path.join(FINISHED_FILE_DIR, "train.bin"), makevocab=True)
chunk_all()