-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepro.py
155 lines (108 loc) · 4.92 KB
/
prepro.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import numpy as np
import json
import os
import collections
import random
from tqdm import tqdm
from keras.preprocessing import image as Image
from keras.applications.vgg19 import VGG19
from keras.models import Model
from keras.preprocessing import text as Text
import utils
def feature_extractor(model_type=None):
if model_type == 'VGG19':
base_model = VGG19(weights='imagenet')
model = Model(inputs=base_model.inputs, outputs=base_model.get_layer('fc2').output)
else:
raise ValueError('model type is not provided')
return model
def feature_extraction(model, image_path=None):
assert isinstance(image_path, str), 'Image is not provided'
img = Image.load_img(image_path, target_size=(224, 224))
img = Image.img_to_array(img)
img = np.expand_dims(img, axis=0)
features = model.predict(img)
return features
def _process_caption_data(caption_file, image_dir, feature_model, max_length=15):
with open(caption_file) as f:
caption_data = json.load(f)
# id_to_filename is a dictionary such as {image_id: filename]}
id_to_filename = {image['id']: image['file_name'] for image in caption_data['images']}
caption_data['annotations'] = ([annotation for annotation in caption_data['annotations']
if len(Text.text_to_word_sequence(annotation['caption'])) <= max_length])
caption_list = []
for annotation in caption_data['annotations']:
caption_list.append(annotation['caption'])
tokenizer, vocab = _create_tokenizer_and_vocab(caption_list)
# dataset is a dictionary, filename : info, info is also a dict that has two keys 'features' and 'captions'
# 'captions' is a ndarray, with shape [num_captions,17]
# 'features' is a array with shape [1,4096]
print('building dataset ....')
train_dataset = build_dataset(caption_data['annotations'], image_dir, feature_model, id_to_filename,
vocab, tokenizer, max_length)
total_data_len = len(train_dataset)
val_len = int(total_data_len*0.2)
print('building validation set ....')
val_dataset = dict()
for _ in tqdm(range(val_len)):
key = random.choice(list(train_dataset.keys()))
val_dataset[key] = train_dataset[key]
del train_dataset[key]
print('training set num = %d' % len(train_dataset))
print('validation set num = %d' % len(val_dataset))
return train_dataset, val_dataset, vocab
def build_dataset(data_annotations, image_dir, feature_model, id_to_filename, vocab, tokenizer, max_length):
dataset = dict()
for annotation in tqdm(data_annotations):
image_id = annotation['image_id']
filename = id_to_filename[image_id]
if filename not in dataset:
this_file = dict()
this_file['feature'] = feature_extraction(feature_model, os.path.join(image_dir, filename))
this_file['caption'] = text_to_vec(annotation['caption'], tokenizer, vocab, max_length)
dataset[filename] = this_file
else:
text_vec = text_to_vec(annotation['caption'], tokenizer, vocab, max_length)
dataset[filename]['caption'] = np.vstack([dataset[filename]['caption'], text_vec])
return dataset
def text_to_vec(text, tokenizer, vocab, max_length):
caption_idx = tokenizer.texts_to_sequences([text])
text_vec = list()
text_vec.append(vocab['<START>'])
text_vec.extend(caption_idx[0])
text_vec.append(vocab['<END>'])
while len(text_vec) < max_length + 2:
text_vec.append(vocab['<NULL>'])
return np.reshape(text_vec, [1, -1])
def _create_tokenizer_and_vocab(texts):
"""
:param texts: list of text
:return: a tokenizer
Use tokenizer.word_index to see the vocabulary
"""
tokenizer = Text.Tokenizer()
tokenizer.fit_on_texts(texts)
for key in tokenizer.word_index.keys():
tokenizer.word_index[key] += 2
vocab = {'<NULL>': 0, '<START>': 1, '<END>': 2}
vocab.update(tokenizer.word_index)
return tokenizer, vocab
def main():
# caption_list = collect_coco_cations_list()
caption_file = '/media/VSlab3/fionakuo/CV_FINAL/coco/captions_train2017.json'
image_dir = '/media/VSlab3/fionakuo/CV_FINAL/coco_image/train2017'
# config = tf.ConfigProto()
# config.gpu_options.allow_growth = True
# with tf.Session(config=config) as sess:
feature_model = feature_extractor('VGG19')
train_dataset, val_dataset, vocab = _process_caption_data(caption_file, image_dir, feature_model)
utils.save_pickle(train_dataset, '/media/VSlab3/fionakuo/CV_FINAL/data/train_dict.pkl')
utils.save_pickle(val_dataset, '/media/VSlab3/fionakuo/CV_FINAL/data/val_dict.pkl')
utils.save_pickle(vocab, '/media/VSlab3/fionakuo/CV_FINAL/data/vocab.pkl')
"""
All data is ordered from annotation
from caption array to image file : annotations[row_idx][file_name]
feature array is the same
"""
if __name__ == '__main__':
main()