-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_loader.py
107 lines (86 loc) · 3.68 KB
/
data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import random
import utils
import numpy as np
import collections
import tensorflow as tf
from tqdm import tqdm
class Sampler:
def __init__(self, data, batch_size=128):
self.data = data
self.batch_size = batch_size
print('Create Sampler...')
def getbatch(self):
random_data = random.sample(list(self.data), self.batch_size)
z = np.random.normal(0, 1, [self.batch_size, 1024])
features = list()
t_captions = list()
f_captions = list()
for img in random_data:
features.append(self.data[img]['feature'])
cur_t_caption = self.data[img]['caption']
# -1 for not using <END>
sample_t_caption = cur_t_caption[np.random.choice(cur_t_caption.shape[0], 1, replace=False), :-1]
t_captions.append(sample_t_caption)
while True:
f_img = random.sample(list(self.data), 1)
if f_img != img:
cur_f_caption = self.data[img]['caption']
break
sample_f_caption = cur_f_caption[np.random.choice(cur_f_caption.shape[0], 1, replace=False), :-1]
f_captions.append(sample_f_caption)
features = np.vstack(features)
t_captions = np.vstack(t_captions)
f_captions = np.vstack(f_captions)
return z, features, t_captions, f_captions
class Pretrain_Loader:
def __init__(self, data, sess):
self.data = data
self.sess = sess
self.features = list()
self.captions = list()
for key in tqdm(self.data.keys()):
for _ in range(self.data[key]['caption'].shape[0]):
self.features.append(self.data[key]['feature'])
self.captions.append(self.data[key]['caption'])
self.features = np.vstack(self.features)
self.captions = np.vstack(self.captions)
n_example = self.captions.shape[0]
index = np.arange(n_example)
np.random.shuffle(index)
# do random shuffle
self.features = self.features[index]
self.captions = self.captions[index]
# Use placeholder to prevent too large numpy array
self.features_tf = tf.placeholder(dtype=self.features.dtype, shape=self.features.shape)
self.captions_tf = tf.placeholder(dtype=self.captions.dtype, shape=self.captions.shape)
self.created_data = None
def create(self, num_epochs=1, batch_size=128, shuffle=False):
created_data = collections.namedtuple('Data', 'iterator, captions, features')
dataset = tf.data.Dataset.from_tensor_slices((self.captions_tf, self.features_tf))
if shuffle:
dataset = dataset.shuffle(buffer_size=10000)
dataset = dataset.repeat(num_epochs)
dataset = dataset.batch(batch_size)
iterator = dataset.make_initializable_iterator()
next_captions, next_features = iterator.get_next()
self.created_data = created_data(
iterator=iterator,
captions=next_captions,
features=next_features
)
def initialize(self):
if self.created_data is None:
raise ValueError('Need create dataset first')
self.sess.run(self.created_data.iterator.initializer, feed_dict={self.features_tf: self.features,
self.captions_tf: self.captions})
@property
def batch_captions(self):
return self.created_data.captions
@property
def batch_features(self):
return self.created_data.features
def main():
train_data = utils.load_pickle('./data/data/train_dict.pkl')
train_data_loader = Pretrain_Loader(train_data)
if __name__ == '__main__':
main()