-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_data.py
128 lines (90 loc) · 3.84 KB
/
create_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import numpy as np, pandas as pd
from collections import defaultdict
import pickle
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
"""
UPDATE:
This file is obsolete. Please do not use this.
The pre-processed datasets have already been provided in the dataset/ folder in the repo. Use them directly.
"""
pre_data = np.asarray(pd.read_csv("./data/transcripts.csv" , header=None))
train = pd.read_csv("./data/text_train.csv", header=None)
test = pd.read_csv("./data/text_test.csv", header=None)
train = np.asarray(train)
test = np.asarray(test)
train_index = np.asarray(train[:,0], dtype = 'int')
test_index = np.asarray(test[:,0], dtype = 'int')
def main(name):
path = "./data/"+name+"/"+name
print path
train_video_mapping=defaultdict(list)
train_video_mapping_index=defaultdict(list)
test_video_mapping=defaultdict(list)
test_video_mapping_index=defaultdict(list)
data_train = np.asarray(pd.read_csv(path+"_train0.csv", header=None))
data_test = np.asarray(pd.read_csv(path+"_test0.csv", header=None))
for i in xrange(train_index.shape[0]):
train_video_mapping[pre_data[train_index[i]][0].rsplit("_",1)[0] ].append(train_index[i])
train_video_mapping_index[pre_data[train_index[i]][0].rsplit("_",1)[0] ].append( int(pre_data[train_index[i]][0].rsplit("_",1)[1]) )
for i in xrange(test_index.shape[0]):
test_video_mapping[pre_data[test_index[i]][0].rsplit("_",1)[0] ].append(test_index[i])
test_video_mapping_index[pre_data[test_index[i]][0].rsplit("_",1)[0] ].append( int(pre_data[test_index[i]][0].rsplit("_",1)[1]) )
train_indices = dict((c, i) for i, c in enumerate(train_index))
test_indices = dict((c, i) for i, c in enumerate(test_index))
max_len = 0
for key,value in train_video_mapping.iteritems():
max_len = max(max_len , len(value))
for key,value in test_video_mapping.iteritems():
max_len = max(max_len, len(value))
pad = np.asarray([0 for i in xrange(data_train[0][:-1].shape[0])])
print "Mapping train"
train_data_X =[]
train_data_Y =[]
train_length =[]
for key,value in train_video_mapping.iteritems():
lst = np.column_stack((train_video_mapping_index[key],value) )
ind = np.asarray(sorted(lst,key=lambda x: x[0]))
lst_X, lst_Y=[],[]
ctr=0;
for i in xrange(ind.shape[0]):
ctr+=1
#lst_X.append(preprocessing.scale( min_max_scaler.fit_transform(data_train[train_indices[ind[i,1]]][:-1])))
lst_X.append(data_train[train_indices[ind[i,1]]][:-1])
lst_Y.append(data_train[train_indices[ind[i,1]]][-1])
train_length.append(ctr)
for i in xrange(ctr, max_len):
lst_X.append(pad)
lst_Y.append(0) #dummy label
train_data_X.append(lst_X)
train_data_Y.append(lst_Y)
test_data_X =[]
test_data_Y =[]
test_length =[]
print "Mapping test"
for key,value in test_video_mapping.iteritems():
lst = np.column_stack((test_video_mapping_index[key],value) )
ind = np.asarray(sorted(lst,key=lambda x: x[0]))
lst_X, lst_Y=[],[]
ctr=0
for i in xrange(ind.shape[0]):
ctr+=1
#lst_X.append(preprocessing.scale( min_max_scaler.transform(data_test[test_indices[ind[i,1]]][:-1])))
lst_X.append(data_test[test_indices[ind[i,1]]][:-1])
lst_Y.append(data_test[test_indices[ind[i,1]]][-1])
test_length.append(ctr)
for i in xrange(ctr, max_len):
lst_X.append(pad)
lst_Y.append(0) #dummy label
test_data_X.append(np.asarray(lst_X))
test_data_Y.append(np.asarray(lst_Y))
train_data_X = np.asarray(train_data_X)
test_data_X = np.asarray(test_data_X)
print train_data_X.shape, test_data_X.shape,len(train_length), len(test_length)
print "Dumping data"
with open('./input/'+name+'.pickle', 'wb') as handle:
pickle.dump((train_data_X, np.asarray(train_data_Y), test_data_X, np.asarray(test_data_Y), max_len ,train_length, test_length), handle, protocol=pickle.HIGHEST_PROTOCOL)
if __name__ == "__main__":
names = ['text','audio','video']
for nm in names:
main(nm)