-
Notifications
You must be signed in to change notification settings - Fork 38
/
init.py
180 lines (180 loc) · 5.03 KB
/
init.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#coding:utf-8
import numpy as np
import sys
import math
import random
reload(sys)
sys.setdefaultencoding('utf-8')
tripleTotal = 1
entityTotal = 1
tagTotal = 1
def min(a, b):
if a > b:
return b
return a
#get global values: tripleTotal, entityTotal, tagTotal
def getGlobalValues():
return tripleTotal, entityTotal, tagTotal
#load triples from file
def getTriples(path):
headList = []
tailList = []
relationList = []
headSet = []
tailSet = []
f = open(path, "r")
content = f.readline()
global tripleTotal, entityTotal, tagTotal
tripleTotal, entityTotal, tagTotal = [int(i) for i in content.strip().split()]
for i in range(entityTotal):
headSet.append(set())
tailSet.append(set())
while(True):
content = f.readline()
if content == "":
break
values = content.strip().split()
values = [(int)(i) for i in values]
headList.append(values[0])
tailList.append(values[1])
headSet[values[0]].add(values[1])
tailSet[values[1]].add(values[0])
relationList.append(values[2:])
f.close()
return headList, tailList, relationList, headSet, tailSet
#generate transNet training batches
def batch_iter(headList, tailList, relationList, headSet, tailSet, batch_size, aeBeta):
data_size = len(headList)
entity_size = entityTotal
# Shuffle the data at each epoch
tag_size = tagTotal
shuffle_indices = np.random.permutation(np.arange(data_size))
start_index = 0
end_index = min(start_index+batch_size, data_size)
batch_id = 0
while start_index < data_size:
pos_h = []
pos_t = []
pos_r = []
pos_b = []
neg_h = []
neg_t = []
neg_r = []
neg_b = []
for i in range(start_index, end_index):
cur_h = headList[shuffle_indices[i]]
cur_t = tailList[shuffle_indices[i]]
cur_r = relationList[shuffle_indices[i]]
set_r = set(cur_r)
r_one_hot = np.zeros(tag_size, dtype=float)
b = np.ones(tag_size, dtype=float)
r_one_hot[cur_r] = 1.0
b[cur_r] = aeBeta
#for index in cur_r:
# r_one_hot[index] = 1.0
#replace head
pos_h.append(cur_h)
pos_t.append(cur_t)
pos_r.append(r_one_hot)
pos_b.append(b)
rand_h = random.randint(0, entity_size-1)
while(rand_h in tailSet[cur_t]):
rand_h = random.randint(0, entity_size-1)
neg_h.append(rand_h)
neg_t.append(cur_t)
neg_r.append(r_one_hot)
neg_b.append(b)
#repalce tail
pos_h.append(cur_h)
pos_t.append(cur_t)
pos_r.append(r_one_hot)
pos_b.append(b)
rand_t = random.randint(0, entity_size-1)
while(rand_t in headSet[cur_h]):
rand_t = random.randint(0, entity_size-1)
neg_h.append(cur_h)
neg_t.append(rand_t)
neg_r.append(r_one_hot)
neg_b.append(b)
#replace relation
pos_h.append(cur_h)
pos_t.append(cur_t)
pos_r.append(r_one_hot)
pos_b.append(b)
rand_set_r = set([])
rand_r = random.randint(0, tag_size-1)
len_r = len(cur_r)
while(len(rand_set_r) < len_r and len(rand_set_r) + len_r < tag_size):
if (rand_r not in set_r) and (rand_r not in rand_set_r):
rand_set_r.add(rand_r)
rand_r = random.randint(0, tag_size-1)
rand_cur_r = [r for r in rand_set_r]
rand_r_one_hot = np.zeros(tag_size, dtype=float)
rand_r_one_hot[rand_cur_r] = 1.0
rand_b = np.ones(tag_size, dtype=float)
rand_b[rand_cur_r] = aeBeta
neg_h.append(cur_h)
neg_t.append(cur_t)
neg_r.append(rand_r_one_hot)
neg_b.append(rand_b)
batch_id += 1
yield np.array(pos_h), np.array(pos_t), np.array(pos_r), np.array(pos_b), np.array(neg_h), np.array(neg_t), np.array(neg_r), np.array(neg_b)
start_index = end_index
end_index = min(start_index+batch_size, data_size)
#generate evaluation batches
def batch_test(headList, tailList, relationList, batch_size):
data_size = len(headList)
entity_size = entityTotal
tag_size = tagTotal
start_index = 0
end_index = min(start_index+batch_size, data_size)
batch_id = 0
while start_index < data_size:
pos_h = []
pos_t = []
pos_r = []
for i in range(start_index, end_index):
cur_h = headList[i]
cur_t = tailList[i]
cur_r = relationList[i]
r_one_hot = np.zeros(tag_size, dtype=float)
b = np.ones(tag_size, dtype=float)
r_one_hot[cur_r] = 1.0
pos_h.append(cur_h)
pos_t.append(cur_t)
pos_r.append(r_one_hot)
#print 'batch ', batch_id
batch_id += 1
'''
print pos_h
print pos_t
print pos_r
print neg_h
print neg_t
print neg_r
'''
yield pos_h, pos_t, pos_r
start_index = end_index
end_index = min(start_index+batch_size, data_size)
#generate relation autoencoder warm-up batches
def batch_autoencoder(vecList, vec_size, batch_size, aeBeta):
data_size = len(vecList)
shuffle_indices = np.random.permutation(np.arange(data_size))
start_index = 0
end_index = min(start_index+batch_size, data_size)
batch_id = 0
while start_index < data_size:
vecs = []
bs = []
for i in range(start_index, end_index):
vec_index = vecList[shuffle_indices[i]]
vec = np.zeros(vec_size, dtype=float)
b = np.ones(vec_size, dtype=float)
vec[vec_index] = 1.0
b[vec_index] = aeBeta
vecs.append(vec)
bs.append(b)
batch_id += 1
yield np.array(vecs), np.array(bs)
start_index = end_index
end_index = min(start_index+batch_size, data_size)