-
Notifications
You must be signed in to change notification settings - Fork 12
/
data_retriever.py
337 lines (296 loc) · 12.9 KB
/
data_retriever.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
import torch
import json
from torch.utils.data import DataLoader, Dataset
import os
import numpy as np
import faiss
from utils import sample_range_excluding
import random
from preprocess_data import normalize_string
# for embedding entities during inference
class EntitySet(Dataset):
def __init__(self, entities):
self.entities = entities
def __len__(self):
return len(self.entities)
def __getitem__(self, index):
entity = self.entities[index]
entity_token_ids = torch.tensor(entity['text_ids']).long()
entity_masks = torch.tensor(entity['text_masks']).long()
return entity_token_ids, entity_masks
# For embedding all the mentions during inference
class MentionSet(Dataset):
def __init__(self, mentions, max_len, tokenizer,
add_topic=True, use_title=False
):
self.mentions = mentions
self.max_len = max_len
self.tokenizer = tokenizer
self.add_topic = add_topic
self.use_title = use_title
# [2] is token id of '[unused1]' for bert tokenizer
self.TT = [2]
def __len__(self):
return len(self.mentions)
def __getitem__(self, index):
mention = self.mentions[index]
if self.add_topic:
title = mention['title'] if self.use_title else mention['topic']
title_ids = self.TT + title
else:
title_ids = []
# CLS + mention ids + TT + title ids
mention_title_ids = mention['text']+title_ids
mention_ids = (mention_title_ids + [self.tokenizer.pad_token_id] * (
self.max_len - len(mention_title_ids)))[:self.max_len]
mention_masks = ([1] * len(mention_title_ids) + [0] * (
self.max_len - len(mention_title_ids)))[:self.max_len]
mention_token_ids = torch.tensor(mention_ids).long()
mention_masks = torch.tensor(mention_masks).long()
return mention_token_ids, mention_masks
def get_labels(samples, all_entity_map):
# get labels for samples
labels = []
for sample in samples:
entities = sample['entities']
label_list = [all_entity_map[normalize_string(e)] for
e in
entities if e in all_entity_map]
labels.append(label_list)
labels = np.array(labels)
return labels
def get_group_indices(samples):
# get list of group indices for passages come from the same document
doc_ids = np.unique([s['doc_id'] for s in samples])
group_indices = {k: [] for k in doc_ids}
for i, s in enumerate(samples):
doc_id = s['doc_id']
group_indices[doc_id].append(i)
return list(group_indices.values())
def get_entity_map(entities):
# get all entity map: map from entity title to index
entity_map = {}
for i, e in enumerate(entities):
entity_map[e['title']] = i
assert len(entity_map) == len(entities)
return entity_map
class RetrievalSet(Dataset):
def __init__(self, mentions, entities, labels, max_len,
tokenizer, candidates,
num_cands, rands_ratio, type_loss,
add_topic=True, use_title=False):
self.mentions = mentions
self.candidates = candidates
self.max_len = max_len
self.tokenizer = tokenizer
self.labels = labels
self.num_cands = num_cands
self.rands_ratio = rands_ratio
self.all_entity_token_ids = np.array([e['text_ids'] for e in entities])
self.all_entity_masks = np.array([e['text_masks'] for e in entities])
self.entities = entities
self.type_loss = type_loss
self.add_topic = add_topic
self.use_title = use_title
# '[unused1]' for bert tokenizer
self.TT = [2]
def __len__(self):
return len(self.mentions)
def __getitem__(self, index):
"""
:param index: The index of mention
:return: mention_token_ids,mention_masks,entity_token_ids,entity_masks : 1 X L
entity_hard_token_ids, entity_hard_masks: k X L (k<=10)
"""
# process mention
mention = self.mentions[index]
if self.add_topic:
title = mention['title'] if self.use_title else mention['topic']
title_ids = self.TT + title
else:
title_ids = []
# CLS + mention ids + TT + title ids
mention_title_ids = mention['text'] + title_ids
mention_ids = mention_title_ids + [self.tokenizer.pad_token_id] * (
self.max_len - len(mention_title_ids))
mention_masks = [1] * len(mention_title_ids) + [0] * (
self.max_len - len(mention_title_ids))
mention_token_ids = torch.tensor(mention_ids[:self.max_len]).long()
mention_masks = torch.tensor(mention_masks[:self.max_len]).long()
# process entity
cand_ids = []
labels = self.labels[index]
# dummy labels if there is no label entity for the given passage
if len(labels) == 0:
labels = [-1]
else:
labels = list(set(labels))
cand_ids += labels
num_pos = len(labels)
# assert num_pos >= 0
num_neg = self.num_cands - num_pos
assert num_neg >= 0
num_rands = int(self.rands_ratio * num_neg)
num_hards = num_neg - num_rands
# non-hard and non-label for random negatives
rand_cands = sample_range_excluding(len(self.entities), num_rands,
set(labels).union(set(
self.candidates[index])))
cand_ids += rand_cands
# process hard negatives
if self.candidates is not None:
# hard negatives
hard_negs = random.sample(list(set(self.candidates[index]) - set(
labels)), num_hards)
cand_ids += hard_negs
passage_labels = torch.tensor([1] * num_pos + [0] * num_neg).long()
candidate_token_ids = self.all_entity_token_ids[cand_ids].tolist()
candidate_masks = self.all_entity_masks[cand_ids].tolist()
assert passage_labels.size(0) == self.num_cands
candidate_token_ids = torch.tensor(candidate_token_ids).long()
assert candidate_token_ids.size(0) == self.num_cands
candidate_masks = torch.tensor(candidate_masks).long()
return mention_token_ids, mention_masks, candidate_token_ids, \
candidate_masks, passage_labels
def load_data(data_dir, kb_dir):
"""
:param data_dir
:return: mentions, entities,doc
"""
print('begin loading data')
def load_mentions(part):
with open(os.path.join(data_dir, 'tokenized_aida_%s.json' % part)) as f:
mentions = json.load(f)
return mentions
samples_train = load_mentions('train')
samples_val = load_mentions('val')
samples_test = load_mentions('test')
def load_entities():
entities = []
with open(os.path.join(kb_dir, 'entities_kilt.json')) as f:
for line in f:
entities.append(json.loads(line))
return entities
entities = load_entities()
return samples_train, samples_val, samples_test, entities
def get_embeddings(loader, model, is_mention, device):
model.eval()
embeddings = []
with torch.no_grad():
for i, batch in enumerate(loader):
batch = tuple(t.to(device) for t in batch)
input_ids, input_masks = batch
k1, k2 = ('mention_token_ids', 'mention_masks') if is_mention else \
('entity_token_ids', 'entity_masks')
kwargs = {k1: input_ids, k2: input_masks}
j = 0 if is_mention else 2
embed = model(**kwargs)[j].detach()
embeddings.append(embed.cpu().numpy())
embeddings = np.concatenate(embeddings, axis=0)
model.train()
return embeddings
def get_hard_negative(mention_embeddings, all_entity_embeds, k,
max_num_postives,
use_gpu_index=False):
index = faiss.IndexFlatIP(all_entity_embeds.shape[1])
if use_gpu_index:
index = faiss.index_cpu_to_all_gpus(index)
index.add(all_entity_embeds)
scores, hard_indices = index.search(mention_embeddings,
k + max_num_postives)
del mention_embeddings
del index
return hard_indices, scores
def make_single_loader(data_set, bsz, shuffle):
loader = DataLoader(data_set, bsz, shuffle=shuffle)
return loader
def get_loader_from_candidates(samples, entities, labels, max_len,
tokenizer, candidates,
num_cands, rands_ratio, type_loss,
add_topic, use_title, shuffle, bsz
):
data_set = RetrievalSet(samples, entities, labels,
max_len, tokenizer, candidates,
num_cands, rands_ratio, type_loss, add_topic,
use_title)
loader = make_single_loader(data_set, bsz, shuffle)
return loader
def get_loaders(samples_train, samples_val, samples_test, entities, max_len,
tokenizer, mention_bsz, entity_bsz, add_topic,
use_title):
# get all mention and entity dataloaders
train_mention_set = MentionSet(samples_train, max_len, tokenizer,
add_topic, use_title)
val_mention_set = MentionSet(samples_val, max_len, tokenizer, add_topic,
use_title)
test_mention_set = MentionSet(samples_test, max_len, tokenizer, add_topic,
use_title)
entity_set = EntitySet(entities)
entity_loader = make_single_loader(entity_set, entity_bsz, False)
train_men_loader = make_single_loader(train_mention_set, mention_bsz,
False)
val_men_loader = make_single_loader(val_mention_set, mention_bsz, False)
test_men_loader = make_single_loader(test_mention_set, mention_bsz, False)
return train_men_loader, val_men_loader, test_men_loader, entity_loader
def save_candidates(mentions, candidates, entity_map, labels, out_dir, part):
# save results for reader training
assert len(mentions) == len(candidates)
labels = labels.tolist()
out_path = os.path.join(out_dir, '%s.json' % part)
entity_titles = np.array(list(entity_map.keys()))
fout = open(out_path, 'w')
for i in range(len(mentions)):
mention = mentions[i]
m_candidates = candidates[i].tolist()
m_spans = [[s[0], s[1] - 1] for s in mention['spans']]
assert len(mention['entities']) == len(mention['spans'])
ent_span_dict = {k: [] for k in mention['entities']}
for j, l in enumerate(mention['entities']):
ent_span_dict[l].append(m_spans[j])
if part == 'train':
positives = [c for c in m_candidates if c in labels[i]]
negatives = [c for c in m_candidates if c not in labels[i]]
pos_titles = entity_titles[positives].tolist()
pos_spans = [ent_span_dict[p] for p in pos_titles]
gold_ids = list(set(labels[i]))
gold_titles = entity_titles[gold_ids].tolist()
gold_spans = [ent_span_dict[g] for g in gold_titles]
neg_spans = [[[0, 0]]] * len(negatives)
item = {'doc_id': mention['doc_id'],
'mention_idx': i,
'mention_ids': mention['text'],
'positives': positives,
'negatives': negatives,
'labels': mention['entities'],
'label_spans': m_spans,
'gold_ids': gold_ids,
'gold_spans': gold_spans,
'pos_spans': pos_spans,
'neg_spans': neg_spans,
'offset': mention['offset'],
'title': mention['title'],
'topic': mention['topic'],
'passage_labels': [1] * len(positives) + [0] * len(
negatives)
}
else:
candidate_titles = entity_titles[m_candidates]
candidate_spans = [ent_span_dict[s] if s in ent_span_dict else
[[0, 0]] for s in candidate_titles]
passage_labels = [1 if c in mention['entities'] else 0 for c in
candidate_titles]
item = {'doc_id': mention['doc_id'],
'mention_idx': i,
'candidates': m_candidates,
'title': mention['title'],
'topic': mention['topic'],
'mention_ids': mention['text'],
'labels': mention['entities'],
'label_spans': m_spans,
'label_ids': labels[i],
'offset': mention['offset'],
'candidate_spans': candidate_spans,
'passage_labels': passage_labels
}
fout.write('%s\n' % json.dumps(item))
fout.close()