-
Notifications
You must be signed in to change notification settings - Fork 106
/
similarity_metrics.py
259 lines (216 loc) · 9.19 KB
/
similarity_metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
from octis.evaluation_metrics.diversity_metrics import WordEmbeddingsInvertedRBO, \
WordEmbeddingsInvertedRBOCentroid, InvertedRBO
import numpy as np
from itertools import combinations
from scipy.spatial.distance import cosine
from octis.evaluation_metrics.metrics import AbstractMetric
from gensim.models import KeyedVectors
import gensim.downloader as api
class WordEmbeddingsRBOMatch(WordEmbeddingsInvertedRBO):
def __init__(self, word2vec_path=None, binary=True, normalize=True, weight=0.9, topk=10):
"""
Initialize metric WERBO-Match
Parameters
----------
:param topk: top k words on which the topic diversity will be computed
:param word2vec_path: word embedding space in gensim word2vec format
:param weight: Weight of each agreement at depth d. When set to 1.0, there is no weight, the rbo returns to
average overlap. (Default 0.9)
:param binary: If True, indicates whether the data is in binary word2vec format.
:param normalize: if true, normalize the cosine similarity
"""
super().__init__(word2vec_path=word2vec_path, binary=binary, normalize=normalize, weight=weight, topk=topk)
def score(self, model_output):
"""
Retrieves the score of the metric
:return WERBO-M
"""
return 1 - super(WordEmbeddingsRBOMatch, self).score(model_output)
class WordEmbeddingsRBOCentroid(WordEmbeddingsInvertedRBOCentroid):
def __init__(self, word2vec_path=None, binary=True, normalize=True, weight=0.9, topk=10):
"""
Initialize metric WERBO-Centroid
Parameters
----------
:param topk: top k words on which the topic diversity will be computed
:param word2vec_path: word embedding space in gensim word2vec format
:param weight: Weight of each agreement at depth d. When set to 1.0, there is no weight, the rbo returns to
average overlap. (Default 0.9)
:param binary: If True, indicates whether the data is in binary word2vec format.
:param normalize: if true, normalize the cosine similarity
"""
super().__init__(word2vec_path=word2vec_path, binary=binary, normalize=normalize, weight=weight, topk=topk)
def score(self, model_output):
"""
Retrieves the score of the metric
:return WERBO-C
"""
return 1 - super(WordEmbeddingsRBOCentroid, self).score(model_output)
class WordEmbeddingsPairwiseSimilarity(AbstractMetric):
def __init__(self, word2vec_path=None, topk=10, binary=False):
"""
Initialize metric WE pairwise similarity
Parameters
----------
:param topk: top k words on which the topic diversity will be computed
:param word2vec_path: word embedding space in gensim word2vec format
:param binary: If True, indicates whether the data is in binary word2vec format.
"""
super().__init__()
if word2vec_path is None:
self.wv = api.load('word2vec-google-news-300')
else:
self.wv = KeyedVectors.load_word2vec_format( word2vec_path, binary=binary)
self.topk = topk
def score(self, model_output):
"""
Retrieves the score of the metric
:return WEPS
"""
topics = model_output['topics']
if self.topk > len(topics[0]):
raise Exception('Words in topics are less than topk')
else:
count = 0
sum_sim = 0
for list1, list2 in combinations(topics, 2):
word_counts = 0
sim = 0
for word1 in list1[:self.topk]:
for word2 in list2[:self.topk]:
if word1 in self.wv.key_to_index.keys() and word2 in self.wv.key_to_index.keys():
sim = sim + self.wv.similarity(word1, word2)
word_counts = word_counts + 1
sim = sim / word_counts
sum_sim = sum_sim + sim
count = count + 1
return sum_sim / count
class WordEmbeddingsCentroidSimilarity(AbstractMetric):
def __init__(self, word2vec_path=None, topk=10, binary=False):
"""
Initialize metric WE centroid similarity
Parameters
----------
:param topk: top k words on which the topic diversity will be computed
:param word2vec_path: word embedding space in gensim word2vec format
:param binary: If True, indicates whether the data is in binary word2vec format.
"""
super().__init__()
if word2vec_path is None:
self.wv = api.load('word2vec-google-news-300')
else:
self.wv = KeyedVectors.load_word2vec_format(word2vec_path, binary=binary)
self.topk = topk
def score(self, model_output):
"""
Retrieves the score of the metric
:return WECS
"""
topics = model_output['topics']
if self.topk > len(topics[0]):
raise Exception('Words in topics are less than topk')
else:
sim = 0
count = 0
for list1, list2 in combinations(topics, 2):
centroid1 = np.zeros(self.wv.vector_size)
centroid2 = np.zeros(self.wv.vector_size)
count1, count2 = 0, 0
for word1 in list1[:self.topk]:
if word1 in self.wv.key_to_index.keys():
centroid1 = centroid1 + self.wv[word1]
count1 += 1
for word2 in list2[:self.topk]:
if word2 in self.wv.key_to_index.keys():
centroid2 = centroid2 + self.wv[word2]
count2 += 1
centroid1 = centroid1 / count1
centroid2 = centroid2 / count2
sim = sim + (1 - cosine(centroid1, centroid2))
count += 1
return sim / count
def get_word2index(list1, list2):
words = set(list1)
words = words.union(set(list2))
word2index = {w: i for i, w in enumerate(words)}
return word2index
class WordEmbeddingsWeightedSumSimilarity(AbstractMetric):
def __init__(self, id2word, word2vec_path=None, topk=10, binary=False):
"""
Initialize metric WE Weighted Sum similarity
:param id2word: dictionary mapping each id to the word of the vocabulary
:param topk: top k words on which the topic diversity will be computed
:param word2vec_path: word embedding space in gensim word2vec format
:param binary: If True, indicates whether the data is in binary word2vec format.
"""
super().__init__()
if word2vec_path is None:
self.wv = api.load('word2vec-google-news-300')
else:
self.wv = KeyedVectors.load_word2vec_format(word2vec_path, binary=binary)
self.topk = topk
self.id2word = id2word
def score(self, model_output):
"""
Retrieves the score of the metric
:return WESS
"""
beta = model_output['topic-word-distribution']
wess = 0
count = 0
for i, j in combinations(range(len(beta)), 2):
centroid1 = np.zeros(self.wv.vector_size)
weights = 0
for id_beta, w in enumerate(beta[i]):
centroid1 = centroid1 + self.wv[self.id2word[id_beta]] * w
weights += w
centroid1 = centroid1 / weights
centroid2 = np.zeros(self.wv.vector_size)
weights = 0
for id_beta, w in enumerate(beta[i]):
centroid2 = centroid2 + self.wv[self.id2word[id_beta]] * w
weights += w
centroid2 = centroid2 / weights
wess += cosine(centroid1, centroid2)
return wess / count
class RBO(InvertedRBO):
def __init__(self, weight=0.9, topk=10):
"""
Initialize metric Ranked-biased Overlap
Parameters
----------
:param topk: top k words on which the topic diversity will be computed
:param weight: Weight of each agreement at depth d. When set to 1.0, there is no weight, the rbo returns to
average overlap. (Default 0.9)
"""
super().__init__(weight=weight, topk=topk)
def score(self, model_output):
"""
Retrieves the score of the metric
:return RBO
"""
return 1 - super(RBO, self).score(model_output)
class PairwiseJaccardSimilarity(AbstractMetric):
def __init__(self, topk=10):
"""
Initialize metric Pairwise Jaccard Similarity
Parameters
----------
:param topk: top k words on which the topic diversity will be computed
"""
super().__init__()
self.topk = topk
def score(self, model_output):
"""
Retrieves the score of the metric
:return PJS
"""
topics = model_output['topics']
sim = 0
count = 0
for list1, list2 in combinations(topics, 2):
intersection = len(list(set(list1[:self.topk]).intersection(list2[:self.topk])))
union = (len(list1[:self.topk]) + len(list2[:self.topk])) - intersection
count = count + 1
sim = sim + (float(intersection) / union)
return sim / count