-
Notifications
You must be signed in to change notification settings - Fork 0
/
vis_compare_corpora.py
59 lines (44 loc) · 2.13 KB
/
vis_compare_corpora.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import numpy as np
import argparse
import utilities as utils
from dim_reduction import pca_dim_reduction, tsne_dim_reduction
from plotting import plot_tsne
if __name__ == '__main__':
parser = argparse.ArgumentParser ()
parser.add_argument('--corpora_dirs', dest = 'corpora_dirs', default = '/path/to/corpus_embeddings1,/path/to/corpus_embeddings2', help = 'provide a list of corpus embeddings directories separated by comma')
parser.add_argument('--corpora_names', dest = 'corpora_names', default = 'corpus1,corpus2', help = 'provide a list of corpus names separated by comma')
parser.add_argument('--colors', dest = 'colors', default='purple,gold,cyan,black', help = 'provide a list of colors separated by comma')
parser.add_argument('--cap_size', dest = 'cap_size', default = 1000, help = 'provide a list of corpus embeddings directories')
args = parser.parse_args()
## dataset embedding dirs
corpora_dirs = args.corpora_dirs.split(',')
corpora_names = args.corpora_names.split(',')
colors = args.colors.split(',')
## limit the number of data points from each corpus
cap_size = int(args.cap_size)
embedding_sources = []
[embedding_sources.append(c) for c in corpora_dirs]
sentences = []
embeddings = []
lengths = []
for source in embedding_sources:
sentences_embeddings = utils.load_precomputed_embeddings(source)
sents = [e[0] for e in sentences_embeddings[:cap_size]]
embds = [e[1][0] for e in sentences_embeddings[:cap_size]]
sentences.extend(sents)
embeddings.extend(embds)
lengths.append(len(sentences))
embeddings = np.array(embeddings).squeeze()
embeddings = pca_dim_reduction(embeddings, pca_dims=50)
embeddings = tsne_dim_reduction(embeddings)
size = len(sentences)
## create sentence labels based on the dataset where the embeddings come from
classes = [0 for i in range(size)]
for i in range(len(lengths)-1):
for j in range(lengths[i],lengths[i+1]):
classes[j] = i+1
## plot t-SNE embeddings
legend_info = []
[legend_info.append((corpora_names[i],colors[i])) for i in range(len(corpora_names))]
plot = plot_tsne(embeddings, sentences, classes, legend_info)
plot.savefig('corpora_comparison.png', dpi=100)