-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathparser.py
executable file
·62 lines (45 loc) · 1.47 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env python
"""
Passes extraction output into `word2vec`
and prints results as JSON.
"""
from __future__ import absolute_import, unicode_literals
import json
import click
from numpy import array as np_array
import gensim
class LineGenerator(object):
"""Reads a sentence file, yields numpy array-wrapped sentences
"""
def __init__(self, fh):
self.fh = fh
def __iter__(self):
for line in self.fh.readlines():
yield np_array(json.loads(line)['sentence'])
def serialize_rankings(rankings):
"""Returns a JSON-encoded object representing word2vec's
similarity output.
"""
return json.dumps([
{'artist': artist, 'rel': rel}
for (artist, rel)
in rankings
])
@click.command()
@click.option('-i', 'input_file', type=click.File('r', encoding='utf-8'),
required=True)
@click.option('-t', 'term', required=True)
@click.option('--min-count', type=click.INT, default=5)
@click.option('-w', 'workers', type=click.INT, default=4)
def cli(input_file, term, min_count, workers):
# create word2vec
model = gensim.models.Word2Vec(min_count=min_count, workers=workers)
model.build_vocab(LineGenerator(input_file))
try:
similar = model.most_similar(term)
click.echo( serialize_rankings(similar) )
except KeyError:
# really wish this was a more descriptive error
exit('Could not parse input: {}'.format(exc))
if __name__ == '__main__':
cli()