-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
/
Copy pathword2vec_standalone.py
143 lines (124 loc) · 5.86 KB
/
word2vec_standalone.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - https://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html
"""
USAGE: %(program)s -train CORPUS -output VECTORS -size SIZE -window WINDOW
-cbow CBOW -sample SAMPLE -hs HS -negative NEGATIVE -threads THREADS -iter ITER
-min_count MIN-COUNT -alpha ALPHA -binary BINARY -accuracy FILE
Trains a neural embedding model on text file CORPUS.
Parameters essentially reproduce those used by the original C tool
(see https://code.google.com/archive/p/word2vec/).
Parameters for training:
-train <file>
Use text data from <file> to train the model
-output <file>
Use <file> to save the resulting word vectors / word clusters
-size <int>
Set size of word vectors; default is 100
-window <int>
Set max skip length between words; default is 5
-sample <float>
Set threshold for occurrence of words. Those that appear with higher frequency in the training data
will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)
-hs <int>
Use Hierarchical Softmax; default is 0 (not used)
-negative <int>
Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)
-threads <int>
Use <int> threads (default 3)
-iter <int>
Run more training iterations (default 5)
-min_count <int>
This will discard words that appear less than <int> times; default is 5
-alpha <float>
Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW
-binary <int>
Save the resulting vectors in binary moded; default is 0 (off)
-cbow <int>
Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)
-accuracy <file>
Compute accuracy of the resulting model analogical inference power on questions file <file>
See an example of questions file
at https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt
Example: python -m gensim.scripts.word2vec_standalone -train data.txt \
-output vec.txt -size 200 -sample 1e-4 -binary 0 -iter 3
"""
import logging
import os.path
import sys
import argparse
from numpy import seterr
from gensim.models.word2vec import Word2Vec, LineSentence # avoid referencing __main__ in pickle
logger = logging.getLogger(__name__)
if __name__ == "__main__":
logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
logger.info("running %s", " ".join(sys.argv))
seterr(all='raise') # don't ignore numpy errors
parser = argparse.ArgumentParser()
parser.add_argument("-train", help="Use text data from file TRAIN to train the model", required=True)
parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors")
parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5)
parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100)
parser.add_argument(
"-sample",
help="Set threshold for occurrence of words. "
"Those that appear with higher frequency in the training data will be randomly down-sampled; "
"default is 1e-3, useful range is (0, 1e-5)",
type=float, default=1e-3)
parser.add_argument(
"-hs", help="Use Hierarchical Softmax; default is 0 (not used)",
type=int, default=0, choices=[0, 1]
)
parser.add_argument(
"-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)",
type=int, default=5
)
parser.add_argument("-threads", help="Use THREADS threads (default 3)", type=int, default=3)
parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5)
parser.add_argument(
"-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5",
type=int, default=5
)
parser.add_argument(
"-alpha", help="Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW",
type=float
)
parser.add_argument(
"-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)",
type=int, default=1, choices=[0, 1]
)
parser.add_argument(
"-binary", help="Save the resulting vectors in binary mode; default is 0 (off)",
type=int, default=0, choices=[0, 1]
)
parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model")
args = parser.parse_args()
if args.cbow == 0:
skipgram = 1
if not args.alpha:
args.alpha = 0.025
else:
skipgram = 0
if not args.alpha:
args.alpha = 0.05
corpus = LineSentence(args.train)
model = Word2Vec(
corpus, vector_size=args.size, min_count=args.min_count, workers=args.threads,
window=args.window, sample=args.sample, alpha=args.alpha, sg=skipgram,
hs=args.hs, negative=args.negative, cbow_mean=1, epochs=args.iter,
)
if args.output:
outfile = args.output
model.wv.save_word2vec_format(outfile, binary=args.binary)
else:
outfile = args.train.split('.')[0]
model.save(outfile + '.model')
if args.binary == 1:
model.wv.save_word2vec_format(outfile + '.model.bin', binary=True)
else:
model.wv.save_word2vec_format(outfile + '.model.txt', binary=False)
if args.accuracy:
questions_file = args.accuracy
model.accuracy(questions_file)
logger.info("finished running %s", os.path.basename(sys.argv[0]))