From 7b72046156210ab24d47dc3dfaf74cacca4f9c02 Mon Sep 17 00:00:00 2001 From: garrafao Date: Sat, 28 Dec 2019 00:15:47 +0100 Subject: [PATCH] merge sgns_vi.py and sgns_vi2.py, initialize on full model, join vocabulary --- README.md | 60 ++++++++----------- alignment/sgns_vi.py | 19 +----- alignment/sgns_vi2.py | 131 ----------------------------------------- scripts/run_SGNS_VI.sh | 2 +- 4 files changed, 27 insertions(+), 185 deletions(-) delete mode 100644 alignment/sgns_vi2.py diff --git a/README.md b/README.md index 0a33f81..5e29d61 100644 --- a/README.md +++ b/README.md @@ -59,53 +59,41 @@ The scripts assume a corpus format of one sentence per line in UTF-8 encoded (op #### Semantic Representations -|Name | Code | Type | -| --- | --- | --- | -| Count | `representations/count.py` | VSM | -| PPMI | `representations/ppmi.py` | VSM | -| SVD | `representations/svd.py` | VSM | -| RI | `representations/ri.py` | VSM | -| SGNS | `representations/sgns.py` | VSM | -| SCAN | [repository](https://github.com/ColiLea/scan) | TPM | +|Name | Code | Type | Comment | +| --- | --- | --- | --- | +| Count | `representations/count.py` | VSM | | +| PPMI | `representations/ppmi.py` | VSM | | +| SVD | `representations/svd.py` | VSM | | +| RI | `representations/ri.py` | VSM | - use `-a` for good performance | +| SGNS | `representations/sgns.py` | VSM | | +| SCAN | [repository](https://github.com/ColiLea/scan) | TPM | - different corpus input format | Table: VSM=Vector Space Model, TPM=Topic Model -Note that SCAN takes a slightly different corpus input format than the other models. - #### Alignment -|Name | Code | Applicability | -| --- | --- | --- | -| CI | `alignment/ci_align.py` | Count, PPMI | -| SRV | `alignment/srv_align.py` | RI | -| OP | `alignment/map_embeddings.py` | SVD, RI, SGNS | -| VI | `alignment/sgns_vi.py` | SGNS | -| WI | `alignment/wi.py` | Count, PPMI, SVD, RI, SGNS | - -The script `alignment/map_embeddings.py` is drawn from [VecMap](https://github.com/artetxem/vecmap), where you can find instructions how to use it. Find examples of how to obtain OP, OP- and OP+ under `scripts/`. - -For SRV, consider using the efficient and more powerful [TRIPY](https://github.com/Garrafao/TRIPY). Instead of WI, consider using the more advanced [Temporal Referencing](https://github.com/Garrafao/TemporalReferencing). +|Name | Code | Applicability | Comment | +| --- | --- | --- | --- | +| CI | `alignment/ci_align.py` | Count, PPMI | | +| SRV | `alignment/srv_align.py` | RI | - use `-a` for good performance
- consider using the efficient and more powerful [TRIPY](https://github.com/Garrafao/TRIPY) | +| OP | `alignment/map_embeddings.py` | SVD, RI, SGNS | - drawn from [VecMap](https://github.com/artetxem/vecmap)
- for OP- and OP+ see `scripts/` | +| VI | `alignment/sgns_vi.py` | SGNS | - updated 27/12/19 (see script for details) | +| WI | `alignment/wi.py` | Count, PPMI, SVD, RI, SGNS | - consider using the more advanced [Temporal Referencing](https://github.com/Garrafao/TemporalReferencing) | #### Measures -|Name | Code | Applicability | -| --- | --- | --- | -| CD | `measures/cd.py` | Count, PPMI, SVD, RI, SGNS | -| LND | `measures/lnd.py` | Count, PPMI, SVD, RI, SGNS | -| JSD | - | SCAN | -| FD | `measures/freq.py` | from corpus | -| TD | `measures/typs.py` |Count| -| HD | `measures/entropy.py` | Count | - -FD, TD and HD need additional applications of `measures/diff.py` and optionally `measures/trsf.py`. +|Name | Code | Applicability | Comment | +| --- | --- | --- | --- | +| CD | `measures/cd.py` | Count, PPMI, SVD, RI, SGNS | | +| LND | `measures/lnd.py` | Count, PPMI, SVD, RI, SGNS | | +| JSD | - | SCAN | | +| FD | `measures/freq.py` | from corpus | - log-transform with `measures/trsf.py`
- get difference with `measures/diff.py` | +| TD | `measures/typs.py` | Count | as above | +| HD | `measures/entropy.py` | Count | as above | ### Parameter Settings -For better performance, RI and SRV should be run with `-a` option, instead of specifying the seed number manually. - -Consider the application of column mean centering after L2-normalization to RI and SGNS embeddings before applying a change measure. - -Find more detailed notes on model performances and optimal parameter settings in [these papers](#bibtex). +Find detailed notes on model performances and optimal parameter settings in [these papers](#bibtex). ### Evaluation diff --git a/alignment/sgns_vi.py b/alignment/sgns_vi.py index b5892dd..8ee9eda 100644 --- a/alignment/sgns_vi.py +++ b/alignment/sgns_vi.py @@ -21,19 +21,13 @@ def main(): args = docopt("""Make comparable embedding vector spaces with Skip-Gram with Negative Sampling and Vector Initialization from corpus. Usage: - sgns_vi.py [-l] + sgns_vi.py [-l] Arguments: = model for initialization = path to corpus directory with zipped files, each sentence in form 'year\tword1 word2 word3...' = output path for vectors - = the linear distance of context words to consider in each direction - = dimensionality of embeddings - = number of negative samples parameter (equivalent to shifting parameter for PPMI) - = threshold for subsampling - = number of occurrences for a word to be included in the vocabulary - = number of iterations Options: -l, --len normalize final vectors to unit length @@ -48,7 +42,7 @@ def main(): Differences: In the original version for training on the second corpus only the previously created Embedding Matrix was loaded into the new model, so the Context matrix is newly initialized with random values. In the updated version the whole model is reused for training on the second corpus, that includes the Embedding Matrix as well as the Context matrix. - Additionally, vocabulary + Additionally, the vocabulary of the two corpora are now unified, before they were intersected. """) @@ -56,15 +50,6 @@ def main(): modelPath = args[''] corpDir = args[''] outPath = args[''] - windowSize = int(args['']) - dim = int(args['']) - k = int(args['']) - if args['']=='None': - t = None - else: - t = float(args['']) - minCount = int(args['']) - itera = int(args['']) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) diff --git a/alignment/sgns_vi2.py b/alignment/sgns_vi2.py deleted file mode 100644 index 6cdd5f9..0000000 --- a/alignment/sgns_vi2.py +++ /dev/null @@ -1,131 +0,0 @@ -""" -Differences between sgns_vi and sgns_vi2: - -Introduction: -The Skip Gram Model consist of three layers, the input layer, hidden layer and -output layer. Weights between the input layer and the hidden layer are stored -in the Embedding Matrix, which is later used for getting the individual word -embeddings by looking at only one column of the Matrix. The Context matrix -stores the weights between the hidden layer and the output layer. - -Differences: -In sgns_vi for training on the second corpus only the previously -created Embedding Matrix is loaded into the new model, so the Context matrix -is newly initialized with random values. -In sgns_vi2 the whole model is reused for training on the second corpus, that -includes the Embedding Matrix as well as the Context matrix. -""" - - -import gensim -from gensim.models.word2vec import PathLineSentences -import time -import logging -from docopt import docopt -import sys -sys.path.append('./modules/') - - -def intersection_dic(t1, t2): - voc_t1 = [x for xs in t1 for x in xs] - voc_t2 = [x for xs in t2 for x in xs] - intersection = list(set(voc_t1) & set(voc_t2)) - # note: gensim wants list of iterables (i.e. list of lists) - return [[x] for x in intersection] - - -def main(): - """ - Make comparable embedding vector spaces with Skip-Gram with - Negative Sampling as described in: - - Yoon Kim, Yi-I. Chiu, Kentaro Hanaki, Darshan Hegde, and - Slav Petrov. 2014. Temporal analysis of language through - neural language models. arXiv preprint arXiv:1405.3515. - - """ - - # Get the arguments - args = docopt("""Make comparable embedding vector spaces with Skip-Gram with - Negative Sampling and Vector Initialization from corpus. - - Usage: - sgns_vi2.py [-l] - - Arguments: - - = path to corpus directory with zipped files for first time step - = path to corpus directory with zipped files for second time step - = output path for vectors for first time step - = output path for vectors for second time step - = the linear distance of context words to consider in each direction - = dimensionality of embeddings - = number of negative samples parameter (equivalent to shifting parameter for PPMI) - = threshold for subsampling - = number of occurrences for a word to be included in the vocabulary - = number of iterations - - Options: - -l, --len normalize final vectors to unit length - - """) - - is_len = args['--len'] - corpDir1 = args[''] - corpDir2 = args[''] - outPath1 = args[''] - outPath2 = args[''] - windowSize = int(args['']) - dim = int(args['']) - k = int(args['']) - if args[''] == 'None': - t = None - else: - t = float(args['']) - minCount = int(args['']) - itera = int(args['']) - - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) - logging.info(__file__.upper()) - start_time = time.time() - - # Initialize model - model = gensim.models.Word2Vec(sg=1, # skipgram - hs=0, # negative sampling - negative=k, # number of negative samples - sample=t, # threshold for subsampling, if None, no subsampling is performed - size=dim, - window=windowSize, - min_count=minCount, - iter=itera, - workers=20) - - # Initialize vocabulary - logging.getLogger('gensim').setLevel(logging.ERROR) - sentences_t1 = PathLineSentences(corpDir1) - sentences_t2 = PathLineSentences(corpDir2) - vocab_intersect = intersection_dic(sentences_t1, sentences_t2) - model.build_vocab(vocab_intersect) - - # Train on the first corpus - model.train(sentences_t1, total_examples=model.corpus_count, epochs=model.epochs) - if is_len: - # L2-normalize vectors - model.init_sims(replace=True) - # Save the vectors and the model - model.wv.save_word2vec_format(outPath1) - # model.save(outPath1 + '.model') - - # Train on the second corpus - model.train(sentences_t2, total_examples=model.corpus_count, epochs=model.epochs) - if is_len: - # L2-normalize vectors - model.init_sims(replace=True) - # Save the vectors and the model - model.wv.save_word2vec_format(outPath2) - # model.save(outPath2 + '.model') - logging.info("--- %s seconds ---" % (time.time() - start_time)) - - -if __name__ == '__main__': - main() diff --git a/scripts/run_SGNS_VI.sh b/scripts/run_SGNS_VI.sh index e382124..327d5ec 100644 --- a/scripts/run_SGNS_VI.sh +++ b/scripts/run_SGNS_VI.sh @@ -7,7 +7,7 @@ do do for iteration in "${iterations[@]}" do - python3 alignment/sgns_vi.py $infolder/win$windowSize-k$k-t$t-iter$iteration.sgns.model $corpDir2 $outfolder2/win$windowSize-k$k-t$t-iter$iteration.sgns-VI $windowSize $dim $k $t 0 5 # construct word2vec skip-gram embeddings with vector initialization + python3 alignment/sgns_vi.py $infolder/win$windowSize-k$k-t$t-iter$iteration.sgns.model $corpDir2 $outfolder2/win$windowSize-k$k-t$t-iter$iteration.sgns-VI # construct word2vec skip-gram embeddings with vector initialization scp $infolder/win$windowSize-k$k-t$t-iter$iteration.sgns $outfolder1/win$windowSize-k$k-t$t-iter$iteration.sgns-VI # copy initialization vectors as matrix for first time period done done