diff --git a/.gitignore b/.gitignore index fd1ef88..86ec302 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ corpora/durel corpora/surel modules/__pycache__ modules/*.pyc +update-git.sh \ No newline at end of file diff --git a/alignment/sgns_vi.py b/alignment/sgns_vi.py index 6c4954a..b5892dd 100644 --- a/alignment/sgns_vi.py +++ b/alignment/sgns_vi.py @@ -6,15 +6,9 @@ import time import gensim from gensim.models.word2vec import PathLineSentences -from gensim.models import KeyedVectors +from gensim.models import Word2Vec -def intersection_dic(t1, t2): - voc_t1 = [x for xs in t1 for x in xs] - voc_t2 = [x for xs in t2 for x in xs] - intersection = list(set(voc_t1) & set(voc_t2)) - return [[x] for x in intersection] # note: gensim wants list of iterables (i.e. list of lists) - def main(): """ Make comparable embedding vector spaces with Skip-Gram with Negative Sampling as described in: @@ -27,13 +21,13 @@ def main(): args = docopt("""Make comparable embedding vector spaces with Skip-Gram with Negative Sampling and Vector Initialization from corpus. Usage: - sgns_vi.py [-l] + sgns_vi.py [-l] Arguments: + = model for initialization = path to corpus directory with zipped files, each sentence in form 'year\tword1 word2 word3...' = output path for vectors - = vectors on which model should be initialized = the linear distance of context words to consider in each direction = dimensionality of embeddings = number of negative samples parameter (equivalent to shifting parameter for PPMI) @@ -45,12 +39,21 @@ def main(): -l, --len normalize final vectors to unit length Note: - Initialization vectors should be non-length-normalized. + This script has been updated considerably compared to the version used in + + Dominik Schlechtweg, Anna Hätty, Marco del Tredici, and Sabine Schulte im Walde. 2019. A Wind of Change: Detecting and Evaluating Lexical Semantic Change across Times and Domains. In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pages 732-746, Florence, Italy. ACL. + + The Skip Gram Model consist of three layers, the input layer, hidden layer and output layer. Weights between the input layer and the hidden layer are stored in the Embedding Matrix, which is later used for getting the individual word embeddings by looking at only one column of the Matrix. The Context matrix stores the weights between the hidden layer and the output layer. + + Differences: + In the original version for training on the second corpus only the previously created Embedding Matrix was loaded into the new model, so the Context matrix is newly initialized with random values. In the updated version the whole model is reused for training on the second corpus, that includes the Embedding Matrix as well as the Context matrix. + + Additionally, vocabulary """) is_len = args['--len'] - initVectorsPath = args[''] + modelPath = args[''] corpDir = args[''] outPath = args[''] windowSize = int(args['']) @@ -67,28 +70,16 @@ def main(): logging.info(__file__.upper()) start_time = time.time() - # Initialize model - model = gensim.models.Word2Vec(sg=1, # skipgram - hs=0, # negative sampling - negative=k, # number of negative samples - sample=t, # threshold for subsampling, if None, no subsampling is performed - size=dim, window=windowSize, min_count=minCount, iter=itera, workers=20) + # Load model + model = Word2Vec.load(modelPath) - # Receive vectors for initialization - initVectors = KeyedVectors.load_word2vec_format(initVectorsPath, binary=False) - - # Initialize vocabulary - vocab_initVectors = initVectors.vocab - # Intersect vocabulary - vocab_sentences_t_2 = PathLineSentences(corpDir) + vocab_sentences = PathLineSentences(corpDir) logging.getLogger('gensim').setLevel(logging.ERROR) - vocab_intersect = intersection_dic([[token] for token in vocab_initVectors],vocab_sentences_t_2) - model.build_vocab(vocab_intersect) + model.build_vocab(vocab_sentences, update=True) # Train sentences = PathLineSentences(corpDir) - model.intersect_word2vec_format(initVectorsPath, lockf=1) model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) if is_len: diff --git a/representations/sgns.py b/representations/sgns.py index 54210b7..be4934f 100644 --- a/representations/sgns.py +++ b/representations/sgns.py @@ -74,7 +74,7 @@ def main(): # Save the vectors and the model model.wv.save_word2vec_format(outPath) - #model.save(outPath + '.model') + model.save(outPath + '.model') logging.info("--- %s seconds ---" % (time.time() - start_time)) diff --git a/scripts/run_CD.sh b/scripts/run_CD.sh index 5a876ab..a0d38d2 100644 --- a/scripts/run_CD.sh +++ b/scripts/run_CD.sh @@ -1,5 +1,5 @@ -matrices=($matrixfolder1/!(*@(_rows|_columns))) +matrices=($matrixfolder1/!(*@(_rows|_columns|.model))) for matrix in "${matrices[@]}" do diff --git a/scripts/run_CI.sh b/scripts/run_CI.sh index 5cf167f..ee2f796 100644 --- a/scripts/run_CI.sh +++ b/scripts/run_CI.sh @@ -1,5 +1,5 @@ -matrices=($matrixfolder1/!(*@(_rows|_columns))) +matrices=($matrixfolder1/!(*@(_rows|_columns|.model))) for matrix in "${matrices[@]}" do diff --git a/scripts/run_ENTR.sh b/scripts/run_ENTR.sh index 8f229f2..643a2f3 100644 --- a/scripts/run_ENTR.sh +++ b/scripts/run_ENTR.sh @@ -1,5 +1,5 @@ -matrices=($matrixfolder/!(*@(_rows|_columns))) +matrices=($matrixfolder/!(*@(_rows|_columns|.model))) for matrix in "${matrices[@]}" do diff --git a/scripts/run_LND.sh b/scripts/run_LND.sh index 6e81ffe..d2c5c33 100644 --- a/scripts/run_LND.sh +++ b/scripts/run_LND.sh @@ -1,5 +1,5 @@ -matrices=($matrixfolder1/!(*@(_rows|_columns))) +matrices=($matrixfolder1/!(*@(_rows|_columns|.model))) for matrix in "${matrices[@]}" do diff --git a/scripts/run_NENTR.sh b/scripts/run_NENTR.sh index e48ee0e..5b59c6f 100644 --- a/scripts/run_NENTR.sh +++ b/scripts/run_NENTR.sh @@ -1,5 +1,5 @@ -matrices=($matrixfolder/!(*@(_rows|_columns))) +matrices=($matrixfolder/!(*@(_rows|_columns|.model))) for matrix in "${matrices[@]}" do diff --git a/scripts/run_NTYPE.sh b/scripts/run_NTYPE.sh index 3a4d19d..e194928 100644 --- a/scripts/run_NTYPE.sh +++ b/scripts/run_NTYPE.sh @@ -1,5 +1,5 @@ -matrices=($matrixfolder/!(*@(_rows|_columns))) +matrices=($matrixfolder/!(*@(_rows|_columns|.model))) for matrix in "${matrices[@]}" do diff --git a/scripts/run_OP+.sh b/scripts/run_OP+.sh index a29f115..bc92455 100644 --- a/scripts/run_OP+.sh +++ b/scripts/run_OP+.sh @@ -1,5 +1,5 @@ -matrices=($matrixfolder1/!(*@(_rows|_columns))) +matrices=($matrixfolder1/!(*@(_rows|_columns|.model))) for matrix in "${matrices[@]}" do diff --git a/scripts/run_OP-.sh b/scripts/run_OP-.sh index 845925c..134f696 100644 --- a/scripts/run_OP-.sh +++ b/scripts/run_OP-.sh @@ -1,5 +1,5 @@ -matrices=($matrixfolder1/!(*@(_rows|_columns))) +matrices=($matrixfolder1/!(*@(_rows|_columns|.model))) for matrix in "${matrices[@]}" do diff --git a/scripts/run_OP.sh b/scripts/run_OP.sh index b494b9d..382e759 100644 --- a/scripts/run_OP.sh +++ b/scripts/run_OP.sh @@ -1,5 +1,5 @@ -matrices=($matrixfolder1/!(*@(_rows|_columns))) +matrices=($matrixfolder1/!(*@(_rows|_columns|.model))) for matrix in "${matrices[@]}" do diff --git a/scripts/run_PPMI.sh b/scripts/run_PPMI.sh index 52f427c..7705bea 100644 --- a/scripts/run_PPMI.sh +++ b/scripts/run_PPMI.sh @@ -1,5 +1,5 @@ -matrices=($matrixfolder/!(*@(_rows|_columns))) +matrices=($matrixfolder/!(*@(_rows|_columns|.model))) for matrix in "${matrices[@]}" do diff --git a/scripts/run_RI.sh b/scripts/run_RI.sh index d789956..c3e3d72 100644 --- a/scripts/run_RI.sh +++ b/scripts/run_RI.sh @@ -1,5 +1,5 @@ -matrices=($matrixfolder/!(*@(_rows|_columns))) +matrices=($matrixfolder/!(*@(_rows|_columns|.model))) for matrix in "${matrices[@]}" do diff --git a/scripts/run_SGNS_VI.sh b/scripts/run_SGNS_VI.sh index 09f5136..e382124 100644 --- a/scripts/run_SGNS_VI.sh +++ b/scripts/run_SGNS_VI.sh @@ -7,8 +7,8 @@ do do for iteration in "${iterations[@]}" do - python3 alignment/sgns_vi.py $infolder/win$windowSize-k$k-t$t-iter$iteration.sgns $corpDir2 $outfolder2/win$windowSize-k$k-t$t-iter$iteration\_vi.sgns $windowSize $dim $k $t 0 5 # construct word2vec skip-gram embeddings with vector initialization - scp $infolder/win$windowSize-k$k-t$t-iter$iteration.sgns $outfolder1/win$windowSize-k$k-t$t-iter$iteration\_vi.sgns # copy initialization vectors as matrix for first time period + python3 alignment/sgns_vi.py $infolder/win$windowSize-k$k-t$t-iter$iteration.sgns.model $corpDir2 $outfolder2/win$windowSize-k$k-t$t-iter$iteration.sgns-VI $windowSize $dim $k $t 0 5 # construct word2vec skip-gram embeddings with vector initialization + scp $infolder/win$windowSize-k$k-t$t-iter$iteration.sgns $outfolder1/win$windowSize-k$k-t$t-iter$iteration.sgns-VI # copy initialization vectors as matrix for first time period done done done diff --git a/scripts/run_SGNS_VI2.sh b/scripts/run_SGNS_VI2.sh deleted file mode 100644 index c1c21db..0000000 --- a/scripts/run_SGNS_VI2.sh +++ /dev/null @@ -1,13 +0,0 @@ -for windowSize in "${windowSizes[@]}" -do - for k in "${ks[@]}" - do - for t in "${ts[@]}" - do - for iteration in "${iterations[@]}" - do - python3 alignment/sgns_vi2.py $corpDir1 $corpDir2 $outfolder1/win$windowSize-k$k-t$t-iter$iteration\_vi.sgns2 $outfolder2/win$windowSize-k$k-t$t-iter$iteration\_vi.sgns2 $windowSize $dim $k $t 0 5 # construct word2vec skip-gram embeddings with vector Initialization - done - done - done -done diff --git a/scripts/run_SRV.sh b/scripts/run_SRV.sh index a92d321..372a0fa 100644 --- a/scripts/run_SRV.sh +++ b/scripts/run_SRV.sh @@ -1,5 +1,5 @@ -matrices=($matrixfolder1/!(*@(_rows|_columns))) +matrices=($matrixfolder1/!(*@(_rows|_columns|.model))) for matrix in "${matrices[@]}" do diff --git a/scripts/run_SVD.sh b/scripts/run_SVD.sh index 695ff64..a40aeef 100644 --- a/scripts/run_SVD.sh +++ b/scripts/run_SVD.sh @@ -1,5 +1,5 @@ -matrices=($matrixfolder/!(*@(_rows|_columns))) +matrices=($matrixfolder/!(*@(_rows|_columns|.model))) for matrix in "${matrices[@]}" do diff --git a/scripts/run_TYPE.sh b/scripts/run_TYPE.sh index eb50f35..adb8cd0 100644 --- a/scripts/run_TYPE.sh +++ b/scripts/run_TYPE.sh @@ -1,5 +1,5 @@ -matrices=($matrixfolder/!(*@(_rows|_columns))) +matrices=($matrixfolder/!(*@(_rows|_columns|.model))) for matrix in "${matrices[@]}" do