merge sgns_vi.py and sgns_vi2.py, initialize on full model, join voca…

…bulary
Garrafao · Dec 27, 2019 · 73c694b · 73c694b
1 parent e2dcf99
commit 73c694b
Show file tree

Hide file tree

Showing 19 changed files with 36 additions and 57 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@ corpora/durel
 corpora/surel
 modules/__pycache__
 modules/*.pyc
+update-git.sh
diff --git a/alignment/sgns_vi.py b/alignment/sgns_vi.py
@@ -6,15 +6,9 @@
 import time
 import gensim
 from gensim.models.word2vec import PathLineSentences
-from gensim.models import KeyedVectors
+from gensim.models import Word2Vec
 
 
-def intersection_dic(t1, t2):
-    voc_t1 = [x for xs in t1 for x in xs]
-    voc_t2 = [x for xs in t2 for x in xs]
-    intersection = list(set(voc_t1) & set(voc_t2))
-    return [[x] for x in intersection] # note: gensim wants list of iterables (i.e. list of lists)
-
 def main():
     """
     Make comparable embedding vector spaces with Skip-Gram with Negative Sampling as described in:
@@ -27,13 +21,13 @@ def main():
     args = docopt("""Make comparable embedding vector spaces with Skip-Gram with Negative Sampling and Vector Initialization from corpus.
 
     Usage:
-        sgns_vi.py [-l] <vectorsPath> <corpDir> <outPath> <windowSize> <dim> <k> <t> <minCount> <itera>
+        sgns_vi.py [-l] <modelPath> <corpDir> <outPath> <windowSize> <dim> <k> <t> <minCount> <itera>
         
     Arguments:
        
+        <modelPath> = model for initialization
         <corpDir> = path to corpus directory with zipped files, each sentence in form 'year\tword1 word2 word3...'
         <outPath> = output path for vectors
-        <vectorsPath> = vectors on which model should be initialized
         <windowSize> = the linear distance of context words to consider in each direction
         <dim> = dimensionality of embeddings
         <k> = number of negative samples parameter (equivalent to shifting parameter for PPMI)
@@ -45,12 +39,21 @@ def main():
         -l, --len   normalize final vectors to unit length
 
     Note:
-        Initialization vectors should be non-length-normalized.
+        This script has been updated considerably compared to the version used in
+
+            Dominik Schlechtweg, Anna Hätty, Marco del Tredici, and Sabine Schulte im Walde. 2019. A Wind of Change: Detecting and Evaluating Lexical Semantic Change across Times and Domains. In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pages 732-746, Florence, Italy. ACL.
+
+        The Skip Gram Model consist of three layers, the input layer, hidden layer and output layer. Weights between the input layer and the hidden layer are stored in the Embedding Matrix, which is later used for getting the individual word embeddings by looking at only one column of the Matrix. The Context matrix stores the weights between the hidden layer and the output layer.
+
+        Differences:
+        In the original version for training on the second corpus only the previously created Embedding Matrix was loaded into the new model, so the Context matrix is newly initialized with random values. In the updated version the whole model is reused for training on the second corpus, that includes the Embedding Matrix as well as the Context matrix.
+
+        Additionally, vocabulary
 
     """)
 
     is_len = args['--len']
-    initVectorsPath = args['<vectorsPath>'] 
+    modelPath = args['<modelPath>'] 
     corpDir = args['<corpDir>']
     outPath = args['<outPath>']
     windowSize = int(args['<windowSize>'])    
@@ -67,28 +70,16 @@ def main():
     logging.info(__file__.upper())
     start_time = time.time()    
 
-    # Initialize model
-    model = gensim.models.Word2Vec(sg=1, # skipgram
-    							   hs=0, # negative sampling
-    							   negative=k, # number of negative samples
-    							   sample=t, # threshold for subsampling, if None, no subsampling is performed
-    							   size=dim, window=windowSize, min_count=minCount, iter=itera, workers=20)
+    # Load model
+    model = Word2Vec.load(modelPath)
 
-    # Receive vectors for initialization
-    initVectors = KeyedVectors.load_word2vec_format(initVectorsPath, binary=False)
-
-    # Initialize vocabulary
-    vocab_initVectors = initVectors.vocab
-
     # Intersect vocabulary
-    vocab_sentences_t_2 = PathLineSentences(corpDir)
+    vocab_sentences = PathLineSentences(corpDir)
     logging.getLogger('gensim').setLevel(logging.ERROR)    
-    vocab_intersect = intersection_dic([[token] for token in vocab_initVectors],vocab_sentences_t_2)
-    model.build_vocab(vocab_intersect)
+    model.build_vocab(vocab_sentences, update=True)
 
     # Train
     sentences = PathLineSentences(corpDir)
-    model.intersect_word2vec_format(initVectorsPath, lockf=1)
     model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)
 
     if is_len:

diff --git a/representations/sgns.py b/representations/sgns.py
@@ -74,7 +74,7 @@ def main():
 
     # Save the vectors and the model
     model.wv.save_word2vec_format(outPath)
-    #model.save(outPath + '.model')
+    model.save(outPath + '.model')
 
     logging.info("--- %s seconds ---" % (time.time() - start_time))
 

diff --git a/scripts/run_CD.sh b/scripts/run_CD.sh
@@ -1,5 +1,5 @@
 
-matrices=($matrixfolder1/!(*@(_rows|_columns)))
+matrices=($matrixfolder1/!(*@(_rows|_columns|.model)))
 
 for matrix in "${matrices[@]}"
 do

diff --git a/scripts/run_CI.sh b/scripts/run_CI.sh
@@ -1,5 +1,5 @@
 
-matrices=($matrixfolder1/!(*@(_rows|_columns)))
+matrices=($matrixfolder1/!(*@(_rows|_columns|.model)))
 
 for matrix in "${matrices[@]}" 
 do

diff --git a/scripts/run_ENTR.sh b/scripts/run_ENTR.sh
@@ -1,5 +1,5 @@
 
-matrices=($matrixfolder/!(*@(_rows|_columns)))
+matrices=($matrixfolder/!(*@(_rows|_columns|.model)))
 
 for matrix in "${matrices[@]}"
 do

diff --git a/scripts/run_LND.sh b/scripts/run_LND.sh
@@ -1,5 +1,5 @@
 
-matrices=($matrixfolder1/!(*@(_rows|_columns)))
+matrices=($matrixfolder1/!(*@(_rows|_columns|.model)))
 
 for matrix in "${matrices[@]}"
 do

diff --git a/scripts/run_NENTR.sh b/scripts/run_NENTR.sh
@@ -1,5 +1,5 @@
 
-matrices=($matrixfolder/!(*@(_rows|_columns)))
+matrices=($matrixfolder/!(*@(_rows|_columns|.model)))
 
 for matrix in "${matrices[@]}"
 do

diff --git a/scripts/run_NTYPE.sh b/scripts/run_NTYPE.sh
@@ -1,5 +1,5 @@
 
-matrices=($matrixfolder/!(*@(_rows|_columns)))
+matrices=($matrixfolder/!(*@(_rows|_columns|.model)))
 
 for matrix in "${matrices[@]}"
 do

diff --git a/scripts/run_OP+.sh b/scripts/run_OP+.sh
@@ -1,5 +1,5 @@
 
-matrices=($matrixfolder1/!(*@(_rows|_columns)))
+matrices=($matrixfolder1/!(*@(_rows|_columns|.model)))
 
 for matrix in "${matrices[@]}" 
 do

diff --git a/scripts/run_OP-.sh b/scripts/run_OP-.sh
@@ -1,5 +1,5 @@
 
-matrices=($matrixfolder1/!(*@(_rows|_columns)))
+matrices=($matrixfolder1/!(*@(_rows|_columns|.model)))
 
 for matrix in "${matrices[@]}" 
 do

diff --git a/scripts/run_OP.sh b/scripts/run_OP.sh
@@ -1,5 +1,5 @@
 
-matrices=($matrixfolder1/!(*@(_rows|_columns)))
+matrices=($matrixfolder1/!(*@(_rows|_columns|.model)))
 
 for matrix in "${matrices[@]}" 
 do

diff --git a/scripts/run_PPMI.sh b/scripts/run_PPMI.sh
@@ -1,5 +1,5 @@
 
-matrices=($matrixfolder/!(*@(_rows|_columns)))
+matrices=($matrixfolder/!(*@(_rows|_columns|.model)))
 
 for matrix in "${matrices[@]}"
 do

diff --git a/scripts/run_RI.sh b/scripts/run_RI.sh
@@ -1,5 +1,5 @@
 
-matrices=($matrixfolder/!(*@(_rows|_columns)))
+matrices=($matrixfolder/!(*@(_rows|_columns|.model)))
 
 for matrix in "${matrices[@]}"
 do

diff --git a/scripts/run_SGNS_VI.sh b/scripts/run_SGNS_VI.sh
@@ -7,8 +7,8 @@ do
 	do		    		
 	    for iteration in "${iterations[@]}"
 	    do
-		python3 alignment/sgns_vi.py $infolder/win$windowSize-k$k-t$t-iter$iteration.sgns $corpDir2 $outfolder2/win$windowSize-k$k-t$t-iter$iteration\_vi.sgns $windowSize $dim $k $t 0 5 # construct word2vec skip-gram embeddings with vector initialization
-		scp $infolder/win$windowSize-k$k-t$t-iter$iteration.sgns $outfolder1/win$windowSize-k$k-t$t-iter$iteration\_vi.sgns # copy initialization vectors as matrix for first time period
+		python3 alignment/sgns_vi.py $infolder/win$windowSize-k$k-t$t-iter$iteration.sgns.model $corpDir2 $outfolder2/win$windowSize-k$k-t$t-iter$iteration.sgns-VI $windowSize $dim $k $t 0 5 # construct word2vec skip-gram embeddings with vector initialization
+		scp $infolder/win$windowSize-k$k-t$t-iter$iteration.sgns $outfolder1/win$windowSize-k$k-t$t-iter$iteration.sgns-VI # copy initialization vectors as matrix for first time period
 	    done	    
 	done	    		
     done	

diff --git a/scripts/run_SGNS_VI2.sh b/scripts/run_SGNS_VI2.sh
diff --git a/scripts/run_SRV.sh b/scripts/run_SRV.sh
@@ -1,5 +1,5 @@
 
-matrices=($matrixfolder1/!(*@(_rows|_columns)))
+matrices=($matrixfolder1/!(*@(_rows|_columns|.model)))
 
 for matrix in "${matrices[@]}"
 do

diff --git a/scripts/run_SVD.sh b/scripts/run_SVD.sh
@@ -1,5 +1,5 @@
 
-matrices=($matrixfolder/!(*@(_rows|_columns)))
+matrices=($matrixfolder/!(*@(_rows|_columns|.model)))
 
 for matrix in "${matrices[@]}"
 do

diff --git a/scripts/run_TYPE.sh b/scripts/run_TYPE.sh
@@ -1,5 +1,5 @@
 
-matrices=($matrixfolder/!(*@(_rows|_columns)))
+matrices=($matrixfolder/!(*@(_rows|_columns|.model)))
 
 for matrix in "${matrices[@]}"
 do