diff --git a/doc2vec.py b/doc2vec.py
index 39a77e8b..9f105e84 100644
--- a/doc2vec.py
+++ b/doc2vec.py
@@ -3,20 +3,51 @@
 from random import shuffle
 import gensim
 import sys
+import spacy
+from gensim.parsing.preprocessing import remove_stopwords
+from gensim.models.doc2vec import Doc2Vec
+
+fname = "doc2vec.model"
 
 import logging
 
 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 
+spacy_nlp = spacy.load('en_core_web_sm')
+spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
+
+customize_stop_words = [
+    "-LRB-", "-RRB-", "-LSB-", "-LRB-"
+]
+
+for w in customize_stop_words:
+    spacy_nlp.vocab[w].is_stop = True
+
 if len(sys.argv) - 1 == 1:
-    max_counter = 1000000  # 1 000 000
+    max_counter = int(sys.argv[1])
 else:
     max_counter = 10000  # 10 000
     print("Max Counter not defined!")
     print("Set Default Value: " + str(max_counter))
 
+
+def pre_process(doc):
+    # doc = spacy_nlp(doc)
+
+    # lemma_tokens = [token.lemma_ for token in doc] 
+    # doc = ' '.join(map(str, lemma_tokens))
+    # doc = spacy_nlp(doc)
+
+    # tokens = [token.text for token in doc if not token.is_stop]
+
+    # text = ' '.join(map(str, tokens))
+    text = remove_stopwords(doc)
+    return text
+
+# TODO:Remove all STOP-WORDS and Lemmatize every token!!!!!
+
 # full text and processed in ['text'] tag
-wiki_folder = "../wiki-pages-split"
+wiki_folder = "data/wiki-pages-split"
 files = os.listdir(wiki_folder)
 shuffle(files)
 
@@ -26,9 +57,24 @@
 tokens = []
 for file in files:
     file_content = jsonlines.open(wiki_folder + "/" + file)
-    file_content = file_content.read()
-    text = file_content['text']
+    doc = file_content.read()['text']
+    text = pre_process(doc)
+
     if counter > max_counter:
+        # adding required docs by fever with the claim given
+        file_content = jsonlines.open(wiki_folder + "/" + "Telemundo.json")
+        doc = file_content.read()['text']
+        text = pre_process(doc)
+        tokens = gensim.utils.simple_preprocess(text)
+        print(tokens)
+        train_text.append(gensim.models.doc2vec.TaggedDocument(tokens, ["Telemundo.json"]))
+
+        file_content = jsonlines.open(wiki_folder + "/" + "Hispanic_and_Latino_Americans.json")
+        doc = file_content.read()['text']
+        text = pre_process(doc)
+        tokens = gensim.utils.simple_preprocess(text)
+        train_text.append(gensim.models.doc2vec.TaggedDocument(tokens, ["Hispanic_and_Latino_Americans.json"]))
+
         break
     else:
         tokens = gensim.utils.simple_preprocess(text)
@@ -37,23 +83,55 @@
         if counter % 1000 == 0:
             print(counter)
 
-model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=20, epochs=2)
-model.build_vocab(train_text)
+model = Doc2Vec(vector_size=50, min_count=2, epochs=40)
+#model = Doc2Vec.load(fname)
+model.build_vocab(train_text)#,keep_raw_vocab=True)#, update=True)
 
 model.train(train_text, total_examples=model.corpus_count, epochs=model.epochs)
 
-sentence = "Obama was president of United States of America similar to a Portuguese person called D. Afonso Henriques"
-test_sentence = gensim.utils.simple_preprocess(sentence)
-inferred_vector = model.infer_vector(test_sentence)
+sentence = "Telemundo is a English-language television network."
+text = pre_process(sentence)
+tokens = gensim.utils.simple_preprocess(text)
+print(tokens)
+for token in tokens:
+    print(token)
+    inferred_vector = model.infer_vector([token])
+    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
+
+    STOP = 3
+    for doc, sim in sims:
+        file_content = jsonlines.open(wiki_folder + "/" + doc)
+        file_content = file_content.read()
+        text = file_content['text']
+        print("\n" + doc + " -- " + str(sim) + ": \n")  # + text)
+        if STOP == 0:
+            break
+        else:
+            STOP -= 1
+
+    for doc, sim in sims:
+        if doc != "Hispanic_and_Latino_Americans.json" and doc != "Telemundo.json":
+            continue
+        print(doc + " -- " + str(sim))
+    print("\n")
+
+model.save(fname)
+
+inferred_vector = model.infer_vector(tokens)
 sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
 
-STOP = 5
+STOP = 3
 for doc, sim in sims:
     file_content = jsonlines.open(wiki_folder + "/" + doc)
     file_content = file_content.read()
     text = file_content['text']
-    print("\n" + doc + " -- " + str(sim) + ": \n" + text)
+    print("\n" + doc + " -- " + str(sim) + ": \n")  # + text)
     if STOP == 0:
         break
     else:
         STOP -= 1
+
+for doc, sim in sims:
+    if doc != "Hispanic_and_Latino_Americans.json" and doc != "Telemundo.json":
+        continue
+    print(doc + " -- " + str(sim))
diff --git a/metrics_old.py b/metrics_old.py
deleted file mode 100644
index 229aca28..00000000
--- a/metrics_old.py
+++ /dev/null
@@ -1,243 +0,0 @@
-import jsonlines
-import sys
-from scorer import fever_score
-
-train_file = "data/subsample_train.jsonl"
-train_relevant_file = "data/subsample_train_relevant_docs.jsonl"
-train_concatenate_file = "data/subsample_train_concatenation.jsonl"
-train_predictions_file = "predictions/predictions_train.jsonl"
-
-train_file = jsonlines.open(train_file)
-train_relevant_file = jsonlines.open(train_relevant_file)
-train_concatenate_file = jsonlines.open(train_concatenate_file)
-train_predictions_file = jsonlines.open(train_predictions_file)
-
-train_set = []
-train_relevant = []
-train_concatenate = []
-train_prediction = []
-
-for lines in train_file:
-    lines['claim'] = lines['claim'].replace("-LRB-", " ( ")
-    lines['claim'] = lines['claim'].replace("-RRB-", " ) ")
-    train_set.append(lines)
-
-for lines in train_relevant_file:
-    lines['claim'] = lines['claim'].replace("-LRB-", " ( ")
-    lines['claim'] = lines['claim'].replace("-RRB-", " ) ")
-    train_relevant.append(lines)
-
-for lines in train_concatenate_file:
-    lines['claim'] = lines['claim'].replace("-LRB-", " ( ")
-    lines['claim'] = lines['claim'].replace("-RRB-", " ) ")
-    train_concatenate.append(lines)
-
-# this evidence addition is irrelevant
-info_by_id = dict((d['id'], dict(d, index=index)) for (index, d) in enumerate(train_set))
-for lines in train_predictions_file:
-    lines['evidence'] = info_by_id.get(lines['id'])['evidence']
-    train_prediction.append(lines)
-
-# All claims
-stop = 0
-
-# List with dicts with all important data
-'''
-id : id of the claim
-verifiable : boolean of 1 and 0 with respective meaning
-docs : set of documents that verify the claim
-docs_sep : set of documents separated
-evidences: list of tuples of <doc, line>
-difficulties: list of the number of sentences needed to be evidence
-'''
-gold_data = []
-
-for claim in train_set:
-
-    # init gold dict
-    gold_dict = {'id': claim['id']}
-
-    if claim['verifiable'] == "VERIFIABLE":
-        gold_dict['verifiable'] = 1
-    else:
-        gold_dict['verifiable'] = 0
-
-    # get gold inputs
-    gold_documents = set()
-    gold_documents_separated = set()
-    sentences_pair = set()
-    evidences = claim['evidence']
-    difficulties = []
-    for evidence in evidences:
-        doc_name = ''
-        difficulty = 0
-        if len(evidence) > 1:  # needs more than 1 doc to be verifiable
-            for e in evidence:
-                doc_name += str(e[2])
-                doc_name += " "
-                sentences_pair.add((str(e[2]), str(e[3])))  # add gold sentences
-                gold_documents_separated.add(str(e[2]))  # add the document
-                difficulty += 1
-            doc_name = doc_name[:-1]  # erase the last blank space
-        else:
-            doc_name = str(evidence[0][2])
-            gold_documents_separated.add(str(evidence[0][2]))
-            sentences_pair.add((str(evidence[0][2]), str(evidence[0][3])))
-            difficulty = 1
-        difficulties.append(difficulty)
-        gold_documents.add(doc_name)
-    gold_dict['difficulties'] = difficulties
-    gold_dict['docs'] = gold_documents
-    gold_dict['evidences'] = sentences_pair
-    gold_dict['docs_sep'] = gold_documents_separated
-
-    gold_data.append(gold_dict)
-
-    # flag to stop if needed
-    stop += 1
-    if stop == -1:
-        break
-
-gold_data = dict((item['id'], item) for item in gold_data)
-
-stop = 0
-
-doc_found = 0
-doc_noise = 0
-gold_doc_found = 0
-gold_doc_not_found = 0
-
-precision_correct = 0
-precision_incorrect = 0
-recall_correct = 0
-recall_incorrect = 0
-specificity = 0
-
-precision_sent_correct = 0
-precision_sent_incorrect = 0
-recall_sent_correct = 0
-recall_sent_incorrect = 0
-sent_found = 0
-sent_found_if_doc_found = 0
-
-total_claim = 0
-for claim in train_relevant:
-    _id = claim['id']
-    gold_dict = gold_data.get(_id)
-
-    # no search is needed... no information on gold dict about retrieval
-    if not gold_dict['verifiable']:
-        continue
-
-    # document analysis
-    # TODO: Analyse NER and TF-IDF
-    doc_correct = 0
-    doc_incorrect = 0
-    gold_incorrect = 0
-    docs = set()
-    gold_docs = gold_dict['docs_sep']
-
-    for doc in claim['predicted_pages']:
-        if doc in gold_docs:
-            doc_correct += 1
-        else:
-            doc_incorrect += 1
-        docs.add(doc)
-
-    precision_correct += doc_correct / len(docs)
-    precision_incorrect += doc_incorrect / len(docs)
-    recall_correct += doc_correct / len(gold_docs)
-    recall_incorrect += doc_incorrect / len(gold_docs)
-
-    for gold_doc in gold_docs:
-        if gold_doc not in docs:
-            gold_incorrect += 1
-
-    specificity += gold_incorrect / len(gold_docs)
-
-    if doc_correct > 0:
-        doc_found += 1
-
-    # sentence analysis TODO: check sentences
-    sentences = set()
-    for sent in claim['predicted_sentences']:
-        sentences.add((str(sent[0]), str(sent[1])))
-
-    evidences = gold_dict['evidences']
-    sent_correct = 0
-    sent_incorrect = 0
-    flag = False
-    for sent in sentences:
-        if sent in evidences:
-            sent_correct += 1
-            flag = True
-        else:
-            sent_incorrect += 1
-
-    if flag:
-        sent_found += 1
-
-    if doc_correct and flag:
-        sent_found_if_doc_found += 1
-
-    precision_sent_correct += sent_correct / len(sentences)
-    precision_sent_incorrect += sent_incorrect / len(sentences)
-    recall_sent_correct += sent_correct / len(evidences)
-    recall_sent_incorrect += sent_incorrect / len(evidences)
-
-    # TODO: create all possible pair in order to see if it appears in gold_dict['docs']
-    # claim['predicted_sentences']
-
-    # flag to stop if needed
-    total_claim += 1
-    stop += 1
-    if stop == -1:
-        break
-
-precision_correct /= total_claim
-precision_incorrect /= total_claim
-recall_correct /= total_claim
-recall_incorrect /= total_claim
-specificity /= total_claim
-doc_found /= total_claim
-
-print("\n#############")
-print("# DOCUMENTS #")
-print("#############")
-print("Precision (Document Retrieved):\t\t\t\t\t\t " + str(precision_correct))  # precision
-print("Fall-out (incorrect documents):\t\t\t\t\t\t " + str(precision_incorrect))  # precision
-print("Recall (Relevant Documents):\t\t\t\t\t\t " + str(recall_correct))  # recall
-print("Percentage of gold documents NOT found:\t\t\t\t " + str(recall_incorrect))  # recall
-print("Fall-out: " + str(specificity))
-print("Percentage of at least one document found correctly: " + str(doc_found))  # recall
-
-precision_sent_correct /= total_claim
-precision_sent_incorrect /= total_claim
-recall_sent_correct /= total_claim
-recall_sent_incorrect /= total_claim
-sent_found /= total_claim
-sent_found_if_doc_found /= total_claim
-another_sent = sent_found_if_doc_found / doc_found
-
-print("\n#############")
-print("# SENTENCES #")
-print("#############")
-print("Precision (Sentences Retrieved):\t\t\t\t\t " + str(precision_sent_correct))  # precision
-print("Precision (incorrect Sentences):\t\t\t\t\t " + str(precision_sent_incorrect))  # precision
-print("Recall (Relevant Sentences):\t\t\t\t\t\t " + str(recall_sent_correct))  # recall
-print("Percentage of gold Sentences NOT found:\t\t\t\t " + str(recall_sent_incorrect))  # recall
-print("Percentage of at least one Sentence found correctly: " + str(sent_found))  # recall
-print("Percentage of at least one Sentence found correctly: " + str(sent_found_if_doc_found))  # recall
-print("Percentage of at least one Sentence found correctly: " + str(another_sent))  # recall
-
-# scores from fever
-results = fever_score(train_prediction, actual=train_set)
-
-print("\n#########")
-print("# FEVER #")
-print("#########")
-print("Strict_score: \t\t" + str(results[0]))
-print("Acc_score: \t\t\t" + str(results[1]))
-print("Precision: \t\t\t" + str(results[2]))
-print("Recall: \t\t\t" + str(results[3]))
-print("F1-Score: \t\t\t" + str(results[4]))
\ No newline at end of file
diff --git a/word2vec.py b/word2vec.py
new file mode 100644
index 00000000..a11b2605
--- /dev/null
+++ b/word2vec.py
@@ -0,0 +1,186 @@
+import os
+import jsonlines
+from random import shuffle
+import gensim
+import sys
+from scipy import spatial
+import spacy
+import numpy as np
+
+fname = "word2vec.model"
+
+import logging
+logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+
+spacy_nlp = spacy.load('en_core_web_sm')
+
+if len(sys.argv) - 1 == 1:
+    max_counter = int(sys.argv[1])
+    if max_counter == -1:
+        max_counter = 9999999
+else:
+    max_counter = 10000  # 10 000
+    print("Max Counter not defined!")
+    print("Set Default Value: " + str(max_counter))
+
+# full text and processed in ['text'] tag
+wiki_folder = "data/wiki-pages-split"
+files = os.listdir(wiki_folder)
+shuffle(files)
+
+counter = 0
+
+train_sentences = []
+tokens = []
+for file in files:
+    _name = file[:-5]
+    title = _name.replace("_", " ")
+    tokens = gensim.utils.simple_preprocess(title)
+    train_sentences.append(tokens)
+
+    file_content = jsonlines.open(wiki_folder + "/" + file)
+    lines = file_content.read()['lines']
+    for line in lines:
+        if len(line['content']) < 2:
+            continue
+        tokens = gensim.utils.simple_preprocess(line['content'])
+        train_sentences.append(tokens)
+    if counter > max_counter:
+        # adding required docs by fever with the claim given
+        file_content = jsonlines.open(wiki_folder + "/" + "Telemundo.json")
+        lines = file_content.read()['lines']
+        for line in lines:
+            print(line['content'])
+            tokens = gensim.utils.simple_preprocess(line['content'])
+            train_sentences.append(tokens)
+
+        file_content = jsonlines.open(wiki_folder + "/" + "Hispanic_and_Latino_Americans.json")
+        lines = file_content.read()['lines']
+        for line in lines:
+            print(line['content'])
+            tokens = gensim.utils.simple_preprocess(line['content'])
+            train_sentences.append(tokens)
+
+        break
+    else:
+        counter += 1
+        if counter % 1000 == 0:
+            print(counter)
+
+model = gensim.models.Word2Vec(iter=1, min_count=5, size=500, workers=4)  # an empty model, no training yet
+model.build_vocab(train_sentences)  # can be a non-repeatable, 1-pass generator
+
+print(model.epochs)
+model.train(train_sentences, total_examples=model.corpus_count, epochs=30)  # can be a non-repeatable, 1-pass generator
+index2word_set = set(model.wv.index2word)
+
+
+def avg_feature_vector(sentence, model, num_features, index2word_set):
+    words = sentence.split()
+    feature_vec = np.zeros((num_features,), dtype='float32')
+    n_words = 0
+    for word in words:
+        if word in index2word_set:
+            n_words += 1
+            feature_vec = np.add(feature_vec, model[word])
+    if n_words > 0:
+        feature_vec = np.divide(feature_vec, n_words)
+    return feature_vec
+
+
+def word2vec(text1, text2):
+    # print(text1)
+    # print(text2)
+    s1_afv = avg_feature_vector(text1, model=model, num_features=500, index2word_set=index2word_set)
+    s2_afv = avg_feature_vector(text2, model=model, num_features=500, index2word_set=index2word_set)
+    if np.sum(s1_afv) == 0 or np.sum(s2_afv) == 0:
+        return 0
+        # text1 = spacy_nlp(text1)
+        # text2 = spacy_nlp(text2)
+        # sim = text1.similarity(text2)
+    else:
+        sim = 1 - spatial.distance.cosine(s1_afv, s2_afv)
+
+    return sim
+
+
+sentence = "Telemundo is a English-language television network."
+document = "Telemundo"
+document_2 = "Hispanic_and_Latino_Americans"
+
+print(word2vec("cat", "man"))
+print(word2vec("cat", "dog"))
+#print(model.similarity("cat", "man"))
+#print(model.similarity("cat", "dog"))
+
+tokens_sentence = gensim.utils.simple_preprocess(sentence)
+sentence = ' '.join(map(str, tokens_sentence))
+print(sentence)
+
+best = [0, 0, 0, 0, 0]
+docs = ["", "", "", "", ""]
+for file in files:
+    _name = file[:-5]
+    title = _name.replace("_", " ")
+    tokens = gensim.utils.simple_preprocess(title)
+    text = ' '.join(map(str, tokens))
+    sim = word2vec(sentence, text)
+    if sim > best[0]:
+        best[0] = sim
+        docs[0] = _name
+
+    elif sim > best[1]:
+        best[1] = sim
+        docs[1] = _name
+
+    elif sim > best[2]:
+        best[2] = sim
+        docs[2] = _name
+
+    elif sim > best[3]:
+        best[3] = sim
+        docs[3] = _name
+
+    elif sim > best[4]:
+        best[4] = sim
+        docs[4] = _name
+
+print(best)
+print(docs)
+
+tokens_sentence = gensim.utils.simple_preprocess(sentence)
+
+for token_sentence in tokens_sentence:
+    best = [0, 0, 0, 0, 0]
+    docs = ["", "", "", "", ""]
+    for file in files:
+        _name = file[:-5]
+        title = _name.replace("_", " ")
+        tokens = gensim.utils.simple_preprocess(title)
+        text = ' '.join(map(str, tokens))
+        sim = word2vec(token_sentence, text)
+        if sim > best[0]:
+            best[0] = sim
+            docs[0] = _name
+
+        elif sim > best[1]:
+            best[1] = sim
+            docs[1] = _name
+
+        elif sim > best[2]:
+            best[2] = sim
+            docs[2] = _name
+
+        elif sim > best[3]:
+            best[3] = sim
+            docs[3] = _name
+
+        elif sim > best[4]:
+            best[4] = sim
+            docs[4] = _name
+    print(best)
+    print(docs)
+
+print(word2vec("telemundo", sentence))
+
+model.save('models/word2vec')