diff --git a/doc2vec.py b/doc2vec.py index 39a77e8b..9f105e84 100644 --- a/doc2vec.py +++ b/doc2vec.py @@ -3,20 +3,51 @@ from random import shuffle import gensim import sys +import spacy +from gensim.parsing.preprocessing import remove_stopwords +from gensim.models.doc2vec import Doc2Vec + +fname = "doc2vec.model" import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) +spacy_nlp = spacy.load('en_core_web_sm') +spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS + +customize_stop_words = [ + "-LRB-", "-RRB-", "-LSB-", "-LRB-" +] + +for w in customize_stop_words: + spacy_nlp.vocab[w].is_stop = True + if len(sys.argv) - 1 == 1: - max_counter = 1000000 # 1 000 000 + max_counter = int(sys.argv[1]) else: max_counter = 10000 # 10 000 print("Max Counter not defined!") print("Set Default Value: " + str(max_counter)) + +def pre_process(doc): + # doc = spacy_nlp(doc) + + # lemma_tokens = [token.lemma_ for token in doc] + # doc = ' '.join(map(str, lemma_tokens)) + # doc = spacy_nlp(doc) + + # tokens = [token.text for token in doc if not token.is_stop] + + # text = ' '.join(map(str, tokens)) + text = remove_stopwords(doc) + return text + +# TODO:Remove all STOP-WORDS and Lemmatize every token!!!!! + # full text and processed in ['text'] tag -wiki_folder = "../wiki-pages-split" +wiki_folder = "data/wiki-pages-split" files = os.listdir(wiki_folder) shuffle(files) @@ -26,9 +57,24 @@ tokens = [] for file in files: file_content = jsonlines.open(wiki_folder + "/" + file) - file_content = file_content.read() - text = file_content['text'] + doc = file_content.read()['text'] + text = pre_process(doc) + if counter > max_counter: + # adding required docs by fever with the claim given + file_content = jsonlines.open(wiki_folder + "/" + "Telemundo.json") + doc = file_content.read()['text'] + text = pre_process(doc) + tokens = gensim.utils.simple_preprocess(text) + print(tokens) + train_text.append(gensim.models.doc2vec.TaggedDocument(tokens, ["Telemundo.json"])) + + file_content = jsonlines.open(wiki_folder + "/" + "Hispanic_and_Latino_Americans.json") + doc = file_content.read()['text'] + text = pre_process(doc) + tokens = gensim.utils.simple_preprocess(text) + train_text.append(gensim.models.doc2vec.TaggedDocument(tokens, ["Hispanic_and_Latino_Americans.json"])) + break else: tokens = gensim.utils.simple_preprocess(text) @@ -37,23 +83,55 @@ if counter % 1000 == 0: print(counter) -model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=20, epochs=2) -model.build_vocab(train_text) +model = Doc2Vec(vector_size=50, min_count=2, epochs=40) +#model = Doc2Vec.load(fname) +model.build_vocab(train_text)#,keep_raw_vocab=True)#, update=True) model.train(train_text, total_examples=model.corpus_count, epochs=model.epochs) -sentence = "Obama was president of United States of America similar to a Portuguese person called D. Afonso Henriques" -test_sentence = gensim.utils.simple_preprocess(sentence) -inferred_vector = model.infer_vector(test_sentence) +sentence = "Telemundo is a English-language television network." +text = pre_process(sentence) +tokens = gensim.utils.simple_preprocess(text) +print(tokens) +for token in tokens: + print(token) + inferred_vector = model.infer_vector([token]) + sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs)) + + STOP = 3 + for doc, sim in sims: + file_content = jsonlines.open(wiki_folder + "/" + doc) + file_content = file_content.read() + text = file_content['text'] + print("\n" + doc + " -- " + str(sim) + ": \n") # + text) + if STOP == 0: + break + else: + STOP -= 1 + + for doc, sim in sims: + if doc != "Hispanic_and_Latino_Americans.json" and doc != "Telemundo.json": + continue + print(doc + " -- " + str(sim)) + print("\n") + +model.save(fname) + +inferred_vector = model.infer_vector(tokens) sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs)) -STOP = 5 +STOP = 3 for doc, sim in sims: file_content = jsonlines.open(wiki_folder + "/" + doc) file_content = file_content.read() text = file_content['text'] - print("\n" + doc + " -- " + str(sim) + ": \n" + text) + print("\n" + doc + " -- " + str(sim) + ": \n") # + text) if STOP == 0: break else: STOP -= 1 + +for doc, sim in sims: + if doc != "Hispanic_and_Latino_Americans.json" and doc != "Telemundo.json": + continue + print(doc + " -- " + str(sim)) diff --git a/metrics_old.py b/metrics_old.py deleted file mode 100644 index 229aca28..00000000 --- a/metrics_old.py +++ /dev/null @@ -1,243 +0,0 @@ -import jsonlines -import sys -from scorer import fever_score - -train_file = "data/subsample_train.jsonl" -train_relevant_file = "data/subsample_train_relevant_docs.jsonl" -train_concatenate_file = "data/subsample_train_concatenation.jsonl" -train_predictions_file = "predictions/predictions_train.jsonl" - -train_file = jsonlines.open(train_file) -train_relevant_file = jsonlines.open(train_relevant_file) -train_concatenate_file = jsonlines.open(train_concatenate_file) -train_predictions_file = jsonlines.open(train_predictions_file) - -train_set = [] -train_relevant = [] -train_concatenate = [] -train_prediction = [] - -for lines in train_file: - lines['claim'] = lines['claim'].replace("-LRB-", " ( ") - lines['claim'] = lines['claim'].replace("-RRB-", " ) ") - train_set.append(lines) - -for lines in train_relevant_file: - lines['claim'] = lines['claim'].replace("-LRB-", " ( ") - lines['claim'] = lines['claim'].replace("-RRB-", " ) ") - train_relevant.append(lines) - -for lines in train_concatenate_file: - lines['claim'] = lines['claim'].replace("-LRB-", " ( ") - lines['claim'] = lines['claim'].replace("-RRB-", " ) ") - train_concatenate.append(lines) - -# this evidence addition is irrelevant -info_by_id = dict((d['id'], dict(d, index=index)) for (index, d) in enumerate(train_set)) -for lines in train_predictions_file: - lines['evidence'] = info_by_id.get(lines['id'])['evidence'] - train_prediction.append(lines) - -# All claims -stop = 0 - -# List with dicts with all important data -''' -id : id of the claim -verifiable : boolean of 1 and 0 with respective meaning -docs : set of documents that verify the claim -docs_sep : set of documents separated -evidences: list of tuples of -difficulties: list of the number of sentences needed to be evidence -''' -gold_data = [] - -for claim in train_set: - - # init gold dict - gold_dict = {'id': claim['id']} - - if claim['verifiable'] == "VERIFIABLE": - gold_dict['verifiable'] = 1 - else: - gold_dict['verifiable'] = 0 - - # get gold inputs - gold_documents = set() - gold_documents_separated = set() - sentences_pair = set() - evidences = claim['evidence'] - difficulties = [] - for evidence in evidences: - doc_name = '' - difficulty = 0 - if len(evidence) > 1: # needs more than 1 doc to be verifiable - for e in evidence: - doc_name += str(e[2]) - doc_name += " " - sentences_pair.add((str(e[2]), str(e[3]))) # add gold sentences - gold_documents_separated.add(str(e[2])) # add the document - difficulty += 1 - doc_name = doc_name[:-1] # erase the last blank space - else: - doc_name = str(evidence[0][2]) - gold_documents_separated.add(str(evidence[0][2])) - sentences_pair.add((str(evidence[0][2]), str(evidence[0][3]))) - difficulty = 1 - difficulties.append(difficulty) - gold_documents.add(doc_name) - gold_dict['difficulties'] = difficulties - gold_dict['docs'] = gold_documents - gold_dict['evidences'] = sentences_pair - gold_dict['docs_sep'] = gold_documents_separated - - gold_data.append(gold_dict) - - # flag to stop if needed - stop += 1 - if stop == -1: - break - -gold_data = dict((item['id'], item) for item in gold_data) - -stop = 0 - -doc_found = 0 -doc_noise = 0 -gold_doc_found = 0 -gold_doc_not_found = 0 - -precision_correct = 0 -precision_incorrect = 0 -recall_correct = 0 -recall_incorrect = 0 -specificity = 0 - -precision_sent_correct = 0 -precision_sent_incorrect = 0 -recall_sent_correct = 0 -recall_sent_incorrect = 0 -sent_found = 0 -sent_found_if_doc_found = 0 - -total_claim = 0 -for claim in train_relevant: - _id = claim['id'] - gold_dict = gold_data.get(_id) - - # no search is needed... no information on gold dict about retrieval - if not gold_dict['verifiable']: - continue - - # document analysis - # TODO: Analyse NER and TF-IDF - doc_correct = 0 - doc_incorrect = 0 - gold_incorrect = 0 - docs = set() - gold_docs = gold_dict['docs_sep'] - - for doc in claim['predicted_pages']: - if doc in gold_docs: - doc_correct += 1 - else: - doc_incorrect += 1 - docs.add(doc) - - precision_correct += doc_correct / len(docs) - precision_incorrect += doc_incorrect / len(docs) - recall_correct += doc_correct / len(gold_docs) - recall_incorrect += doc_incorrect / len(gold_docs) - - for gold_doc in gold_docs: - if gold_doc not in docs: - gold_incorrect += 1 - - specificity += gold_incorrect / len(gold_docs) - - if doc_correct > 0: - doc_found += 1 - - # sentence analysis TODO: check sentences - sentences = set() - for sent in claim['predicted_sentences']: - sentences.add((str(sent[0]), str(sent[1]))) - - evidences = gold_dict['evidences'] - sent_correct = 0 - sent_incorrect = 0 - flag = False - for sent in sentences: - if sent in evidences: - sent_correct += 1 - flag = True - else: - sent_incorrect += 1 - - if flag: - sent_found += 1 - - if doc_correct and flag: - sent_found_if_doc_found += 1 - - precision_sent_correct += sent_correct / len(sentences) - precision_sent_incorrect += sent_incorrect / len(sentences) - recall_sent_correct += sent_correct / len(evidences) - recall_sent_incorrect += sent_incorrect / len(evidences) - - # TODO: create all possible pair in order to see if it appears in gold_dict['docs'] - # claim['predicted_sentences'] - - # flag to stop if needed - total_claim += 1 - stop += 1 - if stop == -1: - break - -precision_correct /= total_claim -precision_incorrect /= total_claim -recall_correct /= total_claim -recall_incorrect /= total_claim -specificity /= total_claim -doc_found /= total_claim - -print("\n#############") -print("# DOCUMENTS #") -print("#############") -print("Precision (Document Retrieved):\t\t\t\t\t\t " + str(precision_correct)) # precision -print("Fall-out (incorrect documents):\t\t\t\t\t\t " + str(precision_incorrect)) # precision -print("Recall (Relevant Documents):\t\t\t\t\t\t " + str(recall_correct)) # recall -print("Percentage of gold documents NOT found:\t\t\t\t " + str(recall_incorrect)) # recall -print("Fall-out: " + str(specificity)) -print("Percentage of at least one document found correctly: " + str(doc_found)) # recall - -precision_sent_correct /= total_claim -precision_sent_incorrect /= total_claim -recall_sent_correct /= total_claim -recall_sent_incorrect /= total_claim -sent_found /= total_claim -sent_found_if_doc_found /= total_claim -another_sent = sent_found_if_doc_found / doc_found - -print("\n#############") -print("# SENTENCES #") -print("#############") -print("Precision (Sentences Retrieved):\t\t\t\t\t " + str(precision_sent_correct)) # precision -print("Precision (incorrect Sentences):\t\t\t\t\t " + str(precision_sent_incorrect)) # precision -print("Recall (Relevant Sentences):\t\t\t\t\t\t " + str(recall_sent_correct)) # recall -print("Percentage of gold Sentences NOT found:\t\t\t\t " + str(recall_sent_incorrect)) # recall -print("Percentage of at least one Sentence found correctly: " + str(sent_found)) # recall -print("Percentage of at least one Sentence found correctly: " + str(sent_found_if_doc_found)) # recall -print("Percentage of at least one Sentence found correctly: " + str(another_sent)) # recall - -# scores from fever -results = fever_score(train_prediction, actual=train_set) - -print("\n#########") -print("# FEVER #") -print("#########") -print("Strict_score: \t\t" + str(results[0])) -print("Acc_score: \t\t\t" + str(results[1])) -print("Precision: \t\t\t" + str(results[2])) -print("Recall: \t\t\t" + str(results[3])) -print("F1-Score: \t\t\t" + str(results[4])) \ No newline at end of file diff --git a/word2vec.py b/word2vec.py new file mode 100644 index 00000000..a11b2605 --- /dev/null +++ b/word2vec.py @@ -0,0 +1,186 @@ +import os +import jsonlines +from random import shuffle +import gensim +import sys +from scipy import spatial +import spacy +import numpy as np + +fname = "word2vec.model" + +import logging +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + +spacy_nlp = spacy.load('en_core_web_sm') + +if len(sys.argv) - 1 == 1: + max_counter = int(sys.argv[1]) + if max_counter == -1: + max_counter = 9999999 +else: + max_counter = 10000 # 10 000 + print("Max Counter not defined!") + print("Set Default Value: " + str(max_counter)) + +# full text and processed in ['text'] tag +wiki_folder = "data/wiki-pages-split" +files = os.listdir(wiki_folder) +shuffle(files) + +counter = 0 + +train_sentences = [] +tokens = [] +for file in files: + _name = file[:-5] + title = _name.replace("_", " ") + tokens = gensim.utils.simple_preprocess(title) + train_sentences.append(tokens) + + file_content = jsonlines.open(wiki_folder + "/" + file) + lines = file_content.read()['lines'] + for line in lines: + if len(line['content']) < 2: + continue + tokens = gensim.utils.simple_preprocess(line['content']) + train_sentences.append(tokens) + if counter > max_counter: + # adding required docs by fever with the claim given + file_content = jsonlines.open(wiki_folder + "/" + "Telemundo.json") + lines = file_content.read()['lines'] + for line in lines: + print(line['content']) + tokens = gensim.utils.simple_preprocess(line['content']) + train_sentences.append(tokens) + + file_content = jsonlines.open(wiki_folder + "/" + "Hispanic_and_Latino_Americans.json") + lines = file_content.read()['lines'] + for line in lines: + print(line['content']) + tokens = gensim.utils.simple_preprocess(line['content']) + train_sentences.append(tokens) + + break + else: + counter += 1 + if counter % 1000 == 0: + print(counter) + +model = gensim.models.Word2Vec(iter=1, min_count=5, size=500, workers=4) # an empty model, no training yet +model.build_vocab(train_sentences) # can be a non-repeatable, 1-pass generator + +print(model.epochs) +model.train(train_sentences, total_examples=model.corpus_count, epochs=30) # can be a non-repeatable, 1-pass generator +index2word_set = set(model.wv.index2word) + + +def avg_feature_vector(sentence, model, num_features, index2word_set): + words = sentence.split() + feature_vec = np.zeros((num_features,), dtype='float32') + n_words = 0 + for word in words: + if word in index2word_set: + n_words += 1 + feature_vec = np.add(feature_vec, model[word]) + if n_words > 0: + feature_vec = np.divide(feature_vec, n_words) + return feature_vec + + +def word2vec(text1, text2): + # print(text1) + # print(text2) + s1_afv = avg_feature_vector(text1, model=model, num_features=500, index2word_set=index2word_set) + s2_afv = avg_feature_vector(text2, model=model, num_features=500, index2word_set=index2word_set) + if np.sum(s1_afv) == 0 or np.sum(s2_afv) == 0: + return 0 + # text1 = spacy_nlp(text1) + # text2 = spacy_nlp(text2) + # sim = text1.similarity(text2) + else: + sim = 1 - spatial.distance.cosine(s1_afv, s2_afv) + + return sim + + +sentence = "Telemundo is a English-language television network." +document = "Telemundo" +document_2 = "Hispanic_and_Latino_Americans" + +print(word2vec("cat", "man")) +print(word2vec("cat", "dog")) +#print(model.similarity("cat", "man")) +#print(model.similarity("cat", "dog")) + +tokens_sentence = gensim.utils.simple_preprocess(sentence) +sentence = ' '.join(map(str, tokens_sentence)) +print(sentence) + +best = [0, 0, 0, 0, 0] +docs = ["", "", "", "", ""] +for file in files: + _name = file[:-5] + title = _name.replace("_", " ") + tokens = gensim.utils.simple_preprocess(title) + text = ' '.join(map(str, tokens)) + sim = word2vec(sentence, text) + if sim > best[0]: + best[0] = sim + docs[0] = _name + + elif sim > best[1]: + best[1] = sim + docs[1] = _name + + elif sim > best[2]: + best[2] = sim + docs[2] = _name + + elif sim > best[3]: + best[3] = sim + docs[3] = _name + + elif sim > best[4]: + best[4] = sim + docs[4] = _name + +print(best) +print(docs) + +tokens_sentence = gensim.utils.simple_preprocess(sentence) + +for token_sentence in tokens_sentence: + best = [0, 0, 0, 0, 0] + docs = ["", "", "", "", ""] + for file in files: + _name = file[:-5] + title = _name.replace("_", " ") + tokens = gensim.utils.simple_preprocess(title) + text = ' '.join(map(str, tokens)) + sim = word2vec(token_sentence, text) + if sim > best[0]: + best[0] = sim + docs[0] = _name + + elif sim > best[1]: + best[1] = sim + docs[1] = _name + + elif sim > best[2]: + best[2] = sim + docs[2] = _name + + elif sim > best[3]: + best[3] = sim + docs[3] = _name + + elif sim > best[4]: + best[4] = sim + docs[4] = _name + print(best) + print(docs) + +print(word2vec("telemundo", sentence)) + +model.save('models/word2vec')