diff --git a/doc_retrieval.py b/doc_retrieval.py index 185d1277..824db7fb 100644 --- a/doc_retrieval.py +++ b/doc_retrieval.py @@ -7,6 +7,10 @@ import unicodedata as ud import clausiepy.clausiepy as clausie from gensim.parsing.preprocessing import remove_stopwords +import operator +import datetime +import multiprocessing +from Levenshtein import distance def clean_entities(entities): @@ -82,8 +86,9 @@ def getClosestDocs(wiki_entities, entities): entities = list(entities) for i in range(len(entities)): entities[i] = str(entities[i]) - selected_docs = [] + selected_docs = set() for ent in entities: + print(ent) ent = ud.normalize('NFC', ent) best_1 = 1.1 @@ -95,19 +100,19 @@ def getClosestDocs(wiki_entities, entities): best_3 = 1.1 best_match_3 = "" + dists = [] + a = datetime.datetime.now() for we in wiki_entities: - dist = stringdist.levenshtein_norm(we, ent) - if dist < best_1: - best_1 = dist - best_match_1 = we + dists.append((distance(we, ent), we)) + b = datetime.datetime.now() + print(b-a) - elif dist < best_2: - best_2 = dist - best_match_2 = we + pair_1 = min(dists, key=operator.itemgetter(0)) + dists.remove(pair_1) + pair_2 = min(dists, key=operator.itemgetter(0)) - elif dist < best_3: - best_3 = dist - best_match_3 = we + best_match_1 = pair_1[1] + best_match_2 = pair_2[1] best_match_1 = best_match_1.replace(" ", "_") best_match_1 = best_match_1.replace("/", "-SLH-") @@ -124,10 +129,11 @@ def getClosestDocs(wiki_entities, entities): best_match_3 = best_match_3.replace("(", "-LRB-") best_match_3 = best_match_3.replace(")", "-RRB-") - selected_docs.append(best_match_1) - selected_docs.append(best_match_2) + selected_docs.add(best_match_1) + selected_docs.add(best_match_2) # selected_docs.append(best_match_3) - return selected_docs, entities + print(selected_docs) + return list(selected_docs), entities def getRelevantDocs(claim, wiki_entities, ner_module="spaCy", nlp=None): # ,matcher=None,nlp=None diff --git a/generate_rte_preds.py b/generate_rte_preds.py index cb680718..dc1ae465 100644 --- a/generate_rte_preds.py +++ b/generate_rte_preds.py @@ -3,28 +3,28 @@ import doc_retrieval import sentence_retrieval import rte.rte as rte -import utilities import spacy import os import codecs import unicodedata as ud -import gensim from openie import StanfordOpenIE -from allennlp.models.archival import load_archive -from allennlp.predictors import Predictor relevant_sentences_file = "data/dev_concatenation.jsonl" -concatenate_file = "data/dev_concatenation_oie.jsonl" +concatenate_file = "data/dev_concatenation_oie_2.jsonl" instances = [] zero_results = 0 INCLUDE_NER = False -INLCUDE_OIE = True +INCLUDE_OIE = True +RUN_RTE = False relevant_sentences_file = jsonlines.open(relevant_sentences_file) -model = "rte/fever_output/model.tar.gz" -model = load_archive(model) -predictor = Predictor.from_archive(model) +if RUN_RTE: + from allennlp.models.archival import load_archive + from allennlp.predictors import Predictor + model = "rte/fever_output/model.tar.gz" + model = load_archive(model) + predictor = Predictor.from_archive(model) wiki_dir = "data/wiki-pages/wiki-pages" wiki_split_docs_dir = "data/wiki-pages-split" @@ -65,7 +65,7 @@ def run_rte(claim, evidence, claim_num): with StanfordOpenIE() as client: with jsonlines.open(concatenate_file, mode='w') as writer_c: - for i in range(len(instances)): + for i in range(0, len(instances)): claim = instances[i]['claim'] print(claim) evidence = instances[i]['predicted_sentences'] @@ -116,25 +116,26 @@ def run_rte(claim, evidence, claim_num): instances[i]['predicted_pages_ner'] = relevant_docs instances[i]['predicted_sentences_ner'] = predicted_evidence - preds = run_rte(claim, potential_evidence_sentences, claim_num) - - saveFile = codecs.open("rte/entailment_predictions/claim_" + str(claim_num) + ".json", mode="w+", - encoding="utf-8") - for j in range(len(preds)): - # print(preds) - # print(evidence) - preds[j]['claim'] = claim - preds[j]['premise_source_doc_id'] = evidence[j][0] - preds[j]['premise_source_doc_line_num'] = evidence[j][1] - preds[j]['premise_source_doc_sentence'] = potential_evidence_sentences[j] - saveFile.write(json.dumps(preds[j], ensure_ascii=False) + "\n") - - saveFile.close() + if RUN_RTE: + preds = run_rte(claim, potential_evidence_sentences, claim_num) + + saveFile = codecs.open("rte/entailment_predictions/claim_" + str(claim_num) + ".json", mode="w+", + encoding="utf-8") + for j in range(len(preds)): + # print(preds) + # print(evidence) + preds[j]['claim'] = claim + preds[j]['premise_source_doc_id'] = evidence[j][0] + preds[j]['premise_source_doc_line_num'] = evidence[j][1] + preds[j]['premise_source_doc_sentence'] = potential_evidence_sentences[j] + saveFile.write(json.dumps(preds[j], ensure_ascii=False) + "\n") + + saveFile.close() claim_num += 1 # print(claim_num) # print(instances[i]) - if INLCUDE_OIE: + if INCLUDE_OIE: relevant_docs, entities = doc_retrieval.get_docs_with_oie(claim, wiki_entities, client) print(entities) instances[i]['predicted_pages_oie'] = relevant_docs diff --git a/generate_rte_preds_1.py b/generate_rte_preds_1.py index 6e16f1ca..fbd97bab 100644 --- a/generate_rte_preds_1.py +++ b/generate_rte_preds_1.py @@ -3,28 +3,28 @@ import doc_retrieval import sentence_retrieval import rte.rte as rte -import utilities import spacy import os import codecs import unicodedata as ud -import gensim from openie import StanfordOpenIE -from allennlp.models.archival import load_archive -from allennlp.predictors import Predictor relevant_sentences_file = "data/dev_concatenation.jsonl" concatenate_file = "data/dev_concatenation_oie_1.jsonl" instances = [] zero_results = 0 INCLUDE_NER = False -INLCUDE_OIE = True +INCLUDE_OIE = True +RUN_RTE = False relevant_sentences_file = jsonlines.open(relevant_sentences_file) -model = "rte/fever_output/model.tar.gz" -model = load_archive(model) -predictor = Predictor.from_archive(model) +if RUN_RTE: + from allennlp.models.archival import load_archive + from allennlp.predictors import Predictor + model = "rte/fever_output/model.tar.gz" + model = load_archive(model) + predictor = Predictor.from_archive(model) wiki_dir = "data/wiki-pages/wiki-pages" wiki_split_docs_dir = "data/wiki-pages-split" @@ -116,25 +116,26 @@ def run_rte(claim, evidence, claim_num): instances[i]['predicted_pages_ner'] = relevant_docs instances[i]['predicted_sentences_ner'] = predicted_evidence - preds = run_rte(claim, potential_evidence_sentences, claim_num) - - saveFile = codecs.open("rte/entailment_predictions/claim_" + str(claim_num) + ".json", mode="w+", - encoding="utf-8") - for j in range(len(preds)): - # print(preds) - # print(evidence) - preds[j]['claim'] = claim - preds[j]['premise_source_doc_id'] = evidence[j][0] - preds[j]['premise_source_doc_line_num'] = evidence[j][1] - preds[j]['premise_source_doc_sentence'] = potential_evidence_sentences[j] - saveFile.write(json.dumps(preds[j], ensure_ascii=False) + "\n") - - saveFile.close() + if RUN_RTE: + preds = run_rte(claim, potential_evidence_sentences, claim_num) + + saveFile = codecs.open("rte/entailment_predictions/claim_" + str(claim_num) + ".json", mode="w+", + encoding="utf-8") + for j in range(len(preds)): + # print(preds) + # print(evidence) + preds[j]['claim'] = claim + preds[j]['premise_source_doc_id'] = evidence[j][0] + preds[j]['premise_source_doc_line_num'] = evidence[j][1] + preds[j]['premise_source_doc_sentence'] = potential_evidence_sentences[j] + saveFile.write(json.dumps(preds[j], ensure_ascii=False) + "\n") + + saveFile.close() claim_num += 1 # print(claim_num) # print(instances[i]) - if INLCUDE_OIE: + if INCLUDE_OIE: relevant_docs, entities = doc_retrieval.get_docs_with_oie(claim, wiki_entities, client) print(entities) instances[i]['predicted_pages_oie'] = relevant_docs diff --git a/generate_rte_preds_2.py b/generate_rte_preds_2.py index f25075aa..2f0bda5c 100644 --- a/generate_rte_preds_2.py +++ b/generate_rte_preds_2.py @@ -3,28 +3,28 @@ import doc_retrieval import sentence_retrieval import rte.rte as rte -import utilities import spacy import os import codecs import unicodedata as ud -import gensim from openie import StanfordOpenIE -from allennlp.models.archival import load_archive -from allennlp.predictors import Predictor relevant_sentences_file = "data/dev_concatenation.jsonl" concatenate_file = "data/dev_concatenation_oie_2.jsonl" instances = [] zero_results = 0 INCLUDE_NER = False -INLCUDE_OIE = True +INCLUDE_OIE = True +RUN_RTE = False relevant_sentences_file = jsonlines.open(relevant_sentences_file) -model = "rte/fever_output/model.tar.gz" -model = load_archive(model) -predictor = Predictor.from_archive(model) +if RUN_RTE: + from allennlp.models.archival import load_archive + from allennlp.predictors import Predictor + model = "rte/fever_output/model.tar.gz" + model = load_archive(model) + predictor = Predictor.from_archive(model) wiki_dir = "data/wiki-pages/wiki-pages" wiki_split_docs_dir = "data/wiki-pages-split" @@ -116,25 +116,26 @@ def run_rte(claim, evidence, claim_num): instances[i]['predicted_pages_ner'] = relevant_docs instances[i]['predicted_sentences_ner'] = predicted_evidence - preds = run_rte(claim, potential_evidence_sentences, claim_num) - - saveFile = codecs.open("rte/entailment_predictions/claim_" + str(claim_num) + ".json", mode="w+", - encoding="utf-8") - for j in range(len(preds)): - # print(preds) - # print(evidence) - preds[j]['claim'] = claim - preds[j]['premise_source_doc_id'] = evidence[j][0] - preds[j]['premise_source_doc_line_num'] = evidence[j][1] - preds[j]['premise_source_doc_sentence'] = potential_evidence_sentences[j] - saveFile.write(json.dumps(preds[j], ensure_ascii=False) + "\n") - - saveFile.close() + if RUN_RTE: + preds = run_rte(claim, potential_evidence_sentences, claim_num) + + saveFile = codecs.open("rte/entailment_predictions/claim_" + str(claim_num) + ".json", mode="w+", + encoding="utf-8") + for j in range(len(preds)): + # print(preds) + # print(evidence) + preds[j]['claim'] = claim + preds[j]['premise_source_doc_id'] = evidence[j][0] + preds[j]['premise_source_doc_line_num'] = evidence[j][1] + preds[j]['premise_source_doc_sentence'] = potential_evidence_sentences[j] + saveFile.write(json.dumps(preds[j], ensure_ascii=False) + "\n") + + saveFile.close() claim_num += 1 # print(claim_num) # print(instances[i]) - if INLCUDE_OIE: + if INCLUDE_OIE: relevant_docs, entities = doc_retrieval.get_docs_with_oie(claim, wiki_entities, client) print(entities) instances[i]['predicted_pages_oie'] = relevant_docs diff --git a/generate_rte_preds_3.py b/generate_rte_preds_3.py index b1b66721..d5912943 100644 --- a/generate_rte_preds_3.py +++ b/generate_rte_preds_3.py @@ -3,28 +3,28 @@ import doc_retrieval import sentence_retrieval import rte.rte as rte -import utilities import spacy import os import codecs import unicodedata as ud -import gensim from openie import StanfordOpenIE -from allennlp.models.archival import load_archive -from allennlp.predictors import Predictor relevant_sentences_file = "data/dev_concatenation.jsonl" concatenate_file = "data/dev_concatenation_oie_3.jsonl" instances = [] zero_results = 0 INCLUDE_NER = False -INLCUDE_OIE = True +INCLUDE_OIE = True +RUN_RTE = False relevant_sentences_file = jsonlines.open(relevant_sentences_file) -model = "rte/fever_output/model.tar.gz" -model = load_archive(model) -predictor = Predictor.from_archive(model) +if RUN_RTE: + from allennlp.models.archival import load_archive + from allennlp.predictors import Predictor + model = "rte/fever_output/model.tar.gz" + model = load_archive(model) + predictor = Predictor.from_archive(model) wiki_dir = "data/wiki-pages/wiki-pages" wiki_split_docs_dir = "data/wiki-pages-split" @@ -116,25 +116,26 @@ def run_rte(claim, evidence, claim_num): instances[i]['predicted_pages_ner'] = relevant_docs instances[i]['predicted_sentences_ner'] = predicted_evidence - preds = run_rte(claim, potential_evidence_sentences, claim_num) - - saveFile = codecs.open("rte/entailment_predictions/claim_" + str(claim_num) + ".json", mode="w+", - encoding="utf-8") - for j in range(len(preds)): - # print(preds) - # print(evidence) - preds[j]['claim'] = claim - preds[j]['premise_source_doc_id'] = evidence[j][0] - preds[j]['premise_source_doc_line_num'] = evidence[j][1] - preds[j]['premise_source_doc_sentence'] = potential_evidence_sentences[j] - saveFile.write(json.dumps(preds[j], ensure_ascii=False) + "\n") - - saveFile.close() + if RUN_RTE: + preds = run_rte(claim, potential_evidence_sentences, claim_num) + + saveFile = codecs.open("rte/entailment_predictions/claim_" + str(claim_num) + ".json", mode="w+", + encoding="utf-8") + for j in range(len(preds)): + # print(preds) + # print(evidence) + preds[j]['claim'] = claim + preds[j]['premise_source_doc_id'] = evidence[j][0] + preds[j]['premise_source_doc_line_num'] = evidence[j][1] + preds[j]['premise_source_doc_sentence'] = potential_evidence_sentences[j] + saveFile.write(json.dumps(preds[j], ensure_ascii=False) + "\n") + + saveFile.close() claim_num += 1 # print(claim_num) # print(instances[i]) - if INLCUDE_OIE: + if INCLUDE_OIE: relevant_docs, entities = doc_retrieval.get_docs_with_oie(claim, wiki_entities, client) print(entities) instances[i]['predicted_pages_oie'] = relevant_docs diff --git a/generate_rte_preds_4.py b/generate_rte_preds_4.py index 1aa61dff..5fead30b 100644 --- a/generate_rte_preds_4.py +++ b/generate_rte_preds_4.py @@ -3,28 +3,28 @@ import doc_retrieval import sentence_retrieval import rte.rte as rte -import utilities import spacy import os import codecs import unicodedata as ud -import gensim from openie import StanfordOpenIE -from allennlp.models.archival import load_archive -from allennlp.predictors import Predictor relevant_sentences_file = "data/dev_concatenation.jsonl" concatenate_file = "data/dev_concatenation_oie_4.jsonl" instances = [] zero_results = 0 INCLUDE_NER = False -INLCUDE_OIE = True +INCLUDE_OIE = True +RUN_RTE = False relevant_sentences_file = jsonlines.open(relevant_sentences_file) -model = "rte/fever_output/model.tar.gz" -model = load_archive(model) -predictor = Predictor.from_archive(model) +if RUN_RTE: + from allennlp.models.archival import load_archive + from allennlp.predictors import Predictor + model = "rte/fever_output/model.tar.gz" + model = load_archive(model) + predictor = Predictor.from_archive(model) wiki_dir = "data/wiki-pages/wiki-pages" wiki_split_docs_dir = "data/wiki-pages-split" @@ -116,25 +116,26 @@ def run_rte(claim, evidence, claim_num): instances[i]['predicted_pages_ner'] = relevant_docs instances[i]['predicted_sentences_ner'] = predicted_evidence - preds = run_rte(claim, potential_evidence_sentences, claim_num) - - saveFile = codecs.open("rte/entailment_predictions/claim_" + str(claim_num) + ".json", mode="w+", - encoding="utf-8") - for j in range(len(preds)): - # print(preds) - # print(evidence) - preds[j]['claim'] = claim - preds[j]['premise_source_doc_id'] = evidence[j][0] - preds[j]['premise_source_doc_line_num'] = evidence[j][1] - preds[j]['premise_source_doc_sentence'] = potential_evidence_sentences[j] - saveFile.write(json.dumps(preds[j], ensure_ascii=False) + "\n") - - saveFile.close() + if RUN_RTE: + preds = run_rte(claim, potential_evidence_sentences, claim_num) + + saveFile = codecs.open("rte/entailment_predictions/claim_" + str(claim_num) + ".json", mode="w+", + encoding="utf-8") + for j in range(len(preds)): + # print(preds) + # print(evidence) + preds[j]['claim'] = claim + preds[j]['premise_source_doc_id'] = evidence[j][0] + preds[j]['premise_source_doc_line_num'] = evidence[j][1] + preds[j]['premise_source_doc_sentence'] = potential_evidence_sentences[j] + saveFile.write(json.dumps(preds[j], ensure_ascii=False) + "\n") + + saveFile.close() claim_num += 1 # print(claim_num) # print(instances[i]) - if INLCUDE_OIE: + if INCLUDE_OIE: relevant_docs, entities = doc_retrieval.get_docs_with_oie(claim, wiki_entities, client) print(entities) instances[i]['predicted_pages_oie'] = relevant_docs