Skip to content

Commit

Permalink
Optimized Document Search
Browse files Browse the repository at this point in the history
  • Loading branch information
pedrojlazevedo committed Mar 29, 2020
1 parent 78eac09 commit 5caae2b
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 131 deletions.
34 changes: 20 additions & 14 deletions doc_retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
import unicodedata as ud
import clausiepy.clausiepy as clausie
from gensim.parsing.preprocessing import remove_stopwords
import operator
import datetime
import multiprocessing
from Levenshtein import distance


def clean_entities(entities):
Expand Down Expand Up @@ -82,8 +86,9 @@ def getClosestDocs(wiki_entities, entities):
entities = list(entities)
for i in range(len(entities)):
entities[i] = str(entities[i])
selected_docs = []
selected_docs = set()
for ent in entities:
print(ent)
ent = ud.normalize('NFC', ent)

best_1 = 1.1
Expand All @@ -95,19 +100,19 @@ def getClosestDocs(wiki_entities, entities):
best_3 = 1.1
best_match_3 = ""

dists = []
a = datetime.datetime.now()
for we in wiki_entities:
dist = stringdist.levenshtein_norm(we, ent)
if dist < best_1:
best_1 = dist
best_match_1 = we
dists.append((distance(we, ent), we))
b = datetime.datetime.now()
print(b-a)

elif dist < best_2:
best_2 = dist
best_match_2 = we
pair_1 = min(dists, key=operator.itemgetter(0))
dists.remove(pair_1)
pair_2 = min(dists, key=operator.itemgetter(0))

elif dist < best_3:
best_3 = dist
best_match_3 = we
best_match_1 = pair_1[1]
best_match_2 = pair_2[1]

best_match_1 = best_match_1.replace(" ", "_")
best_match_1 = best_match_1.replace("/", "-SLH-")
Expand All @@ -124,10 +129,11 @@ def getClosestDocs(wiki_entities, entities):
best_match_3 = best_match_3.replace("(", "-LRB-")
best_match_3 = best_match_3.replace(")", "-RRB-")

selected_docs.append(best_match_1)
selected_docs.append(best_match_2)
selected_docs.add(best_match_1)
selected_docs.add(best_match_2)
# selected_docs.append(best_match_3)
return selected_docs, entities
print(selected_docs)
return list(selected_docs), entities


def getRelevantDocs(claim, wiki_entities, ner_module="spaCy", nlp=None): # ,matcher=None,nlp=None
Expand Down
51 changes: 26 additions & 25 deletions generate_rte_preds.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,28 @@
import doc_retrieval
import sentence_retrieval
import rte.rte as rte
import utilities
import spacy
import os
import codecs
import unicodedata as ud
import gensim
from openie import StanfordOpenIE

from allennlp.models.archival import load_archive
from allennlp.predictors import Predictor

relevant_sentences_file = "data/dev_concatenation.jsonl"
concatenate_file = "data/dev_concatenation_oie.jsonl"
concatenate_file = "data/dev_concatenation_oie_2.jsonl"
instances = []
zero_results = 0
INCLUDE_NER = False
INLCUDE_OIE = True
INCLUDE_OIE = True
RUN_RTE = False

relevant_sentences_file = jsonlines.open(relevant_sentences_file)
model = "rte/fever_output/model.tar.gz"
model = load_archive(model)
predictor = Predictor.from_archive(model)
if RUN_RTE:
from allennlp.models.archival import load_archive
from allennlp.predictors import Predictor
model = "rte/fever_output/model.tar.gz"
model = load_archive(model)
predictor = Predictor.from_archive(model)

wiki_dir = "data/wiki-pages/wiki-pages"
wiki_split_docs_dir = "data/wiki-pages-split"
Expand Down Expand Up @@ -65,7 +65,7 @@ def run_rte(claim, evidence, claim_num):

with StanfordOpenIE() as client:
with jsonlines.open(concatenate_file, mode='w') as writer_c:
for i in range(len(instances)):
for i in range(0, len(instances)):
claim = instances[i]['claim']
print(claim)
evidence = instances[i]['predicted_sentences']
Expand Down Expand Up @@ -116,25 +116,26 @@ def run_rte(claim, evidence, claim_num):
instances[i]['predicted_pages_ner'] = relevant_docs
instances[i]['predicted_sentences_ner'] = predicted_evidence

preds = run_rte(claim, potential_evidence_sentences, claim_num)

saveFile = codecs.open("rte/entailment_predictions/claim_" + str(claim_num) + ".json", mode="w+",
encoding="utf-8")
for j in range(len(preds)):
# print(preds)
# print(evidence)
preds[j]['claim'] = claim
preds[j]['premise_source_doc_id'] = evidence[j][0]
preds[j]['premise_source_doc_line_num'] = evidence[j][1]
preds[j]['premise_source_doc_sentence'] = potential_evidence_sentences[j]
saveFile.write(json.dumps(preds[j], ensure_ascii=False) + "\n")

saveFile.close()
if RUN_RTE:
preds = run_rte(claim, potential_evidence_sentences, claim_num)

saveFile = codecs.open("rte/entailment_predictions/claim_" + str(claim_num) + ".json", mode="w+",
encoding="utf-8")
for j in range(len(preds)):
# print(preds)
# print(evidence)
preds[j]['claim'] = claim
preds[j]['premise_source_doc_id'] = evidence[j][0]
preds[j]['premise_source_doc_line_num'] = evidence[j][1]
preds[j]['premise_source_doc_sentence'] = potential_evidence_sentences[j]
saveFile.write(json.dumps(preds[j], ensure_ascii=False) + "\n")

saveFile.close()
claim_num += 1
# print(claim_num)
# print(instances[i])

if INLCUDE_OIE:
if INCLUDE_OIE:
relevant_docs, entities = doc_retrieval.get_docs_with_oie(claim, wiki_entities, client)
print(entities)
instances[i]['predicted_pages_oie'] = relevant_docs
Expand Down
47 changes: 24 additions & 23 deletions generate_rte_preds_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,28 @@
import doc_retrieval
import sentence_retrieval
import rte.rte as rte
import utilities
import spacy
import os
import codecs
import unicodedata as ud
import gensim
from openie import StanfordOpenIE

from allennlp.models.archival import load_archive
from allennlp.predictors import Predictor

relevant_sentences_file = "data/dev_concatenation.jsonl"
concatenate_file = "data/dev_concatenation_oie_1.jsonl"
instances = []
zero_results = 0
INCLUDE_NER = False
INLCUDE_OIE = True
INCLUDE_OIE = True
RUN_RTE = False

relevant_sentences_file = jsonlines.open(relevant_sentences_file)
model = "rte/fever_output/model.tar.gz"
model = load_archive(model)
predictor = Predictor.from_archive(model)
if RUN_RTE:
from allennlp.models.archival import load_archive
from allennlp.predictors import Predictor
model = "rte/fever_output/model.tar.gz"
model = load_archive(model)
predictor = Predictor.from_archive(model)

wiki_dir = "data/wiki-pages/wiki-pages"
wiki_split_docs_dir = "data/wiki-pages-split"
Expand Down Expand Up @@ -116,25 +116,26 @@ def run_rte(claim, evidence, claim_num):
instances[i]['predicted_pages_ner'] = relevant_docs
instances[i]['predicted_sentences_ner'] = predicted_evidence

preds = run_rte(claim, potential_evidence_sentences, claim_num)

saveFile = codecs.open("rte/entailment_predictions/claim_" + str(claim_num) + ".json", mode="w+",
encoding="utf-8")
for j in range(len(preds)):
# print(preds)
# print(evidence)
preds[j]['claim'] = claim
preds[j]['premise_source_doc_id'] = evidence[j][0]
preds[j]['premise_source_doc_line_num'] = evidence[j][1]
preds[j]['premise_source_doc_sentence'] = potential_evidence_sentences[j]
saveFile.write(json.dumps(preds[j], ensure_ascii=False) + "\n")

saveFile.close()
if RUN_RTE:
preds = run_rte(claim, potential_evidence_sentences, claim_num)

saveFile = codecs.open("rte/entailment_predictions/claim_" + str(claim_num) + ".json", mode="w+",
encoding="utf-8")
for j in range(len(preds)):
# print(preds)
# print(evidence)
preds[j]['claim'] = claim
preds[j]['premise_source_doc_id'] = evidence[j][0]
preds[j]['premise_source_doc_line_num'] = evidence[j][1]
preds[j]['premise_source_doc_sentence'] = potential_evidence_sentences[j]
saveFile.write(json.dumps(preds[j], ensure_ascii=False) + "\n")

saveFile.close()
claim_num += 1
# print(claim_num)
# print(instances[i])

if INLCUDE_OIE:
if INCLUDE_OIE:
relevant_docs, entities = doc_retrieval.get_docs_with_oie(claim, wiki_entities, client)
print(entities)
instances[i]['predicted_pages_oie'] = relevant_docs
Expand Down
47 changes: 24 additions & 23 deletions generate_rte_preds_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,28 @@
import doc_retrieval
import sentence_retrieval
import rte.rte as rte
import utilities
import spacy
import os
import codecs
import unicodedata as ud
import gensim
from openie import StanfordOpenIE

from allennlp.models.archival import load_archive
from allennlp.predictors import Predictor

relevant_sentences_file = "data/dev_concatenation.jsonl"
concatenate_file = "data/dev_concatenation_oie_2.jsonl"
instances = []
zero_results = 0
INCLUDE_NER = False
INLCUDE_OIE = True
INCLUDE_OIE = True
RUN_RTE = False

relevant_sentences_file = jsonlines.open(relevant_sentences_file)
model = "rte/fever_output/model.tar.gz"
model = load_archive(model)
predictor = Predictor.from_archive(model)
if RUN_RTE:
from allennlp.models.archival import load_archive
from allennlp.predictors import Predictor
model = "rte/fever_output/model.tar.gz"
model = load_archive(model)
predictor = Predictor.from_archive(model)

wiki_dir = "data/wiki-pages/wiki-pages"
wiki_split_docs_dir = "data/wiki-pages-split"
Expand Down Expand Up @@ -116,25 +116,26 @@ def run_rte(claim, evidence, claim_num):
instances[i]['predicted_pages_ner'] = relevant_docs
instances[i]['predicted_sentences_ner'] = predicted_evidence

preds = run_rte(claim, potential_evidence_sentences, claim_num)

saveFile = codecs.open("rte/entailment_predictions/claim_" + str(claim_num) + ".json", mode="w+",
encoding="utf-8")
for j in range(len(preds)):
# print(preds)
# print(evidence)
preds[j]['claim'] = claim
preds[j]['premise_source_doc_id'] = evidence[j][0]
preds[j]['premise_source_doc_line_num'] = evidence[j][1]
preds[j]['premise_source_doc_sentence'] = potential_evidence_sentences[j]
saveFile.write(json.dumps(preds[j], ensure_ascii=False) + "\n")

saveFile.close()
if RUN_RTE:
preds = run_rte(claim, potential_evidence_sentences, claim_num)

saveFile = codecs.open("rte/entailment_predictions/claim_" + str(claim_num) + ".json", mode="w+",
encoding="utf-8")
for j in range(len(preds)):
# print(preds)
# print(evidence)
preds[j]['claim'] = claim
preds[j]['premise_source_doc_id'] = evidence[j][0]
preds[j]['premise_source_doc_line_num'] = evidence[j][1]
preds[j]['premise_source_doc_sentence'] = potential_evidence_sentences[j]
saveFile.write(json.dumps(preds[j], ensure_ascii=False) + "\n")

saveFile.close()
claim_num += 1
# print(claim_num)
# print(instances[i])

if INLCUDE_OIE:
if INCLUDE_OIE:
relevant_docs, entities = doc_retrieval.get_docs_with_oie(claim, wiki_entities, client)
print(entities)
instances[i]['predicted_pages_oie'] = relevant_docs
Expand Down
47 changes: 24 additions & 23 deletions generate_rte_preds_3.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,28 @@
import doc_retrieval
import sentence_retrieval
import rte.rte as rte
import utilities
import spacy
import os
import codecs
import unicodedata as ud
import gensim
from openie import StanfordOpenIE

from allennlp.models.archival import load_archive
from allennlp.predictors import Predictor

relevant_sentences_file = "data/dev_concatenation.jsonl"
concatenate_file = "data/dev_concatenation_oie_3.jsonl"
instances = []
zero_results = 0
INCLUDE_NER = False
INLCUDE_OIE = True
INCLUDE_OIE = True
RUN_RTE = False

relevant_sentences_file = jsonlines.open(relevant_sentences_file)
model = "rte/fever_output/model.tar.gz"
model = load_archive(model)
predictor = Predictor.from_archive(model)
if RUN_RTE:
from allennlp.models.archival import load_archive
from allennlp.predictors import Predictor
model = "rte/fever_output/model.tar.gz"
model = load_archive(model)
predictor = Predictor.from_archive(model)

wiki_dir = "data/wiki-pages/wiki-pages"
wiki_split_docs_dir = "data/wiki-pages-split"
Expand Down Expand Up @@ -116,25 +116,26 @@ def run_rte(claim, evidence, claim_num):
instances[i]['predicted_pages_ner'] = relevant_docs
instances[i]['predicted_sentences_ner'] = predicted_evidence

preds = run_rte(claim, potential_evidence_sentences, claim_num)

saveFile = codecs.open("rte/entailment_predictions/claim_" + str(claim_num) + ".json", mode="w+",
encoding="utf-8")
for j in range(len(preds)):
# print(preds)
# print(evidence)
preds[j]['claim'] = claim
preds[j]['premise_source_doc_id'] = evidence[j][0]
preds[j]['premise_source_doc_line_num'] = evidence[j][1]
preds[j]['premise_source_doc_sentence'] = potential_evidence_sentences[j]
saveFile.write(json.dumps(preds[j], ensure_ascii=False) + "\n")

saveFile.close()
if RUN_RTE:
preds = run_rte(claim, potential_evidence_sentences, claim_num)

saveFile = codecs.open("rte/entailment_predictions/claim_" + str(claim_num) + ".json", mode="w+",
encoding="utf-8")
for j in range(len(preds)):
# print(preds)
# print(evidence)
preds[j]['claim'] = claim
preds[j]['premise_source_doc_id'] = evidence[j][0]
preds[j]['premise_source_doc_line_num'] = evidence[j][1]
preds[j]['premise_source_doc_sentence'] = potential_evidence_sentences[j]
saveFile.write(json.dumps(preds[j], ensure_ascii=False) + "\n")

saveFile.close()
claim_num += 1
# print(claim_num)
# print(instances[i])

if INLCUDE_OIE:
if INCLUDE_OIE:
relevant_docs, entities = doc_retrieval.get_docs_with_oie(claim, wiki_entities, client)
print(entities)
instances[i]['predicted_pages_oie'] = relevant_docs
Expand Down
Loading

0 comments on commit 5caae2b

Please sign in to comment.