Skip to content

Commit

Permalink
Retrieving Documents based on Triple Extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
pedrojlazevedo committed Mar 24, 2020
1 parent 3434d4b commit 8ecab20
Show file tree
Hide file tree
Showing 5 changed files with 165 additions and 86 deletions.
47 changes: 44 additions & 3 deletions doc_retrieval.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,54 @@
import os
import jsonlines
import json
import nltk
import codecs
import utilities
import spacy
import stringdist
from spacy.matcher import PhraseMatcher
import unicodedata as ud
import clausiepy.clausiepy as clausie


def clean_entities(entities):
entities = list(entities)
ents_to_remove = set()
for i in range(len(entities)):
for j in range(len(entities)):
if i == j:
continue
if entities[i] in entities[j]:
# keep the smaller ones...
ents_to_remove.add(entities[j])
# or keep the bigger one...
# ents_to_remove.add(entities[i])
for ent in ents_to_remove:
entities.remove(ent)

return entities


def get_docs_with_oie(claim, wiki_entities,client):
ents = set()

# triple extraction standfordIE
triples = client.annotate(claim)
for triple in triples:
ents.add(triple["subject"])
ents.add(triple["object"])

# triples extraction clausIE
if len(triples) == 0:
clauses = clausie.clausie(claim)
for clause in clauses:
for sub in clause['S']:
ents.add(sub)
for obj in clause['O']:
ents.add(obj)

if len(ents) > 4:
ents = clean_entities(ents)
docs, entities = getClosestDocs(wiki_entities, ents)

return docs, entities


def getClosestDocs(wiki_entities, entities):
Expand Down
163 changes: 88 additions & 75 deletions generate_rte_preds.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,26 @@
import os
import codecs
import unicodedata as ud
import gensim
from openie import StanfordOpenIE

from allennlp.models.archival import load_archive
from allennlp.predictors import Predictor

relevant_sentences_file = "data/dev_relevant_docs.jsonl"
concatenate_file = "data/dev_concatenation.jsonl"
relevant_sentences_file = "data/dev_concatenation.jsonl"
concatenate_file = "data/dev_concatenation_oie.jsonl"
instances = []
zero_results = 0
INCLUDE_NER = False
INLCUDE_OIE = True

relevant_sentences_file = jsonlines.open(relevant_sentences_file)
model = "rte/fever_output/model.tar.gz"
model = load_archive(model)
predictor = Predictor.from_archive(model)

wiki_dir = "data/wiki-pages/wiki-pages"
wiki_split_docs_dir = "../wiki-pages-split"
wiki_split_docs_dir = "data/wiki-pages-split"

claim_num = 1

Expand All @@ -35,6 +38,10 @@
wiki_entities[i] = wiki_entities[i][:-5]
wiki_entities[i] = wiki_entities[i].replace("-LRB-", "(")
wiki_entities[i] = wiki_entities[i].replace("-RRB-", ")")
# tokens_sentence = gensim.utils.simple_preprocess(wiki_entities[i])
# wiki_entities[i] = ' '.join(map(str, tokens_sentence))

print("Wiki entities successfully parsed")

for line in relevant_sentences_file:
instances.append(line)
Expand All @@ -56,77 +63,83 @@ def run_rte(claim, evidence, claim_num):
return preds


with jsonlines.open(concatenate_file, mode='w') as writer_c:
for i in range(len(instances)):
claim = instances[i]['claim']
print(claim)
evidence = instances[i]['predicted_sentences']
potential_evidence_sentences = []
# TODO: implement NER to generate file in order to evaluate.
for sentence in evidence:
# print(sentence)
# print(sentence[0])
# load document from TF-IDF
relevant_doc = ud.normalize('NFC', sentence[0])
relevant_doc = relevant_doc.replace("/", "-SLH-")
file = codecs.open(wiki_split_docs_dir + "/" + relevant_doc + ".json", "r", "utf-8")
file = json.load(file)
full_lines = file["lines"]

lines = []
for line in full_lines:
lines.append(line['content'])


lines[sentence[1]] = lines[sentence[1]].strip()
lines[sentence[1]] = lines[sentence[1]].replace("-LRB-", " ( ")
lines[sentence[1]] = lines[sentence[1]].replace("-RRB-", " ) ")

potential_evidence_sentences.append(lines[sentence[1]])

# Just adding a check
# This is needed in case nothing was predicted
if len(potential_evidence_sentences) == 0:
zero_results += 1
potential_evidence_sentences.append("Nothing")
evidence.append(["Nothing", 0])

if INCLUDE_NER:
relevant_docs, entities = doc_retrieval.getRelevantDocs(claim, wiki_entities, "spaCy",
nlp) # "spaCy", nlp)#
print(relevant_docs)
# print(entities)
relevant_sentences = sentence_retrieval.getRelevantSentences(relevant_docs, entities, wiki_split_docs_dir)
# print(relevant_sentences)

predicted_evidence = []
for sent in relevant_sentences:
predicted_evidence.append((sent['id'], sent['line_num']))
potential_evidence_sentences.append(sent['sentence'])
evidence.append((sent['id'], sent['line_num']))

instances[i]['predicted_pages_ner'] = relevant_docs
instances[i]['predicted_sentences_ner'] = predicted_evidence

writer_c.write(instances[i])
print("Claim number: " + str(i) + " of " + str(len(instances)))

preds = run_rte(claim, potential_evidence_sentences, claim_num)

saveFile = codecs.open("rte/entailment_predictions/claim_" + str(claim_num) + ".json", mode="w+",
encoding="utf-8")
for j in range(len(preds)):
# print(preds)
# print(evidence)
preds[j]['claim'] = claim
preds[j]['premise_source_doc_id'] = evidence[j][0]
preds[j]['premise_source_doc_line_num'] = evidence[j][1]
preds[j]['premise_source_doc_sentence'] = potential_evidence_sentences[j]
saveFile.write(json.dumps(preds[j], ensure_ascii=False) + "\n")

saveFile.close()
claim_num += 1
# print(claim_num)
# print(instances[i])
with StanfordOpenIE() as client:
with jsonlines.open(concatenate_file, mode='w') as writer_c:
for i in range(6, len(instances)):
claim = instances[i]['claim']
print(claim)
evidence = instances[i]['predicted_sentences']
potential_evidence_sentences = []

for sentence in evidence:
# print(sentence)
# print(sentence[0])
# load document from TF-IDF
relevant_doc = ud.normalize('NFC', sentence[0])
relevant_doc = relevant_doc.replace("/", "-SLH-")
file = codecs.open(wiki_split_docs_dir + "/" + relevant_doc + ".json", "r", "utf-8")
file = json.load(file)
full_lines = file["lines"]

lines = []
for line in full_lines:
lines.append(line['content'])

lines[sentence[1]] = lines[sentence[1]].strip()
lines[sentence[1]] = lines[sentence[1]].replace("-LRB-", " ( ")
lines[sentence[1]] = lines[sentence[1]].replace("-RRB-", " ) ")

potential_evidence_sentences.append(lines[sentence[1]])

# Just adding a check
# This is needed in case nothing was predicted
if len(potential_evidence_sentences) == 0:
zero_results += 1
potential_evidence_sentences.append("Nothing")
evidence.append(["Nothing", 0])

# this will create document retrieval and sentence retrieval based on NER
if INCLUDE_NER:
relevant_docs, entities = doc_retrieval.getRelevantDocs(claim, wiki_entities, "spaCy",
nlp) # "spaCy", nlp)#
print(relevant_docs)
# print(entities)
relevant_sentences = sentence_retrieval.getRelevantSentences(relevant_docs, entities, wiki_split_docs_dir)
# print(relevant_sentences)

predicted_evidence = []
for sent in relevant_sentences:
predicted_evidence.append((sent['id'], sent['line_num']))
potential_evidence_sentences.append(sent['sentence'])
evidence.append((sent['id'], sent['line_num']))

instances[i]['predicted_pages_ner'] = relevant_docs
instances[i]['predicted_sentences_ner'] = predicted_evidence

preds = run_rte(claim, potential_evidence_sentences, claim_num)

saveFile = codecs.open("rte/entailment_predictions/claim_" + str(claim_num) + ".json", mode="w+",
encoding="utf-8")
for j in range(len(preds)):
# print(preds)
# print(evidence)
preds[j]['claim'] = claim
preds[j]['premise_source_doc_id'] = evidence[j][0]
preds[j]['premise_source_doc_line_num'] = evidence[j][1]
preds[j]['premise_source_doc_sentence'] = potential_evidence_sentences[j]
saveFile.write(json.dumps(preds[j], ensure_ascii=False) + "\n")

saveFile.close()
claim_num += 1
# print(claim_num)
# print(instances[i])

if INLCUDE_OIE:
relevant_docs, entities = doc_retrieval.get_docs_with_oie(claim, wiki_entities, client)
print(entities)
instances[i]['predicted_pages_oie'] = relevant_docs

writer_c.write(instances[i])
print("Claim number: " + str(i) + " of " + str(len(instances)))

print("Number of Zero Sentences Found: " + str(zero_results))
23 changes: 18 additions & 5 deletions metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@
train_predictions_file = "predictions/predictions_train.jsonl"
else: # type_file == 'dev':
train_file = "data/dev.jsonl"
train_relevant_file = "data/dev_relevant_docs.jsonl"
train_concatenate_file = "data/dev_concatenation.jsonl"
train_predictions_file = "predictions/new_predictions_dev_ner.jsonl"
train_relevant_file = "data/dev_concatenation.jsonl"
train_concatenate_file = "data/dev_concatenation_oie.jsonl"
train_predictions_file = "predictions/new_predictions_dev.jsonl"
else:
print("Needs to have one argument. Choose:")
print("train")
Expand Down Expand Up @@ -73,6 +73,9 @@

_claim.add_predicted_docs(claim['predicted_pages'])
_claim.add_predicted_sentences(claim['predicted_sentences'])
if "predicted_pages_ner" in claim:
_claim.add_predicted_docs_ner(claim['predicted_pages_ner'])
_claim.add_predicted_sentences_ner(claim['predicted_sentences_ner'])

for claim in train_concatenate:
_id = claim['id']
Expand All @@ -81,8 +84,10 @@
if not _claim.verifiable:
continue

_claim.add_predicted_docs_ner(claim['predicted_pages_ner'])
_claim.add_predicted_sentences_ner(claim['predicted_sentences_ner'])
# _claim.add_predicted_docs_ner(claim['predicted_pages_ner'])
# _claim.add_predicted_sentences_ner(claim['predicted_sentences_ner'])
_claim.add_predicted_docs_oie(claim['predicted_pages_oie'])


results = Claim.document_retrieval_stats(claims, _type="tfidf")

Expand All @@ -100,6 +105,14 @@
print("Precision (Document Retrieved): \t" + str(results[0]))
print("Recall (Relevant Documents): \t\t" + str(results[1]))

results = Claim.document_retrieval_stats(claims, _type="oie")

print("\n######################")
print("# Documents Only OIE #")
print("########################")
print("Precision (Document Retrieved): \t" + str(results[0]))
print("Recall (Relevant Documents): \t\t" + str(results[1]))

results = Claim.document_retrieval_stats(claims, _type="all")

print("\n######################")
Expand Down
16 changes: 14 additions & 2 deletions metrics/claim.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def __init__(self, _id, name, verifiable):
self.predicted_evidence = []
self.predicted_docs_ner = []
self.predicted_evidence_ner = []
self.predicted_docs_oie = []
self.line = []
self.predicted_line = []

Expand Down Expand Up @@ -53,6 +54,10 @@ def add_predicted_sentences_ner(self, pairs):
e = str(pair[0]), str(pair[1])
self.predicted_evidence_ner.append(e)

def add_predicted_docs_oie(self, docs):
for doc in docs:
self.predicted_docs_oie.append(doc)

def get_gold_documents(self):
docs = set()
for e in self.gold_evidence:
Expand All @@ -70,19 +75,25 @@ def get_predicted_documents(self, _type="tfidf"):
return self.predicted_docs
if _type == "ner":
return self.predicted_docs_ner
if _type == "oie":
return self.predicted_docs_oie
else:
documents = set()
for doc in self.predicted_docs:
documents.add(doc)
for doc in self.predicted_docs_ner:
documents.add(doc)
for doc in self.predicted_docs_oie:
documents.add(doc)
return documents

def get_predicted_evidence(self, _type="tfidf"):
if _type == "tfidf":
return self.predicted_evidence
evidences = set(self.predicted_evidence)
return evidences
elif _type == "ner":
return self.predicted_evidence_ner
evidences = set(self.predicted_evidence_ner)
return evidences
else:
evidences = set()
for e in self.predicted_evidence:
Expand All @@ -106,6 +117,7 @@ def calculate_correct_docs(self, difficulty="all", _type="tfidf"):
def calculate_correct_sentences(self, difficulty="all", _type="tfidf"):
num_corr_e = 0
gold_pairs = self.get_gold_pairs()
gold_pairs = set(gold_pairs)
if difficulty == "all":
for e in self.get_predicted_evidence(_type=_type):
if e in gold_pairs:
Expand Down
2 changes: 1 addition & 1 deletion split_wiki_into_indv_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import datetime

wiki_folder = 'data/wiki-pages'
dest_dir = "../wiki-pages-split"
dest_dir = "data/wiki-pages-split"
files = os.listdir(wiki_folder)

count = 0
Expand Down

0 comments on commit 8ecab20

Please sign in to comment.