Skip to content

Commit

Permalink
Init Sentence Analyzer and OIE for Document Retrieval
Browse files Browse the repository at this point in the history
  • Loading branch information
pedrojlazevedo committed Mar 25, 2020
1 parent 8ecab20 commit 8d67c58
Show file tree
Hide file tree
Showing 6 changed files with 2,363 additions and 22 deletions.
2,219 changes: 2,219 additions & 0 deletions data/dev_concatenation_oie.jsonl

Large diffs are not rendered by default.

89 changes: 70 additions & 19 deletions doc_retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,43 +36,94 @@ def get_docs_with_oie(claim, wiki_entities,client):
ents.add(triple["object"])

# triples extraction clausIE
clauses, ner_spacy = clausie.clausie(claim)
if len(triples) == 0:
clauses = clausie.clausie(claim)
for clause in clauses:
for sub in clause['S']:
ents.add(sub)
ents.add(sub.text)
for obj in clause['O']:
ents.add(obj)
ents.add(obj.text)
print(ner_spacy)
print(ents)
for ent in ner_spacy:
ents.add(ent.text)

if len(ents) > 4:
if len(ents) > 5:
ents = clean_entities(ents)

ents = list(ents)

if len(ents) != 0:
_str = ""
for ent in ents:
_str += ent
_str += " "
_str = _str[:-1]
ents.append(_str)

if "film" in claim:
_str += " ( film )"
ents.append(_str)
elif "(" in claim:
disambiguation = claim[claim.find("(") + 1:claim.find(")")]
_str += " " + disambiguation
ents.append(_str)
print(ents)
docs, entities = getClosestDocs(wiki_entities, ents)

return docs, entities


# getting the 3 closest docs!
def getClosestDocs(wiki_entities, entities):
entities = list(entities)
for i in range(len(entities)):
entities[i] = str(entities[i])
selected_docs = []
for ent in entities:
ent = ud.normalize('NFC', ent)
if ent in wiki_entities:
best_match = ent
else:
best = 1.1
best_match = ""
for we in wiki_entities:
dist = stringdist.levenshtein_norm(we, ent)
if dist < best:
best = dist
best_match = we
best_match = best_match.replace(" ", "_")
best_match = best_match.replace("/", "-SLH-")
best_match = best_match.replace("(", "-LRB-")
best_match = best_match.replace(")", "-RRB-")
selected_docs.append(best_match)

best_1 = 1.1
best_match_1 = ""

best_2 = 1.1
best_match_2 = ""

best_3 = 1.1
best_match_3 = ""

for we in wiki_entities:
dist = stringdist.levenshtein_norm(we, ent)
if dist < best_1:
best_1 = dist
best_match_1 = we

elif dist < best_2:
best_2 = dist
best_match_2 = we

elif dist < best_3:
best_3 = dist
best_match_3 = we

best_match_1 = best_match_1.replace(" ", "_")
best_match_1 = best_match_1.replace("/", "-SLH-")
best_match_1 = best_match_1.replace("(", "-LRB-")
best_match_1 = best_match_1.replace(")", "-RRB-")

best_match_2 = best_match_2.replace(" ", "_")
best_match_2 = best_match_2.replace("/", "-SLH-")
best_match_2 = best_match_2.replace("(", "-LRB-")
best_match_2 = best_match_2.replace(")", "-RRB-")

best_match_3 = best_match_3.replace(" ", "_")
best_match_3 = best_match_3.replace("/", "-SLH-")
best_match_3 = best_match_3.replace("(", "-LRB-")
best_match_3 = best_match_3.replace(")", "-RRB-")

selected_docs.append(best_match_1)
selected_docs.append(best_match_2)
selected_docs.append(best_match_3)
return selected_docs, entities


Expand Down
2 changes: 1 addition & 1 deletion generate_rte_preds.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def run_rte(claim, evidence, claim_num):

with StanfordOpenIE() as client:
with jsonlines.open(concatenate_file, mode='w') as writer_c:
for i in range(6, len(instances)):
for i in range(len(instances)):
claim = instances[i]['claim']
print(claim)
evidence = instances[i]['predicted_sentences']
Expand Down
8 changes: 7 additions & 1 deletion metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@
# _claim.add_predicted_docs_ner(claim['predicted_pages_ner'])
# _claim.add_predicted_sentences_ner(claim['predicted_sentences_ner'])
_claim.add_predicted_docs_oie(claim['predicted_pages_oie'])
if not _claim.check_evidence_found_doc(_type="all"):
print(str(_claim.get_gold_documents()) + " -- " + str(_claim.get_predicted_documents(_type="all")))


results = Claim.document_retrieval_stats(claims, _type="tfidf")
Expand All @@ -96,6 +98,7 @@
print("########################")
print("Precision (Document Retrieved): \t" + str(results[0]))
print("Recall (Relevant Documents): \t\t" + str(results[1]))
print("At least one Doc Found: \t\t" + str(results[2]))

results = Claim.document_retrieval_stats(claims, _type="ner")

Expand All @@ -104,6 +107,7 @@
print("########################")
print("Precision (Document Retrieved): \t" + str(results[0]))
print("Recall (Relevant Documents): \t\t" + str(results[1]))
print("At least one Doc Found: \t\t" + str(results[2]))

results = Claim.document_retrieval_stats(claims, _type="oie")

Expand All @@ -112,14 +116,16 @@
print("########################")
print("Precision (Document Retrieved): \t" + str(results[0]))
print("Recall (Relevant Documents): \t\t" + str(results[1]))
print("At least one Doc Found: \t\t" + str(results[2]))

results = Claim.document_retrieval_stats(claims, _type="all")

print("\n######################")
print("# Documents for BOTH #")
print("# Documents for All #")
print("######################")
print("Precision (Document Retrieved): \t" + str(results[0]))
print("Recall (Relevant Documents): \t\t" + str(results[1]))
print("At least one Doc Found: \t\t" + str(results[2]))

results = Claim.evidence_extraction_stats(claims, _type="tfidf")

Expand Down
7 changes: 6 additions & 1 deletion metrics/claim.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ def find_by_id(cls, _id):
def document_retrieval_stats(cls, claims, _type="tfidf"):
precision_correct = 0
recall_correct = 0
correct_doc = 0
total_claims = 0

for claim in claims:
Expand All @@ -157,10 +158,14 @@ def document_retrieval_stats(cls, claims, _type="tfidf"):
precision_correct += doc_correct / (len(claim.get_predicted_documents(_type=_type)) + 0.000001)
recall_correct += doc_correct / (len(claim.get_gold_documents()) + 0.000001)

if doc_correct > 0:
correct_doc += 1

precision_correct /= total_claims
recall_correct /= total_claims
correct_doc /= total_claims

return precision_correct, recall_correct
return precision_correct, recall_correct, correct_doc

@classmethod
def evidence_extraction_stats(cls, claims, _type="tfidf"):
Expand Down
60 changes: 60 additions & 0 deletions run_sentence_selection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import jsonlines
import codecs
import json
from sentence_transformers import SentenceTransformer
import scipy.spatial

wiki_split_docs_dir = "data/wiki-pages-split"
relevant_docs_file = "data/dev_concatenation.jsonl"
relevant_sent_file = "data/dev_sentence_selection.jsonl"


def get_sentence(doc, line_num):
file = codecs.open(wiki_split_docs_dir + "/" + doc + ".json", "r", "utf-8")
file = json.load(file)
full_lines = file["lines"]
lines = []
for line in full_lines:
lines.append(line['content'])
sentence = lines[line_num]
return sentence


# model = SentenceTransformer('bert-base-nli-mean-tokens')
embedder = SentenceTransformer('bert-base-wikipedia-sections-mean-tokens')

claims = []
for line in relevant_docs_file:
claims.append(line)

# testing
claim_0 = claims[0]
for pair in claim_0['predicted_sentences_ner']:
print(get_sentence(pair[0], pair[1]))

with jsonlines.open(relevant_sent_file, mode='w') as writer_c:
corpus = []
for claim in claims:
# get all possible sentences
for pair in claim['predicted_sentences_ner']:
sentence = get_sentence(pair[0], pair[1])
corpus.append(sentence)

# create embeddings
corpus_embeddings = embedder.encode(corpus)
query_embeddings = embedder.encode(claim['claim'])

# get the n most similar sentences
closest_n = 5
for query, query_embedding in zip(claim, query_embeddings):
distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]

results = zip(range(len(distances)), distances)
results = sorted(results, key=lambda x: x[1])

print("\n\n======================\n\n")
print("Query:", query)
print("\nTop 5 most similar sentences in corpus:")

for idx, distance in results[0:closest_n]:
print(corpus[idx].strip(), "(Score: %.4f)" % (1 - distance))

0 comments on commit 8d67c58

Please sign in to comment.