From 97956cabfe044ffb35fb3cb57a41f1fd75df75c2 Mon Sep 17 00:00:00 2001
From: pedrojlazevedo <up201306026@fe.up.pt>
Date: Thu, 5 Mar 2020 16:24:43 +0000
Subject: [PATCH] Added essential metrics for every step

---
 metrics.py                | 101 +++++++++++++++++++++++--
 rte/rte.py                |   4 +-
 scorer.py                 | 153 ++++++++++++++++++++++++++++++++++++++
 train_label_classifier.py |   6 +-
 4 files changed, 253 insertions(+), 11 deletions(-)
 create mode 100644 scorer.py

diff --git a/metrics.py b/metrics.py
index d5d62910..f3465166 100644
--- a/metrics.py
+++ b/metrics.py
@@ -1,16 +1,20 @@
 import jsonlines
-import json
+import sys
+from scorer import fever_score
 
 train_file = "data/subsample_train.jsonl"
 train_relevant_file = "data/subsample_train_relevant_docs.jsonl"
+train_concatenate_file = "data/subsample_train_concatenation.jsonl"
 train_predictions_file = "predictions/predictions_train.jsonl"
 
 train_file = jsonlines.open(train_file)
 train_relevant_file = jsonlines.open(train_relevant_file)
+train_concatenate_file = jsonlines.open(train_concatenate_file)
 train_predictions_file = jsonlines.open(train_predictions_file)
 
 train_set = []
 train_relevant = []
+train_concatenate = []
 train_prediction = []
 
 for lines in train_file:
@@ -23,6 +27,17 @@
     lines['claim'] = lines['claim'].replace("-RRB-", " ) ")
     train_relevant.append(lines)
 
+for lines in train_concatenate_file:
+    lines['claim'] = lines['claim'].replace("-LRB-", " ( ")
+    lines['claim'] = lines['claim'].replace("-RRB-", " ) ")
+    train_concatenate.append(lines)
+
+# this evidence addition is irrelevant
+info_by_id = dict((d['id'], dict(d, index=index)) for (index, d) in enumerate(train_set))
+for lines in train_predictions_file:
+    lines['evidence'] = info_by_id.get(lines['id'])['evidence']
+    train_prediction.append(lines)
+
 # All claims
 stop = 0
 
@@ -32,7 +47,8 @@
 verifiable : boolean of 1 and 0 with respective meaning
 docs : set of documents that verify the claim
 docs_sep : set of documents seperated
-sentences: list of tuples of <doc, line>
+evidences: list of tuples of <doc, line>
+difficulties: list of the number of sentences needed to be evidence
 '''
 gold_data = []
 
@@ -51,21 +67,26 @@
     gold_documents_seperated = set()
     sentences_pair = set()
     evidences = claim['evidence']
-
+    difficulties = []
     for evidence in evidences:
         doc_name = ''
+        difficulty = 0
         if len(evidence) > 1:  # needs more than 1 doc to be verifiable
             for e in evidence:
                 doc_name += str(e[2])
                 doc_name += " "
                 sentences_pair.add((str(e[2]), str(e[3])))  # add gold sentences
                 gold_documents_seperated.add(str(e[2]))  # add the document
+                difficulty += 1
             doc_name = doc_name[:-1]  # erase the last blank space
         else:
             doc_name = str(evidence[0][2])
             gold_documents_seperated.add(str(evidence[0][2]))
             sentences_pair.add((str(evidence[0][2]), str(evidence[0][3])))
+            difficulty = 1
+        difficulties.append(difficulty)
         gold_documents.add(doc_name)
+    gold_dict['difficulties'] = difficulties
     gold_dict['docs'] = gold_documents
     gold_dict['evidences'] = sentences_pair
     gold_dict['docs_sep'] = gold_documents_seperated
@@ -92,6 +113,13 @@
 recall_incorrect = 0
 specificity = 0
 
+precision_sent_correct = 0
+precision_sent_incorrect = 0
+recall_sent_correct = 0
+recall_sent_incorrect = 0
+sent_found = 0
+sent_found_if_doc_found = 0
+
 total_claim = 0
 for claim in train_relevant:
     _id = claim['id']
@@ -102,6 +130,7 @@
         continue
 
     # document analysis
+    # TODO: Analyse NER and TF-IDF
     doc_correct = 0
     doc_incorrect = 0
     gold_incorrect = 0
@@ -130,7 +159,31 @@
         doc_found += 1
 
     # sentence analysis TODO: check sentences
+    sentences = set()
+    for sent in claim['predicted_sentences']:
+        sentences.add((str(sent[0]), str(sent[1])))
+
+    evidences = gold_dict['evidences']
+    sent_correct = 0
+    sent_incorrect = 0
+    flag = False
+    for sent in sentences:
+        if sent in evidences:
+            sent_correct += 1
+            flag = True
+        else:
+            sent_incorrect += 1
 
+    if flag:
+        sent_found += 1
+
+    if doc_correct and flag:
+        sent_found_if_doc_found += 1
+
+    precision_sent_correct += sent_correct / len(sentences)
+    precision_sent_incorrect += sent_incorrect / len(sentences)
+    recall_sent_correct += sent_correct / len(evidences)
+    recall_sent_incorrect += sent_incorrect / len(evidences)
 
     # TODO: create all possible pair in order to see if it appears in gold_dict['docs']
     # claim['predicted_sentences']
@@ -140,6 +193,10 @@
     stop += 1
     if stop == -1:
         break
+
+# scores from fever
+results = fever_score(train_prediction, actual = train_set)
+
 precision_correct /= total_claim
 precision_incorrect /= total_claim
 recall_correct /= total_claim
@@ -147,10 +204,42 @@
 specificity /= total_claim
 doc_found /= total_claim
 
-
+print("\n#############")
+print("# DOCUMENTS #")
+print("#############")
 print("Precision (Document Retrieved):\t\t\t\t\t\t " + str(precision_correct))  # precision
-print("Fall-out (incorrect documents):\t\t\t " + str(precision_incorrect))  # precision
+print("Fall-out (incorrect documents):\t\t\t\t\t\t " + str(precision_incorrect))  # precision
 print("Recall (Relevant Documents):\t\t\t\t\t\t " + str(recall_correct))  # recall
 print("Percentage of gold documents NOT found:\t\t\t\t " + str(recall_incorrect))  # recall
 print("Fall-out: " + str(specificity))
-print("Percentage of at least one document found correctly: " + str(doc_found))  # recall
\ No newline at end of file
+print("Percentage of at least one document found correctly: " + str(doc_found))  # recall
+
+
+precision_sent_correct /= total_claim
+precision_sent_incorrect /= total_claim
+recall_sent_correct /= total_claim
+recall_sent_incorrect /= total_claim
+sent_found /= total_claim
+sent_found_if_doc_found /= total_claim
+another_sent = sent_found_if_doc_found / doc_found
+
+print("\n#############")
+print("# SENTENCES #")
+print("#############")
+print("Precision (Sentences Retrieved):\t\t\t\t\t " + str(precision_sent_correct))  # precision
+print("Precision (incorrect Sentences):\t\t\t\t\t " + str(precision_sent_incorrect))  # precision
+print("Recall (Relevant Sentences):\t\t\t\t\t\t " + str(recall_sent_correct))  # recall
+print("Percentage of gold Sentences NOT found:\t\t\t\t " + str(recall_sent_incorrect))  # recall
+print("Percentage of at least one Sentence found correctly: " + str(sent_found))  # recall
+print("Percentage of at least one Sentence found correctly: " + str(sent_found_if_doc_found))  # recall
+print("Percentage of at least one Sentence found correctly: " + str(another_sent))  # recall
+
+print("\n#########")
+print("# FEVER #")
+print("#########")
+print("Strict_score: \t\t" + str(results[0]))
+print("Acc_score: \t\t\t" + str(results[1]))
+print("Precision: \t\t\t" + str(results[2]))
+print("Recall: \t\t\t" + str(results[3]))
+print("F1-Score: \t\t\t" + str(results[4]))
+
diff --git a/rte/rte.py b/rte/rte.py
index f6895508..631e0247 100755
--- a/rte/rte.py
+++ b/rte/rte.py
@@ -65,9 +65,9 @@ def determinePredictedLabel(preds):
         [len(nonePredictions), len(supportPredictions), len(contradictionPredictions)])
     mostCommonPrediction = np.argmax(numberOfPredictionsPerLabel)
 
-    if mostCommonPrediction == 0:
+    if mostCommonPrediction == 1:
         return (0, supportPredictions)
-    elif mostCommonPrediction == 1:
+    elif mostCommonPrediction == 2:
         return (1, contradictionPredictions)
     else:
         return (2, [])
diff --git a/scorer.py b/scorer.py
new file mode 100644
index 00000000..ecae63dc
--- /dev/null
+++ b/scorer.py
@@ -0,0 +1,153 @@
+import six
+
+def check_predicted_evidence_format(instance):
+    if 'predicted_evidence' in instance.keys() and len(instance['predicted_evidence']):
+        assert all(isinstance(prediction, list)
+                   for prediction in instance["predicted_evidence"]), \
+            "Predicted evidence must be a list of (page,line) lists"
+
+        assert all(len(prediction) == 2
+                   for prediction in instance["predicted_evidence"]), \
+            "Predicted evidence must be a list of (page,line) lists"
+
+        assert all(isinstance(prediction[0], six.string_types)
+                    for prediction in instance["predicted_evidence"]), \
+            "Predicted evidence must be a list of (page<string>,line<int>) lists"
+
+        assert all(isinstance(prediction[1], int)
+                   for prediction in instance["predicted_evidence"]), \
+            "Predicted evidence must be a list of (page<string>,line<int>) lists"
+
+
+def is_correct_label(instance):
+    return instance["label"].upper() == instance["predicted_label"].upper()
+
+
+def is_strictly_correct(instance, max_evidence=None):
+    #Strict evidence matching is only for NEI class
+    check_predicted_evidence_format(instance)
+
+    if instance["label"].upper() != "NOT ENOUGH INFO" and is_correct_label(instance):
+        assert 'predicted_evidence' in instance, "Predicted evidence must be provided for strict scoring"
+
+        if max_evidence is None:
+            max_evidence = len(instance["predicted_evidence"])
+
+
+        for evience_group in instance["evidence"]:
+            #Filter out the annotation ids. We just want the evidence page and line number
+            actual_sentences = [[e[2], e[3]] for e in evience_group]
+            #Only return true if an entire group of actual sentences is in the predicted sentences
+            if all([actual_sent in instance["predicted_evidence"][:max_evidence] for actual_sent in actual_sentences]):
+                return True
+
+    #If the class is NEI, we don't score the evidence retrieval component
+    elif instance["label"].upper() == "NOT ENOUGH INFO" and is_correct_label(instance):
+        return True
+
+    return False
+
+
+def evidence_macro_precision(instance, max_evidence=None):
+    this_precision = 0.0
+    this_precision_hits = 0.0
+
+    if instance["label"].upper() != "NOT ENOUGH INFO":
+        all_evi = [[e[2], e[3]] for eg in instance["evidence"] for e in eg if e[3] is not None]
+
+        predicted_evidence = instance["predicted_evidence"] if max_evidence is None else \
+                                                                        instance["predicted_evidence"][:max_evidence]
+
+        for prediction in predicted_evidence:
+            if prediction in all_evi:
+                this_precision += 1.0
+            this_precision_hits += 1.0
+
+        return (this_precision / this_precision_hits) if this_precision_hits > 0 else 1.0, 1.0
+
+    return 0.0, 0.0
+
+def evidence_macro_recall(instance, max_evidence=None):
+    # We only want to score F1/Precision/Recall of recalled evidence for NEI claims
+    if instance["label"].upper() != "NOT ENOUGH INFO":
+        # If there's no evidence to predict, return 1
+        if len(instance["evidence"]) == 0 or all([len(eg) == 0 for eg in instance]):
+           return 1.0, 1.0
+
+        predicted_evidence = instance["predicted_evidence"] if max_evidence is None else \
+                                                                        instance["predicted_evidence"][:max_evidence]
+
+        for evidence_group in instance["evidence"]:
+            evidence = [[e[2], e[3]] for e in evidence_group]
+            if all([item in predicted_evidence for item in evidence]):
+                # We only want to score complete groups of evidence. Incomplete groups are worthless.
+                return 1.0, 1.0
+        return 0.0, 1.0
+    return 0.0, 0.0
+
+
+# Micro is not used. This code is just included to demostrate our model of macro/micro
+def evidence_micro_precision(instance):
+    this_precision = 0
+    this_precision_hits = 0
+
+    # We only want to score Macro F1/Precision/Recall of recalled evidence for NEI claims
+    if instance["label"].upper() != "NOT ENOUGH INFO":
+        all_evi = [[e[2], e[3]] for eg in instance["evidence"] for e in eg if e[3] is not None]
+
+        for prediction in instance["predicted_evidence"]:
+            if prediction in all_evi:
+                this_precision += 1.0
+            this_precision_hits += 1.0
+
+    return this_precision, this_precision_hits
+
+
+def fever_score(predictions,actual=None, max_evidence=5):
+    correct = 0
+    strict = 0
+
+    macro_precision = 0
+    macro_precision_hits = 0
+
+    macro_recall = 0
+    macro_recall_hits = 0
+
+    for idx,instance in enumerate(predictions):
+        assert 'predicted_evidence' in instance.keys(), 'evidence must be provided for the prediction'
+
+        #If it's a blind test set, we need to copy in the values from the actual data
+        if 'evidence' not in instance or 'label' not in instance:
+            assert actual is not None, 'in blind evaluation mode, actual data must be provided'
+            assert len(actual) == len(predictions), 'actual data and predicted data length must match'
+            assert 'evidence' in actual[idx].keys(), 'evidence must be provided for the actual evidence'
+            instance['evidence'] = actual[idx]['evidence']
+            instance['label'] = actual[idx]['label']
+
+        assert 'evidence' in instance.keys(), 'gold evidence must be provided'
+
+        if is_correct_label(instance):
+            correct += 1.0
+
+            if is_strictly_correct(instance, max_evidence):
+                strict+=1.0
+
+        macro_prec = evidence_macro_precision(instance, max_evidence)
+        macro_precision += macro_prec[0]
+        macro_precision_hits += macro_prec[1]
+
+        macro_rec = evidence_macro_recall(instance, max_evidence)
+        macro_recall += macro_rec[0]
+        macro_recall_hits += macro_rec[1]
+
+    total = len(predictions)
+
+    strict_score = strict / total
+    acc_score = correct / total
+
+    pr = (macro_precision / macro_precision_hits) if macro_precision_hits > 0 else 1.0
+    rec = (macro_recall / macro_recall_hits) if macro_recall_hits > 0 else 0.0
+
+    f1 = 2.0 * pr * rec / (pr + rec)
+
+    return strict_score, acc_score, pr, rec, f1
\ No newline at end of file
diff --git a/train_label_classifier.py b/train_label_classifier.py
index 230407c1..89392ca1 100644
--- a/train_label_classifier.py
+++ b/train_label_classifier.py
@@ -189,8 +189,8 @@ def predict_test(predictions_test, entailment_predictions_test, new_predictions_
             i += 1
 
 
-predictions_train = "predictions_train.jsonl"
-predictions_test = "predictions.jsonl"
+predictions_train = "predictions/predictions_train.jsonl"
+predictions_test = "predictions/predictions.jsonl"
 new_predictions_file = "predictions/new_predictions.jsonl"
 
 gold_train = "data/subsample_train_relevant_docs.jsonl"
@@ -220,7 +220,7 @@ def predict_test(predictions_test, entailment_predictions_test, new_predictions_
 # clf= Pipeline([('scaler', MinMaxScaler()), ('clf', svm.SVC())])
 
 clf.fit(x_train, y_train)
-
+print("Fit Done")
 joblib.dump(clf, 'label_classifier.pkl')
 # clf = joblib.load('filename.pkl')