-
Notifications
You must be signed in to change notification settings - Fork 60
/
evals.py
134 lines (118 loc) · 4.91 KB
/
evals.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from utils import *
import numpy as np
SMALL_POSITIVE_CONST = 1e-4
def evaluate_ote(gold_ot, pred_ot):
"""
evaluate the model performce for the ote task
:param gold_ot: gold standard ote tags
:param pred_ot: predicted ote tags
:return:
"""
assert len(gold_ot) == len(pred_ot)
n_samples = len(gold_ot)
# number of true positive, gold standard, predicted opinion targets
n_tp_ot, n_gold_ot, n_pred_ot = 0, 0, 0
for i in range(n_samples):
g_ot = gold_ot[i]
p_ot = pred_ot[i]
g_ot_sequence, p_ot_sequence = tag2ot(ote_tag_sequence=g_ot), tag2ot(ote_tag_sequence=p_ot)
# hit number
n_hit_ot = match_ot(gold_ote_sequence=g_ot_sequence, pred_ote_sequence=p_ot_sequence)
n_tp_ot += n_hit_ot
n_gold_ot += len(g_ot_sequence)
n_pred_ot += len(p_ot_sequence)
# add 0.001 for smoothing
# calculate precision, recall and f1 for ote task
ot_precision = float(n_tp_ot) / float(n_pred_ot + SMALL_POSITIVE_CONST)
ot_recall = float(n_tp_ot) / float(n_gold_ot + SMALL_POSITIVE_CONST)
ot_f1 = 2 * ot_precision * ot_recall / (ot_precision + ot_recall + SMALL_POSITIVE_CONST)
ote_scores = (ot_precision, ot_recall, ot_f1)
return ote_scores
def evaluate_ts(gold_ts, pred_ts):
"""
evaluate the model performance for the ts task
:param gold_ts: gold standard ts tags
:param pred_ts: predicted ts tags
:return:
"""
assert len(gold_ts) == len(pred_ts)
n_samples = len(gold_ts)
# number of true postive, gold standard, predicted targeted sentiment
n_tp_ts, n_gold_ts, n_pred_ts = np.zeros(3), np.zeros(3), np.zeros(3)
ts_precision, ts_recall, ts_f1 = np.zeros(3), np.zeros(3), np.zeros(3)
for i in range(n_samples):
g_ts = gold_ts[i]
p_ts = pred_ts[i]
g_ts_sequence, p_ts_sequence = tag2ts(ts_tag_sequence=g_ts), tag2ts(ts_tag_sequence=p_ts)
hit_ts_count, gold_ts_count, pred_ts_count = match_ts(gold_ts_sequence=g_ts_sequence,
pred_ts_sequence=p_ts_sequence)
n_tp_ts += hit_ts_count
n_gold_ts += gold_ts_count
n_pred_ts += pred_ts_count
# calculate macro-average scores for ts task
for i in range(3):
n_ts = n_tp_ts[i]
n_g_ts = n_gold_ts[i]
n_p_ts = n_pred_ts[i]
ts_precision[i] = float(n_ts) / float(n_p_ts + SMALL_POSITIVE_CONST)
ts_recall[i] = float(n_ts) / float(n_g_ts + SMALL_POSITIVE_CONST)
ts_f1[i] = 2 * ts_precision[i] * ts_recall[i] / (ts_precision[i] + ts_recall[i] + SMALL_POSITIVE_CONST)
ts_macro_f1 = ts_f1.mean()
# calculate micro-average scores for ts task
n_tp_total = sum(n_tp_ts)
# total sum of TP and FN
n_g_total = sum(n_gold_ts)
# total sum of TP and FP
n_p_total = sum(n_pred_ts)
ts_micro_p = float(n_tp_total) / (n_p_total + SMALL_POSITIVE_CONST)
ts_micro_r = float(n_tp_total) / (n_g_total + SMALL_POSITIVE_CONST)
ts_micro_f1 = 2 * ts_micro_p * ts_micro_r / (ts_micro_p + ts_micro_r + SMALL_POSITIVE_CONST)
ts_scores = (ts_macro_f1, ts_micro_p, ts_micro_r, ts_micro_f1)
return ts_scores
def evaluate(gold_ot, gold_ts, pred_ot, pred_ts):
"""
evaluate the performance of the predictions
:param gold_ot: gold standard opinion target tags
:param gold_ts: gold standard targeted sentiment tags
:param pred_ot: predicted opinion target tags
:param pred_ts: predicted targeted sentiment tags
:return: metric scores of ner and sa
"""
assert len(gold_ot) == len(gold_ts) == len(pred_ot) == len(pred_ts)
ote_scores = evaluate_ote(gold_ot=gold_ot, pred_ot=pred_ot)
ts_scores = evaluate_ts(gold_ts=gold_ts, pred_ts=pred_ts)
return ote_scores, ts_scores
def match_ot(gold_ote_sequence, pred_ote_sequence):
"""
calculate the number of correctly predicted opinion target
:param gold_ote_sequence: gold standard opinion target sequence
:param pred_ote_sequence: predicted opinion target sequence
:return: matched number
"""
n_hit = 0
for t in pred_ote_sequence:
if t in gold_ote_sequence:
n_hit += 1
return n_hit
def match_ts(gold_ts_sequence, pred_ts_sequence):
"""
calculate the number of correctly predicted targeted sentiment
:param gold_ts_sequence: gold standard targeted sentiment sequence
:param pred_ts_sequence: predicted targeted sentiment sequence
:return:
"""
# positive, negative and neutral
tag2tagid = {'POS': 0, 'NEG': 1, 'NEU': 2}
hit_count, gold_count, pred_count = np.zeros(3), np.zeros(3), np.zeros(3)
for t in gold_ts_sequence:
#print(t)
ts_tag = t[2]
tid = tag2tagid[ts_tag]
gold_count[tid] += 1
for t in pred_ts_sequence:
ts_tag = t[2]
tid = tag2tagid[ts_tag]
if t in gold_ts_sequence:
hit_count[tid] += 1
pred_count[tid] += 1
return hit_count, gold_count, pred_count