-
Notifications
You must be signed in to change notification settings - Fork 5
/
quantitative-evaluation.py
94 lines (76 loc) · 2.47 KB
/
quantitative-evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu
from rouge_score import rouge_scorer
import json
eval_qs_filename = "./../team-answers/evaluation-questions.json"
team_qs_filename = "./../team-answers/<TEAM-NAME>.json"
def bleu(ref, gen):
'''
calculate pair wise bleu score. uses nltk implementation
Args:
references : a list of reference sentences
candidates : a list of candidate(generated) sentences
Returns:
bleu score(float)
'''
ref_bleu = []
gen_bleu = []
for l in gen:
gen_bleu.append(l.split())
for i, l in enumerate(ref):
ref_bleu.append([l.split()])
cc = SmoothingFunction()
score_bleu = corpus_bleu(ref_bleu, gen_bleu, weights=(
0, 1, 0, 0), smoothing_function=cc.method4)
return score_bleu
def rouge(ref, gen):
'''
Calculate the rouge score
'''
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
score_rouge = scorer.score(ref, gen)
return score_rouge
def expand_range(old_value):
'''
Expand the given value from a range between 0 ... 1 to 0 ... 5
'''
old_max = 1
old_min = 0
new_max = 5
new_min = 0
old_range = old_max - old_min
new_range = new_max - new_min
return (((old_value - old_min) * new_range) / old_range) + new_min
# Load the file with the evaluation questions and answers
eval_qs_file = open(eval_qs_filename, 'r')
eval_qs_data = json.load(eval_qs_file)
# Load the file with the team answers
team_qs_file = open(team_qs_filename, 'r')
team_qs_data = json.load(team_qs_file)
###
#
# Collect the bleu and rouge scores for each of the questions
#
###
bleu_score = 0
rouge_score = 0
valid_answers = 0
for i in range(len(eval_qs_data)):
if len(team_qs_data[i]["Question"]) == 0:
continue
valid_answers += 1
b_score = bleu([eval_qs_data[i]["Answer"]], [team_qs_data[i]["Answer"]])
r_score = rouge(eval_qs_data[i]["Answer"], team_qs_data[i]["Answer"])
bleu_score += b_score
rouge_score += r_score["rougeL"].fmeasure
###
#
# Expand the scores to a value between 0 ... 5 and print the results
#
###
final_bleu_score = expand_range(bleu_score / valid_answers)
final_rouge_score = expand_range(rouge_score / valid_answers)
final_overall_score = (final_bleu_score + final_rouge_score) / 2
print("FILENAME: " + team_qs_filename)
print("BLEU_SCORE: " + str(final_bleu_score))
print("ROUGE_SCORE: " + str(final_rouge_score))
print("FINAL_SCORE: " + str(final_overall_score))