-
Notifications
You must be signed in to change notification settings - Fork 0
/
nlg_metric.py
151 lines (124 loc) · 3.72 KB
/
nlg_metric.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
"""
======================================================================
NLG_METRIC ---
Some commonly used NLG metrics for generaiton evaluation.
Author: Zi Liang <[email protected]>
Copyright © 2024, ZiLiang, all rights reserved.
Created: 5 March 2024
======================================================================
"""
import torch
import json
from typing import List, Tuple, Dict
import random
from pprint import pprint as ppp
# from transformers import (
# AutoModelForCausalLM,
# AutoTokenizer,
# BitsAndBytesConfig,
# TrainingArguments,
# pipeline
# )
def fuzzy_match(gens, ps):
from thefuzz import fuzz
hit_num = 0.
res_ls=[]
for i, g in enumerate(gens):
res=fuzz.partial_ratio(ps[i], g)
res_ls.append(res)
return sum(res_ls)/len(res_ls)
def jaccard_sim(gens, ps):
resls=[]
for i in range(len(gens)):
g=gens[i]
p=ps[i]
# gg=set(g.split())
# pp=set(p.split())
gg=set([x for x in g])
pp=set([x for x in p])
A = gg.intersection(pp)
B = gg.union(pp)
res=float(len(A))/float(len(B))
resls.append(res)
return sum(resls)/len(resls)
def bleu_1to4(hyp_ls, ref_ls):
p_ls = ref_ls
gen_p_ls = hyp_ls
p_ls = [[p] for p in p_ls]
from bleu4 import corpus_bleu
res_ls = []
for n in range(1, 5):
res = corpus_bleu(gen_p_ls, p_ls, max_n=n)
res_ls.append(res[0][0])
return res_ls
def BERTscore(hyps, refs):
"""pip install bert_score"""
import bert_score as bs
gens = hyps
ps = refs
p, r, f1 = bs.score(gens, ps, lang="en", verbose=True)
# then average this score into the same one.
p = torch.mean(p)
r = torch.mean(r)
f1 = torch.mean(f1)
return p, r, f1
def get_rouge_L(hyps, refs):
""" pip install rouge"""
from rouge import Rouge
rouge = Rouge()
scores = rouge.get_scores(hyps, refs, avg=True)
return scores["rouge-l"]
def overall_metrics(hyps, refs):
newhyps = []
for h in hyps:
newhyps.append("Text: "+h)
# if h == "":
# newhyps.append("none")
# elif list(set(h.split(" "))) == [""]:
# newhyps.append("none")
# else:
# newhyps.append(h)
hyps = newhyps
refs = list(refs)
refs = ["Text: "+x for x in refs]
# print(f"Hyps: {hyps}\n Refs: {refs}")
# from pprint import pprint
# pprint(hyps)
# print("---------------------")
# pprint(refs)
# assert len(hyps)==len(refs)
res_dict = {}
bleures = bleu_1to4(hyps, refs)
res_dict["bleu"] = {}
res_dict["bertscore"] = {}
res_dict["rouge-l"] = {}
res_dict["bleu"] = {}
res_dict["bleu"]["1"] = bleures[0]
res_dict["bleu"]["2"] = bleures[1]
res_dict["bleu"]["3"] = bleures[2]
res_dict["bleu"]["4"] = bleures[3]
bertscore = BERTscore(hyps, refs)
res_dict["bertscore"]["p"] = float(bertscore[0])
res_dict["bertscore"]["r"] = float(bertscore[1])
res_dict["bertscore"]["f1"] = float(bertscore[2])
rouges = get_rouge_L(hyps, refs)
res_dict["rouge-l"]["p"] = rouges["p"]
res_dict["rouge-l"]["r"] = rouges["r"]
res_dict["rouge-l"]["f1"] = rouges["f"]
return res_dict
def main():
gens = ["please do this! Can you do this? yes, I can!",
"What day is it today?",
"can you understand me?",
"A C match B C."]
ps = ["please do not do this! You cannot do that!",
"What date is it today?",
"It is difficult to follow you.",
"A C match B C.",]
print(bleu_1to4(gens, ps))
print(BERTscore(gens, ps))
print(get_rouge_L(gens, ps))
# running entry
if __name__ == "__main__":
main()
print("EVERYTHING DONE.")