forked from THUDM/AutoWebGLM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
eval.py
137 lines (110 loc) · 3.68 KB
/
eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import json
import sys
import re
import numpy as np
from rouge_chinese import Rouge
import jieba # you can use any other word cutting library
def get_rouge_score(hypothesis, reference):
if hypothesis is None or reference is None:
return None
hypothesis = ' '.join(jieba.cut(hypothesis))
reference = ' '.join(jieba.cut(reference))
rouge = Rouge()
scores = rouge.get_scores(hypothesis, reference)
return scores[0]["rouge-1"]['f']
def parse_function_call(function_call):
pattern = r"(\w+)\((.*)\)"
match = re.match(pattern, function_call)
if match:
function_name = match.group(1)
def return_args(*args):
return args
function_args = eval(f'return_args({match.group(2)})')
return function_name, function_args
return None
def extract(text):
ans = {
'type': None,
'label': None,
'param': None
}
match = parse_function_call(text)
if match:
ans['type'] = match[0]
args = match[1]
if ans['type']:
if ans['type'] == 'click':
ans['label'] = args[0]
elif ans['type'] == 'hover':
ans['label'] = args[0]
elif ans['type'] == 'select':
ans['label'] = args[0]
ans['param'] = args[1]
elif ans['type'] == 'type_string':
ans['label'] = args[0]
ans['param'] = args[1]
elif ans['type'] == 'scroll_page':
ans['param'] = args[0]
elif ans['type'] == 'go':
ans['param'] = args[0]
elif ans['type'] == 'jump_to':
ans['param'] = args[0]
elif ans['type'] == 'switch_tab':
ans['param'] = args[0]
elif ans['type'] == 'user_input':
ans['param'] = args[0]
elif ans['type'] == 'finish':
ans['param'] = args[0]
return ans
if __name__ == '__main__':
result_path = sys.argv[1]
res_list = {
'type': [],
'label': [],
'param': [],
'all': []
}
for ix, r_str in enumerate(open(result_path).readlines()):
r = json.loads(r_str)
try:
labels = json.loads(r['labels'])
except:
labels = [r['labels']]
res = {}
for label in labels:
pred = r['predict'].split('A: ')[-1].strip()
try:
label_ans = extract(label)
pred_ans = extract(pred)
except:
continue
print(f'{ix}. label:', label_ans)
print(f'{ix}. pred:', pred_ans)
if label_ans['type'] is not None:
if label_ans['type'] == pred_ans['type']:
res['type'] = 1
else:
res['type'] = 0
if label_ans['label'] is not None:
if label_ans['label'] == pred_ans['label']:
res['label'] = 1
else:
res['label'] = 0
if label_ans['param'] is not None:
rouge = get_rouge_score(label_ans['param'], pred_ans['param'])
if rouge:
res['param'] = rouge
if label_ans['type'] is not None and label_ans['label'] is not None:
if label_ans['type'] == pred_ans['type'] and label_ans['label'] == pred_ans['label']:
res['all'] = 1
break
else:
res['all'] = 0
for k, v in res.items():
res_list[k].append(v)
for k, v in res_list.items():
if v:
res_list[k] = float(np.mean(v))
else:
res_list[k] = 0.0
print(res_list)