-
Notifications
You must be signed in to change notification settings - Fork 0
/
gpt_eval.py
126 lines (108 loc) · 6.04 KB
/
gpt_eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import json
import time
import openai
from tqdm import tqdm
import os
import argparse
import gzip
from typing import Dict, Iterable
def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
"""
Writes an iterable of dictionaries to jsonl
"""
if append:
mode = "ab"
else:
mode = "wb"
filename = os.path.expanduser(filename)
if filename.endswith(".gz"):
with open(filename, mode) as fp:
with gzip.GzipFile(fileobj=fp, mode="wb") as gzfp:
for x in data:
gzfp.write((json.dumps(x) + "\n").encode("utf-8"))
else:
with open(filename, mode) as fp:
for x in data:
fp.write((json.dumps(x) + "\n").encode("utf-8"))
class Get:
def __init__(self):
self.prompt = ""
def calc(self,doc="", input="", temp=0,n=1,model=3.5, task="code"):
openai.api_type = "azure" # 'azure', 'azure_ad', 'open_ai'
openai.api_base = "https://runway.devops.xiaohongshu.com"
openai.api_version = "2023-05-15" # "2023-03-15-preview"
if model==3.5:
openai.api_key = "please input your token here"
id = "gpt-35-turbo"
elif model==4:
openai.api_key = "please input your token here"
id = "gpt4-PTU"
else:
openai.api_key = "please input your token here"
id = "gpt-4"
if task == "summ_screen":
instruction = "You will be given one summary written for a TV show episode.\nYour task is to rate the summary on one metric.\nPlease make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.\n\nEvaluation Criteria:\nCoherence (1-5) - the collective quality of all sentences. We align this dimension with the DUC quality question of structure and coherence whereby \"the summary should be well-structured and well-organized. The summary should not just be a heap of related information, but should build from sentence to sentence to a coherent body of information about a topic.\"\n\nEvaluation Steps:\n1. Read the news article carefully and identify the main topic and key points.\n2. Read the summary and compare it to the news article. Check if the summary covers the main topic and key points of the news article, and if it presents them in a clear and logical order.\n3. Assign a score for coherence on a scale of 1 to 5, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria.\n\nExample:\nSource Text:\n{" + doc + "}" + "\nSummary:\n{" + input + "}\nEvaluation Form (scores ONLY):\n- Coherence:"
cur = []
elif task == "DM":
instruction = "You will be given one summary written for a TV show episode.\nYour task is to rate the summary on one metric.\nPlease make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.\n\nEvaluation Criteria:\nCoherence (1-5) - the collective quality of all sentences. We align this dimension with the DUC quality question of structure and coherence whereby \"the summary should be well-structured and well-organized. The summary should not just be a heap of related information, but should build from sentence to sentence to a coherent body of information about a topic.\"\n\nEvaluation Steps:\n1. Read the news article carefully and identify the main topic and key points.\n2. Read the summary and compare it to the news article. Check if the summary covers the main topic and key points of the news article, and if it presents them in a clear and logical order.\n3. Assign a score for coherence on a scale of 1 to 5, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria.\n\nExample:\nSource Text:\n{" + doc + "}" + "\nSummary:\n{" + input + "}\nEvaluation Form (scores ONLY):\n- Coherence:"
cur = []
count = 0
while True:
messages = [{"role": "system", "content": instruction}]
try:
response = openai.ChatCompletion.create(
deployment_id=id,
messages=messages,
top_p=1,
n=n,
temperature=temp,
)
res = [tem["message"]["content"] for tem in response["choices"]]
return res
except Exception as e:
if count > 2:
return [" "]
print("An error occurred:", e)
count += 1
time.sleep(18)
def str_to_float(input_list):
out = []
for item in input_list:
try:
out.append(float(item))
except:
m = 1
return sum(out)/len(out)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--input_path', help="input jsonl path", required=True)
args = parser.parse_args()
input_path = args.input_path
input_data = []
with open(input_path, "r") as f:
for line in tqdm(f):
sub_data = json.loads(line)
input_data.append(sub_data)
Gen = Get()
model = 4
temp = 1
n = 20
samples = []
i = 0
dataset = ["summ_screen", "DM"]
dataset = dataset[0]
output_path = input_path.split(".json")[0] + "gpt{}eval".format(model) + ".jsonl"
for item in tqdm(input_data):
scores = []
for i in range(len(item["completion"][:1])):
completion = Gen.calc(doc=item["input"], input=item["completion"][i],temp=temp, n=n, model=model, task=dataset)[:n]
try:
scores.append(str_to_float(completion))
except:
print("bad")
try:
samples.append(dict(input=item["input"], answer=item["answer"], score=sum(scores)/len(scores)))
except:
print("N/A")
write_jsonl(output_path, samples)
write_jsonl(output_path, samples)