-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_qc.py
151 lines (132 loc) · 4.59 KB
/
run_qc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import time
import os
import json
#from collections import defaultdict
from pathlib import Path
from typing import Optional
from random import random, choice
from openai import OpenAI
import requests
api_key = os.environ.get("OPENAI_LITCOIN_KEY")
def write_query(text):
prompt = f"""
This JSON structure contains a UMLS term for a protein, along with its lexical synonyms.
It also lists one or more UniProtKB terms for that protein with their label.
You must decide whether the UMLS term refers to the same protein as each of the UniProtKB terms.
Return your results in only the following JSON format with no extraneous text:
{{
"UMLS_id": <input umls id>
"UniProtKBs": [
{{ "UniProtKB_id": <uniprotkb_id>,
"explanation": <text explaining the match score>,
"match_score" <0-5, 5 being the best>
}} ]}}
{text}
"""
return prompt
def run_qc(text, out_file_path: Optional[str | Path] = None):
# Define the Prompt
prompt = write_query(text)
results = query(prompt)
def query(prompt):
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
payload = {
"model": "gpt-4o-mini",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
}
]
}
]
}
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
content = response.json()["choices"][0]["message"]["content"]
chunk = content[content.index("["):(content.rindex("]")+1)]
output = json.loads(chunk)
return output
def go():
with open("outputs/UMLS_UniProtKB") as f:
for line in f:
run_qc(line.strip())
break
def create_good_batch_line(line,count):
p = write_query(line)
d = {"custom_id": f"request-{count}-good", "method": "POST", "url": "/v1/chat/completions",
"body": {"model": "gpt-4o-mini", "messages": [{"role": "user", "content": p }], "max_tokens": 1000}}
return json.dumps(d)
def create_bad_batch_line(line1,line2,count):
newblob = json.loads(line1)
newblob["UniProtKBs"] = json.loads(line2)["UniProtKBs"]
p = write_query(json.dumps(newblob))
d = {"custom_id": f"request-{count}-bad", "method": "POST", "url": "/v1/chat/completions",
"body": {"model": "gpt-4o-mini", "messages": [{"role": "user", "content": p }], "max_tokens": 1000}}
return json.dumps(d)
def build_batch_file(n=0, error_rate=0.1):
count = 0
with open("outputs/UMLS_UniProtKB_labels.jsonl") as f:
lines = f.readlines()
with open("outputs/batch_file.jsonl", "w") as outf:
for line in lines:
count += 1
oline = create_good_batch_line(line,count)
outf.write(oline+"\n")
if random() < error_rate:
count += 1
line1 = choice(lines)
line2 = choice(lines)
oline = create_bad_batch_line(line1, line2, count)
outf.write(oline + "\n")
if (n > 0) and (count >= n):
break
def upload_batch_file():
client = OpenAI(api_key=api_key)
batch_input_file = client.files.create(
file=open("outputs/batch_file.jsonl", "rb"),
purpose="batch"
)
batch_input_file_id = batch_input_file.id
batch = client.batches.create(
input_file_id=batch_input_file_id,
endpoint="/v1/chat/completions",
completion_window="24h",
metadata={
"description": "QC"
}
)
return batch
def retrieve_query(batch):
client = OpenAI(api_key=api_key)
batch_id = batch.id
batch = client.batches.retrieve(batch_id)
print(json.dumps(batch.to_dict(),indent=4))
status = batch.status
while status in ['validating', 'in_progress', 'finalizing']:
time.sleep(5*60)
batch_id = batch.id
batch = client.batches.retrieve(batch_id)
print(json.dumps(batch.to_dict(), indent=4))
status = batch.status
file_response = client.files.content(batch.output_file_id)
with open("outputs/batch_qc.jsonl","w") as f:
f.write(file_response.text)
def get_batch():
client = OpenAI(api_key=api_key)
batches = client.batches.list(limit=10)
batch = batches.data[0]
return batch
def go_batch():
build_batch_file(0,0.1)
batch = upload_batch_file()
#print(batch)
#batch=get_batch()
retrieve_query(batch)
if __name__ == "__main__":
go_batch()