-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathprepare_data_for_label_eval.py
71 lines (55 loc) · 3.93 KB
/
prepare_data_for_label_eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import json
def process_data(data, gt_data):
authors = {}
hardness_mapping = {"complicated": 4, "indirect": 2, "direct": 1}
features = ["age", "sex", "relationship_status", "education", "occupation", "city_country", "birth_city_country", "income_level"] # replace with your actual feature list
for entry in data:
author = entry["author"]
if author not in authors:
authors[author] = {
"username": entry["username"],
"author": author,
"comments": [],
"predictions": {
"human": {feature: {"guess": [""], "hardness": 0, "certainty": 0} for feature in features},
"gpt-4": {feature: {"guess": [""], "hardness": 0, "certainty": 0} for feature in features},
"ground_truth": {feature: {"guess": [str(gt_data.get(author, {}).get(feature, ""))], "hardness": 0, "certainty": 0} for feature in features}
},
"reviews": {
"human": {feature: {"estimate": "", "hardness": 0, "certainty": 0} for feature in features},
"gpt-4": {feature: {"estimate": "", "hardness": 0, "certainty": 0} for feature in features}
},
"evaluations": {}
}
authors[author]["comments"].append({"text": entry["text"], "username": entry["username"]})
for key, value in entry["reviews"]["human"].items():
if key in features:
if value["hardness"] > 0 and (value["hardness"] < authors[author]["predictions"]["human"][key]["hardness"] or authors[author]["predictions"]["human"][key]["hardness"] == 0):
authors[author]["predictions"]["human"][key]["hardness"] = value["hardness"]
authors[author]["predictions"]["human"][key]["certainty"] = value["certainty"]
authors[author]["predictions"]["human"][key]["guess"] = [str(value["estimate"])]
authors[author]["reviews"]["human"][key]["hardness"] = value["hardness"]
authors[author]["reviews"]["human"][key]["certainty"] = value["certainty"]
authors[author]["reviews"]["human"][key]["estimate"] = str(value["estimate"])
authors[author]["reviews"]["human"]["timestamp"] = 0
authors[author]["reviews"]["human"]["time"] = 0
if entry["guesses"] is not None:
for guess in entry["guesses"]:
if guess["feature"] in features:
hardness = hardness_mapping.get(guess["hardness"], 0)
if hardness > 0 and (hardness < authors[author]["predictions"]["gpt-4"][guess["feature"]]["hardness"] or authors[author]["predictions"]["gpt-4"][guess["feature"]]["hardness"] == 0):
authors[author]["predictions"]["gpt-4"][guess["feature"]]["hardness"] = hardness
authors[author]["predictions"]["gpt-4"][guess["feature"]]["certainty"] = guess["certainty"]
authors[author]["predictions"]["gpt-4"][guess["feature"]]["guess"] = [str(guess["guesses"][0])]
authors[author]["reviews"]["gpt-4"][guess["feature"]]["hardness"] = hardness
authors[author]["reviews"]["gpt-4"][guess["feature"]]["certainty"] = guess["certainty"]
authors[author]["reviews"]["gpt-4"][guess["feature"]]["estimate"] = str(guess["guesses"][0])
authors[author]["reviews"]["gpt-4"]["timestamp"] = 0
authors[author]["reviews"]["gpt-4"]["time"] = 0
return authors
with open('data/thread/synth_clean.jsonl', 'r') as infile, open('data/profiles/user_bot_gen_online_profiles_300.json', 'r') as gtfile, open('data/thread/comments_label_eval.jsonl', 'w') as outfile:
data = [json.loads(line) for line in infile]
gt_data = json.load(gtfile)
authors = process_data(data, gt_data)
for author in authors.values():
outfile.write(json.dumps(author) + '\n')