add simpo dpo results on gemma 2

allenai · Jul 19, 2024 · 1c8615b · 1c8615b
1 parent 20d410d
commit 1c8615b
Show file tree

Hide file tree

Showing 21 changed files with 133,331 additions and 2,611 deletions.
diff --git a/eval_results/v2.0522/score.v2/eval=claude-3-5-sonnet-20240620/gpt-4o-2024-05-13.json b/eval_results/v2.0522/score.v2/eval=claude-3-5-sonnet-20240620/gpt-4o-2024-05-13.json
diff --git a/..._results/v2.0625/score.v2/eval=claude-3-5-sonnet-20240620/claude-3-5-sonnet-20240620.json b/..._results/v2.0625/score.v2/eval=claude-3-5-sonnet-20240620/claude-3-5-sonnet-20240620.json
diff --git a/eval_results/v2.0625/score.v2/eval=claude-3-5-sonnet-20240620/gemini-1.5-pro.json b/eval_results/v2.0625/score.v2/eval=claude-3-5-sonnet-20240620/gemini-1.5-pro.json
diff --git a/eval_results/v2.0625/score.v2/eval=claude-3-5-sonnet-20240620/gpt-4o-2024-05-13.json b/eval_results/v2.0625/score.v2/eval=claude-3-5-sonnet-20240620/gpt-4o-2024-05-13.json
diff --git a/eval_results/v2.0625/score.v2/eval=claude-3-5-sonnet-20240620/gpt-4o-mini-2024-07-18.json b/eval_results/v2.0625/score.v2/eval=claude-3-5-sonnet-20240620/gpt-4o-mini-2024-07-18.json
diff --git a/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/gemma-2-9b-it-DPO.json b/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/gemma-2-9b-it-DPO.json
diff --git a/evaluation/run_eval_v2_instant.score.sh b/evaluation/run_eval_v2_instant.score.sh
@@ -7,7 +7,7 @@ total_ex=1024
 
 
 eval_template="evaluation/eval_template.score.v2.md"
-eval_folder="eval_results/v2.0522/score.v2/eval=${gpt_eval_name}/"
+eval_folder="eval_results/v2.0625/score.v2/eval=${gpt_eval_name}/"
 echo "Evaluating $model_name using $gpt_eval_name with $eval_template"
 mkdir -p $eval_folder 
 

diff --git a/leaderboard/data_dir/_create_tables.py b/leaderboard/data_dir/_create_tables.py
@@ -67,6 +67,9 @@
     # folder = FOLDER+"/score.v2/eval=gpt-4-turbo-2024-04-09/"
     folder = FOLDER+"/score.v2/eval=gpt-4o-2024-05-13/"
     MODE = "score"
+elif ACTION == "score-sonnet":
+    folder = FOLDER+"/score.v2/eval=claude-3-5-sonnet-20240620/"
+    MODE = "score"
 else:
     print("Please provide either 'pairwise' or 'score' as the argument")
     sys.exit()
@@ -80,8 +83,8 @@
 files = os.listdir(folder)
 table = []
 for file in tqdm(files, desc=f"Processing {folder.replace(FOLDER, '')}"):
-    if file.endswith(".json"):
-        # print(f"Processing {file}")
+    if file.endswith(".json") and not any([x in file for x in ["128", "256", "384", "512", "640", "768", "896", "1024"]]):
+        print(f"Processing {file}")
         eval_result = []
         with open(f"{folder}/{file}", "r") as f:
             eval_result = json.load(f)
@@ -205,6 +208,14 @@
         elif MODE == "score":
             task_cat_results = {}
             for item in eval_result:
+                # print(item.keys())
+                if ACTION == "score-sonnet" and "parsed_result" in item:
+                    item["score"] = item["parsed_result"]["score"]
+                    if type(item["model_output"]) == list:
+                        item["model_output"] = item["model_output"][0]
+                    item["model_test"] = item["generator"]
+                if 'score' not in item:
+                    print(item)
                 scores.append(float(item["score"]))
                 model_output = item["model_output"]
                 if model_output.endswith("... (truncated)"):
@@ -270,6 +281,7 @@
     json.dump(result, f, indent=2)
 
 """
+python data_dir/_create_tables.py score-sonnet
 python data_dir/_create_tables.py score
 python data_dir/_create_tables.py pairwise-gpt4t -1
 python data_dir/_create_tables.py pairwise-llama -1