add support for follow-up eval

WildEval · Oct 28, 2024 · c7071d8 · c7071d8
1 parent 9657e36
commit c7071d8
Show file tree

Hide file tree

Showing 10 changed files with 71,165 additions and 26 deletions.
diff --git a/result_dirs/zebra-grid/self_verification/gpt-4o-mini-2024-07-18.self_verification.T=1.json b/result_dirs/zebra-grid/self_verification/gpt-4o-mini-2024-07-18.self_verification.T=1.json
diff --git a/result_dirs_follow_up/zebra-grid/gpt-4o-2024-08-06.self_verification.T=1.json b/result_dirs_follow_up/zebra-grid/gpt-4o-2024-08-06.self_verification.T=1.json
diff --git a/result_dirs_follow_up/zebra-grid/gpt-4o-mini-2024-07-18.self_verification.T=1.json b/result_dirs_follow_up/zebra-grid/gpt-4o-mini-2024-07-18.self_verification.T=1.json
diff --git a/src/evaluation/zebra_grid_eval.py b/src/evaluation/zebra_grid_eval.py
@@ -279,7 +279,7 @@ def eval_model(model, filepath, mode="best_of_n", max_N=None):
     return result, parsed_results  # Return parsed_results along with the result
 
 
-def gen_results(run_name_folders, bon=False): 
+def gen_results(run_name_folders, bon=False, save_results=True): 
     model_results = load_model_results(run_name_folders)
 
     def save_parsed_results(filepath, parsed_results, bon=bon):
@@ -351,13 +351,14 @@ def save_parsed_results(filepath, parsed_results, bon=bon):
     print(tabulate(table_data, headers=columns, tablefmt="fancy_outline", stralign="center", numalign="center"))
     # print(tabulate(rows, headers=columns, tablefmt="github"))
 
-    # write to json file 
-    with open("result_dirs/zebra-grid.summary.json", "w") as f:
-        json.dump(rows, f, indent=2)
+    if save_results:
+        # write to json file 
+        with open("result_dirs/zebra-grid.summary.json", "w") as f:
+            json.dump(rows, f, indent=2)
 
-    # write to markdown file
-    with open(f"result_dirs/zebra-grid.summary.md", "w") as f:
-        f.write(tabulate(table_data, headers=columns, tablefmt="github", stralign="center", numalign="center"))
+        # write to markdown file
+        with open(f"result_dirs/zebra-grid.summary.md", "w") as f:
+            f.write(tabulate(table_data, headers=columns, tablefmt="github", stralign="center", numalign="center"))
 
 
 if __name__ == "__main__":
@@ -366,8 +367,9 @@ def save_parsed_results(filepath, parsed_results, bon=bon):
         # "greedy": "result_dirs/zebra-grid",
         # "sampling": "result_dirs/zebra-grid/sampling",
         # "bon_all": "result_dirs/zebra-grid/bon_all", 
-        "rm": "result_dirs/zebra-grid/rm_32", 
+        # "rm": "result_dirs/zebra-grid/rm_32", 
+        "self_verification": "result_dirs/zebra-grid/self_verification",
     } 
     load_private_solutions()
-    gen_results(run_name_folders, bon=True)
+    gen_results(run_name_folders, bon=False, save_results=False)
 
diff --git a/src/follow_up_process.py b/src/follow_up_process.py
@@ -0,0 +1,41 @@
+"""
+This script is to add a follow-up instruction to the existing result file, 
+where there is already a chat-history of the previous conversation, and an existing output (at least one) from the model.
+"""
+import json, os
+from templates import FOLLOW_UP
+
+def add_follow_up_instruction(file_path, output_path, follow_up_mode="self_verification"):
+    # Load the existing data
+    with open(file_path, 'r') as file:
+        data = json.load(file)
+
+    for item in data:
+        # Add the follow-up instruction to each item
+        chat_history = item["chat_history"]
+        current_output = item["output"][0]
+        if follow_up_mode == "self_verification":
+            follow_up_prompt = FOLLOW_UP.SELF_VERIFICATION
+        else:
+            raise ValueError(f"Unknown follow_up_mode: {follow_up_mode}")
+        new_chat_history = chat_history + [current_output] + [follow_up_prompt]
+        item["chat_history"] = new_chat_history
+        item["output"] = []
+
+    # save the modified data back to the file
+    # create the output directory (and the parents) if it doesn't exist
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    with open(output_path, 'w') as file:
+        json.dump(data, file, indent=2)
+
+if __name__ == "__main__":
+    # Example usage
+    file_path = "result_dirs/zebra-grid/gpt-4o-mini-2024-07-18.json"
+    follow_up_mode = "self_verification"
+    output_file = "result_dirs_follow_up/zebra-grid/gpt-4o-mini-2024-07-18.self_verification.T=1.json"
+    add_follow_up_instruction(file_path, output_file, follow_up_mode)  # Call the function to add follow-up instruction
+
+    file_path = "result_dirs/zebra-grid/gpt-4o-2024-08-06.json"
+    output_file = "result_dirs_follow_up/zebra-grid/gpt-4o-2024-08-06.self_verification.T=1.json"
+    add_follow_up_instruction(file_path, output_file, follow_up_mode)  # Call the function to add follow-up instruction
+
diff --git a/src/task_configs.py b/src/task_configs.py
@@ -56,7 +56,8 @@ def result_format(output_item, args):
     if args.data_name in ["alpaca_eval"]:
         output_item["output"] = output_item["output"][0] # use str instead of list 
     elif args.data_name in ["zebra-grid"]:
-        del output_item["solution"]
+        if "solution" in output_item:
+            del output_item["solution"]
     else:
         pass 
     return output_item
diff --git a/src/templates/FOLLOW_UP.py b/src/templates/FOLLOW_UP.py
@@ -0,0 +1,7 @@
+SELF_VERIFICATION = """
+Please review the initial prompt, including the question and the constraints or requirements provided. 
+Reassess your reasoning and the answer you provided to ensure they align with the given information. 
+If any adjustments are needed, modify your reasoning and answer accordingly. 
+Finally, present your response in the same JSON format mentioned in the initial prompt.
+If the original answer was already correct, you can simply repeat it in the same JSON format.
+"""
diff --git a/src/unified_infer.py b/src/unified_infer.py
@@ -39,9 +39,12 @@ def parse_args():
     parser.add_argument('--start_index',default=0, type=int) # 0 means from the beginning of the list
     parser.add_argument('--end_index',default=-1, type=int) # -1 means to the end of the list
     parser.add_argument('--filepath',default="auto", type=str)
-
+    
     parser.add_argument('--cache_filepath', default=None, type=str)
 
+    parser.add_argument('--follow_up_mode', default="N/A", type=str) # N/A means not a follow up 
+    parser.add_argument('--follow_up_file', default=None, type=str) # if you have an existing file
+
     parser.add_argument('--overwrite', action='store_true')
     parser.add_argument('--no_repeat_ngram_size', default=0, type=int)
     parser.add_argument('--hf_bf16', action='store_true')
@@ -137,6 +140,9 @@ def sanitize_args(args):
     if args.use_imend_stop:
         IM_END_MODELS.append(args.model_name)
 
+    # TODO: we need to support the case when you have an existing file
+
+
     # Data loading
     id_strs, chat_history, model_inputs, metadata = load_eval_data(args)
 

diff --git a/src/unified_utils.py b/src/unified_utils.py
@@ -56,26 +56,47 @@ def apply_template(chat_history, model_name, args):
 
 
 def load_eval_data(args, data_name=None, model_name=None):
+
+
     if data_name is None:
         data_name = args.data_name
     if model_name is None:
         model_name = args.model_name    
-    chat_history = []
-    id_strs = []
-    metadata = {}
-    dataset, id_name = mapping_task_names(data_name)
-
-
-    print(f"Loaded {len(dataset)} examples from {data_name}")
 
-    for ind, item in enumerate(dataset):
-        id_strs.append(item.get(id_name, f"{data_name}#{ind}")) 
-        prompt = prompt_generation(data_name, item, args)
-        chat_history.append([prompt])
-        for key in item: 
-            if key not in metadata:
-                metadata[key] = []
-            metadata[key].append(item[key])
+    if args.follow_up_mode == "N/A":
+        chat_history = []
+        id_strs = []
+        metadata = {}
+        dataset, id_name = mapping_task_names(data_name)
+
+
+        print(f"Loaded {len(dataset)} examples from {data_name}")
+
+        for ind, item in enumerate(dataset):
+            id_strs.append(item.get(id_name, f"{data_name}#{ind}")) 
+            prompt = prompt_generation(data_name, item, args)
+            chat_history.append([prompt])
+            for key in item: 
+                if key not in metadata:
+                    metadata[key] = []
+                metadata[key].append(item[key])
+    elif args.follow_up_mode != "N/A" and os.path.exists(args.follow_up_file):
+        # load the file and use the content there to load the chat history, id_strs, and metadata, etc.
+        with open(args.follow_up_file, "r") as f:
+            follow_up_data = json.load(f)
+        print(f"Loaded {len(follow_up_data)} examples from {args.follow_up_file}")
+        id_strs = []
+        chat_history = []
+        metadata = {}
+        for item in follow_up_data:
+            id_strs.append(item.get("session_id", "N/A"))
+            chat_history.append(item.get("chat_history", []))
+            for key in item:
+                if key in ["configs", "model_input", "generator", "output", "session_id", "chat_history"]:
+                    continue  
+                if key not in metadata:
+                    metadata[key] = []
+                metadata[key].append(item[key])
     print("Start applying template")
     model_inputs = apply_template(chat_history, model_name, args)
     return id_strs, chat_history, model_inputs, metadata

diff --git a/zebra_logic_analysis/scripts/follow_up.sh b/zebra_logic_analysis/scripts/follow_up.sh
@@ -0,0 +1,55 @@
+# Initialize default values
+DATA_NAME="zebra-grid"
+# model_name="openai/gpt-4o-mini-2024-07-18"
+# model_pretty_name="gpt-4o-mini-2024-07-18.self_verification.T=1"
+model_name="openai/gpt-4o-2024-08-06"
+model_pretty_name="gpt-4o-2024-08-06.self_verification.T=1"
+n_shards=8
+run_name="self_verification"
+TEMP=0
+TOP_P=1.0
+rp=1.0
+engine_name="openai"
+MAX_TOKENS=4096; 
+num_outputs=1  # New default value
+batch_size=4; 
+CACHE_DIR=${HF_HOME:-"default"}
+
+
+# Check if required arguments are provided
+if [ -z "$DATA_NAME" ] || [ -z "$model_name" ] || [ -z "$model_pretty_name" ] || [ -z "$n_shards" ]; then
+  echo "Usage: $0 -d DATA_NAME -m model_name -p model_pretty_name -s n_shards [-r run_name] [-t TEMP] [-o TOP_P] [-e rp] [-f engine_name] [-n num_outputs]"
+  exit 1
+fi
+
+# output_dir="result_dirs/${DATA_NAME}/cot=${cot}/" 
+if [ "$run_name" = "default" ]; then
+    output_dir="result_dirs/${DATA_NAME}/" 
+else
+    output_dir="result_dirs/${DATA_NAME}/${run_name}/" 
+fi
+
+
+echo "Using Data-parallelism"
+shards_dir="${output_dir}/tmp_${model_pretty_name}"
+for ((shard_id = 0; shard_id < $n_shards; shard_id++)); do
+    python src/unified_infer.py \
+        --follow_up_mode "self_verification" \
+        --follow_up_file "result_dirs_follow_up/zebra-grid/${model_pretty_name}.json" \
+        --num_shards $n_shards \
+        --shard_id $shard_id \
+        --data_name $DATA_NAME \
+        --engine $engine_name \
+        --model_name $model_name \
+        --run_name $run_name \
+        --model_pretty_name $model_pretty_name \
+        --top_p $TOP_P --temperature $TEMP --repetition_penalty $rp \
+        --batch_size $batch_size --max_tokens $MAX_TOKENS \
+        --num_outputs $num_outputs \
+        --output_folder $shards_dir/ \
+        &
+done 
+wait 
+python src/merge_results.py $shards_dir/ $model_pretty_name
+cp $shards_dir/${model_pretty_name}.json $output_dir/${model_pretty_name}.json
+