Skip to content

Commit

Permalink
add support for follow-up eval
Browse files Browse the repository at this point in the history
  • Loading branch information
(Bill) Yuchen Lin committed Oct 28, 2024
1 parent 9657e36 commit c7071d8
Show file tree
Hide file tree
Showing 10 changed files with 71,165 additions and 26 deletions.

Large diffs are not rendered by default.

23,002 changes: 23,002 additions & 0 deletions result_dirs_follow_up/zebra-grid/gpt-4o-2024-08-06.self_verification.T=1.json

Large diffs are not rendered by default.

23,002 changes: 23,002 additions & 0 deletions result_dirs_follow_up/zebra-grid/gpt-4o-mini-2024-07-18.self_verification.T=1.json

Large diffs are not rendered by default.

20 changes: 11 additions & 9 deletions src/evaluation/zebra_grid_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ def eval_model(model, filepath, mode="best_of_n", max_N=None):
return result, parsed_results # Return parsed_results along with the result


def gen_results(run_name_folders, bon=False):
def gen_results(run_name_folders, bon=False, save_results=True):
model_results = load_model_results(run_name_folders)

def save_parsed_results(filepath, parsed_results, bon=bon):
Expand Down Expand Up @@ -351,13 +351,14 @@ def save_parsed_results(filepath, parsed_results, bon=bon):
print(tabulate(table_data, headers=columns, tablefmt="fancy_outline", stralign="center", numalign="center"))
# print(tabulate(rows, headers=columns, tablefmt="github"))

# write to json file
with open("result_dirs/zebra-grid.summary.json", "w") as f:
json.dump(rows, f, indent=2)
if save_results:
# write to json file
with open("result_dirs/zebra-grid.summary.json", "w") as f:
json.dump(rows, f, indent=2)

# write to markdown file
with open(f"result_dirs/zebra-grid.summary.md", "w") as f:
f.write(tabulate(table_data, headers=columns, tablefmt="github", stralign="center", numalign="center"))
# write to markdown file
with open(f"result_dirs/zebra-grid.summary.md", "w") as f:
f.write(tabulate(table_data, headers=columns, tablefmt="github", stralign="center", numalign="center"))


if __name__ == "__main__":
Expand All @@ -366,8 +367,9 @@ def save_parsed_results(filepath, parsed_results, bon=bon):
# "greedy": "result_dirs/zebra-grid",
# "sampling": "result_dirs/zebra-grid/sampling",
# "bon_all": "result_dirs/zebra-grid/bon_all",
"rm": "result_dirs/zebra-grid/rm_32",
# "rm": "result_dirs/zebra-grid/rm_32",
"self_verification": "result_dirs/zebra-grid/self_verification",
}
load_private_solutions()
gen_results(run_name_folders, bon=True)
gen_results(run_name_folders, bon=False, save_results=False)

41 changes: 41 additions & 0 deletions src/follow_up_process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""
This script is to add a follow-up instruction to the existing result file,
where there is already a chat-history of the previous conversation, and an existing output (at least one) from the model.
"""
import json, os
from templates import FOLLOW_UP

def add_follow_up_instruction(file_path, output_path, follow_up_mode="self_verification"):
# Load the existing data
with open(file_path, 'r') as file:
data = json.load(file)

for item in data:
# Add the follow-up instruction to each item
chat_history = item["chat_history"]
current_output = item["output"][0]
if follow_up_mode == "self_verification":
follow_up_prompt = FOLLOW_UP.SELF_VERIFICATION
else:
raise ValueError(f"Unknown follow_up_mode: {follow_up_mode}")
new_chat_history = chat_history + [current_output] + [follow_up_prompt]
item["chat_history"] = new_chat_history
item["output"] = []

# save the modified data back to the file
# create the output directory (and the parents) if it doesn't exist
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w') as file:
json.dump(data, file, indent=2)

if __name__ == "__main__":
# Example usage
file_path = "result_dirs/zebra-grid/gpt-4o-mini-2024-07-18.json"
follow_up_mode = "self_verification"
output_file = "result_dirs_follow_up/zebra-grid/gpt-4o-mini-2024-07-18.self_verification.T=1.json"
add_follow_up_instruction(file_path, output_file, follow_up_mode) # Call the function to add follow-up instruction

file_path = "result_dirs/zebra-grid/gpt-4o-2024-08-06.json"
output_file = "result_dirs_follow_up/zebra-grid/gpt-4o-2024-08-06.self_verification.T=1.json"
add_follow_up_instruction(file_path, output_file, follow_up_mode) # Call the function to add follow-up instruction

3 changes: 2 additions & 1 deletion src/task_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ def result_format(output_item, args):
if args.data_name in ["alpaca_eval"]:
output_item["output"] = output_item["output"][0] # use str instead of list
elif args.data_name in ["zebra-grid"]:
del output_item["solution"]
if "solution" in output_item:
del output_item["solution"]
else:
pass
return output_item
7 changes: 7 additions & 0 deletions src/templates/FOLLOW_UP.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
SELF_VERIFICATION = """
Please review the initial prompt, including the question and the constraints or requirements provided.
Reassess your reasoning and the answer you provided to ensure they align with the given information.
If any adjustments are needed, modify your reasoning and answer accordingly.
Finally, present your response in the same JSON format mentioned in the initial prompt.
If the original answer was already correct, you can simply repeat it in the same JSON format.
"""
8 changes: 7 additions & 1 deletion src/unified_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,12 @@ def parse_args():
parser.add_argument('--start_index',default=0, type=int) # 0 means from the beginning of the list
parser.add_argument('--end_index',default=-1, type=int) # -1 means to the end of the list
parser.add_argument('--filepath',default="auto", type=str)

parser.add_argument('--cache_filepath', default=None, type=str)

parser.add_argument('--follow_up_mode', default="N/A", type=str) # N/A means not a follow up
parser.add_argument('--follow_up_file', default=None, type=str) # if you have an existing file

parser.add_argument('--overwrite', action='store_true')
parser.add_argument('--no_repeat_ngram_size', default=0, type=int)
parser.add_argument('--hf_bf16', action='store_true')
Expand Down Expand Up @@ -137,6 +140,9 @@ def sanitize_args(args):
if args.use_imend_stop:
IM_END_MODELS.append(args.model_name)

# TODO: we need to support the case when you have an existing file


# Data loading
id_strs, chat_history, model_inputs, metadata = load_eval_data(args)

Expand Down
51 changes: 36 additions & 15 deletions src/unified_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,26 +56,47 @@ def apply_template(chat_history, model_name, args):


def load_eval_data(args, data_name=None, model_name=None):


if data_name is None:
data_name = args.data_name
if model_name is None:
model_name = args.model_name
chat_history = []
id_strs = []
metadata = {}
dataset, id_name = mapping_task_names(data_name)


print(f"Loaded {len(dataset)} examples from {data_name}")

for ind, item in enumerate(dataset):
id_strs.append(item.get(id_name, f"{data_name}#{ind}"))
prompt = prompt_generation(data_name, item, args)
chat_history.append([prompt])
for key in item:
if key not in metadata:
metadata[key] = []
metadata[key].append(item[key])
if args.follow_up_mode == "N/A":
chat_history = []
id_strs = []
metadata = {}
dataset, id_name = mapping_task_names(data_name)


print(f"Loaded {len(dataset)} examples from {data_name}")

for ind, item in enumerate(dataset):
id_strs.append(item.get(id_name, f"{data_name}#{ind}"))
prompt = prompt_generation(data_name, item, args)
chat_history.append([prompt])
for key in item:
if key not in metadata:
metadata[key] = []
metadata[key].append(item[key])
elif args.follow_up_mode != "N/A" and os.path.exists(args.follow_up_file):
# load the file and use the content there to load the chat history, id_strs, and metadata, etc.
with open(args.follow_up_file, "r") as f:
follow_up_data = json.load(f)
print(f"Loaded {len(follow_up_data)} examples from {args.follow_up_file}")
id_strs = []
chat_history = []
metadata = {}
for item in follow_up_data:
id_strs.append(item.get("session_id", "N/A"))
chat_history.append(item.get("chat_history", []))
for key in item:
if key in ["configs", "model_input", "generator", "output", "session_id", "chat_history"]:
continue
if key not in metadata:
metadata[key] = []
metadata[key].append(item[key])
print("Start applying template")
model_inputs = apply_template(chat_history, model_name, args)
return id_strs, chat_history, model_inputs, metadata
Expand Down
55 changes: 55 additions & 0 deletions zebra_logic_analysis/scripts/follow_up.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Initialize default values
DATA_NAME="zebra-grid"
# model_name="openai/gpt-4o-mini-2024-07-18"
# model_pretty_name="gpt-4o-mini-2024-07-18.self_verification.T=1"
model_name="openai/gpt-4o-2024-08-06"
model_pretty_name="gpt-4o-2024-08-06.self_verification.T=1"
n_shards=8
run_name="self_verification"
TEMP=0
TOP_P=1.0
rp=1.0
engine_name="openai"
MAX_TOKENS=4096;
num_outputs=1 # New default value
batch_size=4;
CACHE_DIR=${HF_HOME:-"default"}


# Check if required arguments are provided
if [ -z "$DATA_NAME" ] || [ -z "$model_name" ] || [ -z "$model_pretty_name" ] || [ -z "$n_shards" ]; then
echo "Usage: $0 -d DATA_NAME -m model_name -p model_pretty_name -s n_shards [-r run_name] [-t TEMP] [-o TOP_P] [-e rp] [-f engine_name] [-n num_outputs]"
exit 1
fi

# output_dir="result_dirs/${DATA_NAME}/cot=${cot}/"
if [ "$run_name" = "default" ]; then
output_dir="result_dirs/${DATA_NAME}/"
else
output_dir="result_dirs/${DATA_NAME}/${run_name}/"
fi


echo "Using Data-parallelism"
shards_dir="${output_dir}/tmp_${model_pretty_name}"
for ((shard_id = 0; shard_id < $n_shards; shard_id++)); do
python src/unified_infer.py \
--follow_up_mode "self_verification" \
--follow_up_file "result_dirs_follow_up/zebra-grid/${model_pretty_name}.json" \
--num_shards $n_shards \
--shard_id $shard_id \
--data_name $DATA_NAME \
--engine $engine_name \
--model_name $model_name \
--run_name $run_name \
--model_pretty_name $model_pretty_name \
--top_p $TOP_P --temperature $TEMP --repetition_penalty $rp \
--batch_size $batch_size --max_tokens $MAX_TOKENS \
--num_outputs $num_outputs \
--output_folder $shards_dir/ \
&
done
wait
python src/merge_results.py $shards_dir/ $model_pretty_name
cp $shards_dir/${model_pretty_name}.json $output_dir/${model_pretty_name}.json

0 comments on commit c7071d8

Please sign in to comment.