From 3c1f36b90a08f0db025964557595d6a30621654a Mon Sep 17 00:00:00 2001 From: Jiaxin-Pei Date: Thu, 21 Mar 2024 10:09:58 -0400 Subject: [PATCH 1/4] adding code to fetch and convert devin's output for evaluation --- evaluation/README.md | 6 +- .../prepare_devin_outputs_for_evaluation.py | 62 +++++++++++++++++++ 2 files changed, 66 insertions(+), 2 deletions(-) create mode 100644 evaluation/SWE-bench/src/prepare_devin_outputs_for_evaluation.py diff --git a/evaluation/README.md b/evaluation/README.md index b8a4e2bb4793..9ed1d24990ad 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -11,5 +11,7 @@ all the preprocessing/evaluation/analysis scripts. ## Tasks ### SWE-bench -- analysis - - devin_eval_analysis.ipynb: notebook analyzing devin's outputs \ No newline at end of file +- notebooks + - `devin_eval_analysis.ipynb`: notebook analyzing devin's outputs +- src + - `prepare_devin_outputs_for_evaluation.py`: script fetching and converting devin's output into the desired json file for evaluation diff --git a/evaluation/SWE-bench/src/prepare_devin_outputs_for_evaluation.py b/evaluation/SWE-bench/src/prepare_devin_outputs_for_evaluation.py new file mode 100644 index 000000000000..4fc895f5c1ce --- /dev/null +++ b/evaluation/SWE-bench/src/prepare_devin_outputs_for_evaluation.py @@ -0,0 +1,62 @@ +''' +Script used to convert devin's output into the desired json format for evaluation on SWE-bench + +Usage: + python prepare_devin_outputs_for_evaluation.py + +Outputs: + two json files under evaluation/SWE-bench/data/ + +''' + +import requests +import os +from tqdm import tqdm +import json + +#fetch devin's outputs into a json file for evaluation +def get_devin_eval_output(): + repo_url = "CognitionAI/devin-swebench-results" + folder_path = "output_diffs" + + base_url = "https://api.github.com/repos/" + pass_api_url = f"{base_url}{repo_url}/contents/{folder_path}/pass" + failed_api_url = f"{base_url}{repo_url}/contents/{folder_path}/fail" + + pass_files_info = [] + failed_files_info = [] + + def get_files(api_url, subfolder_name, files_info): + response = requests.get(api_url) + if response.status_code == 200: + contents = response.json() + for item in tqdm(contents): + if item["type"] == "file": + file_url = f"https://raw.githubusercontent.com/{repo_url}/main/{folder_path}/{subfolder_name}/{item['name']}" + file_content = requests.get(file_url).text + instance_id = item['name'][:-9] + model_name = "Devin" # Update with actual model name + files_info.append({ + "instance_id": instance_id, + "model_patch": file_content, + "model_name_or_path": model_name + }) + + get_files(pass_api_url, "pass", pass_files_info) + get_files(failed_api_url, "fail", failed_files_info) + + script_dir = os.path.dirname(os.path.abspath(__file__)) + output_dir = os.path.join(script_dir, "../data/devin/") + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + with open(os.path.join(output_dir, "pass_output.json"), "w") as pass_file: + json.dump(pass_files_info, pass_file, indent=4) + + with open(os.path.join(output_dir, "fail_output.json"), "w") as fail_file: + json.dump(failed_files_info, fail_file, indent=4) + + +if __name__ == '__main__': + get_devin_eval_output() \ No newline at end of file From 7e95a01f8987e1ebc4449ce88f9160461e5375ae Mon Sep 17 00:00:00 2001 From: Jiaxin-Pei Date: Thu, 21 Mar 2024 10:12:53 -0400 Subject: [PATCH 2/4] update README.md --- evaluation/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/evaluation/README.md b/evaluation/README.md index 9ed1d24990ad..69e5584601f9 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -14,4 +14,5 @@ all the preprocessing/evaluation/analysis scripts. - notebooks - `devin_eval_analysis.ipynb`: notebook analyzing devin's outputs - src - - `prepare_devin_outputs_for_evaluation.py`: script fetching and converting devin's output into the desired json file for evaluation + - `prepare_devin_outputs_for_evaluation.py`: script fetching and converting devin's output into the desired json file for evaluation. + - outputs: two json files under `evaluation/SWE-bench/data/` that can be directly used for evaluation From b55541a442ead9861bed46aaa8736074b9b02696 Mon Sep 17 00:00:00 2001 From: Jiaxin-Pei Date: Thu, 21 Mar 2024 13:11:23 -0400 Subject: [PATCH 3/4] update code for fetching and processing devin's outputs --- evaluation/README.md | 10 ++- .../prepare_devin_outputs_for_evaluation.py | 79 +++++++++++++++++++ 2 files changed, 86 insertions(+), 3 deletions(-) create mode 100644 evaluation/SWE-bench/scripts/prepare_devin_outputs_for_evaluation.py diff --git a/evaluation/README.md b/evaluation/README.md index 69e5584601f9..06b0fc1532f1 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -13,6 +13,10 @@ all the preprocessing/evaluation/analysis scripts. ### SWE-bench - notebooks - `devin_eval_analysis.ipynb`: notebook analyzing devin's outputs -- src - - `prepare_devin_outputs_for_evaluation.py`: script fetching and converting devin's output into the desired json file for evaluation. - - outputs: two json files under `evaluation/SWE-bench/data/` that can be directly used for evaluation +- scripts + - `prepare_devin_outputs_for_evaluation.py`: script fetching and converting [devin's output](https://github.com/CognitionAI/devin-swebench-results/tree/main) into the desired json file for evaluation. + - usage: `python prepare_devin_outputs_for_evaluation.py ` where setting can be `passed`, `failed` or `all` +- resources + - Devin's outputs processed for evaluations is available on [Huggingface](https://huggingface.co/datasets/OpenDevin/Devin-SWE-bench-output) + - get predictions that passed the test: `wget https://huggingface.co/datasets/OpenDevin/Devin-SWE-bench-output/raw/main/devin_swe_passed.json` + - get all predictions`wget https://huggingface.co/datasets/OpenDevin/Devin-SWE-bench-output/raw/main/devin_swe_outputs.json` diff --git a/evaluation/SWE-bench/scripts/prepare_devin_outputs_for_evaluation.py b/evaluation/SWE-bench/scripts/prepare_devin_outputs_for_evaluation.py new file mode 100644 index 000000000000..d4e6906d2ef2 --- /dev/null +++ b/evaluation/SWE-bench/scripts/prepare_devin_outputs_for_evaluation.py @@ -0,0 +1,79 @@ +''' +Script used to convert devin's output into the desired json format for evaluation on SWE-bench + +Usage: + python prepare_devin_outputs_for_evaluation.py + can be "passed", "failed", "all" + +Outputs: + two json files under evaluation/SWE-bench/data/ + +''' + +#fetch devin's outputs into a json file for evaluation +import os +import sys +import json +import requests +from tqdm import tqdm + +def get_devin_eval_output(setting): + repo_url = "CognitionAI/devin-swebench-results" + folder_path = "output_diffs" + + base_url = "https://api.github.com/repos/" + pass_api_url = f"{base_url}{repo_url}/contents/{folder_path}/pass" + failed_api_url = f"{base_url}{repo_url}/contents/{folder_path}/fail" + + pass_files_info = [] + failed_files_info = [] + + def get_files(api_url, subfolder_name, files_info): + response = requests.get(api_url) + if response.status_code == 200: + contents = response.json() + for item in tqdm(contents): + if item["type"] == "file": + file_url = f"https://raw.githubusercontent.com/{repo_url}/main/{folder_path}/{subfolder_name}/{item['name']}" + file_content = requests.get(file_url).text + instance_id = item['name'][:-9] + model_name = "Devin" # Update with actual model name + files_info.append({ + "instance_id": instance_id, + "model_patch": file_content, + "model_name_or_path": model_name, + "pass_or_fail": subfolder_name + }) + + if setting == "passed" or setting == "all": + get_files(pass_api_url, "pass", pass_files_info) + if setting == "failed" or setting == "all": + get_files(failed_api_url, "fail", failed_files_info) + + script_dir = os.path.dirname(os.path.abspath(__file__)) + output_dir = os.path.join(script_dir, "../data/devin/") + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + if setting == "passed" or setting == "all": + with open(os.path.join(output_dir, "devin_swe_passed.json"), "w") as pass_file: + json.dump(pass_files_info, pass_file, indent=4) + + if setting == "failed" or setting == "all": + with open(os.path.join(output_dir, "devin_swe_failed.json"), "w") as fail_file: + json.dump(failed_files_info, fail_file, indent=4) + + if setting == "all": + merged_output = pass_files_info + failed_files_info + with open(os.path.join(output_dir, "devin_swe_outputs.json"), "w") as merge_file: + json.dump(merged_output, merge_file, indent=4) + + +if __name__ == '__main__': + if len(sys.argv) != 2: + print("Usage: python script_name.py ") + sys.exit(1) + + setting = sys.argv[1] + get_devin_eval_output(setting) \ No newline at end of file From b4b6786e5c87dd41a00232a7e203ce6852c9cd77 Mon Sep 17 00:00:00 2001 From: Jiaxin-Pei Date: Thu, 21 Mar 2024 13:12:51 -0400 Subject: [PATCH 4/4] update code for fetching and processing devin's outputs --- .../prepare_devin_outputs_for_evaluation.py | 62 ------------------- 1 file changed, 62 deletions(-) delete mode 100644 evaluation/SWE-bench/src/prepare_devin_outputs_for_evaluation.py diff --git a/evaluation/SWE-bench/src/prepare_devin_outputs_for_evaluation.py b/evaluation/SWE-bench/src/prepare_devin_outputs_for_evaluation.py deleted file mode 100644 index 4fc895f5c1ce..000000000000 --- a/evaluation/SWE-bench/src/prepare_devin_outputs_for_evaluation.py +++ /dev/null @@ -1,62 +0,0 @@ -''' -Script used to convert devin's output into the desired json format for evaluation on SWE-bench - -Usage: - python prepare_devin_outputs_for_evaluation.py - -Outputs: - two json files under evaluation/SWE-bench/data/ - -''' - -import requests -import os -from tqdm import tqdm -import json - -#fetch devin's outputs into a json file for evaluation -def get_devin_eval_output(): - repo_url = "CognitionAI/devin-swebench-results" - folder_path = "output_diffs" - - base_url = "https://api.github.com/repos/" - pass_api_url = f"{base_url}{repo_url}/contents/{folder_path}/pass" - failed_api_url = f"{base_url}{repo_url}/contents/{folder_path}/fail" - - pass_files_info = [] - failed_files_info = [] - - def get_files(api_url, subfolder_name, files_info): - response = requests.get(api_url) - if response.status_code == 200: - contents = response.json() - for item in tqdm(contents): - if item["type"] == "file": - file_url = f"https://raw.githubusercontent.com/{repo_url}/main/{folder_path}/{subfolder_name}/{item['name']}" - file_content = requests.get(file_url).text - instance_id = item['name'][:-9] - model_name = "Devin" # Update with actual model name - files_info.append({ - "instance_id": instance_id, - "model_patch": file_content, - "model_name_or_path": model_name - }) - - get_files(pass_api_url, "pass", pass_files_info) - get_files(failed_api_url, "fail", failed_files_info) - - script_dir = os.path.dirname(os.path.abspath(__file__)) - output_dir = os.path.join(script_dir, "../data/devin/") - - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - with open(os.path.join(output_dir, "pass_output.json"), "w") as pass_file: - json.dump(pass_files_info, pass_file, indent=4) - - with open(os.path.join(output_dir, "fail_output.json"), "w") as fail_file: - json.dump(failed_files_info, fail_file, indent=4) - - -if __name__ == '__main__': - get_devin_eval_output() \ No newline at end of file