diff --git a/.gitignore b/.gitignore index 679b9013d1ab..89a41024347f 100644 --- a/.gitignore +++ b/.gitignore @@ -161,7 +161,7 @@ cython_debug/ .vscode/ # evaluation -evaluation/swe-bench/data +evaluation/SWE-bench/data # frontend diff --git a/evaluation/SWE-bench/README.md b/evaluation/SWE-bench/README.md index 7185eabc7198..00301742aad5 100644 --- a/evaluation/SWE-bench/README.md +++ b/evaluation/SWE-bench/README.md @@ -15,14 +15,8 @@ Currently, the docker container should be able to for running SWE-Bench. It was ### Setup example data ```bash -cd evaluation/swe-bench -mkdir -p data/processed -python3 scripts/download_test_data.py - -# Download an example output file (FROM claude-2) -# https://gist.github.com/sorendunn/9f1f1fade59f986b4925b6633f9ff165 -mkdir -p data/predictions -curl -o data/predictions/matplotlib__matplotlib-24362.jsonl "https://gist.githubusercontent.com/sorendunn/3218ac73166c6ae0caf2eafd23918971/raw/de02f3ea30a43dbab3245eb7b1b23510fad96847/matplotlib__matplotlib-24362.jsonl" +cd evaluation/SWE-bench +./scripts/prepare_devin_swe_bench_data.sh # Clone the repo # This is a fork that fixes some issues that stops matplotlib from running (see https://github.com/princeton-nlp/SWE-bench/pull/56) @@ -36,22 +30,20 @@ git clone https://github.com/xingyaoww/SWE-bench.git ```bash #!/bin/bash -LOG_DIR=data/logs -TESTBED_DIR=data/testbeds -mkdir -p $LOG_DIR -mkdir -p $TESTBED_DIR +mkdir -p data/logs +mkdir -p data/testbeds python harness/run_evaluation.py \ - --predictions_path data/predictions/matplotlib__matplotlib-24362.jsonl \ + --predictions_path evaluation/SWE-bench/data/predictions/devin_swe_outputs.json \ --swe_bench_tasks data/processed/swe-bench-test.json \ - --log_dir $LOG_DIR \ - --testbed $TESTBED_DIR \ + --log_dir data/logs \ + --testbed data/testbeds \ --skip_existing \ --timeout 900 \ --verbose ``` -You will see the following command line outputs: +You will see the command line outputs similar to this (if success): ```log swe-bench@2f3a6b9fcab2:/swe-bench$ ./harness/run_evaluation.sh diff --git a/evaluation/SWE-bench/scripts/prepare_devin_swe_bench_data.sh b/evaluation/SWE-bench/scripts/prepare_devin_swe_bench_data.sh new file mode 100755 index 000000000000..ef2c76f46285 --- /dev/null +++ b/evaluation/SWE-bench/scripts/prepare_devin_swe_bench_data.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +set -xeo pipefail +mkdir -p data/processed +python3 scripts/download_test_data.py + +# Download an example output file (FROM claude-2) +# https://gist.github.com/sorendunn/9f1f1fade59f986b4925b6633f9ff165 +mkdir -p data/predictions +wget https://huggingface.co/datasets/OpenDevin/Devin-SWE-bench-output/raw/main/devin_swe_outputs.json -O data/predictions/devin_swe_outputs.json