Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support Instance Level Images for SWE-Bench Evaluation #2874

Merged
merged 10 commits into from
Jul 16, 2024
16 changes: 15 additions & 1 deletion evaluation/swe_bench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ Make sure your Docker daemon is running, and you have pulled the `eval-swe-bench
docker image. Then run this python script:

```bash
# export USE_INSTANCE_IMAGE=true # if you want to test support for instance-level docker images
poetry run python evaluation/swe_bench/swe_env_box.py
```

Expand All @@ -85,7 +86,7 @@ If you see an error, please make sure your `config.toml` contains all
## Run Inference on SWE-Bench Instances

```bash
./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers]
# e.g., ./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview_llm HEAD CodeActAgent 300
```

Expand All @@ -104,7 +105,20 @@ to `CodeActAgent`.
default, the script evaluates the entire SWE-bench_Lite test set (300 issues). Note:
in order to use `eval_limit`, you must also set `agent`.

`max_iter`, e.g. `20`, is the maximum number of iterations for the agent to run. By
default, it is set to 30.

`num_workers`, e.g. `3`, is the number of parallel workers to run the evaluation. By
default, it is set to 1.

There are also two optional environment variables you can set.
```
export USE_HINT_TEXT=true # if you want to use hint text in the evaluation. Ignore this if you are not sure.
export USE_INSTANCE_IMAGE=true # if you want to use instance-level docker images
```

Let's say you'd like to run 10 instances using `eval_gpt4_1106_preview_llm` and CodeActAgent,

then your command would be:

```bash
Expand Down
69 changes: 40 additions & 29 deletions evaluation/swe_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from opendevin.llm.llm import LLM

USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false') == 'true'
USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false') == 'true'


def cleanup():
Expand Down Expand Up @@ -133,37 +134,45 @@ def get_test_result(instance, sandbox, workspace_dir_name):
else:
test_result['metadata']['5_reformat_instance_json_success'] = True

# Get the instance report
err_code, output = sandbox.execute(
(
'cd /swe_util/OD-SWE-bench '
'&& export PYTHONPATH=$(pwd):$PYTHONPATH '
'&& conda run -n swe-bench-eval python swebench/metrics/get_instance_report.py --swe_bench_task /workspace/instance.json --log_path /workspace/$SWE_INSTANCE_ID.log'
)
)
if err_code != 0:
logger.error(f'Error getting instance report: {output}')
if USE_INSTANCE_IMAGE:
# instance report is not supported in instance image mode
test_result['metadata']['6_get_instance_report_success'] = False
test_result['metadata']['6_get_instance_report_error'] = output
test_result['metadata']['6_get_instance_report_error'] = (
'Instance report is not supported in instance image mode.'
)

else:
test_result['metadata']['6_get_instance_report_success'] = True
test_result['result_raw'] = output

# try to parse output
for line in output.strip().split('\n'):
line = line.strip('-')
try:
key, value = line.split(':')
except ValueError:
# skip this line
print(f'Error parsing result line: {line}')
continue
value = value.strip()
try:
value = int(value)
except ValueError:
pass
test_result['result'][key.strip()] = value
# Get the instance report
err_code, output = sandbox.execute(
(
'cd /swe_util/OD-SWE-bench '
'&& export PYTHONPATH=$(pwd):$PYTHONPATH '
'&& conda run -n swe-bench-eval python swebench/metrics/get_instance_report.py --swe_bench_task /workspace/instance.json --log_path /workspace/$SWE_INSTANCE_ID.log'
)
)
if err_code != 0:
logger.error(f'Error getting instance report: {output}')
test_result['metadata']['6_get_instance_report_success'] = False
test_result['metadata']['6_get_instance_report_error'] = output
else:
test_result['metadata']['6_get_instance_report_success'] = True
test_result['result_raw'] = output

# try to parse output
for line in output.strip().split('\n'):
line = line.strip('-')
try:
key, value = line.split(':')
except ValueError:
# skip this line
print(f'Error parsing result line: {line}')
continue
value = value.strip()
try:
value = int(value)
except ValueError:
pass
test_result['result'][key.strip()] = value
return test_result


Expand Down Expand Up @@ -199,6 +208,7 @@ def process_instance(
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
os.makedirs(os.path.dirname(log_file), exist_ok=True)
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
Expand All @@ -215,6 +225,7 @@ def process_instance(
workspace_dir_name,
workspace_mount_path=workspace_mount_path,
sandbox_plugins=agenthub.Agent.get_cls(metadata.agent_class).sandbox_plugins,
use_instance_image=USE_INSTANCE_IMAGE,
)

# Prepare instruction
Expand Down
4 changes: 3 additions & 1 deletion evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,7 @@ echo "Image file: $IMAGE_FILE"
grep "$PATTERN" "$IMAGE_FILE" | while IFS= read -r image; do
echo "Pulling $NAMESPACE/$image into $image"
docker pull $NAMESPACE/$image
docker tag $NAMESPACE/$image $image
# replace _s_ to __ in the image name
renamed_image=$(echo "$image" | sed 's/_s_/__/g')
docker tag $NAMESPACE/$image $renamed_image
Jiayi-Pan marked this conversation as resolved.
Show resolved Hide resolved
done
8 changes: 8 additions & 0 deletions evaluation/swe_bench/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,14 @@ if [ -z "$MAX_ITER" ]; then
MAX_ITER=30
fi

if [ -z "$USE_INSTANCE_IMAGE" ]; then
echo "USE_INSTANCE_IMAGE not specified, use default false"
USE_INSTANCE_IMAGE=false
fi

export USE_INSTANCE_IMAGE=$USE_INSTANCE_IMAGE
echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"

get_agent_version

echo "AGENT: $AGENT"
Expand Down
77 changes: 77 additions & 0 deletions evaluation/swe_bench/scripts/setup/instance_swe_entry.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/bin/bash

# set -e

# assert user name is `root`
if [ "$USER" != "root" ]; then
echo "Error: This script is intended to be run by the 'root' user only." >&2
exit 1
fi

source ~/.bashrc

SWEUTIL_DIR=/swe_util

# Create logs directory
LOG_DIR=/opendevin/logs
mkdir -p $LOG_DIR && chmod 777 $LOG_DIR

# FIXME: Cannot read SWE_INSTANCE_ID from the environment variable
# SWE_INSTANCE_ID=django__django-11099
if [ -z "$SWE_INSTANCE_ID" ]; then
echo "Error: SWE_INSTANCE_ID is not set." >&2
exit 1
fi

# Read the swe-bench-test-lite.json file and extract the required item based on instance_id
item=$(jq --arg INSTANCE_ID "$SWE_INSTANCE_ID" '.[] | select(.instance_id == $INSTANCE_ID)' $SWEUTIL_DIR/eval_data/instances/swe-bench-instance.json)

if [[ -z "$item" ]]; then
echo "No item found for the provided instance ID."
exit 1
fi

WORKSPACE_NAME=$(echo "$item" | jq -r '.repo + "__" + .version | gsub("/"; "__")')

echo "WORKSPACE_NAME: $WORKSPACE_NAME"

SWE_TASK_DIR=/opendevin/swe_tasks
mkdir -p $SWE_TASK_DIR
# Dump test_patch to /workspace/test.patch
echo "$item" | jq -r '.test_patch' > $SWE_TASK_DIR/test.patch
# Dump patch to /workspace/gold.patch
echo "$item" | jq -r '.patch' > $SWE_TASK_DIR/gold.patch
# Dump the item to /workspace/instance.json except for the "test_patch" and "patch" fields
echo "$item" | jq 'del(.test_patch, .patch)' > $SWE_TASK_DIR/instance.json

# Clear the workspace
rm -rf /workspace/*
# Copy repo to workspace
if [ -d /workspace/$WORKSPACE_NAME ]; then
rm -rf /workspace/$WORKSPACE_NAME
fi
cp -r /testbed/ /workspace/$WORKSPACE_NAME/

# Reset swe-bench testbed and install the repo
. /opt/miniconda3/etc/profile.d/conda.sh
conda activate testbed

mkdir -p $SWE_TASK_DIR/reset_testbed_temp
mkdir -p $SWE_TASK_DIR/reset_testbed_log_dir

REPO_PATH=/workspace/$WORKSPACE_NAME
echo "Repo Path: $REPO_PATH"
echo "Test Command: $TEST_CMD"
echo "export REPO_PATH=\"$REPO_PATH\"" >> ~/.bashrc
# echo "export TEST_CMD=\"$TEST_CMD\"" >> ~/.bashrc

if [[ "$REPO_PATH" == "None" ]]; then
echo "Error: Failed to retrieve repository path. Tests may not have passed or output was not as expected." >&2
exit 1
fi

# Activate instance-specific environment
. /opt/miniconda3/etc/profile.d/conda.sh
conda activate testbed

# set +e
84 changes: 78 additions & 6 deletions evaluation/swe_bench/swe_env_box.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import json
import os
import sys
import tempfile
import uuid

from datasets import load_dataset
from swebench.harness.constants import MAP_REPO_TO_TEST_FRAMEWORK
from swebench.harness.utils import get_test_directives

from opendevin.core.config import config
from opendevin.core.logger import opendevin_logger as logger
Expand All @@ -15,6 +20,10 @@
SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.2.1'


def get_image_name_from_instance_id(instance_id: str) -> str:
return 'sweb.eval.x86_64.' + instance_id


class SWEBenchSSHBox(DockerSSHBox):
def __init__(
self,
Expand All @@ -26,6 +35,7 @@ def __init__(
skip_workspace_mount: bool = True,
sandbox_plugins: list[PluginRequirement] = [], # noqa: B006
workspace_dir_name: str | None = None,
use_instance_image: bool = False,
):
if swe_instance_id is None:
raise ValueError('swe_instance_id must be provided!')
Expand All @@ -39,6 +49,7 @@ def __init__(
), 'container_image is required for SWEBenchSSHBox!'
# Need to run as root to use SWEBench container
sid = f'swe_bench_{swe_instance_id}_' + str(uuid.uuid4())
logger.info(f'===Using container image: {container_image}')
super().__init__(container_image, timeout, sid)
self.init_plugins(sandbox_plugins)

Expand All @@ -54,11 +65,61 @@ def __init__(
logger.info(
'Initialization of SWEBench may take approximately 10 minutes due to long-running installations, such as those requiring compilation.'
)
exit_code, output = self.execute('source /swe_util/swe_entry.sh', timeout=600)
logger.info('exit code: %d', exit_code)
logger.info(output)
assert exit_code == 0, f'Failed to source swe_entry.sh: {output}'
logger.info('Sourced swe_entry.sh successfully')
logger.info(f'Use instance image: {use_instance_image}')
if use_instance_image:
# we directly inject the instance info into the container and the init script
script_dir = os.path.dirname(__file__)

# inject test command
test_type = MAP_REPO_TO_TEST_FRAMEWORK[swe_instance['repo']][
swe_instance['version']
]
swe_instance['test_directives'] = get_test_directives(swe_instance)
swe_instance['test_cmd'] = (
f"{test_type} {' '.join(swe_instance['test_directives'])}"
)
exit_code, output = self.execute(
f"""echo "export TEST_CMD='{swe_instance["test_cmd"]}'" >> ~/.bashrc"""
)
# assert exit_code == 0, f'Failed to set TEST_CMD in ~/.bashrc: {output}'

# inject the instance info
self.execute('mkdir -p /swe_util/eval_data/instances')
swe_instance_json_name = 'swe-bench-instance.json'
with tempfile.TemporaryDirectory() as temp_dir:
# Construct the full path for the desired file name within the temporary directory
temp_file_path = os.path.join(temp_dir, swe_instance_json_name)
# Write to the file with the desired name within the temporary directory
with open(temp_file_path, 'w') as f:
if not isinstance(swe_instance, dict):
json.dump([swe_instance.to_dict()], f)
else:
json.dump([swe_instance], f)

# Copy the file to the desired location
self.copy_to(temp_file_path, '/swe_util/eval_data/instances/')

# inject the init script
self.copy_to(
str(os.path.join(script_dir, 'scripts/setup/instance_swe_entry.sh')),
'/swe_util/',
)
self.execute('cat ~/.bashrc')
self.execute('source ~/.bashrc')

self.execute('source /swe_util/instance_swe_entry.sh', timeout=600)
logger.info('exit code: %d', exit_code)
logger.info(output)
assert exit_code == 0, f'Failed to source swe_entry.sh: {output}'
logger.info('Sourced swe_entry.sh successfully')
else:
exit_code, output = self.execute(
'source /swe_util/swe_entry.sh', timeout=600
)
logger.info('exit code: %d', exit_code)
logger.info(output)
assert exit_code == 0, f'Failed to source swe_entry.sh: {output}'
logger.info('Sourced swe_entry.sh successfully')

@property
def volumes(self):
Expand All @@ -78,6 +139,7 @@ def get_box_for_instance(
skip_workspace_mount: bool = True,
workspace_mount_path: str | None = None,
sandbox_plugins: list[PluginRequirement] = [], # noqa: B006
use_instance_image: bool = False,
) -> 'SWEBenchSSHBox':
if workspace_dir_name is None:
workspace_dir_name = f"{instance['repo']}__{instance['version']}".replace(
Expand All @@ -94,13 +156,20 @@ def get_box_for_instance(
config.enable_auto_lint = True
# Need to run as root to use SWEBench container
config.run_as_devin = False
if use_instance_image:
container_image = get_image_name_from_instance_id(
instance['instance_id']
)
else:
container_image = SWE_BENCH_CONTAINER_IMAGE
sandbox = cls(
container_image=SWE_BENCH_CONTAINER_IMAGE,
container_image=container_image,
swe_instance_id=instance['instance_id'],
swe_instance=instance,
skip_workspace_mount=skip_workspace_mount,
sandbox_plugins=sandbox_plugins,
workspace_dir_name=workspace_dir_name,
use_instance_image=use_instance_image,
)
logger.info(f"SSH box started for instance {instance['instance_id']}.")

Expand Down Expand Up @@ -163,6 +232,8 @@ def get_diff_patch(self):
# so we don't need to manage file uploading to OpenDevin's repo
dataset = load_dataset('princeton-nlp/SWE-bench_Lite')
swe_bench_tests = dataset['test'].to_pandas()
USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false') == 'true'
logger.info(f'USE_INSTANCE_IMAGE: {USE_INSTANCE_IMAGE}')

# INSTANCE_ID = 'django__django-11099'
INSTANCE_ID = 'astropy__astropy-12907'
Expand All @@ -172,6 +243,7 @@ def get_diff_patch(self):
sandbox = SWEBenchSSHBox.get_box_for_instance(
instance=EXAMPLE_INSTANCE,
sandbox_plugins=[AgentSkillsRequirement(), JupyterRequirement()],
use_instance_image=USE_INSTANCE_IMAGE,
)

# PRE TEST
Expand Down