Run Integration Tests #20
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Run Integration Tests | |
on: | |
pull_request: | |
types: [labeled] | |
workflow_dispatch: | |
inputs: | |
reason: | |
description: 'Reason for manual trigger' | |
required: true | |
default: '' | |
schedule: | |
- cron: '30 22 * * *' # Runs at 10:30pm UTC every day | |
env: | |
N_PROCESSES: 10 # Global configuration for number of parallel processes for evaluation | |
jobs: | |
run-integration-tests: | |
if: github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' | |
runs-on: ubuntu-latest | |
permissions: | |
contents: "read" | |
id-token: "write" | |
pull-requests: "write" | |
issues: "write" | |
strategy: | |
matrix: | |
python-version: ["3.12"] | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
- name: Install poetry via pipx | |
run: pipx install poetry | |
- name: Set up Python | |
uses: actions/setup-python@v5 | |
with: | |
python-version: ${{ matrix.python-version }} | |
cache: "poetry" | |
- name: Comment on PR if 'integration-test' label is present | |
if: github.event_name == 'pull_request' && github.event.label.name == 'integration-test' | |
uses: KeisukeYamashita/create-comment@v1 | |
with: | |
unique: false | |
comment: | | |
Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly. | |
- name: Install Python dependencies using Poetry | |
run: poetry install --without evaluation,llama-index | |
- name: Configure config.toml for testing with Haiku | |
env: | |
LLM_MODEL: ${{ secrets.HAIKU_LLM_MODEL }} | |
LLM_API_KEY: ${{ secrets.LLM_API_KEY }} | |
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }} | |
run: | | |
echo "[llm.eval]" > config.toml | |
echo "model = \"$LLM_MODEL\"" >> config.toml | |
echo "api_key = \"$LLM_API_KEY\"" >> config.toml | |
echo "base_url = \"$LLM_BASE_URL\"" >> config.toml | |
echo "temperature = 0.0" >> config.toml | |
- name: Build environment | |
run: make build | |
- name: Run integration test evaluation for Haiku | |
env: | |
#ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} | |
#RUNTIME: remote | |
#SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev | |
#EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images | |
SANDBOX_FORCE_REBUILD_RUNTIME: True | |
run: | | |
poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'haiku_run' | |
# get integration tests report | |
REPORT_FILE_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*haiku*_maxiter_10_N* -name "report.md" -type f | head -n 1) | |
echo "REPORT_FILE: $REPORT_FILE_HAIKU" | |
echo "INTEGRATION_TEST_REPORT_HAIKU<<EOF" >> $GITHUB_ENV | |
cat $REPORT_FILE_HAIKU >> $GITHUB_ENV | |
echo >> $GITHUB_ENV | |
echo "EOF" >> $GITHUB_ENV | |
#- name: Cleanup Haiku runtimes | |
# if: always() | |
# env: | |
# ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} | |
# SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev | |
# run: | | |
# poetry run ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh | |
# # Add a small delay to ensure cleanup is complete | |
# sleep 10 | |
- name: Wait a little bit | |
run: sleep 10 | |
- name: Configure config.toml for testing with DeepSeek | |
env: | |
LLM_MODEL: "deepseek/deepseek-chat" | |
LLM_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }} | |
run: | | |
echo "[llm.eval]" > config.toml | |
echo "model = \"$LLM_MODEL\"" >> config.toml | |
echo "api_key = \"$LLM_API_KEY\"" >> config.toml | |
echo "temperature = 0.0" >> config.toml | |
- name: Run integration test evaluation for DeepSeek | |
env: | |
#ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} | |
#RUNTIME: remote | |
#SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev | |
#EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images | |
SANDBOX_FORCE_REBUILD_RUNTIME: True | |
run: | | |
poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'deepseek_run' | |
# get integration tests report | |
REPORT_FILE_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek*_maxiter_10_N* -name "report.md" -type f | head -n 1) | |
echo "REPORT_FILE: $REPORT_FILE_DEEPSEEK" | |
echo "INTEGRATION_TEST_REPORT_DEEPSEEK<<EOF" >> $GITHUB_ENV | |
cat $REPORT_FILE_DEEPSEEK >> $GITHUB_ENV | |
echo >> $GITHUB_ENV | |
echo "EOF" >> $GITHUB_ENV | |
- name: Create archive of evaluation outputs | |
run: | | |
TIMESTAMP=$(date +'%y-%m-%d-%H-%M') | |
cd evaluation/evaluation_outputs/outputs # Change to the outputs directory | |
tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/* # Only include the actual result directories | |
- name: Upload evaluation results as artifact | |
uses: actions/upload-artifact@v4 | |
id: upload_results_artifact | |
with: | |
name: integration-test-outputs-${{ github.run_id }}-${{ github.run_attempt }} | |
path: integration_tests_*.tar.gz | |
- name: Get artifact URLs | |
run: | | |
echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV | |
# - name: Authenticate to Google Cloud | |
# uses: 'google-github-actions/auth@v2' | |
# with: | |
# credentials_json: ${{ secrets.GCP_RESEARCH_OBJECT_CREATOR_SA_KEY }} | |
- name: Set timestamp and trigger reason | |
run: | | |
echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV | |
if [[ "${{ github.event_name }}" == "pull_request" ]]; then | |
echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV | |
elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then | |
echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV | |
else | |
echo "TRIGGER_REASON=nightly-scheduled" >> $GITHUB_ENV | |
fi | |
# - name: Upload evaluation results to Google Cloud Storage | |
# uses: 'google-github-actions/upload-cloud-storage@v2' | |
# with: | |
# path: 'evaluation/evaluation_outputs/outputs' | |
# destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}' | |
# - name: Cleanup remote runtimes | |
# if: always() # run this step even if previous steps failed | |
# env: | |
# ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} | |
# SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev | |
# run: | | |
# poetry run ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh | |
- name: Comment with results and artifact link | |
id: create_comment | |
uses: KeisukeYamashita/create-comment@v1 | |
with: | |
# if triggered by PR, use PR number, otherwise use 9 as fallback issue number for manual triggers | |
number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 9 }} | |
unique: false | |
comment: | | |
Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || (github.event_name == 'workflow_dispatch' && format('Manual Trigger: {0}', github.event.inputs.reason)) || 'Nightly Scheduled Run' }} | |
Commit: ${{ github.sha }} | |
**Integration Tests Report (Haiku)** | |
Haiku LLM Test Results: | |
${{ env.INTEGRATION_TEST_REPORT_HAIKU }} | |
--- | |
**Integration Tests Report (DeepSeek)** | |
DeepSeek LLM Test Results: | |
${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }} | |
--- | |
Download evaluation outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }}) |