Run Integration Tests #20

Workflow file for this run

.github/workflows/integration-runner.yml at 91135d7

	name: Run Integration Tests

	on:
	pull_request:
	types: [labeled]
	workflow_dispatch:
	inputs:
	reason:
	description: 'Reason for manual trigger'
	required: true
	default: ''
	schedule:
	- cron: '30 22 * * *' # Runs at 10:30pm UTC every day

	env:
	N_PROCESSES: 10 # Global configuration for number of parallel processes for evaluation

	jobs:
	run-integration-tests:
	if: github.event.label.name == 'integration-test' \|\| github.event_name == 'workflow_dispatch' \|\| github.event_name == 'schedule'
	runs-on: ubuntu-latest
	permissions:
	contents: "read"
	id-token: "write"
	pull-requests: "write"
	issues: "write"
	strategy:
	matrix:
	python-version: ["3.12"]
	steps:
	- name: Checkout repository
	uses: actions/checkout@v4

	- name: Install poetry via pipx
	run: pipx install poetry

	- name: Set up Python
	uses: actions/setup-python@v5
	with:
	python-version: ${{ matrix.python-version }}
	cache: "poetry"

	- name: Comment on PR if 'integration-test' label is present
	if: github.event_name == 'pull_request' && github.event.label.name == 'integration-test'
	uses: KeisukeYamashita/create-comment@v1
	with:
	unique: false
	comment: \|
	Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly.

	- name: Install Python dependencies using Poetry
	run: poetry install --without evaluation,llama-index

	- name: Configure config.toml for testing with Haiku
	env:
	LLM_MODEL: ${{ secrets.HAIKU_LLM_MODEL }}
	LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
	LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
	run: \|
	echo "[llm.eval]" > config.toml
	echo "model = \"$LLM_MODEL\"" >> config.toml
	echo "api_key = \"$LLM_API_KEY\"" >> config.toml
	echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
	echo "temperature = 0.0" >> config.toml

	- name: Build environment
	run: make build

	- name: Run integration test evaluation for Haiku
	env:
	#ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
	#RUNTIME: remote
	#SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
	#EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
	SANDBOX_FORCE_REBUILD_RUNTIME: True
	run: \|
	poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'haiku_run'

	# get integration tests report
	REPORT_FILE_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/haiku_maxiter_10_N* -name "report.md" -type f \| head -n 1)
	echo "REPORT_FILE: $REPORT_FILE_HAIKU"
	echo "INTEGRATION_TEST_REPORT_HAIKU<<EOF" >> $GITHUB_ENV
	cat $REPORT_FILE_HAIKU >> $GITHUB_ENV
	echo >> $GITHUB_ENV
	echo "EOF" >> $GITHUB_ENV

	#- name: Cleanup Haiku runtimes
	# if: always()
	# env:
	# ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
	# SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
	# run: \|
	# poetry run ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh
	# # Add a small delay to ensure cleanup is complete
	# sleep 10

	- name: Wait a little bit
	run: sleep 10

	- name: Configure config.toml for testing with DeepSeek
	env:
	LLM_MODEL: "deepseek/deepseek-chat"
	LLM_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
	run: \|
	echo "[llm.eval]" > config.toml
	echo "model = \"$LLM_MODEL\"" >> config.toml
	echo "api_key = \"$LLM_API_KEY\"" >> config.toml
	echo "temperature = 0.0" >> config.toml

	- name: Run integration test evaluation for DeepSeek
	env:
	#ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
	#RUNTIME: remote
	#SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
	#EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
	SANDBOX_FORCE_REBUILD_RUNTIME: True
	run: \|
	poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'deepseek_run'

	# get integration tests report
	REPORT_FILE_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek_maxiter_10_N -name "report.md" -type f \| head -n 1)
	echo "REPORT_FILE: $REPORT_FILE_DEEPSEEK"
	echo "INTEGRATION_TEST_REPORT_DEEPSEEK<<EOF" >> $GITHUB_ENV
	cat $REPORT_FILE_DEEPSEEK >> $GITHUB_ENV
	echo >> $GITHUB_ENV
	echo "EOF" >> $GITHUB_ENV

	- name: Create archive of evaluation outputs
	run: \|
	TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
	cd evaluation/evaluation_outputs/outputs # Change to the outputs directory
	tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/* # Only include the actual result directories

	- name: Upload evaluation results as artifact
	uses: actions/upload-artifact@v4
	id: upload_results_artifact
	with:
	name: integration-test-outputs-${{ github.run_id }}-${{ github.run_attempt }}
	path: integration_tests_*.tar.gz

	- name: Get artifact URLs
	run: \|
	echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV

	# - name: Authenticate to Google Cloud
	# uses: 'google-github-actions/auth@v2'
	# with:
	# credentials_json: ${{ secrets.GCP_RESEARCH_OBJECT_CREATOR_SA_KEY }}

	- name: Set timestamp and trigger reason
	run: \|
	echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV
	if [[ "${{ github.event_name }}" == "pull_request" ]]; then
	echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV
	elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
	echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV
	else
	echo "TRIGGER_REASON=nightly-scheduled" >> $GITHUB_ENV
	fi

	# - name: Upload evaluation results to Google Cloud Storage
	# uses: 'google-github-actions/upload-cloud-storage@v2'
	# with:
	# path: 'evaluation/evaluation_outputs/outputs'
	# destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}'

	# - name: Cleanup remote runtimes
	# if: always() # run this step even if previous steps failed
	# env:
	# ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
	# SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
	# run: \|
	# poetry run ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh

	- name: Comment with results and artifact link
	id: create_comment
	uses: KeisukeYamashita/create-comment@v1
	with:
	# if triggered by PR, use PR number, otherwise use 9 as fallback issue number for manual triggers
	number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number \|\| 9 }}
	unique: false
	comment: \|
	Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) \|\| (github.event_name == 'workflow_dispatch' && format('Manual Trigger: {0}', github.event.inputs.reason)) \|\| 'Nightly Scheduled Run' }}
	Commit: ${{ github.sha }}
	Integration Tests Report (Haiku)
	Haiku LLM Test Results:
	${{ env.INTEGRATION_TEST_REPORT_HAIKU }}
	---
	Integration Tests Report (DeepSeek)
	DeepSeek LLM Test Results:
	${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }}
	---
	Download evaluation outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }})

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Run Integration Tests #20

Workflow file

Run Integration Tests #20

Jobs

Run details

Workflow file for this run