More integration tests info #33
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Run Integration Tests | |
on: | |
pull_request: | |
types: [labeled] | |
workflow_dispatch: | |
inputs: | |
reason: | |
description: 'Reason for manual trigger' | |
required: true | |
default: '' | |
schedule: | |
- cron: '30 22 * * *' # Runs at 10:30pm UTC every day | |
env: | |
N_PROCESSES: 10 # Global configuration for number of parallel processes for evaluation | |
jobs: | |
run-integration-tests: | |
if: github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' | |
runs-on: ubuntu-latest | |
permissions: | |
contents: "read" | |
id-token: "write" | |
pull-requests: "write" | |
issues: "write" | |
strategy: | |
matrix: | |
python-version: ["3.12"] | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
- name: Install poetry via pipx | |
run: pipx install poetry | |
- name: Set up Python | |
uses: actions/setup-python@v5 | |
with: | |
python-version: ${{ matrix.python-version }} | |
cache: "poetry" | |
- name: Comment on PR if 'integration-test' label is present | |
if: github.event_name == 'pull_request' && github.event.label.name == 'integration-test' | |
uses: KeisukeYamashita/create-comment@v1 | |
with: | |
unique: false | |
comment: | | |
Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly. | |
- name: Install Python dependencies using Poetry | |
run: poetry install --without evaluation,llama-index | |
- name: Configure config.toml for testing with Haiku | |
env: | |
LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022" | |
LLM_API_KEY: ${{ secrets.LLM_API_KEY }} | |
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }} | |
run: | | |
echo "[llm.eval]" > config.toml | |
echo "model = \"$LLM_MODEL\"" >> config.toml | |
echo "api_key = \"$LLM_API_KEY\"" >> config.toml | |
echo "base_url = \"$LLM_BASE_URL\"" >> config.toml | |
echo "temperature = 0.0" >> config.toml | |
- name: Build environment | |
run: make build | |
- name: Run integration test evaluation for Haiku | |
env: | |
SANDBOX_FORCE_REBUILD_RUNTIME: True | |
run: | | |
poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'haiku_run' | |
# get integration tests report | |
REPORT_FILE_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*haiku*_maxiter_10_N* -name "report.md" -type f | head -n 1) | |
echo "REPORT_FILE: $REPORT_FILE_HAIKU" | |
echo "INTEGRATION_TEST_REPORT_HAIKU<<EOF" >> $GITHUB_ENV | |
cat $REPORT_FILE_HAIKU >> $GITHUB_ENV | |
echo >> $GITHUB_ENV | |
echo "EOF" >> $GITHUB_ENV | |
- name: Wait a little bit | |
run: sleep 10 | |
- name: Configure config.toml for testing with DeepSeek | |
env: | |
LLM_MODEL: "deepseek/deepseek-chat" | |
LLM_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }} | |
#LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }} | |
run: | | |
echo "[llm.eval]" > config.toml | |
echo "model = \"$LLM_MODEL\"" >> config.toml | |
echo "api_key = \"$LLM_API_KEY\"" >> config.toml | |
#echo "base_url = \"$LLM_BASE_URL\"" >> config.toml | |
echo "temperature = 0.0" >> config.toml | |
- name: Run integration test evaluation for DeepSeek | |
env: | |
SANDBOX_FORCE_REBUILD_RUNTIME: True | |
run: | | |
poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'deepseek_run' | |
# get integration tests report | |
REPORT_FILE_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek*_maxiter_10_N* -name "report.md" -type f | head -n 1) | |
echo "REPORT_FILE: $REPORT_FILE_DEEPSEEK" | |
echo "INTEGRATION_TEST_REPORT_DEEPSEEK<<EOF" >> $GITHUB_ENV | |
cat $REPORT_FILE_DEEPSEEK >> $GITHUB_ENV | |
echo >> $GITHUB_ENV | |
echo "EOF" >> $GITHUB_ENV | |
- name: Create archive of evaluation outputs | |
run: | | |
TIMESTAMP=$(date +'%y-%m-%d-%H-%M') | |
cd evaluation/evaluation_outputs/outputs # Change to the outputs directory | |
tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/* # Only include the actual result directories | |
- name: Upload evaluation results as artifact | |
uses: actions/upload-artifact@v4 | |
id: upload_results_artifact | |
with: | |
name: integration-test-outputs-${{ github.run_id }}-${{ github.run_attempt }} | |
path: integration_tests_*.tar.gz | |
- name: Get artifact URLs | |
run: | | |
echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV | |
- name: Set timestamp and trigger reason | |
run: | | |
echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV | |
if [[ "${{ github.event_name }}" == "pull_request" ]]; then | |
echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV | |
elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then | |
echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV | |
else | |
echo "TRIGGER_REASON=nightly-scheduled" >> $GITHUB_ENV | |
fi | |
- name: Comment with results and artifact link | |
id: create_comment | |
uses: KeisukeYamashita/create-comment@v1 | |
with: | |
# if triggered by PR, use PR number, otherwise use 5318 as fallback issue number for manual triggers | |
number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 9 }} | |
unique: false | |
comment: | | |
Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || (github.event_name == 'workflow_dispatch' && format('Manual Trigger: {0}', github.event.inputs.reason)) || 'Nightly Scheduled Run' }} | |
Commit: ${{ github.sha }} | |
**Integration Tests Report (Haiku)** | |
Haiku LLM Test Results: | |
${{ env.INTEGRATION_TEST_REPORT_HAIKU }} | |
--- | |
**Integration Tests Report (DeepSeek)** | |
DeepSeek LLM Test Results: | |
${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }} | |
--- | |
Download testing outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }}) |