run-cluster #245
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: run-cluster | |
on: | |
workflow_dispatch: | |
inputs: | |
daft_wheel_url: | |
description: Daft python-wheel URL | |
type: string | |
required: false | |
daft_version: | |
description: Daft version (errors if both this and "Daft python-wheel URL" are provided) | |
type: string | |
required: false | |
python_version: | |
description: Python version | |
type: string | |
required: false | |
default: "3.9" | |
cluster_profile: | |
description: Cluster profile | |
type: choice | |
options: | |
- medium-x86 | |
- debug_xs-x86 | |
required: false | |
default: medium-x86 | |
working_dir: | |
description: Working directory | |
type: string | |
required: false | |
default: .github/working-dir | |
entrypoint_script: | |
description: Entry-point python script (must be inside of the working directory) | |
type: string | |
required: true | |
entrypoint_args: | |
description: Entry-point arguments | |
type: string | |
required: false | |
default: "" | |
env_vars: | |
description: Environment variables | |
type: string | |
required: false | |
default: "" | |
jobs: | |
run-command: | |
runs-on: [self-hosted, linux, x64, ci-dev] | |
timeout-minutes: 15 # Remove for ssh debugging | |
permissions: | |
id-token: write | |
contents: read | |
steps: | |
- name: Log workflow inputs | |
run: echo "${{ toJson(github.event.inputs) }}" | |
- name: Checkout repo | |
uses: actions/checkout@v4 | |
with: | |
fetch-depth: 1 | |
- name: Configure AWS credentials | |
uses: aws-actions/configure-aws-credentials@v4 | |
with: | |
aws-region: us-west-2 | |
role-session-name: run-command-workflow | |
- name: Install uv, rust, python | |
uses: ./.github/actions/install | |
with: | |
python_version: ${{ inputs.python_version }} | |
- name: Setup uv environment | |
run: | | |
uv v | |
source .venv/bin/activate | |
uv pip install ray[default] boto3 | |
- name: Dynamically update ray config file | |
run: | | |
source .venv/bin/activate | |
(cat .github/assets/template.yaml | \ | |
uv run \ | |
--python 3.12 \ | |
.github/ci-scripts/templatize_ray_config.py \ | |
--cluster-name="ray-ci-run-${{ github.run_id }}_${{ github.run_attempt }}" \ | |
--daft-wheel-url='${{ inputs.daft_wheel_url }}' \ | |
--daft-version='${{ inputs.daft_version }}' \ | |
--python-version='${{ inputs.python_version }}' \ | |
--cluster-profile='${{ inputs.cluster_profile }}' \ | |
--working-dir='${{ inputs.working_dir }}' \ | |
--entrypoint-script='${{ inputs.entrypoint_script }}' | |
) >> .github/assets/ray.yaml | |
cat .github/assets/ray.yaml | |
- name: Download private ssh key | |
run: | | |
KEY=$(aws secretsmanager get-secret-value --secret-id ci-github-actions-ray-cluster-key-3 --query SecretString --output text) | |
echo "$KEY" >> ~/.ssh/ci-github-actions-ray-cluster-key.pem | |
chmod 600 ~/.ssh/ci-github-actions-ray-cluster-key.pem | |
- name: Spin up ray cluster | |
run: | | |
source .venv/bin/activate | |
ray up .github/assets/ray.yaml -y | |
- name: Setup connection to ray cluster | |
run: | | |
source .venv/bin/activate | |
ray dashboard .github/assets/ray.yaml & | |
- name: Submit job to ray cluster | |
run: | | |
source .venv/bin/activate | |
if [[ -z '${{ inputs.entrypoint_script }}' ]]; then | |
echo 'Invalid command submitted; command cannot be empty' | |
exit 1 | |
fi | |
python .github/ci-scripts/job_runner.py \ | |
--working-dir='${{ inputs.working_dir }}' \ | |
--entrypoint-script='${{ inputs.entrypoint_script }}' \ | |
--entrypoint-args='${{ inputs.entrypoint_args }}' \ | |
--env-vars='${{ inputs.env_vars }}' \ | |
--enable-ray-tracing | |
- name: Download log files from ray cluster | |
run: | | |
source .venv/bin/activate | |
ray rsync-down .github/assets/ray.yaml /tmp/ray/session_*/logs ray-daft-logs | |
find ray-daft-logs -depth -name '*:*' -exec bash -c ' | |
for filepath; do | |
dir=$(dirname "$filepath") | |
base=$(basename "$filepath") | |
new_base=${base//:/_} | |
mv "$filepath" "$dir/$new_base" | |
done | |
' _ {} + | |
- name: Kill connection to ray cluster | |
run: | | |
PID=$(lsof -t -i:8265) | |
if [[ -n "$PID" ]]; then | |
echo "Process $PID is listening on port 8265; killing it..." | |
kill -9 "$PID" | |
if [[ $? -eq 0 ]]; then | |
echo "Process $PID killed successfully" | |
else | |
echo "Failed to kill process $PID" | |
fi | |
fi | |
- name: Spin down ray cluster | |
if: always() | |
run: | | |
source .venv/bin/activate | |
ray down .github/assets/ray.yaml -y | |
- name: Upload log files | |
uses: actions/upload-artifact@v4 | |
with: | |
name: ray-daft-logs | |
path: ray-daft-logs |