-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
6b3039a
commit a7cd7e2
Showing
16 changed files
with
559 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
name: CI | ||
|
||
on: | ||
push: | ||
branches: | ||
- main | ||
|
||
pull_request: | ||
branches: | ||
- main | ||
|
||
jobs: | ||
|
||
docker-image: | ||
runs-on: ubuntu-latest | ||
|
||
permissions: | ||
contents: read | ||
packages: write | ||
|
||
steps: | ||
- name: Checkout repository | ||
uses: actions/checkout@v4 | ||
|
||
# See explanation: https://github.com/orgs/community/discussions/25678 | ||
- name: Clean disk | ||
run: | | ||
rm -rf /opt/hostedtoolcache | ||
- name: Log in to the Container registry | ||
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 | ||
with: | ||
registry: ghcr.io | ||
username: ${{ github.actor }} | ||
password: ${{ secrets.GITHUB_TOKEN }} | ||
|
||
- name: Extract metadata (tags, labels) for Docker | ||
id: meta | ||
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 | ||
with: | ||
images: ghcr.io/kyryl-opens-ml/gpu-jobs-comparison | ||
|
||
- name: Build and push Docker image | ||
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 | ||
with: | ||
context: . | ||
push: true | ||
tags: | | ||
${{ steps.meta.outputs.tags }} | ||
ghcr.io/kyryl-opens-ml/gpu-jobs-comparison:latest | ||
labels: ${{ steps.meta.outputs.labels }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
FROM huggingface/transformers-pytorch-gpu:4.35.2 | ||
|
||
WORKDIR /app | ||
|
||
ENV LC_ALL=C.UTF-8 | ||
ENV LANG=C.UTF-8 | ||
|
||
COPY requirements.txt requirements.txt | ||
RUN pip install -r requirements.txt | ||
RUN MAX_JOBS=4 pip install flash-attn==2.5.7 --no-build-isolation | ||
|
||
RUN ln -s /usr/bin/python3 /usr/bin/python | ||
|
||
ENV PYTHONPATH /app | ||
COPY . . | ||
|
||
CMD [ "bash" ] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,84 @@ | ||
# gpu-jobs-comparison | ||
# gpu-jobs-comparison | ||
|
||
## TLDR | ||
|
||
![alt text](./docs/result.png) | ||
|
||
|
||
| Approach | Setup effort | Dev UX | Scalability | Price USD | | ||
|-----------|--------------|--------|-------------|-----------| | ||
| SSH | 🛋️-⚖️ | 4/5 | 1/5 | ~2.43 | | ||
| Modal | 🛋️ | 5/5 | 5/5 | ~1.65 | | ||
| K8S | ⚖️-🏋️♂️ | 3/5 | 5/5 | ~2.43 | | ||
| SageMaker | ⚖️ | 3/5 | 5/5 | ~2.24 | | ||
|
||
|
||
## Run SSH in VM | ||
|
||
```bash | ||
export HF_TOKEN=**** | ||
export WANDB_PROJECT=gpu-jobs-comparison | ||
export WANDB_API_KEY==**** | ||
export RUN_NAME=phi-3-text2sql-ssh | ||
|
||
pip install -r requirements.txt | ||
python text2sql_training.py | ||
``` | ||
|
||
## Kubernetes | ||
|
||
```bash | ||
minikube start --driver docker --container-runtime docker --gpus all | ||
|
||
export HF_TOKEN=**** | ||
export WANDB_API_KEY==**** | ||
|
||
kubectl create secret generic gpu-job-secrets --from-literal=HF_TOKEN=$HF_TOKEN --from-literal=WANDB_API_KEY=$WANDB_API_KEY | ||
kubectl create -f gpu-job/kubernetes/job-app-ml.yaml | ||
``` | ||
|
||
## Modal | ||
|
||
```bash | ||
export HF_TOKEN=**** | ||
export WANDB_PROJECT=gpu-jobs-comparison | ||
export WANDB_API_KEY==**** | ||
export RUN_NAME=phi-3-text2sql-modal | ||
|
||
pip install modal | ||
|
||
modal setup | ||
modal deploy ./gpu-job/modal/run_training_job.py | ||
|
||
python ./gpu-job/modal/run_training_job.py | ||
``` | ||
|
||
## AWS SageMaker | ||
|
||
```bash | ||
export AWS_ACCESS_KEY_ID=**** | ||
export AWS_SECRET_ACCESS_KEY=**** | ||
export AWS_DEFAULT_REGION=us-east-1 | ||
export AWS_ACCOUNT_ID=**** | ||
|
||
pip install boto3 sagemaker awscli | ||
|
||
aws iam create-role --role-name sagemaker-execution-role --assume-role-policy-document file://gpu-job/aws-sagemaker/trust-policy.json | ||
aws iam attach-role-policy --role-name sagemaker-execution-role --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess | ||
aws iam attach-role-policy --role-name sagemaker-execution-role --policy-arn arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryFullAccess | ||
|
||
|
||
aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com | ||
aws ecr create-repository --repository-name gpu-jobs-comparison | ||
docker pull ghcr.io/kyryl-opens-ml/gpu-jobs-comparison:latest | ||
docker tag ghcr.io/kyryl-opens-ml/gpu-jobs-comparison:latest $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/gpu-jobs-comparison:latest | ||
docker push $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/gpu-jobs-comparison:latest | ||
|
||
|
||
|
||
export HF_TOKEN=**** | ||
export WANDB_PROJECT=gpu-jobs-comparison | ||
export WANDB_API_KEY=**** | ||
export RUN_NAME=phi-3-text2sql-sagemaker | ||
python ./gpu-job/aws-sagemaker/run_processing.py | ||
``` |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
from sagemaker.processing import Processor | ||
import os | ||
|
||
|
||
env = { | ||
"WANDB_PROJECT": os.getenv("WANDB_PROJECT"), | ||
"WANDB_API_KEY": os.getenv("WANDB_API_KEY"), | ||
"HF_TOKEN": os.getenv("HF_TOKEN"), | ||
"RUN_NAME": os.getenv("RUN_NAME"), | ||
} | ||
|
||
sagemaker_role_arn = f"arn:aws:iam::{os.getenv('AWS_ACCOUNT_ID')}:role/sagemaker-execution-role" | ||
iamge_uri = f"{os.getenv('AWS_ACCOUNT_ID')}.dkr.ecr.{os.getenv('AWS_DEFAULT_REGION')}.amazonaws.com/gpu-jobs-comparison:latest" | ||
|
||
processor = Processor( | ||
role=sagemaker_role_arn, | ||
image_uri=iamge_uri, | ||
instance_count=1, | ||
instance_type='ml.g5.2xlarge', | ||
env=env | ||
) | ||
|
||
# Define processing inputs and outputs (if any) | ||
processing_inputs = [] | ||
processing_outputs = [] | ||
|
||
processor.run( | ||
inputs=processing_inputs, | ||
outputs=processing_outputs, | ||
arguments=['python', 'text2sql_training.py'] | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
{ | ||
"Version": "2012-10-17", | ||
"Statement": [ | ||
{ | ||
"Effect": "Allow", | ||
"Principal": { | ||
"Service": "sagemaker.amazonaws.com" | ||
}, | ||
"Action": "sts:AssumeRole" | ||
} | ||
] | ||
} |
Empty file.
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import os | ||
from google.cloud import aiplatform | ||
|
||
# Set your project ID and location | ||
PROJECT_ID = os.getenv('GOOGLE_CLOUD_PROJECT', 'gothic-doodad-323015') | ||
LOCATION = os.getenv('GOOGLE_CLOUD_LOCATION', 'us-central1') | ||
STAGING_BUCKET = 'gs://gpu-jobs-comparison' # Replace with your staging bucket | ||
|
||
# Initialize Vertex AI | ||
aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=STAGING_BUCKET) | ||
|
||
# Initialize Vertex AI | ||
# aiplatform.init(project=PROJECT_ID, location=LOCATION) | ||
|
||
# Define environment variables | ||
env = { | ||
"WANDB_PROJECT": os.getenv("WANDB_PROJECT"), | ||
"WANDB_API_KEY": os.getenv("WANDB_API_KEY"), | ||
"HF_TOKEN": os.getenv("HF_TOKEN"), | ||
"RUN_NAME": os.getenv("RUN_NAME"), | ||
} | ||
|
||
# Define the container image URI from Artifact Registry | ||
image_uri = "us-central1-docker.pkg.dev/gothic-doodad-323015/gpu-jobs-comparison/gpu-jobs-comparis" | ||
|
||
# Define the worker pool specification | ||
worker_pool_specs = [ | ||
{ | ||
"machine_spec": { | ||
"machine_type": "n1-standard-4", | ||
"accelerator_type": "NVIDIA_TESLA_T4", | ||
"accelerator_count": 1, | ||
}, | ||
"replica_count": 1, | ||
"container_spec": { | ||
"image_uri": image_uri, | ||
"command": ["python", "text2sql_training.py"], # Command to run the training script | ||
"args": [], # Additional arguments can be added here | ||
"env": [{"name": key, "value": value} for key, value in env.items()] | ||
}, | ||
} | ||
] | ||
|
||
# Create the CustomJob | ||
job = aiplatform.CustomJob( | ||
display_name='text2sql-training', | ||
worker_pool_specs=worker_pool_specs, | ||
labels={'env': 'production'}, # Example label | ||
) | ||
|
||
# Run the job | ||
job.run(sync=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
apiVersion: batch/v1 | ||
kind: Job | ||
metadata: | ||
name: phi-3-text2sql-k8s-job | ||
spec: | ||
parallelism: 1 | ||
template: | ||
spec: | ||
restartPolicy: Never | ||
containers: | ||
- image: ghcr.io/kyryl-opens-ml/gpu-jobs-comparison:latest | ||
name: training | ||
env: | ||
- name: HF_TOKEN | ||
valueFrom: | ||
secretKeyRef: | ||
name: gpu-job-secrets | ||
key: HF_TOKEN | ||
- name: WANDB_PROJECT | ||
value: gpu-jobs-comparison | ||
- name: WANDB_API_KEY | ||
valueFrom: | ||
secretKeyRef: | ||
name: gpu-job-secrets | ||
key: WANDB_API_KEY | ||
- name: RUN_NAME | ||
valueFrom: | ||
fieldRef: | ||
fieldPath: metadata.name | ||
command: ["python"] | ||
args: ["text2sql_training.py"] | ||
resources: | ||
limits: | ||
nvidia.com/gpu: 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
import os | ||
|
||
import modal | ||
from modal import Image | ||
|
||
app = modal.App("gpu-jobs") | ||
env = { | ||
"WANDB_PROJECT": os.getenv("WANDB_PROJECT"), | ||
"WANDB_API_KEY": os.getenv("WANDB_API_KEY"), | ||
"HF_TOKEN": os.getenv("HF_TOKEN"), | ||
"RUN_NAME": os.getenv("RUN_NAME"), | ||
} | ||
custom_image = Image.from_registry("ghcr.io/kyryl-opens-ml/gpu-jobs-comparison:latest").env(env) | ||
|
||
|
||
@app.function(image=custom_image, gpu="a10g", timeout=10 * 60 * 60) | ||
def run_training(): | ||
from text2sql_training import main | ||
main() | ||
|
||
|
||
def run_from_python(): | ||
fn = modal.Function.lookup("gpu-jobs", "run_training") | ||
fn_id = fn.spawn() | ||
print(f"Run training object: {fn_id}") | ||
|
||
|
||
if __name__ == "__main__": | ||
run_from_python() |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
import runpod | ||
|
||
runpod.api_key = "IOQA87IDZCITG92W82HJA34LWYT6MFJF2JY3FYYU" | ||
|
||
# Get all my pods | ||
pods = runpod.get_pods() | ||
|
||
# Get a specific pod | ||
pod = runpod.get_pod(pod.id) | ||
|
||
# Create a pod | ||
pod = runpod.create_pod("test", "runpod/stack", "NVIDIA GeForce RTX 3070") | ||
|
||
# Stop the pod | ||
runpod.stop_pod(pod.id) | ||
|
||
# Resume the pod | ||
runpod.resume_pod(pod.id) | ||
|
||
# Terminate the pod | ||
runpod.terminate_pod(pod.id) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
transformers==4.42.3 | ||
datasets==2.15.0 | ||
trl==0.9.6 | ||
accelerate==0.32.1 | ||
typer==0.6.1 | ||
wandb==0.17.4 | ||
ruff==0.5.0 | ||
great-expectations==0.15.25 | ||
pytest-cov==3.0.0 | ||
peft==0.11.1 | ||
evaluate==0.4.2 | ||
packaging==23.2 | ||
ninja==1.11.1.1 |
Oops, something went wrong.