forked from skypilot-org/skypilot
-
Notifications
You must be signed in to change notification settings - Fork 1
/
lora.yaml
58 lines (48 loc) · 1.68 KB
/
lora.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# LoRA finetuning Meta Llama-3.1 on any of your own infra.
#
# Usage:
#
# HF_TOKEN=xxx sky launch lora.yaml -c llama31 --env HF_TOKEN
#
# To finetune a 70B model:
#
# HF_TOKEN=xxx sky launch lora.yaml -c llama31-70 --env HF_TOKEN --env MODEL_SIZE=70B
envs:
MODEL_SIZE: 8B
HF_TOKEN:
DATASET: "yahma/alpaca-cleaned"
# Change this to your own checkpoint bucket
CHECKPOINT_BUCKET_NAME: sky-llama-31-checkpoints
resources:
accelerators: A100:8
disk_tier: best
use_spot: true
file_mounts:
/configs: ./configs
/output:
name: $CHECKPOINT_BUCKET_NAME
mode: MOUNT
# Optionally, specify the store to enforce to use one of the stores below:
# r2/azure/gcs/s3/cos
# store: r2
setup: |
pip install torch torchvision
# Install torch tune from source for the latest Llama-3.1 model
pip install git+https://github.com/pytorch/torchtune.git@58255001bd0b1e3a81a6302201024e472af05379
# pip install torchtune
tune download meta-llama/Meta-Llama-3.1-${MODEL_SIZE}-Instruct \
--hf-token $HF_TOKEN \
--output-dir /tmp/Meta-Llama-3.1-${MODEL_SIZE}-Instruct \
--ignore-patterns "original/consolidated*"
run: |
tune run --nproc_per_node $SKYPILOT_NUM_GPUS_PER_NODE \
lora_finetune_distributed \
--config /configs/${MODEL_SIZE}-lora.yaml \
dataset.source=$DATASET
# Remove the checkpoint files to save space, LoRA serving only needs the
# adapter files.
rm /tmp/Meta-Llama-3.1-${MODEL_SIZE}-Instruct/*.pt
rm /tmp/Meta-Llama-3.1-${MODEL_SIZE}-Instruct/*.safetensors
mkdir -p /output/$MODEL_SIZE-lora
rsync -Pavz /tmp/Meta-Llama-3.1-${MODEL_SIZE}-Instruct /output/$MODEL_SIZE-lora
cp -r /tmp/lora_finetune_output /output/$MODEL_SIZE-lora/