Skip to content

Commit

Permalink
Merging Develop -> Main for sample_workloads changes (#366)
Browse files Browse the repository at this point in the history
  • Loading branch information
Chris113113 authored Mar 5, 2024
2 parents b0b15e0 + cee6df3 commit 6ef47c4
Show file tree
Hide file tree
Showing 16 changed files with 1,099 additions and 296 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
!**/samples/**/terraform.tfvars
**/tfplan
**/backend.tf
sample_workloads/lit_gpt_demo/lit-gpt/**
7 changes: 3 additions & 4 deletions a3/terraform/modules/cluster/gke/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,9 @@ resource "google_container_cluster" "cluster" {
# pool defined. So we create the smallest possible default node pool and
# immediately delete it. This is a best-practice suggested in the Terraform
# documentation for the container_cluster resource.
remove_default_node_pool = true
initial_node_count = 1
min_master_version = local.gke_master_version
deletion_protection = false
initial_node_count = 1
min_master_version = local.gke_master_version
deletion_protection = false

network = module.network.network_self_links[0]
subnetwork = module.network.subnetwork_self_links[0]
Expand Down
13 changes: 6 additions & 7 deletions sample_workloads/lit-gpt-demo/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# syntax=docker/dockerfile:experimental

FROM nvcr.io/nvidia/pytorch:23.09-py3
FROM nvcr.io/nvidia/pytorch:24.01-py3


# Ensure apt-get won't prompt for selecting options
Expand All @@ -18,10 +18,11 @@ RUN apt-get update && \

WORKDIR /workspace/

COPY requirements.txt requirements.txt
COPY requirements-all.txt .
COPY requirements.txt .

RUN MAX_JOBS=4 pip install 'flash-attn==2.0.4' --no-build-isolation \
&& pip install -r requirements.txt tokenizers sentencepiece ujson
RUN pip install -r requirements-all.txt tokenizers sentencepiece ujson
RUN pip install --upgrade torchvision

RUN pip install nvidia-dlprof-pytorch-nvtx nvidia-pyindex nvidia-dlprof

Expand All @@ -30,6 +31,4 @@ COPY . .
# Check install
RUN python -c "from lit_gpt.model import GPT, Block, Config" && \
python -c "import lightning as L" && \
python -c "from lightning.fabric.strategies import FSDPStrategy"


python -c "from lightning.fabric.strategies import FSDPStrategy"
3 changes: 2 additions & 1 deletion sample_workloads/lit-gpt-demo/LitGPT.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.c
&& apt-get update -y && apt-get install google-cloud-cli -y

COPY scripts /workspace/scripts
COPY openwebtext_trainer.py /workspace/pretrain/
COPY utilities /workspace/pretrain/utilities
COPY openwebtext.py /workspace/pretrain/

ENTRYPOINT ["/bin/bash", "/workspace/scripts/litgpt_container_entrypoint.sh"]

6 changes: 3 additions & 3 deletions sample_workloads/lit-gpt-demo/build_and_push_litgpt.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ FULL_IMAGE=${FULL_IMAGE:="$ARTIFACT_REGISTRY/litgpt-full"}
# Clone LitGPT and checkout a flash-attn enabled commit
if [ ! -d $LITGPT_PATH ]; then
git clone https://github.com/Lightning-AI/lit-gpt.git
cd lit-gpt
git checkout d5d371417ecb3d3b6c4f30837d8bb7cf2b5310ae
cd ..
LITGPT_PATH=lit-gpt
fi
cd lit-gpt
git checkout 44c3c58b759fa0903ab31ed8863a66c157d5ccd9
cd ..

cp Dockerfile $LITGPT_PATH/Dockerfile

Expand Down
32 changes: 23 additions & 9 deletions sample_workloads/lit-gpt-demo/helm/templates/litgpt.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
{{- $requiredVar := .Values.cluster.nNodes | required ".Values.cluster.nNodes is required" -}}
{{- $requiredVar := .Values.cluster.nodePool | required ".Values.cluster.nodePool is required" -}}
{{- $requiredVar := .Values.network.ncclIfnames | required ".Values.ncclIfnames is required" -}}
{{- $requiredVar := .Values.workload.jobTimestamp | required ".Values.jobTimestamp is required" -}}
{{- $requiredVar := .Values.workload.gcsExperimentBucket | required ".Values.gcsExperimentBucket is required" -}}
{{- $requiredVar := .Values.workload.experimentDir | required ".Values.experimentDir is required" -}}
{{- $requiredVar := .Values.logging.jobTimestamp | required ".Values.jobTimestamp is required" -}}
{{- $requiredVar := .Values.logging.experimentDir | required ".Values.experimentDir is required" -}}
{{- $requiredVar := .Values.workload.gcsDataBucket | required ".Values.gcsDataBucket is required" -}}
{{- $requiredVar := .Values.workload.dataDir| required ".Values.dataDir is required" -}}
{{- $requiredVar := .Values.workload.image | required ".Values.image is required" -}}
Expand Down Expand Up @@ -51,6 +50,8 @@ spec:
tolerations:
- operator: "Exists"
key: nvidia.com/gpu
- operator: "Exists"
key: cloud.google.com/impending-node-termination
volumes:
- name: nvidia-install-dir-host
hostPath:
Expand All @@ -66,6 +67,9 @@ spec:
emptyDir: {}
- name: tcpx-nccl-plugin-volume
emptyDir: {}
- name: data-volume
hostPath:
path: /home/data
{{if eq $root.Values.network.useTcpx "yes"}}
initContainers:
- name: tcpx-nccl-plugin-installer
Expand Down Expand Up @@ -127,9 +131,9 @@ spec:
fieldRef:
fieldPath: status.hostIP
- name: LD_LIBRARY_PATH
value: "/usr/local/nvidia/lib64"
value: "/usr/lib/x86_64-linux-gnu:/usr/local/nvidia/lib64"
- name: JOB_TIMESTAMP
value: "{{$root.Values.workload.jobTimestamp}}"
value: "{{$root.Values.logging.jobTimestamp}}"
- name: MASTER_ADDR
value: "pytorch-leader-{{$.Release.Name}}"
- name: NCCL_SOCKET_IFNAME
Expand All @@ -147,9 +151,9 @@ spec:
- name: CPU_PINNING_MODE
value: "{{$root.Values.network.cpuPinningMode}}"
- name: GCS_EXPERIMENT_BUCKET
value: "{{$root.Values.workload.gcsExperimentBucket}}"
value: "{{$root.Values.logging.gcsExperimentBucket}}"
- name: EXPERIMENT_ROOT_DIR
value: "{{$root.Values.workload.experimentDir}}"
value: "{{$root.Values.logging.experimentDir}}"
- name: GCS_DATA_BUCKET
value: "{{$root.Values.workload.gcsDataBucket}}"
- name: DATA_DIR
Expand All @@ -162,10 +166,18 @@ spec:
value: "{{$root.Values.workload.modelName}}"
- name: WARMUP_ITERS
value: "{{$root.Values.workload.warmupIters}}"
- name: MAX_ITERS
value: "{{$root.Values.workload.maxIters}}"
- name: COLLECT_NSYS_PROFILE
value: "{{$root.Values.logging.collectNsysProfile}}"
- name: CLUSTER_TYPE
value: GKE
- name: NCCL_NVLS_ENABLE
value: '0'
- name: NCCL_DEBUG
value: "{{$root.Values.logging.ncclDebugLevel}}"
- name: NUMBER_OF_EPOCHS
value: "{{$root.Values.workload.numberOfEpochs}}"
- name: STEPS_PER_EPOCH
value: "{{$root.Values.workload.stepsPerEpoch}}"
volumeMounts:
- name: nvidia-install-dir-host
mountPath: /usr/local/nvidia/lib64
Expand All @@ -177,6 +189,8 @@ spec:
mountPath: /dev/shm
- name: workload-terminated-volume
mountPath: /usr/share/litgpt
- name: data-volume
mountPath: /data
resources:
limits:
nvidia.com/gpu: !!int 8
Expand Down
21 changes: 12 additions & 9 deletions sample_workloads/lit-gpt-demo/helm/values.yaml
Original file line number Diff line number Diff line change
@@ -1,22 +1,25 @@
cluster:
nNodes: 8
nNodes: 4
nodePool: np-1
network:
useTcpx: "yes"
ncclIfnames: 'eth0'
ncclPlugin: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx-dev:v3.1.6_2023_10_06
rxdmContainer: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:v2.0.9
ncclPlugin: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx-dev:v3.1.7
rxdmContainer: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:v2.0.11
disablePmtu: "yes"
workload:
jobTimestamp: # Must be defined
gcsExperimentBucket: # Must be defined
logging:
collectNsysProfile: 'no' # Set to 'yes' for profiles
ncclDebugLevel: WARN
gcsExperimentBucket: '' # Set to a writable GCS bucket to upload logs and Nsys Profiles
jobTimestamp: 1
experimentDir: llama2-70b
workload:
gcsDataBucket: litgpt-public-bucket
dataDir: openwebtext_dataset
image: us-docker.pkg.dev/gce-ai-infra/litgpt-full/litgpt
image: us-docker.pkg.dev/gce-ai-infra/litgpt-full/litgpt/litgpt-full:latest
modelName: Llama-2-70b-hf
batchSize: 6
microBatchSize: 6
warmupIters: 10
maxIters: 1000

numberOfEpochs: 1
stepsPerEpoch: 30
Loading

0 comments on commit 6ef47c4

Please sign in to comment.