Skip to content

Commit

Permalink
Release v1.4.2 (#349)
Browse files Browse the repository at this point in the history
* Adds sample_workloads for LitGPT, nccl-test, basic TCPX validation
  • Loading branch information
Chris113113 authored Dec 15, 2023
2 parents d02347a + 9796582 commit b0b15e0
Show file tree
Hide file tree
Showing 30 changed files with 1,831 additions and 63 deletions.
2 changes: 1 addition & 1 deletion cloudbuild-continuous.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,4 @@ images:
timeout: 5400s

substitutions:
_VERSION: 'v1.4.1'
_VERSION: 'v1.4.2'
2 changes: 1 addition & 1 deletion sample_workloads/lit-gpt-demo/LitGPT.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ RUN apt-get update && \
iproute2 && \
rm -rf /var/lib/apt/lists/*

RUN pip install ujson
RUN pip install ujson csvkit

# Prerequisite for removing GCSFuse dependency
RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | \
Expand Down
385 changes: 385 additions & 0 deletions sample_workloads/lit-gpt-demo/README.md

Large diffs are not rendered by default.

7 changes: 5 additions & 2 deletions sample_workloads/lit-gpt-demo/build_and_push_litgpt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,15 @@ SOME_UUID=$(uuidgen)
LITGPT_PATH=${LITGPT_PATH:="lit-gpt"}
echo $LITGPT_PATH

BASE_IMAGE=${BASE_IMAGE:="EITHER ADD HERE OR VIA ENV VAR"}
FULL_IMAGE=${FULL_IMAGE:="EITHER ADD HERE OR VIA ENV VAR"}
BASE_IMAGE=${BASE_IMAGE:="$ARTIFACT_REGISTRY/litgpt-base"}
FULL_IMAGE=${FULL_IMAGE:="$ARTIFACT_REGISTRY/litgpt-full"}

# Clone LitGPT and checkout a flash-attn enabled commit
if [ ! -d $LITGPT_PATH ]; then
git clone https://github.com/Lightning-AI/lit-gpt.git
cd lit-gpt
git checkout d5d371417ecb3d3b6c4f30837d8bb7cf2b5310ae
cd ..
LITGPT_PATH=lit-gpt
fi

Expand Down
23 changes: 19 additions & 4 deletions sample_workloads/lit-gpt-demo/helm/templates/litgpt.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
{{- $requiredVar := .Values.cluster.nNodes | required ".Values.cluster.nNodes is required" -}}
{{- $requiredVar := .Values.cluster.nodePool | required ".Values.cluster.nodePool is required" -}}
{{- $requiredVar := .Values.network.ncclIfnames | required ".Values.ncclIfnames is required" -}}
{{- $requiredVar := .Values.workload.gcsBucket | required ".Values.gcsBucket is required" -}}
{{- $requiredVar := .Values.workload.jobTimestamp | required ".Values.jobTimestamp is required" -}}
{{- $requiredVar := .Values.workload.gcsExperimentBucket | required ".Values.gcsExperimentBucket is required" -}}
{{- $requiredVar := .Values.workload.experimentDir | required ".Values.experimentDir is required" -}}
{{- $requiredVar := .Values.workload.gcsDataBucket | required ".Values.gcsDataBucket is required" -}}
{{- $requiredVar := .Values.workload.dataDir| required ".Values.dataDir is required" -}}
{{- $requiredVar := .Values.workload.image | required ".Values.image is required" -}}
apiVersion: v1
Expand Down Expand Up @@ -91,7 +92,7 @@ spec:
- "bash"
- "-c"
- |
/tcpgpudmarxd/build/app/tcpgpudmarxd --gpu_nic_preset a3vm --gpu_shmem_type fd --setup_param "--verbose 128 5 0" &
/tcpgpudmarxd/build/app/tcpgpudmarxd --gpu_nic_preset a3vm --gpu_shmem_type fd --setup_param "--verbose 128 2 0" &
while [ ! -e "/usr/share/litgpt/workload_terminated" ]; do sleep 10; done
securityContext:
privileged: true
Expand Down Expand Up @@ -145,12 +146,26 @@ spec:
value: "{{$root.Values.network.disablePmtu}}"
- name: CPU_PINNING_MODE
value: "{{$root.Values.network.cpuPinningMode}}"
- name: GCS_BUCKET
value: "{{$root.Values.workload.gcsBucket}}"
- name: GCS_EXPERIMENT_BUCKET
value: "{{$root.Values.workload.gcsExperimentBucket}}"
- name: EXPERIMENT_ROOT_DIR
value: "{{$root.Values.workload.experimentDir}}"
- name: GCS_DATA_BUCKET
value: "{{$root.Values.workload.gcsDataBucket}}"
- name: DATA_DIR
value: "{{$root.Values.workload.dataDir}}"
- name: BATCH_SIZE
value: "{{$root.Values.workload.batchSize}}"
- name: MICRO_BATCH_SIZE
value: "{{$root.Values.workload.microBatchSize}}"
- name: MODEL_NAME
value: "{{$root.Values.workload.modelName}}"
- name: WARMUP_ITERS
value: "{{$root.Values.workload.warmupIters}}"
- name: MAX_ITERS
value: "{{$root.Values.workload.maxIters}}"
- name: CLUSTER_TYPE
value: GKE
volumeMounts:
- name: nvidia-install-dir-host
mountPath: /usr/local/nvidia/lib64
Expand Down
22 changes: 22 additions & 0 deletions sample_workloads/lit-gpt-demo/helm/values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
cluster:
nNodes: 8
nodePool: np-1
network:
useTcpx: "yes"
ncclIfnames: 'eth0'
ncclPlugin: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx-dev:v3.1.6_2023_10_06
rxdmContainer: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:v2.0.9
disablePmtu: "yes"
workload:
jobTimestamp: # Must be defined
gcsExperimentBucket: # Must be defined
experimentDir: llama2-70b
gcsDataBucket: litgpt-public-bucket
dataDir: openwebtext_dataset
image: us-docker.pkg.dev/gce-ai-infra/litgpt-full/litgpt
modelName: Llama-2-70b-hf
batchSize: 6
microBatchSize: 6
warmupIters: 10
maxIters: 1000

58 changes: 32 additions & 26 deletions sample_workloads/lit-gpt-demo/openwebtext_trainer.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
# Vendored from https://github.com/Lightning-AI/lit-gpt/blob/main/pretrain/openwebtext_trainer.py

# Modified from https://github.com/Lightning-AI/lit-gpt/blob/d5d371417ecb3d3b6c4f30837d8bb7cf2b5310ae/pretrain/openwebtext_trainer.py
import math
import sys
import time
import os
from pathlib import Path
from typing import Any, Optional

import lightning as L
import numpy as np
import torch
import os
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger
from lightning.pytorch.strategies import FSDPStrategy
from lightning.pytorch.strategies import FSDPStrategy, XLAStrategy
from torch.utils.data import DataLoader, IterableDataset

# support running without installing as a package
Expand All @@ -22,11 +21,11 @@
from lit_gpt import Config
from lit_gpt.model import GPT, Block
from lit_gpt.speed_monitor import SpeedMonitorCallback, estimate_flops, measure_flops
from lit_gpt.utils import chunked_cross_entropy, get_default_supported_precision
from lit_gpt.utils import chunked_cross_entropy, get_default_supported_precision, step_csv_logger

model_name = "pythia-70m"
model_name = os.getenv("MODEL_NAME", "Llama-2-70b-hf")
name = "openwebtext"
out_dir = Path("out") / name
out_dir = Path(os.getenv("EXPERIMENT_LOCAL_DIR", "")) / "out"
data_dir = Path("/data")
save_interval = 1000
eval_interval = 1000
Expand All @@ -36,16 +35,16 @@

# Hyperparameters
learning_rate = 6e-4
batch_size = int(os.getenv("BATCH_SIZE", "125"))
micro_batch_size = int(os.getenv("MICRO_BATCH_SIZE", "5"))
batch_size = int(os.getenv("BATCH_SIZE", "6"))
micro_batch_size = int(os.getenv("MICRO_BATCH_SIZE", "6"))
gradient_accumulation_steps = batch_size // micro_batch_size
assert gradient_accumulation_steps > 0
max_iters = 600000 # num_epochs * (epoch_size // micro_batch_size) // devices
max_iters = int(os.getenv("MAX_ITERS", "1000")) # num_epochs * (epoch_size // micro_batch_size) // devices
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
decay_lr = True
warmup_iters = 2000
warmup_iters = int(os.getenv("WARMUP_ITERS", "10"))
lr_decay_iters = max_iters
min_lr = 6e-5

Expand Down Expand Up @@ -77,7 +76,7 @@ def on_fit_start(self) -> None:
# consider setting `self.measured_flops = estimated_flops` instead
estimated_flops = estimate_flops(meta_model) * micro_batch_size
self.print(f"Estimated TFLOPs: {estimated_flops * trainer.world_size / 1e12:.2f}")
x = torch.randint(0, 1, (micro_batch_size, meta_model.max_seq_length))
x = torch.randint(0, 1, (micro_batch_size, meta_model.config.block_size))
self.measured_flops = measure_flops(meta_model, x)
self.print(f"Measured TFLOPs: {self.measured_flops * trainer.world_size / 1e12:.2f}")

Expand All @@ -104,24 +103,29 @@ def validation_step(self, batch: Any, batch_idx: int) -> None:
self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)


def main(devices: int = 1, precision: Optional[str] = None) -> None:
precision = precision or get_default_supported_precision(training=True)
def main(devices: int = 1, precision: Optional[str] = None, tpu: bool = False) -> None:
precision = precision or get_default_supported_precision(training=True, tpu=tpu)

if devices > 1:
strategy = FSDPStrategy(
auto_wrap_policy={Block},
activation_checkpointing_policy={Block},
# the argument is not available in the Trainer strategy, but it's the default anyways
# state_dict_type="full",
limit_all_gathers=True,
cpu_offload=False,
)
if tpu:
# For multi-host TPU training, the device count for Fabric is limited to the count on a single host.
devices = "auto"
strategy = XLAStrategy(sync_module_states=False)
else:
strategy = FSDPStrategy(
auto_wrap_policy={Block},
activation_checkpointing_policy={Block},
# the argument is not available in the Trainer strategy, but it's the default anyways
# state_dict_type="full",
limit_all_gathers=True,
cpu_offload=False,
)
else:
strategy = "auto"

logger = CSVLogger("out", name, flush_logs_every_n_steps=log_interval)
logger = step_csv_logger(out_dir, name, cls=CSVLogger, flush_logs_every_n_steps=log_interval)
speed_monitor = SpeedMonitorCallback(
length_fn=lambda batch: batch[0].size(1), batch_size=micro_batch_size, window_size=50, time_unit="seconds"
length_fn=lambda batch: batch[0].size(1), batch_size=micro_batch_size, window_size=10, time_unit="seconds"
)
model_checkpoint = ModelCheckpoint(dirpath=out_dir, every_n_train_steps=save_interval, save_last=True, verbose=True)
trainer = L.Trainer(
Expand Down Expand Up @@ -180,7 +184,7 @@ def __iter__(self):


# learning rate decay scheduler (cosine with warmup)
def get_lr(it: int) -> float:
def get_lr(it):
# 1) linear warmup for warmup_iters steps
if it < warmup_iters:
return learning_rate * it / warmup_iters
Expand All @@ -195,8 +199,10 @@ def get_lr(it: int) -> float:


if __name__ == "__main__":
# Uncomment this line if you see an error: "Expected is_sm80 to be true, but got false"
# torch.backends.cuda.enable_flash_sdp(False)
torch.set_float32_matmul_precision("high")

from jsonargparse import CLI

CLI(main)
CLI(main)
Original file line number Diff line number Diff line change
Expand Up @@ -5,31 +5,31 @@ set -o pipefail

: "${MASTER_ADDR:?Must set MASTER_ADDR}"
: "${NODE_RANK:?Must set NODE_RANK}"
: "${GCS_BUCKET:?Must set GCS_BUCKET}"
: "${JOB_TIMESTAMP:?Must set JOB_TIMESTAMP}"
: "${EXPERIMENT_ROOT_DIR:?Must set EXPERIMENT_ROOT_DIR}"
: "${NNODES:?Must set NNODES}"
: "${GCS_EXPERIMENT_BUCKET:?Must set GCS_EXPERIMENT_BUCKET}"
: "${EXPERIMENT_ROOT_DIR:?Must set EXPERIMENT_ROOT_DIR}"
: "${GCS_DATA_BUCKET:?Must set GCS_DATA_BUCKET}"
: "${DATA_DIR:?Must set DATA_DIR}"
: "${CLUSTER_TYPE:='GKE'}"

export EXPERIMENT_LOCAL_DIR=/experiment/${EXPERIMENT_ROOT_DIR}

EXPERIMENT_LOCAL_DIR=/experiment/${EXPERIMENT_ROOT_DIR}
mkdir -p $EXPERIMENT_LOCAL_DIR

echo $EXPERIMENT_ROOT_DIR
echo $EXPERIMENT_LOCAL_DIR

gsutil rsync -r gs://${GCS_BUCKET}/${EXPERIMENT_ROOT_DIR}/ ${EXPERIMENT_LOCAL_DIR}/
gsutil rsync -r gs://${GCS_EXPERIMENT_BUCKET}/${EXPERIMENT_ROOT_DIR}/ ${EXPERIMENT_LOCAL_DIR}/

LOCAL_DATA_DIR=/data
mkdir -p $LOCAL_DATA_DIR
gsutil -m rsync gs://${GCS_BUCKET}/${DATA_DIR} /data
gsutil -m rsync gs://${GCS_DATA_BUCKET}/${DATA_DIR} /data

export MASTER_PORT=6002
export GPUS_PER_NODE=8
export WORLD_SIZE=$((NNODES * GPUS_PER_NODE))

PROFILING_DIR=$EXPERIMENT_LOCAL_DIR/nsys_profiles
mkdir -p $PROFILING_DIR

LOG_DIR=$EXPERIMENT_LOCAL_DIR/training_logs
mkdir -p $LOG_DIR

Expand Down Expand Up @@ -57,20 +57,28 @@ set_nccl_specific_configuration() {
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/tcpx/lib64"
export NCCL_GPUDIRECTTCPX_FORCE_ACK=1
export NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP=1000
export NCCL_SOCKET_IFNAME=eth0
export NCCL_DYNAMIC_CHUNK_SIZE=524288
export NCCL_P2P_NET_CHUNKSIZE=524288
export NCCL_P2P_PCI_CHUNKSIZE=524288
export NCCL_P2P_NVL_CHUNKSIZE=1048576
export NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177"
export NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,124-139;eth2:22-35,124-139;eth3:74-87,178-191;eth4:74-87,178-191"
export NCCL_NSOCKS_PERTHREAD=4
export NCCL_SOCKET_NTHREADS=1
export NCCL_MAX_NCHANNELS=8
export NCCL_MIN_NCHANNELS=8
export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=1000000
export NCCL_SOCKET_IFNAME=eth0
export NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177"
export NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191"
export NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4
export NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0
export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=1000000
if [[ "$CLUSTER_TYPE" == "SLURM" ]]; then
echo "Overriding with SLURM Specific Envvar"
export NCCL_SOCKET_IFNAME=enp0s12
export NCCL_GPUDIRECTTCPX_TX_BINDINGS="enp6s0:8-21,112-125;enp12s0:8-21,112-125;enp134s0:60-73,164-177;enp140s0:60-73,164-177"
export NCCL_GPUDIRECTTCPX_RX_BINDINGS="enp6s0:22-35,126-139;enp12s0:22-35,126-139;enp134s0:74-87,178-191;enp140s0:74-87,178-191"
export NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=enp6s0,enp12s,enp134s0,enp140s0
export NCCL_GPUDIRECTTCPX_CTRL_DEV=enp0s12
fi
else
echo "NOT using TCPX"
fi
Expand Down Expand Up @@ -115,17 +123,17 @@ non_blocking_wait() {

function on_script_completion {
# semaphore to cleanly exit hardware utilization monitor
touch /tmp/workload_terminated
touch /usr/share/litgpt/workload_terminated

echo "Uploading ${EXPERIMENT_LOCAL_DIR} to gs://${GCS_BUCKET}/${EXPERIMENT_ROOT_DIR}/"
gsutil rsync -r ${EXPERIMENT_LOCAL_DIR}/ gs://${GCS_BUCKET}/${EXPERIMENT_ROOT_DIR}/
echo "Uploading ${EXPERIMENT_LOCAL_DIR} to gs://${GCS_EXPERIMENT_BUCKET}/${EXPERIMENT_ROOT_DIR}/"
gsutil rsync -r ${EXPERIMENT_LOCAL_DIR}/ gs://${GCS_EXPERIMENT_BUCKET}/${EXPERIMENT_ROOT_DIR}/
}


trap on_script_completion EXIT

# Launch background process that samples hardware utilization
rm -f /tmp/workload_terminated
rm -f /usr/share/litgpt/workload_terminated

if [[ "${DISABLE_PMTU:="yes"}" == "yes" ]]; then
echo "Disabling PMTU"
Expand Down Expand Up @@ -157,11 +165,11 @@ for ((LOCAL_RANK=0; LOCAL_RANK <= $((GPUS_PER_NODE - 1)); LOCAL_RANK++)); do
RANK=$RANK LOCAL_RANK=$LOCAL_RANK \
$CMD_PREFIX \
python /workspace/pretrain/openwebtext_trainer.py \
--devices=$GPUS_PER_NODE > >(tee "$LOG_DIR/pretrain_gpt_rank$RANK.log") 2>&1 &
--devices=$GPUS_PER_NODE --precision="bf16-true" > >(tee "$LOG_DIR/pretrain_gpt_rank$RANK.log") 2>&1 &
PID=$!
PIDS+=($PID)

echo "Launched pretrain_gpt.py for rank $RANK with PID $PID"
echo "Launched openwebtext_trainer.py for rank $RANK with PID $PID"
done

wait_all_success_or_exit "${PIDS[@]}"
wait_all_success_or_exit "${PIDS[@]}"
26 changes: 26 additions & 0 deletions sample_workloads/lit-gpt-demo/slurm/litgpt_container.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)

# Start the Lit-GPT training container
docker run \
--privileged \
--gpus all --net="host" \
-v /var/lib/tcpx/lib64:/var/lib/tcpx/lib64 \
-v ${UDS_PATH}:${UDS_PATH} \
-u 0 \
-e LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/var/lib/tcpx/lib64:/usr/lib/lib32:/usr/lib/x86_64-linux-gnu/ \
-e JOB_TIMESTAMP=$(date +%s) \
-e NNODES=$SLURM_NNODES \
-e NODE_RANK=$SLURM_NODEID \
-e MODEL_NAME='Llama-2-70b-hf' \
-e GCS_EXPERIMENT_BUCKET=litgpt-public-bucket \
-e GCS_DATA_BUCKET=litgpt-public-bucket \
-e USE_TCPX=yes \
-e CLUSTER_TYPE=SLURM \
-e EXPERIMENT_ROOT_DIR=llama-70b/training_logs \
-e DATA_DIR=openwebtext_dataset \
-e MASTER_ADDR=$MASTER_ADDR \
-e MASTER_PORT=20120 \
-e NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX=${UDS_PATH} \
-e WARMUP_ITERS=10 \
-e MAX_ITERS=1000 \
us-docker.pkg.dev/gce-ai-infra/litgpt-full/litgpt:slurm
Loading

0 comments on commit b0b15e0

Please sign in to comment.