Skip to content

Commit

Permalink
Replace hardcoded parameters with environment variables in litgpt_con…
Browse files Browse the repository at this point in the history
…tainer.sh (#359)
  • Loading branch information
samcmho authored Jan 24, 2024
2 parents 7e1dfc6 + e397f1d commit 465aa7b
Showing 1 changed file with 20 additions and 4 deletions.
24 changes: 20 additions & 4 deletions sample_workloads/lit-gpt-demo/slurm/litgpt_container.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
# Check for required environment variables
if [ -z "$MODEL_NAME" ]; then
echo "Error: MODEL_NAME environment variable is not set. Please set it before running the script."
exit 1
fi

if [ -z "$GCS_EXPERIMENT_BUCKET" ]; then
echo "Error: GCS_EXPERIMENT_BUCKET environment variable is not set. Please set it before running the script."
exit 1
fi

if [ -z "$EXPERIMENT_ROOT_DIR" ]; then
echo "Error: EXPERIMENT_ROOT_DIR environment variable is not set. Please set it before running the script."
exit 1
fi

MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)

# Start the Lit-GPT training container
Expand All @@ -11,16 +27,16 @@ docker run \
-e JOB_TIMESTAMP=$(date +%s) \
-e NNODES=$SLURM_NNODES \
-e NODE_RANK=$SLURM_NODEID \
-e MODEL_NAME='Llama-2-70b-hf' \
-e GCS_EXPERIMENT_BUCKET=litgpt-public-bucket \
-e MODEL_NAME=${MODEL_NAME} \
-e GCS_EXPERIMENT_BUCKET=${GCS_EXPERIMENT_BUCKET} \
-e GCS_DATA_BUCKET=litgpt-public-bucket \
-e USE_TCPX=yes \
-e CLUSTER_TYPE=SLURM \
-e EXPERIMENT_ROOT_DIR=llama-70b/training_logs \
-e EXPERIMENT_ROOT_DIR=${EXPERIMENT_ROOT_DIR} \
-e DATA_DIR=openwebtext_dataset \
-e MASTER_ADDR=$MASTER_ADDR \
-e MASTER_PORT=20120 \
-e NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX=${UDS_PATH} \
-e WARMUP_ITERS=10 \
-e MAX_ITERS=1000 \
us-docker.pkg.dev/gce-ai-infra/litgpt-full/litgpt:slurm
us-docker.pkg.dev/gce-ai-infra/litgpt-full/litgpt:slurm

0 comments on commit 465aa7b

Please sign in to comment.