Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NCCL only multi-gpu multi-node training without MPI #426

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 8 additions & 10 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -188,23 +188,21 @@ else
endif
endif

# Check if OpenMPI and NCCL are available, include them if so, for multi-GPU training
# Check if NCCL is available for multi-GPU training
ifeq ($(NO_MULTI_GPU), 1)
$(info → Multi-GPU (OpenMPI + NCCL) is manually disabled)
$(info → Multi-GPU is manually disabled)
else
ifneq ($(OS), Windows_NT)
# Detect if running on macOS or Linux
ifeq ($(SHELL_UNAME), Darwin)
$(info ✗ Multi-GPU on CUDA on Darwin is not supported, skipping OpenMPI + NCCL support)
else ifeq ($(shell [ -d /usr/lib/x86_64-linux-gnu/openmpi/lib/ ] && [ -d /usr/lib/x86_64-linux-gnu/openmpi/include/ ] && echo "exists"), exists)
$(info ✓ OpenMPI found, OK to train with multiple GPUs)
NVCC_INCLUDES += -I/usr/lib/x86_64-linux-gnu/openmpi/include
NVCC_LDFLAGS += -L/usr/lib/x86_64-linux-gnu/openmpi/lib/
NVCC_LDLIBS += -lmpi -lnccl
$(info ✗ Multi-GPU on CUDA on Darwin is not supported, skipping NCCL support)
else ifeq ($(shell dpkg -l | grep -q nccl && echo "exists"), exists)
$(info ✓ NCCL found, OK to train with multiple GPUs)
NVCC_LDLIBS += -lnccl
NVCC_FLAGS += -DMULTI_GPU
else
$(info ✗ OpenMPI is not found, disabling multi-GPU support)
$(info ---> On Linux you can try install OpenMPI with `sudo apt install openmpi-bin openmpi-doc libopenmpi-dev`)
$(info ✗ NCCL is not found, disabling multi-GPU support)
$(info ---> On Linux you can try install NCCL with `sudo apt install libnccl2 libnccl-dev`)
endif
endif
endif
Expand Down
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,19 +128,21 @@ sudo apt-get -y install libcudnn9-dev-cuda-12

On top of this you need the [cuDNN frontend](https://github.com/NVIDIA/cudnn-frontend/tree/main), but this is just header files. Simply clone the repo to your disk. The Makefile currently looks for it in either your home directory or the current directory. If you have put it elsewhere, add `CUDNN_FRONTEND_PATH=/path/to/your/cudnn-frontend/include` to the `make` command-line.

**multi-GPU training**. As of April 26, 2024 there is now also support for multi-GPU training using MPI and NCCL. Make sure you install MPI, e.g. on Linux:
**multi-GPU training**. Support for multi-GPU training is availabel using NCCL. Make sure you download and install [NCCL](https://docs.nvidia.com/deeplearning/nccl/install-guide/index.html), e.g. on Linux:

```bash
sudo apt install openmpi-bin openmpi-doc libopenmpi-dev
sudo sudo apt install libnccl2 libnccl-dev
```

and then:

```bash
make train_gpt2cu
mpirun -np <number of GPUs> ./train_gpt2cu
mpirun -np <number of GPUs> bach -c './train_gpt2cu -pn <number of GPUs> -pr $OMPI_COMM_WORLD_RANK'
```

**multi-node training**. For SLURM enabled cluster, use the sample script in [scripts/run_gpt2_124M.sbatch](scripts/run_gpt2_124M.sbatch)

## experiments / sweeps

Just as an example process to sweep learning rates on a machine with 4 GPUs on TinyStories. Run a shell script `sweep.sh` (after you of course `chmod u+x sweep.sh`):
Expand Down
3 changes: 1 addition & 2 deletions dev/cuda/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ endif
# Compiler flags
CFLAGS = -O3 --use_fast_math
NVCCFLAGS = -lcublas -lcublasLt -std=c++17
MPI_PATHS = -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-gnu/openmpi/lib/

# Default rule for our CUDA files
%: %.cu
Expand Down Expand Up @@ -52,7 +51,7 @@ global_norm: global_norm.cu

# NCCL communication kernels
nccl_all_reduce: nccl_all_reduce.cu
$(NVCC) -lmpi -lnccl $(NVCCFLAGS) $(MPI_PATHS) nccl_all_reduce.cu -o nccl_all_reduce
$(NVCC) -lnccl $(NVCCFLAGS) nccl_all_reduce.cu -o nccl_all_reduce

# Run all targets
run_all: all
Expand Down
120 changes: 40 additions & 80 deletions dev/cuda/nccl_all_reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,16 @@ Fills a vector with 1s on the first GPU, 2s on the second, etc.
Then aggregates the values in the resulting vectors.

Compile example:
nvcc -lmpi -lnccl -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-gnu/openmpi/lib/ -lcublas -lcublasLt nccl_all_reduce.cu -o nccl_all_reduce
nvcc -lnccl -lcublas -lcublasLt nccl_all_reduce.cu -o nccl_all_reduce

Run on 2 local GPUs (set -np to a different value to change GPU count):
mpirun -np 2 ./nccl_all_reduce
mpirun -np 2 bash -c './nccl_all_reduce $OMPI_COMM_WORLD_RANK'

*/

#include "common.h"
#include <assert.h>
#include <cuda_runtime.h>
#include <mpi.h>
#include <nccl.h>
#include <stdint.h>
#include <stdio.h>
Expand All @@ -31,99 +30,52 @@ void nccl_check(ncclResult_t status, const char *file, int line) {
}
#define ncclCheck(err) (nccl_check(err, __FILE__, __LINE__))

void mpi_check(int status, const char *file, int line) {
if (status != MPI_SUCCESS) {
char mpi_error[4096];
int mpi_error_len = 0;
assert(MPI_Error_string(status, &mpi_error[0], &mpi_error_len) ==
MPI_SUCCESS);
printf("[MPI ERROR] at file %s:%d:\n%.*s\n", file, line, mpi_error_len,
mpi_error);
exit(EXIT_FAILURE);
}
}
#define mpiCheck(err) (mpi_check(err, __FILE__, __LINE__))

// Sets a vector to a predefined value
__global__ void set_vector(float *data, int N, float value) {
int i = blockIdx.x * blockDim.x + threadIdx.x;

// Check for out-of-bounds access
if (i < N) {
data[i] = value;
}
}

size_t cdiv(size_t a, size_t b) { return (a + b - 1) / b; }

// Parameters specific to training on multiple GPUs.
typedef struct {
int process_rank; // Rank of this process among all MPI processes on all hosts. 0 if no multi-GPU.
int process_rank; // Rank of this process among all processes on all hosts. 0 if no multi-GPU.
int num_processes; // Total number of processes on all hosts. 1 if no multi-GPU.
int local_device_idx; // This process GPU index on current machine. 0 if no multi-GPU.
int device_idx; // This process GPU index on current machine. 0 if no multi-GPU.
ncclComm_t nccl_comm; // NCCL communication primitive, used for collective mutli-GPU work.
} MultiGpuConfig;

// Determine which GPU this process should use.
// Processes on the same machines use different GPU indicies. Processes on other machines don't.
// Copied from NCCL examples: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html#example-2-one-device-per-process-or-thread
int multi_gpu_get_local_device_idx(int process_rank, int num_processes) {
char hostname[1024];
hostname[1023] = '\0';
// All processes on the same machine will share the same hostname.
gethostname(hostname, 1023);
for (int i=0; i < 1024; i++) {
if (hostname[i] == '.') {
hostname[i] = '\0';
break;
}
}
uint64_t hostname_hash = 5381;
for (int c = 0; hostname[c] != '\0'; c++){ hostname_hash = ((hostname_hash << 5) + hostname_hash) ^ hostname[c]; }

// Distribute all hostname hashes to all processes.
uint64_t* all_hostsname_hashes = (uint64_t*)malloc(num_processes * sizeof(uint64_t));
all_hostsname_hashes[process_rank] = hostname_hash;
mpiCheck(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, all_hostsname_hashes, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD));

// Identify which GPU we need to use.
int local_device_idx = 0;
for (int current_process = 0; current_process < num_processes; ++current_process) {
if (current_process == process_rank) {
// Found my gpu, local_device_idx now has my target GPU index.
break;
}
if (all_hostsname_hashes[current_process] == all_hostsname_hashes[process_rank]) {
// This process ID runs on the same machine, but it's not me, skip this GPU
local_device_idx++;
}
}

free(all_hostsname_hashes);
return local_device_idx;
}

MultiGpuConfig multi_gpu_config_init(int *argc, char ***argv) {
// Initialize MPI.
MultiGpuConfig multi_gpu_config_init(int num_processes, int process_rank, int gpus_per_node, char *dfs_path) {
MultiGpuConfig result;
mpiCheck(MPI_Init(argc, argv));
mpiCheck(MPI_Comm_rank(MPI_COMM_WORLD, &result.process_rank));
mpiCheck(MPI_Comm_size(MPI_COMM_WORLD, &result.num_processes));
result.local_device_idx = multi_gpu_get_local_device_idx(result.process_rank, result.num_processes);
printf("[Process rank %d] Using GPU %d\n", result.process_rank, result.local_device_idx);
cudaCheck(cudaSetDevice(result.local_device_idx));
ncclUniqueId nccl_id;
if (result.process_rank == 0) {

result.process_rank = process_rank;
result.num_processes = num_processes;
result.device_idx = process_rank % gpus_per_node;

FILE* idFile;
static char filename[256];
snprintf(filename, sizeof(filename), "%s/ncclUniqueId.dat", dfs_path);

if (result.process_rank == 0) { // Generate the NCCL unique ID at rank 0 and write it to a file
ncclCheck(ncclGetUniqueId(&nccl_id));
idFile = fopen(filename, "wb");
assert(idFile != NULL);
fwrite(&nccl_id, sizeof(nccl_id), 1, idFile);
fclose(idFile);
} else { // Other ranks wait until the file is available and read the unique ID
do {
usleep(1000000);
idFile = fopen(filename, "rb");
if (idFile != NULL) break;
} while (idFile == NULL);
fread(&nccl_id, sizeof(nccl_id), 1, idFile);
fclose(idFile);
}
mpiCheck(MPI_Bcast((void *)&nccl_id, sizeof(nccl_id), MPI_BYTE, 0, MPI_COMM_WORLD));

printf("ProcessID:%d, NumProcess::%d, DeviceId:%d\n", result.process_rank, result.num_processes, result.device_idx);
cudaCheck(cudaSetDevice(result.device_idx));
ncclCheck(ncclCommInitRank(&result.nccl_comm, result.num_processes, nccl_id, result.process_rank));
return result;
}

void multi_gpu_config_free(const MultiGpuConfig* multi_gpu_config) {
ncclCommDestroy(multi_gpu_config->nccl_comm);
mpiCheck(MPI_Finalize());
}

float get_mean(float *arr, size_t size, int process_rank) {
Expand All @@ -134,12 +86,20 @@ float get_mean(float *arr, size_t size, int process_rank) {
return sum / size;
}

int main(int argc, char **argv) {
// CUDA kernel to set each element of the array to a specific value
__global__ void set_vector(float *array, float value, size_t num_elements) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < num_elements) {
array[idx] = value;
}
}

int main(int argc, char *argv[]) {
// Some constants
const size_t all_reduce_buffer_size = 32 * 1024 * 1024;
const size_t threads_per_block = 1024;

MultiGpuConfig multi_gpu_config = multi_gpu_config_init(&argc, &argv);
MultiGpuConfig multi_gpu_config = multi_gpu_config_init(2, atoi(argv[1]), 8, ".");

// Allocating buffers on each of the devices.
float *all_reduce_buffer;
Expand Down
Loading