Skip to content

Commit

Permalink
Merge branch 'mpi' into refactor-mpi
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov authored Jul 9, 2023
2 parents 9da9d26 + 03cc12b commit 81c5ddd
Show file tree
Hide file tree
Showing 22 changed files with 725 additions and 463 deletions.
33 changes: 33 additions & 0 deletions .devops/full-cuda.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
ARG UBUNTU_VERSION=22.04

# This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1

# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

FROM ${BASE_CUDA_DEV_CONTAINER} as build

# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all

RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip

COPY requirements.txt requirements.txt

RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt

WORKDIR /app

COPY . .

# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable cuBLAS
ENV LLAMA_CUBLAS=1

RUN make

ENTRYPOINT ["/app/.devops/tools.sh"]
32 changes: 32 additions & 0 deletions .devops/main-cuda.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the CUDA runtime image
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

FROM ${BASE_CUDA_DEV_CONTAINER} as build

# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all

RUN apt-get update && \
apt-get install -y build-essential

WORKDIR /app

COPY . .

# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable cuBLAS
ENV LLAMA_CUBLAS=1

RUN make

FROM ${BASE_CUDA_RUN_CONTAINER} as runtime

COPY --from=build /app/main /main

ENTRYPOINT [ "/main" ]
22 changes: 16 additions & 6 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@ on:
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']

env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
GGML_NLOOP: 3
GGML_NITER: 1
GGML_N_THREADS: 1

jobs:
ubuntu-focal-make:
Expand Down Expand Up @@ -64,7 +67,7 @@ jobs:
id: cmake_test
run: |
cd build
ctest --verbose
ctest --verbose --timeout 900
ubuntu-latest-cmake-sanitizer:
runs-on: ubuntu-latest
Expand Down Expand Up @@ -99,11 +102,17 @@ jobs:
id: cmake_test
run: |
cd build
ctest --verbose
ctest --verbose --timeout 900
ubuntu-latest-cmake-mpi:
runs-on: ubuntu-latest

continue-on-error: true

strategy:
matrix:
mpi_library: [mpich, libopenmpi-dev]

steps:
- name: Clone
id: checkout
Expand All @@ -113,7 +122,7 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential mpich
sudo apt-get install build-essential ${{ matrix.mpi_library }}
- name: Build
id: cmake_build
Expand Down Expand Up @@ -175,10 +184,11 @@ jobs:
id: cmake_test
run: |
cd build
ctest --verbose
ctest --verbose --timeout 900
windows-latest-cmake:
runs-on: windows-latest

env:
OPENBLAS_VERSION: 0.3.23
OPENCL_VERSION: 2023.04.17
Expand Down Expand Up @@ -277,7 +287,7 @@ jobs:
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
run: |
cd build
ctest -C Release --verbose
ctest -C Release --verbose --timeout 900
- name: Get commit hash
id: commit
Expand Down
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,9 @@ if (LLAMA_BLAS)
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
add_compile_options(${BLAS_LINKER_FLAGS})
add_compile_definitions(GGML_USE_OPENBLAS)
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
add_compile_definitions(GGML_BLAS_USE_MKL)
endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})

Expand Down
8 changes: 7 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,12 @@ ifdef LLAMA_CUBLAS
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
OBJS += ggml-cuda.o
NVCC = nvcc
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
NVCCFLAGS = --forward-unknown-to-host-compiler
ifdef CUDA_DOCKER_ARCH
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
else
NVCCFLAGS += -arch=native
endif # CUDA_DOCKER_ARCH
ifdef LLAMA_CUDA_FORCE_DMMV
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
endif # LLAMA_CUDA_FORCE_DMMV
Expand All @@ -196,6 +201,7 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
else
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
endif

ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
endif # LLAMA_CUBLAS
Expand Down
41 changes: 39 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -734,7 +734,7 @@ export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH

For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.

Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script.
Place your desired model into the `~/llama.cpp/models/` directory and execute the `./main (...)` script.

### Docker

Expand Down Expand Up @@ -770,6 +770,38 @@ or with a light image:
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
```

### Docker With CUDA

Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.

#### Building Locally

```bash
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile .
```

You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.

The defaults are:

- `CUDA_VERSION` set to `11.7.1`
- `CUDA_DOCKER_ARCH` set to `all`

The resulting images, are essentially the same as the non-CUDA images:

1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.

#### Usage

After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
```bash
docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
```
### Contributing
- Contributors can open PRs
Expand All @@ -790,5 +822,10 @@ docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /mode
### Docs
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
- [main](./examples/main/README.md)
- [server](./examples/server/README.md)
- [embd-input](./examples/embd-input/README.md)
- [jeopardy](./examples/jeopardy/README.md)
- [BLIS](./docs/BLIS.md)
- [Performance troubleshooting](./docs/token_generation_performance_tips.md)
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
1 change: 1 addition & 0 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -828,6 +828,7 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:


SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
'BF16': DT_BF16,
'F16': DT_F16,
'F32': DT_F32,
'I32': DT_I32,
Expand Down
24 changes: 18 additions & 6 deletions examples/baby-llama/baby-llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,17 @@ float frand_normal(struct random_normal_distribution * rnd) {
return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
}

void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);

if (plan.work_size > 0) {
buf.resize(plan.work_size);
plan.work_data = buf.data();
}

ggml_graph_compute(graph, &plan);
}

struct ggml_tensor * randomize_tensor(
struct ggml_tensor * tensor,
int ndims,
Expand Down Expand Up @@ -1569,6 +1580,8 @@ int main(int argc, char ** argv) {
int n_tokens = model.hparams.n_ctx;
int n_vocab = model.hparams.n_vocab;

std::vector<uint8_t> work_buffer;

for (int ex=0; ex<n_examples; ++ex) {
struct ggml_init_params params = {
/*.mem_size =*/ compute_size,
Expand All @@ -1586,7 +1599,6 @@ int main(int argc, char ** argv) {
int n_past = 0;

ggml_cgraph gf = {};
gf.n_threads = 1;

get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);

Expand All @@ -1595,7 +1607,7 @@ int main(int argc, char ** argv) {
struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);

ggml_build_forward_expand(&gf, e);
ggml_graph_compute(ctx0, &gf);
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);

float error_before_opt = ggml_get_f32_1d(e, 0);

Expand All @@ -1611,7 +1623,7 @@ int main(int argc, char ** argv) {
ggml_opt(ctx0, opt_params_lbfgs, e);
//
ggml_build_forward_expand(&gf, e);
ggml_graph_compute(ctx0, &gf);
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);

float error_after_opt = ggml_get_f32_1d(e, 0);

Expand Down Expand Up @@ -1659,13 +1671,12 @@ int main(int argc, char ** argv) {
struct ggml_context * ctx0 = ggml_init(params);

ggml_cgraph gf = {};
gf.n_threads = 1;

int n_past = 0;
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);

ggml_build_forward_expand(&gf, logits);
ggml_graph_compute(ctx0, &gf);
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);

struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
Expand All @@ -1687,10 +1698,11 @@ int main(int argc, char ** argv) {
}

print_matrix(model.tok_embeddings);

printf("done\n");

// ggml_free(kv_self.ctx);
// ggml_free(model_lora.ctx);
ggml_free(model.ctx);

return 0;
}
29 changes: 20 additions & 9 deletions examples/benchmark/benchmark-matmult.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,17 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif

void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);

if (plan.work_size > 0) {
buf.resize(plan.work_size);
plan.work_data = buf.data();
}

ggml_graph_compute(graph, &plan);
}

float tensor_sum_elements(const ggml_tensor * tensor) {
float sum = 0;
if (tensor->type==GGML_TYPE_F32) {
Expand Down Expand Up @@ -159,13 +170,14 @@ int main(int argc, char ** argv) {
// printf("Creating compute graph\n");
struct ggml_cgraph gf = ggml_build_forward(m11xm2);

gf.n_threads=benchmark_params.n_threads;
printf("cgraph->n_threads=%i\n",gf.n_threads);
printf("n_threads=%i\n", benchmark_params.n_threads);

TENSOR_DUMP(m11);
TENSOR_DUMP(m2);

ggml_graph_compute(ctx, &gf);
std::vector<uint8_t> work_buffer;

ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads);

TENSOR_DUMP(gf.nodes[0]);

Expand All @@ -187,7 +199,6 @@ int main(int argc, char ** argv) {

// printf("Creating compute graph\n");
struct ggml_cgraph gf31 = ggml_build_forward(q31);
gf31.n_threads=benchmark_params.n_threads;

// Set up a second graph computation to make sure we override the CPU cache lines
// printf("Creating new tensor q12 & Running quantize\n");
Expand All @@ -199,8 +210,7 @@ int main(int argc, char ** argv) {

//printf("Creating compute graph\n");
struct ggml_cgraph gf32 = ggml_build_forward(q32);
gf32.n_threads=benchmark_params.n_threads;
printf("cgraph->n_threads=%i\n",gf31.n_threads);
printf("n_threads=%i\n", benchmark_params.n_threads);

const int dimx = sizex;
const int dimy = sizey;
Expand All @@ -221,14 +231,15 @@ int main(int argc, char ** argv) {

long long int start = ggml_time_us();
//printf("Running ggml_graph_compute\n");
ggml_graph_compute(ctx, &gf31);
ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads);

long long int stop = ggml_time_us();
long long int usec = stop-start;
double gflops = (double)(flops_per_matrix)/usec/1000.0;
gflops_sum += gflops;
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
i,
gf31.n_threads,
benchmark_params.n_threads,
sizex, sizey, sizez, flops_per_matrix,
usec,gflops);

Expand All @@ -253,7 +264,7 @@ int main(int argc, char ** argv) {
}

// Running a different graph computation to make sure we override the CPU cache lines
ggml_graph_compute(ctx, &gf32);
ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads);
}
printf("\n");
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
Expand Down
Loading

0 comments on commit 81c5ddd

Please sign in to comment.