From 8e6735ec60dc9399bf6070397a007f3343935f34 Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 17 Dec 2023 21:21:07 +0100 Subject: [PATCH 01/21] llama : initial ggml-backend integration --- Makefile | 2 +- ggml-alloc.c | 11 +- ggml-backend.c | 49 +++- ggml-backend.h | 4 + ggml-cuda.cu | 57 ++-- ggml.c | 24 +- ggml.h | 13 +- llama.cpp | 772 ++++++++++++++++++------------------------------- 8 files changed, 386 insertions(+), 546 deletions(-) diff --git a/Makefile b/Makefile index 8273f84004df6..512407a1de87b 100644 --- a/Makefile +++ b/Makefile @@ -65,7 +65,7 @@ test: $(TEST_TARGETS) ./$$test_target; \ fi; \ if [ $$? -ne 0 ]; then \ - printf 'Test $$test_target FAILED!\n\n' $$test_target; \ + printf 'Test %s FAILED!\n\n' $$test_target; \ failures=$$(( failures + 1 )); \ else \ printf 'Test %s passed.\n\n' $$test_target; \ diff --git a/ggml-alloc.c b/ggml-alloc.c index d3049efb497a0..0a2b80f31f6dc 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -449,11 +449,10 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd if (update_backend) { view->backend = view->view_src->backend; } - view->buffer = view->view_src->buffer; + // views are initialized in the alloc buffer rather than the view_src buffer + view->buffer = alloc->buffer; view->data = (char *)view->view_src->data + view->view_offs; - // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend - // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft); if (!alloc->measure) { @@ -736,6 +735,10 @@ void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) { } void ggml_allocr_free(ggml_allocr_t alloc) { + if (alloc == NULL) { + return; + } + ggml_gallocr_free(alloc->galloc); ggml_tallocr_free(alloc->talloc); free(alloc); @@ -775,7 +778,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte } if (nbytes == 0) { - fprintf(stderr, "%s: no tensors to allocate\n", __func__); + //fprintf(stderr, "%s: no tensors to allocate\n", __func__); return NULL; } diff --git a/ggml-backend.c b/ggml-backend.c index 3a22cd085eac0..2551915c33fa9 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -378,7 +378,6 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { free(buffer->context); - GGML_UNUSED(buffer); } static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { @@ -456,7 +455,7 @@ static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_ty } ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { - static struct ggml_backend_buffer_type ggml_backend_buffer_type_cpu = { + static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = { /* .iface = */ { /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, @@ -466,8 +465,50 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { /* .context = */ NULL, }; - return &ggml_backend_buffer_type_cpu; + return &ggml_backend_cpu_buffer_type; +} + +#ifdef GGML_USE_CPU_HBM +#include + +// HBM buffer type +static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) { + hbw_free(buffer->context); +} + +static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + //void * ptr = hbw_malloc(size); + void * ptr; + int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size); + if (result != 0) { + fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size); + return NULL; + } + + // FIXME: this is a hack to avoid having to implement a new buffer type + ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); + buffer->buft = buft; + buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer; + + return buffer; +} + +struct ggml_backend_buffer_type_i cpu_backend_hbm_buffer_type_interface = { + /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, +}; + +ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() { + static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = { + /* .iface = */ cpu_backend_hbm_buffer_type_interface, + /* .context = */ NULL, + }; + + return &ggml_backend_cpu_buffer_type_hbm; } +#endif struct ggml_backend_cpu_context { int n_threads; @@ -505,7 +546,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); - cpu_plan->cgraph = *cgraph; + cpu_plan->cgraph = *cgraph; // FIXME: deep copy if (cpu_plan->cplan.work_size > 0) { cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size); diff --git a/ggml-backend.h b/ggml-backend.h index 58d5ccae6ed10..a52dc367b6954 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -76,6 +76,10 @@ extern "C" { GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); +#ifdef GGML_USE_CPU_HBM + GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); +#endif + // // Backend registry // diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 0a63c1ecf3bfb..f11fc4e93cba0 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -7057,6 +7057,7 @@ inline void ggml_cuda_op_upscale( (void) src1; (void) dst; + (void) src1_dd; } inline void ggml_cuda_op_pad( @@ -7073,6 +7074,7 @@ inline void ggml_cuda_op_pad( (void) src1; (void) dst; + (void) src1_dd; } inline void ggml_cuda_op_rms_norm( @@ -8958,7 +8960,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { char * buf; CUDA_CHECK(cudaMalloc(&buf, size)); - char * buf_host = (char*)data + offset_split; + char * buf_host = (char *)data + offset_split; // set padding to 0 to avoid possible NaN values if (size > original_size) { @@ -9103,11 +9105,10 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra(); - const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) || - tensor->op == GGML_OP_VIEW; + const bool inplace = tensor->view_src != nullptr; - if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) { - ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; + if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) { + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra; char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; size_t view_offset = 0; if (tensor->op == GGML_OP_VIEW) { @@ -9431,9 +9432,12 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); - CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice)); + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; - UNUSED(buffer); + ggml_cuda_set_device(ctx->device); + CUDA_CHECK(cudaDeviceSynchronize()); + + CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice)); } static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { @@ -9441,9 +9445,12 @@ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, co GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); - CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost)); + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; - UNUSED(buffer); + ggml_cuda_set_device(ctx->device); + CUDA_CHECK(cudaDeviceSynchronize()); + + CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost)); } static struct ggml_backend_buffer_i cuda_backend_buffer_interface = { @@ -9505,7 +9512,7 @@ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_t UNUSED(buft); } -static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = { +static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, @@ -9513,27 +9520,27 @@ static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = { }; ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { - static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES]; - static bool ggml_backend_buffer_type_cuda_initialized = false; - if (!ggml_backend_buffer_type_cuda_initialized) { + static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES]; + + static bool ggml_backend_cuda_buffer_type_initialized = false; + + if (!ggml_backend_cuda_buffer_type_initialized) { for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) { - ggml_backend_buffer_type_cuda[i] = { - /* .iface = */ cuda_backend_buffer_type_interface, + ggml_backend_cuda_buffer_types[i] = { + /* .iface = */ ggml_backend_cuda_buffer_type_interface, /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i, }; } - ggml_backend_buffer_type_cuda_initialized = true; + ggml_backend_cuda_buffer_type_initialized = true; } - return &ggml_backend_buffer_type_cuda[device]; + return &ggml_backend_cuda_buffer_types[device]; } // host buffer type static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; - CUDA_CHECK(cudaFreeHost(ctx->dev_ptr)); - delete ctx; + CUDA_CHECK(cudaFreeHost(buffer->context)); } static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { @@ -9546,11 +9553,9 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer; return buffer; - - UNUSED(buft); } -struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = { +struct ggml_backend_buffer_type_i ggml_backend_cuda_host_buffer_type_interface = { /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, @@ -9558,12 +9563,12 @@ struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = { }; ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { - static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = { - /* .iface = */ cuda_backend_host_buffer_type_interface, + static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = { + /* .iface = */ ggml_backend_cuda_host_buffer_type_interface, /* .context = */ nullptr, }; - return &ggml_backend_buffer_type_cuda_host; + return &ggml_backend_cuda_buffer_type_host; } // backend diff --git a/ggml.c b/ggml.c index ad546a7314a14..61725e1e9daae 100644 --- a/ggml.c +++ b/ggml.c @@ -2383,20 +2383,8 @@ size_t ggml_get_mem_size(const struct ggml_context * ctx) { size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) { size_t max_size = 0; - struct ggml_object * obj = ctx->objects_begin; - - while (obj != NULL) { - if (obj->type == GGML_OBJECT_TENSOR) { - struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs); - - const size_t size = ggml_nbytes(tensor); - - if (max_size < size) { - max_size = size; - } - } - - obj = obj->next; + for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) { + max_size = MAX(max_size, ggml_nbytes(tensor)); } return max_size; @@ -3093,7 +3081,7 @@ struct ggml_tensor * ggml_view_tensor( return result; } -struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) { +struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) { struct ggml_object * obj = ctx->objects_begin; char * const mem_buffer = ctx->mem_buffer; @@ -3109,7 +3097,7 @@ struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) { return NULL; } -struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) { +struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) { struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE); obj = obj->next; @@ -19179,6 +19167,10 @@ char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) { return ctx->infos[i].name.data; } +enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) { + return ctx->infos[i].type; +} + // returns the index static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) { const int idx = gguf_find_key(ctx, key); diff --git a/ggml.h b/ggml.h index 68f7833b62343..91c5a5ddf42b1 100644 --- a/ggml.h +++ b/ggml.h @@ -729,8 +729,8 @@ extern "C" { GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src); // Context tensor enumeration and lookup - GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx); - GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor); + GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx); + GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); @@ -2123,10 +2123,11 @@ extern "C" { GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id); GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i); - GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx); - GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name); - GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i); - GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i); + GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx); + GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name); + GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i); + GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i); + GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i); // overrides existing values or adds a new one GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val); diff --git a/llama.cpp b/llama.cpp index fd9fd6ed9e008..0a360c92c7f3d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4,8 +4,8 @@ #include "unicode.h" #include "ggml.h" - #include "ggml-alloc.h" +#include "ggml-backend.h" #ifdef GGML_USE_CUBLAS # include "ggml-cuda.h" @@ -697,38 +697,6 @@ static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * // llama helpers // -inline void * llama_host_malloc(size_t n) { -#ifdef GGML_USE_CUBLAS - if (ggml_cublas_loaded()) { - return ggml_cuda_host_malloc(n); - } else { - return malloc(n); - } -#elif GGML_USE_METAL - return ggml_metal_host_malloc(n); -#elif GGML_USE_CPU_HBM - return hbw_malloc(n); -#else - return malloc(n); -#endif -} - -inline void llama_host_free(void * ptr) { -#ifdef GGML_USE_CUBLAS - if (ggml_cublas_loaded()) { - return ggml_cuda_host_free(ptr); - } else { - return free(ptr); - } -#elif GGML_USE_METAL - return ggml_metal_host_free(ptr); -#elif GGML_USE_CPU_HBM - return hbw_free(ptr); -#else - return free(ptr); -#endif -} - #if defined(_WIN32) static std::string llama_format_win_err(DWORD err) { LPSTR buf; @@ -743,40 +711,10 @@ static std::string llama_format_win_err(DWORD err) { } #endif -struct llama_buffer { - void * data = NULL; - size_t size = 0; - - // fallback to malloc / free - // useful in cases where CUDA can try to allocate PINNED memory - bool fallback = false; - - void resize(size_t n) { - llama_host_free(data); - - data = llama_host_malloc(n); - if (!data) { - fallback = true; - data = malloc(n); - } else { - fallback = false; - } - - GGML_ASSERT(data); - size = n; - } - - ~llama_buffer() { - if (data) { - if (fallback) { // NOLINT - free(data); - } else { - llama_host_free(data); - } - } - - data = NULL; - } +template +struct no_init { + T value; + no_init() { /* do nothing */ } }; struct llama_file { @@ -874,7 +812,7 @@ struct llama_mmap { if (prefetch) { flags |= MAP_POPULATE; } #endif addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); - if (addr == MAP_FAILED) { + if (addr == MAP_FAILED) { // NOLINT throw std::runtime_error(format("mmap failed: %s", strerror(errno))); } @@ -1333,13 +1271,9 @@ struct llama_kv_cache { struct ggml_context * ctx = NULL; - llama_buffer buf; + ggml_backend_buffer_t buf = NULL; ~llama_kv_cache() { - if (ctx) { - ggml_free(ctx); - } - #ifdef GGML_USE_CUBLAS if (ggml_cublas_loaded()) { for (size_t i = 0; i < k_l.size(); ++i) { @@ -1348,6 +1282,11 @@ struct llama_kv_cache { } } #endif + if (ctx) { + ggml_free(ctx); + } + + ggml_backend_buffer_free(buf); } }; @@ -1387,11 +1326,11 @@ struct llama_vocab { id special_suffix_id = 32008; id special_eot_id = 32010; - int find_bpe_rank(std::string token_left, std::string token_right) const { - GGML_ASSERT(token_left.find(" ") == std::string::npos); - GGML_ASSERT(token_left.find("\n") == std::string::npos); - GGML_ASSERT(token_right.find(" ") == std::string::npos); - GGML_ASSERT(token_right.find("\n") == std::string::npos); + int find_bpe_rank(const std::string & token_left, const std::string & token_right) const { + GGML_ASSERT(token_left.find(' ') == std::string::npos); + GGML_ASSERT(token_left.find('\n') == std::string::npos); + GGML_ASSERT(token_right.find(' ') == std::string::npos); + GGML_ASSERT(token_right.find('\n') == std::string::npos); auto it = bpe_ranks.find(std::make_pair(token_left, token_right)); if (it == bpe_ranks.end()) { @@ -1432,7 +1371,7 @@ struct llama_model { struct ggml_context * ctx = NULL; // the model memory buffer - llama_buffer buf; + ggml_backend_buffer_t buf = NULL; // model memory mapped file std::unique_ptr mapping; @@ -1448,10 +1387,6 @@ struct llama_model { int64_t t_start_us = 0; ~llama_model() { - if (ctx) { - ggml_free(ctx); - } - #ifdef GGML_USE_CUBLAS if (ggml_cublas_loaded()) { for (size_t i = 0; i < tensors_by_name.size(); ++i) { @@ -1466,24 +1401,26 @@ struct llama_model { ggml_cl_free_data(tensors_by_name[i].second); } #endif + if (ctx) { + ggml_free(ctx); + } + + ggml_backend_buffer_free(buf); } }; struct llama_context { llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {} ~llama_context() { -#ifdef GGML_USE_METAL - if (ctx_metal) { - ggml_metal_free(ctx_metal); - } -#endif - if (alloc) { - ggml_allocr_free(alloc); - } + ggml_allocr_free(alloc); + ggml_backend_buffer_free(buf_alloc); + ggml_backend_free(backend); } llama_cparams cparams; + ggml_backend_t backend = nullptr; + const llama_model & model; // key + value cache for the self attention @@ -1514,19 +1451,11 @@ struct llama_context { // input embedding (1-dimensional array: [n_embd]) std::vector embedding; - // reusable buffer for `struct ggml_graph_plan.work_data` - std::vector work_buffer; - // memory buffers used to evaluate the model - llama_buffer buf_compute; - - llama_buffer buf_alloc; + std::vector buf_compute_meta; + ggml_backend_buffer_t buf_alloc = NULL; ggml_allocr * alloc = NULL; -#ifdef GGML_USE_METAL - ggml_metal_context * ctx_metal = NULL; -#endif - #ifdef GGML_USE_MPI ggml_mpi_context * ctx_mpi = NULL; #endif @@ -1543,13 +1472,11 @@ static bool llama_kv_cache_init( ggml_type vtype, uint32_t n_ctx, int n_gpu_layers, - bool offload) { + bool offload, + ggml_backend_buffer_type_t buft) { const uint32_t n_embd = hparams.n_embd_gqa(); const uint32_t n_layer = hparams.n_layer; - const int64_t n_mem = n_layer*n_ctx; - const int64_t n_elements = n_embd*n_mem; - cache.has_shift = false; cache.head = 0; @@ -1559,13 +1486,10 @@ static bool llama_kv_cache_init( cache.cells.clear(); cache.cells.resize(n_ctx); - cache.buf.resize(ggml_row_size(ktype, n_elements) + ggml_row_size(vtype, n_elements) + 2u*n_layer*ggml_tensor_overhead()); - memset(cache.buf.data, 0, cache.buf.size); - struct ggml_init_params params; - params.mem_size = cache.buf.size; - params.mem_buffer = cache.buf.data; - params.no_alloc = false; + params.mem_size = 2u*n_layer*ggml_tensor_overhead(); + params.mem_buffer = NULL; + params.no_alloc = true; cache.ctx = ggml_init(params); @@ -1579,9 +1503,7 @@ static bool llama_kv_cache_init( cache.k_l.reserve(n_layer); cache.v_l.reserve(n_layer); - const int i_gpu_start = (int) n_layer - n_gpu_layers; GGML_UNUSED(i_gpu_start); - - GGML_UNUSED(offload); + const int i_gpu_start = (int) n_layer - n_gpu_layers; for (int i = 0; i < (int) n_layer; i++) { ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx); @@ -1594,19 +1516,33 @@ static bool llama_kv_cache_init( if (i >= i_gpu_start) { if (offload) { ggml_cuda_assign_buffers_no_scratch(k); - vram_kv_cache += ggml_nbytes(k); ggml_cuda_assign_buffers_no_scratch(v); + vram_kv_cache += ggml_nbytes(k); vram_kv_cache += ggml_nbytes(v); + // HACK: mark tensor as allocated, but crash if we try to use it from the CPU + k->data = v->data = (void *)(uintptr_t)1; } } #endif // GGML_USE_CUBLAS } + // allocate tensors + cache.buf = ggml_backend_alloc_ctx_tensors_from_buft(cache.ctx, buft); + + // buf may be NULL with full offload + if (cache.buf) { + // TODO: ggml_backend_buffer_memset + // this is only valid with CPU buffers! + memset(ggml_backend_buffer_get_base(cache.buf), 0, ggml_backend_buffer_get_size(cache.buf)); + } + if (vram_kv_cache > 0) { LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0); } GGML_UNUSED(n_gpu_layers); + GGML_UNUSED(i_gpu_start); + GGML_UNUSED(offload); return true; } @@ -2057,17 +1993,16 @@ struct llama_model_loader { enum ggml_type type_max = GGML_TYPE_F32; for (int i = 0; i < n_tensors; i++) { - const char * name = gguf_get_tensor_name(ctx_gguf, i); - struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, name); + enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i); - n_type[meta->type]++; + n_type[type]++; - if (n_type_max < n_type[meta->type]) { - n_type_max = n_type[meta->type]; - type_max = meta->type; + if (n_type_max < n_type[type]) { + n_type_max = n_type[type]; + type_max = type; } - LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str()); + //LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str()); } switch (type_max) { @@ -2209,30 +2144,11 @@ struct llama_model_loader { return ggml_get_tensor(ctx_meta, get_tensor_name(i)); } - void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const { - ctx_size_p = 0; - mmapped_size_p = 0; - - for (int i = 0; i < n_tensors; i++) { - struct ggml_tensor * meta = get_tensor_meta(i); - ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE; - (use_mmap ? mmapped_size_p : ctx_size_p) += ggml_nbytes_pad(meta); - } - } - struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) { - if (backend != GGML_BACKEND_CPU) { - ggml_set_no_alloc(ctx, true); - } - struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta); tensor->backend = backend; // TODO: ggml_set_backend ggml_set_name(tensor, ggml_get_name(meta)); - if (backend != GGML_BACKEND_CPU) { - ggml_set_no_alloc(ctx, use_mmap); - } - n_created++; return tensor; @@ -2290,91 +2206,113 @@ struct llama_model_loader { return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx); } + void init_mapping(struct ggml_context * ctx) { + if (use_mmap) { + size_t size_pref = 0; // prefetch + + for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { + struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); + if (cur->backend == GGML_BACKEND_CPU) { + size_t tensor_end = gguf_get_tensor_offset(ctx_gguf, i) + ggml_nbytes(cur); + size_pref = std::max(size_pref, tensor_end); + } + } + mapping.reset(new llama_mmap(&file, gguf_get_data_offset(ctx_gguf) + size_pref, ggml_is_numa())); + } + } + + // for backwards compatibility only void load_data_for(struct ggml_tensor * cur) const { const size_t offs = file_offset(ggml_get_name(cur)); if (use_mmap) { - cur->data = (uint8_t *) mapping->addr + offs; + cur->data = (uint8_t *)mapping->addr + offs; } else { file.seek(offs, SEEK_SET); file.read_raw(cur->data, ggml_nbytes(cur)); } } - void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { - size_t size_data = 0; + void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t mmap_buf, llama_mlock * lmlock) { size_t size_lock = 0; - size_t size_pref = 0; // prefetch + size_t size_data = 0; for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); size_data += ggml_nbytes(cur); - if (cur->backend == GGML_BACKEND_CPU) { - size_pref += ggml_nbytes(cur); - } } if (use_mmap) { - mapping.reset(new llama_mmap(&file, size_pref, ggml_is_numa())); if (lmlock) { lmlock->init(mapping->addr); } } + std::vector> read_buf; + size_t done_size = 0; for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); GGML_ASSERT(cur); // unused tensors should have been caught by load_data already - if (progress_callback) { - progress_callback((float) done_size / size_data, progress_callback_user_data); - } - - // allocate temp buffer if not using mmap - if (!use_mmap && cur->data == NULL) { - GGML_ASSERT(cur->backend != GGML_BACKEND_CPU); - #ifdef GGML_USE_CPU_HBM - cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur)); - #else - cur->data = (uint8_t*)malloc(ggml_nbytes(cur)); - #endif - } - - load_data_for(cur); + const size_t offs = file_offset(ggml_get_name(cur)); switch (cur->backend) { case GGML_BACKEND_CPU: + if (use_mmap) { + if (mmap_buf) { + ggml_backend_tensor_alloc(mmap_buf, cur, (uint8_t *)mapping->addr + offs); + } else { + ggml_backend_tensor_set(cur, (uint8_t *)mapping->addr + offs, 0, ggml_nbytes(cur)); + } + } else { + file.seek(offs, SEEK_SET); + file.read_raw(cur->data, ggml_nbytes(cur)); + } + if (use_mmap && lmlock) { size_lock += ggml_nbytes(cur); lmlock->grow_to(size_lock); } break; -#ifdef GGML_USE_CUBLAS + case GGML_BACKEND_GPU: - case GGML_BACKEND_GPU_SPLIT: - // old code: - //ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor); - - // TODO: test if this works !! - ggml_cuda_transform_tensor(cur->data, cur); - if (!use_mmap) { - free(cur->data); + case GGML_BACKEND_GPU_SPLIT: { + // HACK: mark tensor as allocated, but crash if we try to use it from the CPU + cur->data = (void *)(uintptr_t)1; + void * data; + if (use_mmap) { + data = (uint8_t *)mapping->addr + offs; + } else { + read_buf.resize(ggml_nbytes(cur)); + file.seek(offs, SEEK_SET); + file.read_raw(read_buf.data(), ggml_nbytes(cur)); + data = read_buf.data(); } - break; + +#ifdef GGML_USE_CUBLAS + ggml_cuda_transform_tensor(data, cur); #elif defined(GGML_USE_CLBLAST) - case GGML_BACKEND_GPU: - ggml_cl_transform_tensor(cur->data, cur); - if (!use_mmap) { - free(cur->data); - } - break; + GGML_ASSERT(cur->backend == GGML_BACKEND_GPU); + ggml_cl_transform_tensor(data, cur); +#else + GGML_ASSERT(!"GPU tensor without a GPU backend"); + GGML_UNUSED(data); #endif + } break; + default: continue; } done_size += ggml_nbytes(cur); + + if (progress_callback) { + progress_callback((float) done_size / size_data, progress_callback_user_data); + } } + + // TODO: unmap GPU tensors } }; @@ -2942,6 +2880,20 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } } +// TODO: metal should be disabled with ngl=0 -> cpu_buffer_type +static ggml_backend_buffer_type_t llama_default_buffer_type() { +#ifdef GGML_USE_METAL + return ggml_backend_metal_buffer_type(); +#elif GGML_USE_CUBLAS + printf("Using " GGML_CUDA_NAME " host buffer type\n"); + return ggml_backend_cuda_host_buffer_type(); +#elif GGML_USE_CPU_HBM + return ggml_backend_cpu_hbm_buffer_type(); +#else + return ggml_backend_cpu_buffer_type(); +#endif +} + static void llm_load_tensors( llama_model_loader & ml, llama_model & model, @@ -2958,25 +2910,16 @@ static void llm_load_tensors( model.n_gpu_layers = n_gpu_layers; - size_t ctx_size; - size_t mmapped_size; + size_t ctx_size = ggml_tensor_overhead() * ml.n_tensors; - ml.calc_sizes(ctx_size, mmapped_size); - - LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0); + LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0); // create the ggml context { - model.buf.resize(ctx_size); - if (use_mlock) { - model.mlock_buf.init (model.buf.data); - model.mlock_buf.grow_to(model.buf.size); - } - struct ggml_init_params params = { - /*.mem_size =*/ model.buf.size, - /*.mem_buffer =*/ model.buf.data, - /*.no_alloc =*/ ml.use_mmap, + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, }; model.ctx = ggml_init(params); @@ -2987,7 +2930,7 @@ static void llm_load_tensors( (void) main_gpu; - enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU; + enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU; enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU; #ifdef GGML_USE_CUBLAS @@ -2995,17 +2938,16 @@ static void llm_load_tensors( LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__); ggml_cuda_set_main_device(main_gpu); - llama_backend_offload = GGML_BACKEND_GPU; + llama_backend_offload = GGML_BACKEND_GPU; llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT; } #elif defined(GGML_USE_CLBLAST) LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__); - llama_backend_offload = GGML_BACKEND_GPU; + llama_backend_offload = GGML_BACKEND_GPU; llama_backend_offload_split = GGML_BACKEND_GPU; #endif - // prepare memory for the weights - size_t vram_weights = 0; + // create tensors for the weights { const int64_t n_embd = hparams.n_embd; const int64_t n_embd_gqa = hparams.n_embd_gqa(); @@ -3034,13 +2976,6 @@ static void llm_load_tensors( model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(model.output_norm); - } - if (backend_output == GGML_BACKEND_GPU_SPLIT) { - vram_weights += ggml_nbytes(model.output); - } } const uint32_t n_ff = hparams.n_ff; @@ -3090,28 +3025,6 @@ static void llm_load_tensors( layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split); } } - - if (backend == GGML_BACKEND_GPU) { - vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + - ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + - (layer.bq ? ggml_nbytes(layer.bq) : 0) + - (layer.bk ? ggml_nbytes(layer.bk) : 0) + - (layer.bv ? ggml_nbytes(layer.bv) : 0) + - (layer.bo ? ggml_nbytes(layer.bo) : 0) + - ggml_nbytes(layer.ffn_norm); - - if (layer.ffn_gate_inp == nullptr) { - vram_weights += - ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); - } else { - vram_weights += ggml_nbytes(layer.ffn_gate_inp); - for (uint32_t x = 0; x < hparams.n_expert; ++x) { - vram_weights += - ggml_nbytes(layer.ffn_gate_exp[x]) + ggml_nbytes(layer.ffn_down_exp[x]) + ggml_nbytes(layer.ffn_up_exp[x]); - } - } - } } } break; case LLM_ARCH_BAICHUAN: @@ -3131,13 +3044,6 @@ static void llm_load_tensors( model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(model.output_norm); - } - if (backend_output == GGML_BACKEND_GPU_SPLIT) { - vram_weights += ggml_nbytes(model.output); - } } const uint32_t n_ff = hparams.n_ff; @@ -3164,19 +3070,10 @@ static void llm_load_tensors( layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - - if (backend == GGML_BACKEND_GPU) { - vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + - ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + - ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); - } } } break; case LLM_ARCH_FALCON: { - // TODO: CPU-only for now - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); // output @@ -3195,14 +3092,6 @@ static void llm_load_tensors( model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(model.output_norm); - vram_weights += ggml_nbytes(model.output_norm_b); - } - if (backend_output == GGML_BACKEND_GPU_SPLIT) { - vram_weights += ggml_nbytes(model.output); - } } const uint32_t n_ff = hparams.n_ff; @@ -3223,11 +3112,6 @@ static void llm_load_tensors( if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) { layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend); layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend); - - if (backend == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(layer.attn_norm_2); - vram_weights += ggml_nbytes(layer.attn_norm_2_b); - } } layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); @@ -3235,13 +3119,6 @@ static void llm_load_tensors( layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - - if (backend == GGML_BACKEND_GPU) { - vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) + - ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) + - ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); - } } } break; case LLM_ARCH_STARCODER: @@ -3265,14 +3142,6 @@ static void llm_load_tensors( model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(model.output_norm); - vram_weights += ggml_nbytes(model.output_norm_b); - } - if (backend_output == GGML_BACKEND_GPU_SPLIT) { - vram_weights += ggml_nbytes(model.output); - } } const uint32_t n_ff = hparams.n_ff; @@ -3304,16 +3173,6 @@ static void llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); - - if (backend == GGML_BACKEND_GPU) { - vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) + - ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) + - ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) + - ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) + - ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b) + - ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b); - } } } break; case LLM_ARCH_PERSIMMON: @@ -3335,14 +3194,6 @@ static void llm_load_tensors( model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(model.output_norm); - vram_weights += ggml_nbytes(model.output_norm_b); - } - if (backend_output == GGML_BACKEND_GPU_SPLIT) { - vram_weights += ggml_nbytes(model.output); - } } const uint32_t n_ff = hparams.n_ff; @@ -3372,8 +3223,6 @@ static void llm_load_tensors( } break; case LLM_ARCH_BLOOM: { - // TODO: CPU-only for now - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU); model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU); @@ -3394,14 +3243,6 @@ static void llm_load_tensors( model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(model.output_norm); - vram_weights += ggml_nbytes(model.output_norm_b); - } - if (backend_output == GGML_BACKEND_GPU_SPLIT) { - vram_weights += ggml_nbytes(model.output); - } } const uint32_t n_ff = hparams.n_ff; @@ -3433,16 +3274,6 @@ static void llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); - - if (backend == GGML_BACKEND_GPU) { - vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) + - ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) + - ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) + - ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) + - ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b) + - ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b); - } } } break; case LLM_ARCH_MPT: @@ -3464,13 +3295,6 @@ static void llm_load_tensors( model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(model.output_norm); - } - if (backend_output == GGML_BACKEND_GPU_SPLIT) { - vram_weights += ggml_nbytes(model.output); - } } const uint32_t n_ff = hparams.n_ff; @@ -3493,16 +3317,6 @@ static void llm_load_tensors( layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - - if (backend == GGML_BACKEND_GPU) { - vram_weights += - ggml_nbytes(layer.attn_norm) + - ggml_nbytes(layer.wqkv) + - ggml_nbytes(layer.wo) + - ggml_nbytes(layer.ffn_norm) + - ggml_nbytes(layer.ffn_down) + - ggml_nbytes(layer.ffn_up); - } } } break; case LLM_ARCH_STABLELM: @@ -3525,13 +3339,6 @@ static void llm_load_tensors( model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(model.output_norm); - } - if (backend_output == GGML_BACKEND_GPU_SPLIT) { - vram_weights += ggml_nbytes(model.output); - } } const uint32_t n_ff = hparams.n_ff; @@ -3563,13 +3370,6 @@ static void llm_load_tensors( layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - - if (backend == GGML_BACKEND_GPU) { - vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + - ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + - ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); - } } } break; case LLM_ARCH_QWEN: @@ -3589,14 +3389,7 @@ static void llm_load_tensors( model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(model.output_norm); - } - if (backend_output == GGML_BACKEND_GPU_SPLIT) { - vram_weights += ggml_nbytes(model.output); - } - } + } const uint32_t n_ff = hparams.n_ff / 2; @@ -3621,13 +3414,6 @@ static void llm_load_tensors( layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - - if (backend == GGML_BACKEND_GPU) { - vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) + - ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) + - ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); - } } } break; @@ -3638,14 +3424,63 @@ static void llm_load_tensors( ml.done_getting_tensors(); + ml.init_mapping(ctx); + + // allocate tensors + size_t vram_weights = 0; + size_t buf_size = 0; + + ggml_backend_buffer_type_t buft = llama_default_buffer_type(); + + for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + if (t->backend == GGML_BACKEND_CPU) { + buf_size += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), ggml_backend_buft_get_alignment(buft)); + } else { + vram_weights += ggml_nbytes(t); + } + } + + + // create backend buffer + bool sys_mem_buf = false; + +#ifdef GGML_USE_METAL + if (ml.use_mmap) { + model.buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size); + } else { + model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type()); + sys_mem_buf = true; + } +#else + // CPU backend, and indirectly CUDA and OpenCL + if (ml.use_mmap) { + model.buf = ggml_backend_cpu_buffer_from_ptr(ml.mapping->addr, ml.mapping->size); + } else { + // allocate only CPU tensors + model.buf = ggml_backend_buft_alloc_buffer(buft, buf_size); + ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(model.buf); + for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + if (t->backend == GGML_BACKEND_CPU) { + ggml_tallocr_alloc(alloc, t); + } + } + ggml_tallocr_free(alloc); + sys_mem_buf = true; + } +#endif + + if (use_mlock && sys_mem_buf) { + // TODO: CPU/metal only + model.mlock_buf.init (ggml_backend_buffer_get_base(model.buf)); + model.mlock_buf.grow_to(ggml_backend_buffer_get_size(model.buf)); + } + // print memory requirements { - // this is the total memory required to run the inference - size_t mem_required = - ctx_size + - mmapped_size - vram_weights; // weights in VRAM not in memory + size_t sys_mem_required = ctx_size + buf_size; - LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: system memory used = %7.2f MiB\n", __func__, sys_mem_required / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: VRAM used = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0); #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); @@ -3655,35 +3490,29 @@ static void llm_load_tensors( LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__); } -#ifdef GGML_USE_CUBLAS - const int max_backend_supported_layers = hparams.n_layer + 1; - const int max_offloadable_layers = hparams.n_layer + 1; -#elif GGML_USE_CLBLAST const int max_backend_supported_layers = hparams.n_layer + 1; const int max_offloadable_layers = hparams.n_layer + 1; -#endif // GGML_USE_CUBLAS LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); - LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0); #else - (void) n_gpu_layers; + GGML_UNUSED(n_gpu_layers); + GGML_UNUSED(vram_weights); + GGML_UNUSED(tensor_split); #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) } - // populate `tensors_by_name` + // populate tensors_by_name for (int i = 0; i < ml.n_tensors; ++i) { struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i)); model.tensors_by_name.emplace_back(ggml_get_name(cur), cur); } - (void) tensor_split; #ifdef GGML_USE_CUBLAS - { - ggml_cuda_set_tensor_split(tensor_split); - } -#endif + ggml_cuda_set_tensor_split(tensor_split); +#endif // GGML_USE_CUBLAS - ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL); + // TODO: only pass buf if it is a mmap buffer + ml.load_all_data(ctx, progress_callback, progress_callback_user_data, model.buf, use_mlock ? &model.mlock_mmap : NULL); if (progress_callback) { progress_callback(1.0f, progress_callback_user_data); @@ -4112,7 +3941,7 @@ struct llm_build_context { const llm_build_cb & cb; - llama_buffer & buf_compute; + std::vector & buf_compute_meta; struct ggml_context * ctx0 = nullptr; @@ -4122,35 +3951,35 @@ struct llm_build_context { const llama_batch & batch, const llm_build_cb & cb, bool worst_case) : - model (lctx.model), - hparams (model.hparams), - cparams (lctx.cparams), - batch (batch), - kv_self (lctx.kv_self), - n_embd (hparams.n_embd), - n_layer (hparams.n_layer), - n_ctx (cparams.n_ctx), - n_head (hparams.n_head), - n_head_kv (hparams.n_head_kv), - n_embd_head (hparams.n_embd_head()), - n_embd_gqa (hparams.n_embd_gqa()), - n_expert (hparams.n_expert), - n_expert_used (hparams.n_expert_used), - freq_base (cparams.rope_freq_base), - freq_scale (cparams.rope_freq_scale), - ext_factor (cparams.yarn_ext_factor), - attn_factor (cparams.yarn_attn_factor), - beta_fast (cparams.yarn_beta_fast), - beta_slow (cparams.yarn_beta_slow), - norm_eps (hparams.f_norm_eps), - norm_rms_eps (hparams.f_norm_rms_eps), - n_tokens (batch.n_tokens), - n_kv (worst_case ? n_ctx : kv_self.n), - kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), - n_orig_ctx (cparams.n_yarn_orig_ctx), - do_rope_shift (worst_case || kv_self.has_shift), - cb (cb), - buf_compute (lctx.buf_compute) { + model (lctx.model), + hparams (model.hparams), + cparams (lctx.cparams), + batch (batch), + kv_self (lctx.kv_self), + n_embd (hparams.n_embd), + n_layer (hparams.n_layer), + n_ctx (cparams.n_ctx), + n_head (hparams.n_head), + n_head_kv (hparams.n_head_kv), + n_embd_head (hparams.n_embd_head()), + n_embd_gqa (hparams.n_embd_gqa()), + n_expert (hparams.n_expert), + n_expert_used (hparams.n_expert_used), + freq_base (cparams.rope_freq_base), + freq_scale (cparams.rope_freq_scale), + ext_factor (cparams.yarn_ext_factor), + attn_factor (cparams.yarn_attn_factor), + beta_fast (cparams.yarn_beta_fast), + beta_slow (cparams.yarn_beta_slow), + norm_eps (hparams.f_norm_eps), + norm_rms_eps (hparams.f_norm_rms_eps), + n_tokens (batch.n_tokens), + n_kv (worst_case ? n_ctx : kv_self.n), + kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), + n_orig_ctx (cparams.n_yarn_orig_ctx), + do_rope_shift (worst_case || kv_self.has_shift), + cb (cb), + buf_compute_meta (lctx.buf_compute_meta) { GGML_ASSERT(!!kv_self.ctx); // all initializations should be done in init() @@ -4158,8 +3987,8 @@ struct llm_build_context { void init() { struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, + /*.mem_size =*/ buf_compute_meta.size(), + /*.mem_buffer =*/ buf_compute_meta.data(), /*.no_alloc =*/ true, }; @@ -6063,10 +5892,11 @@ static int llama_decode_internal( #ifdef GGML_USE_CUBLAS + char * buf_alloc_base = (char *)ggml_backend_buffer_get_base(lctx.buf_alloc); for (int i = 0; i < gf->n_leafs; i++) { ggml_tensor * node = gf->leafs[i]; if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { - ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data); + ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base); ggml_cuda_copy_to_device(node); } } @@ -6074,7 +5904,7 @@ static int llama_decode_internal( for (int i = 0; i < gf->n_nodes; i++) { ggml_tensor * node = gf->nodes[i]; if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { - ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data); + ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base); } } @@ -6107,14 +5937,14 @@ static int llama_decode_internal( #endif #ifdef GGML_USE_METAL - if (lctx.ctx_metal) { - ggml_metal_set_n_cb (lctx.ctx_metal, n_threads); - ggml_metal_graph_compute(lctx.ctx_metal, gf); - } else { - ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); + if (ggml_backend_is_metal(lctx.backend)) { + ggml_backend_metal_set_n_cb(lctx.backend, n_threads); } #else - ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); + if (ggml_backend_is_cpu(lctx.backend)) { + ggml_backend_cpu_set_n_threads(lctx.backend, n_threads); + } + ggml_backend_graph_compute(lctx.backend, gf); #endif #if GGML_USE_MPI @@ -6184,7 +6014,7 @@ static int llama_decode_internal( logits_out.resize(n_vocab); memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab); #ifndef NDEBUG - logits_valid[n_tokens - 1] = true; + logits_valid[0] = true; #endif } } @@ -8152,12 +7982,6 @@ void llama_beam_search(llama_context * ctx, // quantization // -template -struct no_init { - T value; - no_init() { /* do nothing */ } -}; - struct quantize_state_internal { const llama_model & model; const llama_model_quantize_params * params; @@ -8481,7 +8305,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } tensor->data = read_data.data(); } - ml.load_data_for(tensor); + GGML_ASSERT(!"not implemented"); + //ml.load_data_for(tensor); TODO LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ", ++idx, ml.n_tensors, @@ -8709,7 +8534,8 @@ static int llama_apply_lora_from_file_internal( size_t ctx_size; size_t mmapped_size; - ml->calc_sizes(ctx_size, mmapped_size); + GGML_ASSERT(!"not implemented"); + //ml->calc_sizes(ctx_size, mmapped_size); base_buf.resize(ctx_size); @@ -8837,7 +8663,8 @@ static int llama_apply_lora_from_file_internal( } base_t = ml->create_tensor(base_ctx.get(), base_name, { dest_t->ne[0], dest_t->ne[1] }, GGML_BACKEND_CPU); - ml->load_data_for(base_t); + GGML_ASSERT(!"not implemented"); + //ml->load_data_for(base_t); // TODO } else { base_t = dest_t; } @@ -9121,7 +8948,23 @@ struct llama_context * llama_new_context_with_model( // reserve memory for context buffers if (!hparams.vocab_only) { - if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) { + // initialize backend +#ifdef GGML_USE_METAL + if (model->n_gpu_layers > 0) { + ctx->backend = ggml_backend_metal_init(); + } +#endif + if (ctx->backend == nullptr) { + ctx->backend = ggml_backend_cpu_init(); + } + + if (ctx->backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize backend\n", __func__); + delete ctx; + return nullptr; + } + + if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv, llama_default_buffer_type())) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; @@ -9157,12 +9000,11 @@ struct llama_context * llama_new_context_with_model( } { - static const size_t tensor_alignment = 32; // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data - ctx->buf_compute.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead()); + ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead()); // create measure allocator - ctx->alloc = ggml_allocr_new_measure(tensor_alignment); + ctx->alloc = ggml_allocr_new_measure_from_backend(ctx->backend); // build worst-case graph int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch); @@ -9170,33 +9012,16 @@ struct llama_context * llama_new_context_with_model( llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0)); -#ifdef GGML_USE_METAL - if (model->n_gpu_layers > 0) { - ctx->ctx_metal = ggml_metal_init(1); - if (!ctx->ctx_metal) { - LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__); - llama_free(ctx); - return NULL; - } - //ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false); - //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal)); - } -#endif // measure memory requirements for the graph - size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment; + size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf); - LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute_meta.size() + alloc_size) / 1024.0 / 1024.0); - // recreate allocator with exact memory requirements + // create allocator again with exact memory requirements ggml_allocr_free(ctx->alloc); - ctx->buf_alloc.resize(alloc_size); - ctx->alloc = ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment); -#ifdef GGML_USE_METAL - if (ctx->ctx_metal) { - //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal)); - } -#endif + ctx->buf_alloc = ggml_backend_alloc_buffer(ctx->backend, alloc_size); + ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc); #ifdef GGML_USE_CUBLAS ggml_cuda_set_scratch_size(alloc_size); LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0); @@ -9229,39 +9054,6 @@ struct llama_context * llama_new_context_with_model( ctx_vram_size / 1024.0 / 1024.0); #endif } - -#ifdef GGML_USE_METAL - if (model->n_gpu_layers > 0) { - // this allocates all Metal resources and memory buffers - - void * data_ptr = NULL; - size_t data_size = 0; - - if (ctx->model.mapping) { - data_ptr = ctx->model.mapping->addr; - data_size = ctx->model.mapping->size; - } else { - data_ptr = ggml_get_mem_buffer(ctx->model.ctx); - data_size = ggml_get_mem_size (ctx->model.ctx); - } - - const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); - - LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0); - -#define LLAMA_METAL_CHECK_BUF(result) \ - if (!(result)) { \ - LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \ - llama_free(ctx); \ - return NULL; \ - } - - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0)); -#undef LLAMA_METAL_CHECK_BUF - } -#endif } #ifdef GGML_USE_MPI @@ -9549,7 +9341,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) { const size_t s_embedding = ctx->embedding.size() * sizeof(float); const size_t s_kv_size = sizeof(size_t); const size_t s_kv_ntok = sizeof(int); - const size_t s_kv = ctx->kv_self.buf.size; + const size_t s_kv = ggml_backend_buffer_get_size(ctx->kv_self.buf); const size_t s_total = ( + s_rng_size @@ -9677,7 +9469,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat const auto n_embd = hparams.n_embd_gqa(); const auto n_ctx = cparams.n_ctx; - const size_t kv_buf_size = kv_self.buf.size; + const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf); const uint32_t kv_head = kv_self.head; const uint32_t kv_size = kv_self.size; const uint32_t kv_used = kv_self.used; @@ -9717,7 +9509,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d)); } - ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1); + std::vector work_buffer; + ggml_graph_compute_helper(work_buffer, gf, ctx->cparams.n_threads); ggml_free(cpy_ctx); @@ -9824,7 +9617,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used); if (kv_buf_size) { - GGML_ASSERT(kv_self.buf.size == kv_buf_size); + GGML_ASSERT(ggml_backend_buffer_get_size(kv_self.buf) == kv_buf_size); const size_t elt_size = ggml_element_size(kv_self.k_l[0]); @@ -9852,7 +9645,8 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d, v2d)); } - ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1); + std::vector work_buffer; + ggml_graph_compute_helper(work_buffer, gf, ctx->cparams.n_threads); ggml_free(cpy_ctx); } From 0808aa5a42c5154a21a411ea04522fcaf6804ce6 Mon Sep 17 00:00:00 2001 From: slaren Date: Tue, 19 Dec 2023 03:23:00 +0100 Subject: [PATCH 02/21] add ggml-metal --- ggml-cuda.cu | 8 --- ggml-metal.h | 3 + ggml-metal.m | 180 +++++++++++++++++++++++++++++++++++++++++++-------- llama.cpp | 33 ++++++---- 4 files changed, 176 insertions(+), 48 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index f11fc4e93cba0..dafa9282aeb5e 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -9428,8 +9428,6 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g } static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; @@ -9441,8 +9439,6 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg } static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; @@ -9600,8 +9596,6 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0])); @@ -9611,8 +9605,6 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0])); diff --git a/ggml-metal.h b/ggml-metal.h index bf52d9cd34da4..b5e02b668a0f7 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -98,7 +98,10 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void); GGML_API bool ggml_backend_is_metal(ggml_backend_t backend); +GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size); + GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb); + GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); // helper to check if the device supports a specific family diff --git a/ggml-metal.m b/ggml-metal.m index 465679a6bb0c8..4944ac3cdc853 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -607,12 +607,24 @@ int ggml_metal_if_optimized(struct ggml_metal_context * ctx) { } // temporarily defined here for compatibility between ggml-backend and the old API -struct ggml_backend_metal_buffer_context { - void * data; + +struct ggml_backend_metal_buffer { + void * data; + size_t size; id metal; }; +struct ggml_backend_metal_buffer_context { + void * all_data; + size_t all_size; + bool owned; + + // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap + int n_buffers; + struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; +}; + // finds the Metal buffer that contains the tensor data on the GPU device // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the // Metal buffer based on the host memory pointer @@ -622,17 +634,29 @@ int ggml_metal_if_optimized(struct ggml_metal_context * ctx) { const int64_t tsize = ggml_nbytes(t); + ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer; + // compatibility with ggml-backend - if (t->buffer && t->buffer->buft == ggml_backend_metal_buffer_type()) { - struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) t->buffer->context; + if (buffer && buffer->buft == ggml_backend_metal_buffer_type()) { + struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context; + + // find the view that contains the tensor fully + for (int i = 0; i < buf_ctx->n_buffers; ++i) { + const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data; + + //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size); + if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) { + *offs = (size_t) ioffs; - const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->data; + //GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs); - GGML_ASSERT(ioffs >= 0 && ioffs + tsize <= (int64_t) t->buffer->size); + return buf_ctx->buffers[i].metal; + } + } - *offs = (size_t) ioffs; + GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name); - return buf_ctx->metal; + return nil; } // find the view that contains the tensor fully @@ -2361,6 +2385,7 @@ void ggml_metal_graph_compute( // backend interface +// default buffer static id g_backend_device = nil; static int g_backend_device_ref_count = 0; @@ -2388,34 +2413,33 @@ static void ggml_backend_metal_free_device(void) { static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; - return ctx->data; + return ctx->all_data; } static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; - [ctx->metal release]; + for (int i = 0; i < ctx->n_buffers; i++) { + [ctx->buffers[i].metal release]; + } ggml_backend_metal_free_device(); - free(ctx->data); + if (ctx->owned) { + free(ctx->all_data); + } + free(ctx); UNUSED(buffer); } static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - memcpy((char *)tensor->data + offset, data, size); UNUSED(buffer); } static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - memcpy(data, (const char *)tensor->data + offset, size); UNUSED(buffer); @@ -2433,7 +2457,7 @@ static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer UNUSED(buffer); } -static struct ggml_backend_buffer_i metal_backend_buffer_i = { +static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = { /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer, /* .get_base = */ ggml_backend_metal_buffer_get_base, /* .init_tensor = */ NULL, @@ -2443,6 +2467,8 @@ static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer /* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to, }; +// default buffer type + static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context)); @@ -2453,13 +2479,30 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba size_aligned += (size_page - (size_aligned % size_page)); } - ctx->data = ggml_metal_host_malloc(size); - ctx->metal = [ggml_backend_metal_get_device() newBufferWithBytesNoCopy:ctx->data + id device = ggml_backend_metal_get_device(); + + ctx->all_data = ggml_metal_host_malloc(size_aligned); + ctx->all_size = size_aligned; + ctx->owned = true; + ctx->n_buffers = 1; + + ctx->buffers[0].data = ctx->all_data; + ctx->buffers[0].size = size; + ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; - return ggml_backend_buffer_init(buft, metal_backend_buffer_i, ctx, size); + if (ctx->buffers[0].metal == nil) { + GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, "default", size_aligned / 1024.0 / 1024.0); + free(ctx); + ggml_backend_metal_free_device(); + return NULL; + } + + GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB\n", __func__, "default", size_aligned / 1024.0 / 1024.0); + + return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size); } static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { @@ -2470,7 +2513,7 @@ static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_t static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend); - GGML_UNUSED(buft); + UNUSED(buft); } ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { @@ -2487,6 +2530,91 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { return &ggml_backend_buffer_type_metal; } +// buffer from ptr + +ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) { + struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context)); + + ctx->all_data = data; + ctx->all_size = size; + ctx->owned = false; + ctx->n_buffers = 0; + + const size_t size_page = sysconf(_SC_PAGESIZE); + size_t size_aligned = size; + if ((size_aligned % size_page) != 0) { + size_aligned += (size_page - (size_aligned % size_page)); + } + + id device = ggml_backend_metal_get_device(); + + const char * name = "from_ptr"; + + // the buffer fits into the max buffer size allowed by the device + if (size_aligned <= device.maxBufferLength) { + ctx->buffers[ctx->n_buffers].name = name; + ctx->buffers[ctx->n_buffers].data = data; + ctx->buffers[ctx->n_buffers].size = size; + + ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; + + if (ctx->buffers[ctx->n_buffers].metal == nil) { + GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_aligned / 1024.0 / 1024.0); + return false; + } + + GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_aligned / 1024.0 / 1024.0); + + ++ctx->n_buffers; + } else { + // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into + // one of the views + const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case + const size_t size_step = device.maxBufferLength - size_ovlp; + const size_t size_view = device.maxBufferLength; + + for (size_t i = 0; i < size; i += size_step) { + const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i); + + ctx->buffers[ctx->n_buffers].name = name; + ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i); + ctx->buffers[ctx->n_buffers].size = size_step_aligned; + + ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; + + if (ctx->buffers[ctx->n_buffers].metal == nil) { + GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); + return false; + } + + GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); + if (i + size_step < size) { + GGML_METAL_LOG_INFO("\n"); + } + + ++ctx->n_buffers; + } + } + +#if TARGET_OS_OSX + GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)", + device.currentAllocatedSize / 1024.0 / 1024.0, + device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); + + if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) { + GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); + } else { + GGML_METAL_LOG_INFO("\n"); + } +#else + GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0); +#endif + + return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size); +} + +// backend + static const char * ggml_backend_metal_name(ggml_backend_t backend) { return "Metal"; @@ -2499,10 +2627,6 @@ static void ggml_backend_metal_free(ggml_backend_t backend) { free(backend); } -static void ggml_backend_metal_synchronize(ggml_backend_t backend) { - UNUSED(backend); -} - static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) { return ggml_backend_metal_buffer_type(); @@ -2529,8 +2653,8 @@ static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct /* .get_tensor_async = */ NULL, /* .cpy_tensor_from_async = */ NULL, /* .cpy_tensor_to_async = */ NULL, - /* .synchronize = */ ggml_backend_metal_synchronize, - /* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_metal_graph_compute, diff --git a/llama.cpp b/llama.cpp index 0a360c92c7f3d..5cb23230abd98 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2881,17 +2881,20 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { } // TODO: metal should be disabled with ngl=0 -> cpu_buffer_type -static ggml_backend_buffer_type_t llama_default_buffer_type() { +static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) { #ifdef GGML_USE_METAL - return ggml_backend_metal_buffer_type(); + if (n_gpu_layers > 0) { + return ggml_backend_metal_buffer_type(); + } #elif GGML_USE_CUBLAS - printf("Using " GGML_CUDA_NAME " host buffer type\n"); return ggml_backend_cuda_host_buffer_type(); #elif GGML_USE_CPU_HBM return ggml_backend_cpu_hbm_buffer_type(); -#else - return ggml_backend_cpu_buffer_type(); #endif + + return ggml_backend_cpu_buffer_type(); + + GGML_UNUSED(n_gpu_layers); } static void llm_load_tensors( @@ -3430,7 +3433,7 @@ static void llm_load_tensors( size_t vram_weights = 0; size_t buf_size = 0; - ggml_backend_buffer_type_t buft = llama_default_buffer_type(); + ggml_backend_buffer_type_t buft = llama_default_buffer_type(n_gpu_layers); for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { if (t->backend == GGML_BACKEND_CPU) { @@ -3440,13 +3443,14 @@ static void llm_load_tensors( } } - // create backend buffer bool sys_mem_buf = false; #ifdef GGML_USE_METAL +// todo: disable with 0 gpu layers if (ml.use_mmap) { - model.buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size); + const size_t max_size = ggml_get_max_tensor_size(ctx); + model.buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size, max_size); } else { model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type()); sys_mem_buf = true; @@ -5940,12 +5944,12 @@ static int llama_decode_internal( if (ggml_backend_is_metal(lctx.backend)) { ggml_backend_metal_set_n_cb(lctx.backend, n_threads); } -#else +#endif + if (ggml_backend_is_cpu(lctx.backend)) { ggml_backend_cpu_set_n_threads(lctx.backend, n_threads); } ggml_backend_graph_compute(lctx.backend, gf); -#endif #if GGML_USE_MPI ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer); @@ -8952,6 +8956,9 @@ struct llama_context * llama_new_context_with_model( #ifdef GGML_USE_METAL if (model->n_gpu_layers > 0) { ctx->backend = ggml_backend_metal_init(); + if (ctx->backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__); + } } #endif if (ctx->backend == nullptr) { @@ -8959,12 +8966,14 @@ struct llama_context * llama_new_context_with_model( } if (ctx->backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize backend\n", __func__); + LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); delete ctx; return nullptr; } - if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv, llama_default_buffer_type())) { + if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, + cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv, + llama_default_buffer_type(model->n_gpu_layers))) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; From 0c5ee7c417418d676b76b27527c9b9f361092a52 Mon Sep 17 00:00:00 2001 From: slaren Date: Tue, 19 Dec 2023 17:55:37 +0100 Subject: [PATCH 03/21] cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST access all tensor data with ggml_backend_tensor_get/set --- llama.cpp | 106 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 74 insertions(+), 32 deletions(-) diff --git a/llama.cpp b/llama.cpp index 699ae936256bc..62c921ea95aa3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1,4 +1,5 @@ #define LLAMA_API_INTERNAL +#define LLAMA_GGML_BACKEND_CUDA_TEST // for testing only - disables partial offloading #include "llama.h" #include "unicode.h" @@ -1289,7 +1290,7 @@ struct llama_kv_cache { ggml_backend_buffer_t buf = NULL; ~llama_kv_cache() { -#ifdef GGML_USE_CUBLAS +#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) if (ggml_cublas_loaded()) { for (size_t i = 0; i < k_l.size(); ++i) { ggml_cuda_free_data(k_l[i]); @@ -1403,7 +1404,7 @@ struct llama_model { int64_t t_start_us = 0; ~llama_model() { -#ifdef GGML_USE_CUBLAS +#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) if (ggml_cublas_loaded()) { for (size_t i = 0; i < tensors_by_name.size(); ++i) { ggml_cuda_free_data(tensors_by_name[i].second); @@ -1472,6 +1473,9 @@ struct llama_context { ggml_backend_buffer_t buf_alloc = NULL; ggml_allocr * alloc = NULL; + // temporary buffer for copying data to/from the backend + std::vector> buf_copy; + #ifdef GGML_USE_MPI ggml_mpi_context * ctx_mpi = NULL; #endif @@ -1549,7 +1553,7 @@ static bool llama_kv_cache_init( if (cache.buf) { // TODO: ggml_backend_buffer_memset // this is only valid with CPU buffers! - memset(ggml_backend_buffer_get_base(cache.buf), 0, ggml_backend_buffer_get_size(cache.buf)); + //memset(ggml_backend_buffer_get_base(cache.buf), 0, ggml_backend_buffer_get_size(cache.buf)); } if (vram_kv_cache > 0) { @@ -2249,7 +2253,7 @@ struct llama_model_loader { } } - void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t mmap_buf, llama_mlock * lmlock) { + void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) { size_t size_lock = 0; size_t size_data = 0; @@ -2276,12 +2280,13 @@ struct llama_model_loader { switch (cur->backend) { case GGML_BACKEND_CPU: if (use_mmap) { - if (mmap_buf) { - ggml_backend_tensor_alloc(mmap_buf, cur, (uint8_t *)mapping->addr + offs); + if (buf_mmap) { + ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + offs); } else { ggml_backend_tensor_set(cur, (uint8_t *)mapping->addr + offs, 0, ggml_nbytes(cur)); } } else { + // FIXME: use read_buf for device buffers without unified memory file.seek(offs, SEEK_SET); file.read_raw(cur->data, ggml_nbytes(cur)); } @@ -2905,15 +2910,16 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } } -// TODO: metal should be disabled with ngl=0 -> cpu_buffer_type static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) { #ifdef GGML_USE_METAL if (n_gpu_layers > 0) { return ggml_backend_metal_buffer_type(); } -#elif GGML_USE_CUBLAS +#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST) + return ggml_backend_cuda_buffer_type(0); +#elif defined(GGML_USE_CUBLAS) return ggml_backend_cuda_host_buffer_type(); -#elif GGML_USE_CPU_HBM +#elif defined(GGML_USE_CPU_HBM) return ggml_backend_cpu_hbm_buffer_type(); #endif @@ -3520,20 +3526,26 @@ static void llm_load_tensors( // create backend buffer bool sys_mem_buf = false; + ggml_backend_buffer_t buf_mmap = nullptr; #ifdef GGML_USE_METAL // todo: disable with 0 gpu layers if (ml.use_mmap) { const size_t max_size = ggml_get_max_tensor_size(ctx); model.buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size, max_size); + buf_mmap = model.buf; } else { model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type()); sys_mem_buf = true; } +#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST) + // for testing only + model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cuda_buffer_type(0)); #else // CPU backend, and indirectly CUDA and OpenCL if (ml.use_mmap) { model.buf = ggml_backend_cpu_buffer_from_ptr(ml.mapping->addr, ml.mapping->size); + buf_mmap = model.buf; } else { // allocate only CPU tensors model.buf = ggml_backend_buft_alloc_buffer(buft, buf_size); @@ -3549,7 +3561,6 @@ static void llm_load_tensors( #endif if (use_mlock && sys_mem_buf) { - // TODO: CPU/metal only model.mlock_buf.init (ggml_backend_buffer_get_base(model.buf)); model.mlock_buf.grow_to(ggml_backend_buffer_get_size(model.buf)); } @@ -3591,7 +3602,7 @@ static void llm_load_tensors( #endif // GGML_USE_CUBLAS // TODO: only pass buf if it is a mmap buffer - ml.load_all_data(ctx, progress_callback, progress_callback_user_data, model.buf, use_mlock ? &model.mlock_mmap : NULL); + ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL); if (progress_callback) { progress_callback(1.0f, progress_callback_user_data); @@ -5690,7 +5701,7 @@ static struct ggml_cgraph * llama_build_graph( if (!ggml_allocr_is_measure(lctx.alloc) && batch.token) { const int64_t n_tokens = cur->ne[0]; - memcpy(cur->data, batch.token, n_tokens*ggml_element_size(cur)); + ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur)); } alloc_inp_tokens = true; @@ -5703,7 +5714,7 @@ static struct ggml_cgraph * llama_build_graph( const int64_t n_embd = cur->ne[0]; const int64_t n_tokens = cur->ne[1]; - memcpy(cur->data, batch.embd, n_tokens*n_embd*ggml_element_size(cur)); + ggml_backend_tensor_set(cur, batch.embd, 0, n_tokens*n_embd*ggml_element_size(cur)); } alloc_inp_embd = true; @@ -5715,11 +5726,8 @@ static struct ggml_cgraph * llama_build_graph( if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) { const int64_t n_tokens = cur->ne[0]; - int32_t * data = (int32_t *) cur->data; - - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } + static_assert(std::is_same::value, "llama_pos must be int32_t"); + ggml_backend_tensor_set(cur, batch.pos, 0, n_tokens*ggml_element_size(cur)); } alloc_inp_pos = true; @@ -5730,7 +5738,8 @@ static struct ggml_cgraph * llama_build_graph( if (!ggml_allocr_is_measure(lctx.alloc)) { const int64_t n_embd_head = model.hparams.n_embd_head(); - ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head))); + float f = 1.0f/sqrtf(float(n_embd_head)); + ggml_backend_tensor_set(cur, &f, 0, sizeof(f)); } alloc_inp_Q_scale = true; @@ -5741,13 +5750,15 @@ static struct ggml_cgraph * llama_build_graph( if (!ggml_allocr_is_measure(lctx.alloc)) { const int64_t n_embd_head = model.hparams.n_embd_head(); + float f; if (model.arch == LLM_ARCH_PHI2) { // with phi2, we scale the Q to avoid precision issues // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66 - ggml_set_f32(cur, 1.0f); + f = 1.0f; } else { - ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head))); + f = 1.0f/sqrtf(float(n_embd_head)); } + ggml_backend_tensor_set(cur, &f, 0, sizeof(f)); } alloc_inp_KQ_scale = true; @@ -5760,8 +5771,13 @@ static struct ggml_cgraph * llama_build_graph( const int64_t n_kv = cur->ne[0]; const int64_t n_tokens = cur->ne[1]; - float * data = (float *) cur->data; - memset(data, 0, ggml_nbytes(cur)); + float * data; + if (/*is_sys_mem_buf(cur->buffer)*/false) { // TODO + data = (float *) cur->data; + } else { + lctx.buf_copy.resize(ggml_nbytes(cur)); + data = (float *) lctx.buf_copy.data(); + } for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { @@ -5769,12 +5785,20 @@ static struct ggml_cgraph * llama_build_graph( const llama_seq_id seq_id = batch.seq_id[j][0]; for (int i = 0; i < n_kv; ++i) { + float f; if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; + f = -INFINITY; + } else { + f = 0; } + data[h*(n_kv*n_tokens) + j*n_kv + i] = f; } } } + + if (data != cur->data) { + ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur)); + } } alloc_inp_KQ_mask = true; @@ -5786,11 +5810,21 @@ static struct ggml_cgraph * llama_build_graph( if (!ggml_allocr_is_measure(lctx.alloc)) { const int64_t n_ctx = cur->ne[0]; - int32_t * data = (int32_t *) cur->data; + int32_t * data; + if (/*is_sys_mem_buf(cur->buffer)*/false) { // TODO + data = (int32_t *) cur->data; + } else { + lctx.buf_copy.resize(ggml_nbytes(cur)); + data = (int32_t *) lctx.buf_copy.data(); + } for (int i = 0; i < n_ctx; ++i) { data[i] = lctx.kv_self.cells[i].delta; } + + if (data != cur->data) { + ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur)); + } } alloc_inp_K_shift = true; @@ -6122,7 +6156,7 @@ static int llama_decode_internal( GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); } -#ifdef GGML_USE_CUBLAS +#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) char * buf_alloc_base = (char *)ggml_backend_buffer_get_base(lctx.buf_alloc); for (int i = 0; i < gf->n_leafs; i++) { ggml_tensor * node = gf->leafs[i]; @@ -6162,7 +6196,7 @@ static int llama_decode_internal( n_threads = 1; } -#if GGML_USE_MPI +#ifdef GGML_USE_MPI const int64_t n_layer = hparams.n_layer; ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); #endif @@ -6178,7 +6212,7 @@ static int llama_decode_internal( } ggml_backend_graph_compute(lctx.backend, gf); -#if GGML_USE_MPI +#ifdef GGML_USE_MPI ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer); #endif @@ -6230,20 +6264,20 @@ static int llama_decode_internal( if (batch.logits[i] == 0) { continue; } - memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab); + ggml_backend_tensor_get(res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float)); #ifndef NDEBUG logits_valid[i] = true; #endif } } else if (lctx.logits_all) { logits_out.resize(n_vocab * n_tokens); - memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens); + ggml_backend_tensor_get(res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float)); #ifndef NDEBUG std::fill(logits_valid.begin(), logits_valid.end(), true); #endif } else { logits_out.resize(n_vocab); - memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab); + ggml_backend_tensor_get(res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float)); #ifndef NDEBUG logits_valid[0] = true; #endif @@ -6255,7 +6289,7 @@ static int llama_decode_internal( auto & embedding_out = lctx.embedding; embedding_out.resize(n_embd); - memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd); + ggml_backend_tensor_get(embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float)); } // measure the performance only for the single-token evals @@ -9187,8 +9221,16 @@ struct llama_context * llama_new_context_with_model( LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__); } } +#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST) + // for testing only + ctx->backend = ggml_backend_cuda_init(0); + if (ctx->backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CUDA backend\n", __func__); + } #endif + if (ctx->backend == nullptr) { + // FIXME: this may fail if the model buffer is not compatible with the CPU backend ctx->backend = ggml_backend_cpu_init(); } From 1ac01fbbd10fd970d024509a98066d01086b325f Mon Sep 17 00:00:00 2001 From: slaren Date: Tue, 19 Dec 2023 18:31:00 +0100 Subject: [PATCH 04/21] add ggml_backend_buffer_clear zero-init KV cache buffer --- ggml-backend-impl.h | 15 ++++++++------- ggml-backend.c | 10 ++++++++++ ggml-backend.h | 1 + ggml-cuda.cu | 10 ++++++++++ ggml-metal.m | 9 +++++++-- llama.cpp | 15 ++++++++------- 6 files changed, 44 insertions(+), 16 deletions(-) diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index f588af6028265..f824b2c754ef9 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -31,15 +31,16 @@ extern "C" { typedef void * ggml_backend_buffer_context_t; struct ggml_backend_buffer_i { - void (*free_buffer)(ggml_backend_buffer_t buffer); + void (*free_buffer) (ggml_backend_buffer_t buffer); //void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras - void * (*get_base) (ggml_backend_buffer_t buffer); - void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + void * (*get_base) (ggml_backend_buffer_t buffer); + void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); // (optional) copy tensor between different buffer-type, allow for single-copy tranfers - void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*clear) (ggml_backend_buffer_t buffer, uint8_t value); }; struct ggml_backend_buffer { diff --git a/ggml-backend.c b/ggml-backend.c index 2551915c33fa9..9ea9e5c1b9071 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -94,6 +94,10 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor); } +void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + buffer->iface.clear(buffer, value); +} + ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) { return buffer->buft; } @@ -410,6 +414,10 @@ static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, GGML_UNUSED(buffer); } +static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + memset(buffer->context, value, buffer->size); +} + static struct ggml_backend_buffer_i cpu_backend_buffer_i = { /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer, /* .get_base = */ ggml_backend_cpu_buffer_get_base, @@ -418,6 +426,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = { /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from, /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to, + /* .clear = */ ggml_backend_cpu_buffer_clear, }; // for buffers from ptr, free is not called @@ -429,6 +438,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from, /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to, + /* .clear = */ ggml_backend_cpu_buffer_clear, }; static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 diff --git a/ggml-backend.h b/ggml-backend.h index a52dc367b6954..b644feb16a15f 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -29,6 +29,7 @@ extern "C" { GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer); // diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 57a128ade65b3..f69a8f5af50f5 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -9494,6 +9494,15 @@ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, co CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost)); } +static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + + ggml_cuda_set_device(ctx->device); + CUDA_CHECK(cudaDeviceSynchronize()); + + CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size)); +} + static struct ggml_backend_buffer_i cuda_backend_buffer_interface = { /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, /* .get_base = */ ggml_backend_cuda_buffer_get_base, @@ -9502,6 +9511,7 @@ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = { /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor, /* .cpy_tensor_from = */ NULL, /* .cpy_tensor_to = */ NULL, + /* .clear = */ ggml_backend_cuda_buffer_clear, }; // cuda buffer type diff --git a/ggml-metal.m b/ggml-metal.m index 4944ac3cdc853..3bbe644053a46 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -2429,8 +2429,6 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) } free(ctx); - - UNUSED(buffer); } static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { @@ -2457,6 +2455,12 @@ static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer UNUSED(buffer); } +static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + memset(ctx->all_data, value, ctx->all_size); +} + static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = { /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer, /* .get_base = */ ggml_backend_metal_buffer_get_base, @@ -2465,6 +2469,7 @@ static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer /* .get_tensor = */ ggml_backend_metal_buffer_get_tensor, /* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from, /* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to, + /* .clear = */ ggml_backend_metal_buffer_clear, }; // default buffer type diff --git a/llama.cpp b/llama.cpp index 62c921ea95aa3..988ce80044bdf 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1551,9 +1551,8 @@ static bool llama_kv_cache_init( // buf may be NULL with full offload if (cache.buf) { - // TODO: ggml_backend_buffer_memset - // this is only valid with CPU buffers! - //memset(ggml_backend_buffer_get_base(cache.buf), 0, ggml_backend_buffer_get_size(cache.buf)); + // initialize the buffer to avoid NaNs in the padding + ggml_backend_buffer_clear(cache.buf, 0); } if (vram_kv_cache > 0) { @@ -3569,8 +3568,12 @@ static void llm_load_tensors( { size_t sys_mem_required = ctx_size + buf_size; - LLAMA_LOG_INFO("%s: system memory used = %7.2f MiB\n", __func__, sys_mem_required / 1024.0 / 1024.0); - LLAMA_LOG_INFO("%s: VRAM used = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0); + { + LLAMA_LOG_INFO("%s: system memory used = %7.2f MiB\n", __func__, sys_mem_required / 1024.0 / 1024.0); + } + if (vram_weights > 0) { + LLAMA_LOG_INFO("%s: VRAM used = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0); + } #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); @@ -3586,7 +3589,6 @@ static void llm_load_tensors( LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); #else GGML_UNUSED(n_gpu_layers); - GGML_UNUSED(vram_weights); GGML_UNUSED(tensor_split); #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) } @@ -3601,7 +3603,6 @@ static void llm_load_tensors( ggml_cuda_set_tensor_split(tensor_split); #endif // GGML_USE_CUBLAS - // TODO: only pass buf if it is a mmap buffer ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL); if (progress_callback) { From c8bd5d8b65517ac3e2f5a1c25f0fcc7da8c5655a Mon Sep 17 00:00:00 2001 From: slaren Date: Tue, 19 Dec 2023 23:47:34 +0100 Subject: [PATCH 05/21] add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data --- ggml-backend-impl.h | 5 ++++- ggml-backend.c | 37 ++++++++++++++++++++++++++++--------- ggml-backend.h | 2 ++ ggml-cuda.cu | 16 ++++++++-------- ggml-metal.m | 7 +++++++ llama.cpp | 28 ++++++++++++++++++---------- 6 files changed, 67 insertions(+), 28 deletions(-) diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index f824b2c754ef9..05859935a3c2f 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -20,6 +20,9 @@ extern "C" { size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend + // check if tensor data is in host memory + // should be equivalent to supports_backend(buft, ggml_backend_cpu_init()) + bool (*is_host) (ggml_backend_buffer_type_t buft); }; struct ggml_backend_buffer_type { @@ -79,7 +82,7 @@ extern "C" { void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*synchronize) (ggml_backend_t backend); + void (*synchronize)(ggml_backend_t backend); // compute graph with a plan ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph); diff --git a/ggml-backend.c b/ggml-backend.c index 9ea9e5c1b9071..e6f69bb23ec5b 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -35,6 +35,13 @@ bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_ba return buft->iface.supports_backend(buft, backend); } +bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) { + if (buft->iface.is_host) { + return buft->iface.is_host(buft); + } + return false; +} + // backend buffer ggml_backend_buffer_t ggml_backend_buffer_init( @@ -98,6 +105,10 @@ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { buffer->iface.clear(buffer, value); } +bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) { + return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer)); +} + ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) { return buffer->buft; } @@ -464,6 +475,12 @@ static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_ty GGML_UNUSED(buft); } +static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + return true; + + GGML_UNUSED(buft); +} + ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = { /* .iface = */ { @@ -471,6 +488,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, }, /* .context = */ NULL, }; @@ -479,9 +497,11 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { } #ifdef GGML_USE_CPU_HBM + +// buffer type HBM + #include -// HBM buffer type static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) { hbw_free(buffer->context); } @@ -503,16 +523,15 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_ return buffer; } -struct ggml_backend_buffer_type_i cpu_backend_hbm_buffer_type_interface = { - /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, - /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, -}; - ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() { static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = { - /* .iface = */ cpu_backend_hbm_buffer_type_interface, + /* .iface = */ { + /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, + }, /* .context = */ NULL, }; diff --git a/ggml-backend.h b/ggml-backend.h index b644feb16a15f..a9d2fddd726a8 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -21,6 +21,7 @@ extern "C" { GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend); + GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); // buffer GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); @@ -30,6 +31,7 @@ extern "C" { GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); + GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer); // diff --git a/ggml-cuda.cu b/ggml-cuda.cu index f69a8f5af50f5..23bd073d2b290 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -9568,6 +9568,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend, + /* .is_host = */ nullptr, }; ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { @@ -9606,16 +9607,15 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm return buffer; } -struct ggml_backend_buffer_type_i ggml_backend_cuda_host_buffer_type_interface = { - /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, - /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, - /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, -}; - ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = { - /* .iface = */ ggml_backend_cuda_host_buffer_type_interface, + /* .iface = */ { + /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, + /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + }, /* .context = */ nullptr, }; diff --git a/ggml-metal.m b/ggml-metal.m index 3bbe644053a46..0916202a0687d 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -2521,6 +2521,12 @@ static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_ UNUSED(buft); } +static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + return true; + + UNUSED(buft); +} + ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = { /* .iface = */ { @@ -2528,6 +2534,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_metal_buffer_type_is_host, }, /* .context = */ NULL, }; diff --git a/llama.cpp b/llama.cpp index 988ce80044bdf..3ffc33b27f167 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1539,7 +1539,7 @@ static bool llama_kv_cache_init( ggml_cuda_assign_buffers_no_scratch(v); vram_kv_cache += ggml_nbytes(k); vram_kv_cache += ggml_nbytes(v); - // HACK: mark tensor as allocated, but crash if we try to use it from the CPU + // HACK: mark tensor as allocated k->data = v->data = (void *)(uintptr_t)1; } } @@ -2285,9 +2285,15 @@ struct llama_model_loader { ggml_backend_tensor_set(cur, (uint8_t *)mapping->addr + offs, 0, ggml_nbytes(cur)); } } else { - // FIXME: use read_buf for device buffers without unified memory - file.seek(offs, SEEK_SET); - file.read_raw(cur->data, ggml_nbytes(cur)); + if (ggml_backend_buffer_is_host(cur->buffer)) { + file.seek(offs, SEEK_SET); + file.read_raw(cur->data, ggml_nbytes(cur)); + } else { + read_buf.resize(ggml_nbytes(cur)); + file.seek(offs, SEEK_SET); + file.read_raw(read_buf.data(), ggml_nbytes(cur)); + ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur)); + } } if (use_mmap && lmlock) { @@ -2298,7 +2304,7 @@ struct llama_model_loader { case GGML_BACKEND_GPU: case GGML_BACKEND_GPU_SPLIT: { - // HACK: mark tensor as allocated, but crash if we try to use it from the CPU + // HACK: mark tensor as allocated cur->data = (void *)(uintptr_t)1; void * data; if (use_mmap) { @@ -5773,7 +5779,7 @@ static struct ggml_cgraph * llama_build_graph( const int64_t n_tokens = cur->ne[1]; float * data; - if (/*is_sys_mem_buf(cur->buffer)*/false) { // TODO + if (ggml_backend_buffer_is_host(cur->buffer)) { data = (float *) cur->data; } else { lctx.buf_copy.resize(ggml_nbytes(cur)); @@ -5812,7 +5818,7 @@ static struct ggml_cgraph * llama_build_graph( const int64_t n_ctx = cur->ne[0]; int32_t * data; - if (/*is_sys_mem_buf(cur->buffer)*/false) { // TODO + if (ggml_backend_buffer_is_host(cur->buffer)) { data = (int32_t *) cur->data; } else { lctx.buf_copy.resize(ggml_nbytes(cur)); @@ -9230,13 +9236,15 @@ struct llama_context * llama_new_context_with_model( } #endif - if (ctx->backend == nullptr) { - // FIXME: this may fail if the model buffer is not compatible with the CPU backend + if (ctx->backend == nullptr && ggml_backend_buffer_is_host(model->buf)) { ctx->backend = ggml_backend_cpu_init(); + if (ctx->backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); + } } if (ctx->backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); + LLAMA_LOG_ERROR("%s: failed to initialize a backend\n", __func__); delete ctx; return nullptr; } From 72a0c96621786ced835993f1e3c2b985e7d252ca Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 20 Dec 2023 02:45:54 +0100 Subject: [PATCH 06/21] disable gpu backends with ngl 0 --- ggml-cuda.cu | 2 +- llama.cpp | 292 +++++++++++++++++++++++++-------------------------- 2 files changed, 146 insertions(+), 148 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 23bd073d2b290..b890ec985401f 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -9442,7 +9442,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; if (tensor->view_src != NULL && tensor->view_offs == 0) { - assert(tensor->view_src->buffer->buft == buffer->buft); // TODO + assert(tensor->view_src->buffer->buft == buffer->buft); tensor->backend = tensor->view_src->backend; tensor->extra = tensor->view_src->extra; return; diff --git a/llama.cpp b/llama.cpp index 3ffc33b27f167..cca11e5ad3783 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1,5 +1,5 @@ #define LLAMA_API_INTERNAL -#define LLAMA_GGML_BACKEND_CUDA_TEST // for testing only - disables partial offloading +//#define LLAMA_GGML_BACKEND_CUDA_TEST // for testing only - enables ggml-cuda through ggml-backend, disables partial offloading #include "llama.h" #include "unicode.h" @@ -1087,6 +1087,26 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_ return std::string(result.data(), result.size()); } +static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) { +#ifdef GGML_USE_METAL + if (n_gpu_layers > 0) { + return ggml_backend_metal_buffer_type(); + } +#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST) + if (n_gpu_layers > 0) { + return ggml_backend_cuda_buffer_type(0); + } +#elif defined(GGML_USE_CUBLAS) + return ggml_backend_cuda_host_buffer_type(); +#elif defined(GGML_USE_CPU_HBM) + return ggml_backend_cpu_hbm_buffer_type(); +#endif + + return ggml_backend_cpu_buffer_type(); + + GGML_UNUSED(n_gpu_layers); +} + // // globals // @@ -1492,8 +1512,7 @@ static bool llama_kv_cache_init( ggml_type vtype, uint32_t n_ctx, int n_gpu_layers, - bool offload, - ggml_backend_buffer_type_t buft) { + bool offload) { const uint32_t n_embd = hparams.n_embd_gqa(); const uint32_t n_layer = hparams.n_layer; @@ -1532,7 +1551,7 @@ static bool llama_kv_cache_init( ggml_format_name(v, "cache_v_l%d", i); cache.k_l.push_back(k); cache.v_l.push_back(v); -#ifdef GGML_USE_CUBLAS +#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) if (i >= i_gpu_start) { if (offload) { ggml_cuda_assign_buffers_no_scratch(k); @@ -1547,7 +1566,7 @@ static bool llama_kv_cache_init( } // allocate tensors - cache.buf = ggml_backend_alloc_ctx_tensors_from_buft(cache.ctx, buft); + cache.buf = ggml_backend_alloc_ctx_tensors_from_buft(cache.ctx, llama_default_buffer_type(n_gpu_layers)); // buf may be NULL with full offload if (cache.buf) { @@ -1559,7 +1578,6 @@ static bool llama_kv_cache_init( LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0); } - GGML_UNUSED(n_gpu_layers); GGML_UNUSED(i_gpu_start); GGML_UNUSED(offload); @@ -2252,6 +2270,8 @@ struct llama_model_loader { } } + + void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) { size_t size_lock = 0; size_t size_data = 0; @@ -2267,68 +2287,65 @@ struct llama_model_loader { } } +#if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST) + const bool legacy_offload = true; +#else + const bool legacy_offload = false; +#endif + std::vector> read_buf; size_t done_size = 0; for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); GGML_ASSERT(cur); // unused tensors should have been caught by load_data already - const size_t offs = file_offset(ggml_get_name(cur)); - switch (cur->backend) { - case GGML_BACKEND_CPU: - if (use_mmap) { - if (buf_mmap) { - ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + offs); - } else { - ggml_backend_tensor_set(cur, (uint8_t *)mapping->addr + offs, 0, ggml_nbytes(cur)); - } + if (!legacy_offload || cur->backend == GGML_BACKEND_CPU) { + if (use_mmap) { + if (buf_mmap) { + ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs); } else { - if (ggml_backend_buffer_is_host(cur->buffer)) { - file.seek(offs, SEEK_SET); - file.read_raw(cur->data, ggml_nbytes(cur)); - } else { - read_buf.resize(ggml_nbytes(cur)); - file.seek(offs, SEEK_SET); - file.read_raw(read_buf.data(), ggml_nbytes(cur)); - ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur)); - } - } - - if (use_mmap && lmlock) { - size_lock += ggml_nbytes(cur); - lmlock->grow_to(size_lock); + ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur)); } - break; - - case GGML_BACKEND_GPU: - case GGML_BACKEND_GPU_SPLIT: { - // HACK: mark tensor as allocated - cur->data = (void *)(uintptr_t)1; - void * data; - if (use_mmap) { - data = (uint8_t *)mapping->addr + offs; + } else { + if (ggml_backend_buffer_is_host(cur->buffer)) { + file.seek(offs, SEEK_SET); + file.read_raw(cur->data, ggml_nbytes(cur)); } else { read_buf.resize(ggml_nbytes(cur)); file.seek(offs, SEEK_SET); file.read_raw(read_buf.data(), ggml_nbytes(cur)); - data = read_buf.data(); + ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur)); } + } -#ifdef GGML_USE_CUBLAS - ggml_cuda_transform_tensor(data, cur); + if (use_mmap && lmlock) { + size_lock += ggml_nbytes(cur); + lmlock->grow_to(size_lock); + } + } else { + // HACK: mark tensor as allocated + cur->data = (void *)(uintptr_t)1; + void * data; + if (use_mmap) { + data = (uint8_t *) mapping->addr + offs; + } else { + read_buf.resize(ggml_nbytes(cur)); + file.seek(offs, SEEK_SET); + file.read_raw(read_buf.data(), ggml_nbytes(cur)); + data = read_buf.data(); + } + +#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) + ggml_cuda_transform_tensor(data, cur); #elif defined(GGML_USE_CLBLAST) - GGML_ASSERT(cur->backend == GGML_BACKEND_GPU); - ggml_cl_transform_tensor(data, cur); + GGML_ASSERT(cur->backend == GGML_BACKEND_GPU); + ggml_cl_transform_tensor(data, cur); #else - GGML_ASSERT(!"GPU tensor without a GPU backend"); - GGML_UNUSED(data); + GGML_ASSERT(!"GPU tensor without a GPU backend"); + GGML_UNUSED(data); #endif - } break; - - default: - continue; } done_size += ggml_nbytes(cur); @@ -2915,24 +2932,6 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } } -static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) { -#ifdef GGML_USE_METAL - if (n_gpu_layers > 0) { - return ggml_backend_metal_buffer_type(); - } -#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST) - return ggml_backend_cuda_buffer_type(0); -#elif defined(GGML_USE_CUBLAS) - return ggml_backend_cuda_host_buffer_type(); -#elif defined(GGML_USE_CPU_HBM) - return ggml_backend_cpu_hbm_buffer_type(); -#endif - - return ggml_backend_cpu_buffer_type(); - - GGML_UNUSED(n_gpu_layers); -} - static void llm_load_tensors( llama_model_loader & ml, llama_model & model, @@ -2972,7 +2971,7 @@ static void llm_load_tensors( enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU; enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU; -#ifdef GGML_USE_CUBLAS +#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) if (ggml_cublas_loaded()) { LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__); ggml_cuda_set_main_device(main_gpu); @@ -3522,6 +3521,7 @@ static void llm_load_tensors( ggml_backend_buffer_type_t buft = llama_default_buffer_type(n_gpu_layers); for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + // GGML_BACKEND_GPU tensors are for CUDA and OpenCL only, which are handled separately without ggml-backend if (t->backend == GGML_BACKEND_CPU) { buf_size += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), ggml_backend_buft_get_alignment(buft)); } else { @@ -3530,42 +3530,44 @@ static void llm_load_tensors( } // create backend buffer - bool sys_mem_buf = false; ggml_backend_buffer_t buf_mmap = nullptr; #ifdef GGML_USE_METAL -// todo: disable with 0 gpu layers - if (ml.use_mmap) { - const size_t max_size = ggml_get_max_tensor_size(ctx); - model.buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size, max_size); - buf_mmap = model.buf; - } else { - model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type()); - sys_mem_buf = true; + if (n_gpu_layers > 0) { + if (ml.use_mmap) { + const size_t max_size = ggml_get_max_tensor_size(ctx); + model.buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size, max_size); + buf_mmap = model.buf; + } else { + model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type()); + } } #elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST) // for testing only - model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cuda_buffer_type(0)); -#else - // CPU backend, and indirectly CUDA and OpenCL - if (ml.use_mmap) { - model.buf = ggml_backend_cpu_buffer_from_ptr(ml.mapping->addr, ml.mapping->size); - buf_mmap = model.buf; - } else { - // allocate only CPU tensors - model.buf = ggml_backend_buft_alloc_buffer(buft, buf_size); - ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(model.buf); - for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { - if (t->backend == GGML_BACKEND_CPU) { - ggml_tallocr_alloc(alloc, t); + if (n_gpu_layers > 0) { + model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cuda_buffer_type(0)); + } +#endif + + if (model.buf == nullptr) { + // CPU backend, and indirectly CUDA and OpenCL + if (ml.use_mmap) { + model.buf = ggml_backend_cpu_buffer_from_ptr(ml.mapping->addr, ml.mapping->size); + buf_mmap = model.buf; + } else { + // allocate only CPU tensors + model.buf = ggml_backend_buft_alloc_buffer(buft, buf_size); + ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(model.buf); + for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + if (t->backend == GGML_BACKEND_CPU) { + ggml_tallocr_alloc(alloc, t); + } } + ggml_tallocr_free(alloc); } - ggml_tallocr_free(alloc); - sys_mem_buf = true; } -#endif - if (use_mlock && sys_mem_buf) { + if (use_mlock && ggml_backend_buffer_is_host(model.buf)) { model.mlock_buf.init (ggml_backend_buffer_get_base(model.buf)); model.mlock_buf.grow_to(ggml_backend_buffer_get_size(model.buf)); } @@ -3574,7 +3576,7 @@ static void llm_load_tensors( { size_t sys_mem_required = ctx_size + buf_size; - { + if (sys_mem_required > 0) { LLAMA_LOG_INFO("%s: system memory used = %7.2f MiB\n", __func__, sys_mem_required / 1024.0 / 1024.0); } if (vram_weights > 0) { @@ -3593,22 +3595,21 @@ static void llm_load_tensors( const int max_offloadable_layers = hparams.n_layer + 1; LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); -#else - GGML_UNUSED(n_gpu_layers); - GGML_UNUSED(tensor_split); #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) } +#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) + ggml_cuda_set_tensor_split(tensor_split); +#else + GGML_UNUSED(tensor_split); +#endif // GGML_USE_CUBLAS + // populate tensors_by_name for (int i = 0; i < ml.n_tensors; ++i) { struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i)); model.tensors_by_name.emplace_back(ggml_get_name(cur), cur); } -#ifdef GGML_USE_CUBLAS - ggml_cuda_set_tensor_split(tensor_split); -#endif // GGML_USE_CUBLAS - ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL); if (progress_callback) { @@ -5680,7 +5681,7 @@ static struct ggml_cgraph * llama_build_graph( bool alloc_inp_KQ_mask = false; bool alloc_inp_K_shift = false; -#ifdef GGML_USE_CUBLAS +#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) const bool do_offload = true; #else const bool do_offload = true; // TODO: set to false after finishing refactoring @@ -5868,7 +5869,7 @@ static struct ggml_cgraph * llama_build_graph( static const std::unordered_map> k_offload_func_name = { { OFFLOAD_FUNC_NOP, "CPU" }, { OFFLOAD_FUNC_OUT, "CPU" }, -#ifdef GGML_USE_CUBLAS +#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) { OFFLOAD_FUNC, "GPU (CUDA)" }, { OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" }, { OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" }, @@ -5941,7 +5942,7 @@ static struct ggml_cgraph * llama_build_graph( offload_func_t func = ggml_offload_nop; // this is needed for compatibility with Metal for example -#ifdef GGML_USE_CUBLAS +#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc; #else static offload_func_t ggml_offload_gpu = ggml_offload_nop; @@ -8799,22 +8800,16 @@ static int llama_apply_lora_from_file_internal( std::unique_ptr ml; unique_context base_ctx(nullptr, ggml_free); - std::vector base_buf; if (path_base_model) { LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ NULL)); - size_t ctx_size; - size_t mmapped_size; - GGML_ASSERT(!"not implemented"); - //ml->calc_sizes(ctx_size, mmapped_size); - - base_buf.resize(ctx_size); + size_t ctx_size = ggml_tensor_overhead() * ml->n_tensors; ggml_init_params base_params; - base_params.mem_size = base_buf.size(); - base_params.mem_buffer = base_buf.data(); - base_params.no_alloc = ml->use_mmap; + base_params.mem_size = ctx_size; + base_params.mem_buffer = NULL; + base_params.no_alloc = true; base_ctx.reset(ggml_init(base_params)); @@ -8913,7 +8908,7 @@ static int llama_apply_lora_from_file_internal( offload_func_t offload_func = ggml_offload_nop; offload_func_t offload_func_force_inplace = ggml_offload_nop; -#ifdef GGML_USE_CUBLAS +#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) { if (dest_t->type != GGML_TYPE_F16) { throw std::runtime_error(format( @@ -9230,9 +9225,11 @@ struct llama_context * llama_new_context_with_model( } #elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST) // for testing only - ctx->backend = ggml_backend_cuda_init(0); - if (ctx->backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize CUDA backend\n", __func__); + if (model->n_gpu_layers > 0) { + ctx->backend = ggml_backend_cuda_init(0); + if (ctx->backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CUDA backend\n", __func__); + } } #endif @@ -9250,8 +9247,7 @@ struct llama_context * llama_new_context_with_model( } if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, - cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv, - llama_default_buffer_type(model->n_gpu_layers))) { + cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; @@ -9309,36 +9305,38 @@ struct llama_context * llama_new_context_with_model( ctx->buf_alloc = ggml_backend_alloc_buffer(ctx->backend, alloc_size); ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc); -#ifdef GGML_USE_CUBLAS - ggml_cuda_set_scratch_size(alloc_size); - LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0); - - // calculate total VRAM usage - auto add_tensor = [](const ggml_tensor * t, size_t & size) { - if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) { - size += ggml_nbytes(t); +#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) + if (model->n_gpu_layers > 0) { + ggml_cuda_set_scratch_size(alloc_size); + LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0); + + // calculate total VRAM usage + auto add_tensor = [](const ggml_tensor * t, size_t & size) { + if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) { + size += ggml_nbytes(t); + } + }; + size_t model_vram_size = 0; + for (const auto & kv : model->tensors_by_name) { + add_tensor(kv.second, model_vram_size); } - }; - size_t model_vram_size = 0; - for (const auto & kv : model->tensors_by_name) { - add_tensor(kv.second, model_vram_size); - } - size_t kv_vram_size = 0; - for (auto & k : ctx->kv_self.k_l) { - add_tensor(k, kv_vram_size); - } - for (auto & v : ctx->kv_self.v_l) { - add_tensor(v, kv_vram_size); - } + size_t kv_vram_size = 0; + for (auto & k : ctx->kv_self.k_l) { + add_tensor(k, kv_vram_size); + } + for (auto & v : ctx->kv_self.v_l) { + add_tensor(v, kv_vram_size); + } - size_t ctx_vram_size = alloc_size + kv_vram_size; - size_t total_vram_size = model_vram_size + ctx_vram_size; + size_t ctx_vram_size = alloc_size + kv_vram_size; + size_t total_vram_size = model_vram_size + ctx_vram_size; - LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__, - total_vram_size / 1024.0 / 1024.0, - model_vram_size / 1024.0 / 1024.0, - ctx_vram_size / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__, + total_vram_size / 1024.0 / 1024.0, + model_vram_size / 1024.0 / 1024.0, + ctx_vram_size / 1024.0 / 1024.0); + } #endif } } From d3e7242bdb24c270dfc7074920930fc5eeaa3f44 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 20 Dec 2023 03:01:33 +0100 Subject: [PATCH 07/21] more accurate mlock --- llama.cpp | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/llama.cpp b/llama.cpp index cca11e5ad3783..7254d409db4e9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2270,10 +2270,7 @@ struct llama_model_loader { } } - - - void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) { - size_t size_lock = 0; + void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const { size_t size_data = 0; for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { @@ -2281,7 +2278,7 @@ struct llama_model_loader { size_data += ggml_nbytes(cur); } - if (use_mmap) { + if (use_mmap && buf_mmap) { if (lmlock) { lmlock->init(mapping->addr); } @@ -2305,6 +2302,9 @@ struct llama_model_loader { if (use_mmap) { if (buf_mmap) { ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs); + if (lmlock) { + lmlock->grow_to(offs + ggml_nbytes(cur)); + } } else { ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur)); } @@ -2319,11 +2319,6 @@ struct llama_model_loader { ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur)); } } - - if (use_mmap && lmlock) { - size_lock += ggml_nbytes(cur); - lmlock->grow_to(size_lock); - } } else { // HACK: mark tensor as allocated cur->data = (void *)(uintptr_t)1; From c3678ca84f79e278cdfaf13b2d017f5c24b483c6 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 20 Dec 2023 04:12:11 +0100 Subject: [PATCH 08/21] unmap offloaded part of the model --- llama.cpp | 86 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 78 insertions(+), 8 deletions(-) diff --git a/llama.cpp b/llama.cpp index 7254d409db4e9..464ee1249ac8e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -815,6 +815,21 @@ struct llama_mmap { llama_mmap(const llama_mmap &) = delete; + static void align_offset(size_t & offset, size_t & len, size_t page_size) { + // align offset to the next page + size_t offset_in_page = offset & (page_size - 1); + size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page; + offset += offset_to_page; + + if (offset_to_page >= len) { + len = 0; + } else { + len -= offset_to_page; + // align len to the previous page + len -= len & (page_size - 1); + } + } + #ifdef _POSIX_MAPPED_FILES static constexpr bool SUPPORTED = true; @@ -849,6 +864,24 @@ struct llama_mmap { } } + void unmap(size_t offset, size_t len) { + int page_size = sysconf(_SC_PAGESIZE); + align_offset(offset, len, page_size); + if (len < (size_t)page_size) { + return; + } + + void * next_page_start = (uint8_t *) addr + offset; + // unmap and discard the pages + if (munmap(next_page_start, len)) { + fprintf(stderr, "warning: munmap failed: %s\n", strerror(errno)); + } + if (posix_madvise(next_page_start, len, POSIX_MADV_DONTNEED)) { + fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_DONTNEED) failed: %s\n", + strerror(errno)); + } + } + ~llama_mmap() { munmap(addr, size); } @@ -898,6 +931,20 @@ struct llama_mmap { } } + void unmap(size_t offset, size_t len) { + SYSTEM_INFO si; + GetSystemInfo(&si); + DWORD page_size = si.dwAllocationGranularity; + align_offset(offset, len, page_size); + + if (len < (size_t)page_size) { + return; + } + + void * next_page_start = (uint8_t *) addr + offset; + VirtualAlloc(next_page_start, len, MEM_RESET, PAGE_NOACCESS); + } + ~llama_mmap() { if (!UnmapViewOfFile(addr)) { fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n", @@ -914,6 +961,13 @@ struct llama_mmap { throw std::runtime_error(std::string("mmap not supported")); } + + void unmap(size_t offset, size_t len) { + (void) offset; + (void) len; + + throw std::runtime_error(std::string("mmap not supported")); + } #endif }; @@ -2243,7 +2297,9 @@ struct llama_model_loader { return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx); } - void init_mapping(struct ggml_context * ctx) { + void init_mapping() { + /* + // prefetch only CPU tensors if (use_mmap) { size_t size_pref = 0; // prefetch @@ -2256,6 +2312,9 @@ struct llama_model_loader { } mapping.reset(new llama_mmap(&file, gguf_get_data_offset(ctx_gguf) + size_pref, ggml_is_numa())); } + */ + // prefetch the whole file - all the data is needed anyway + mapping.reset(new llama_mmap(&file, -1, ggml_is_numa())); } // for backwards compatibility only @@ -2292,19 +2351,25 @@ struct llama_model_loader { std::vector> read_buf; - size_t done_size = 0; + size_t size_done = 0; + + size_t mmap_first = -1; + size_t mmap_last = 0; + for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); GGML_ASSERT(cur); // unused tensors should have been caught by load_data already const size_t offs = file_offset(ggml_get_name(cur)); if (!legacy_offload || cur->backend == GGML_BACKEND_CPU) { - if (use_mmap) { + if (use_mmap && mapping) { if (buf_mmap) { ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs); if (lmlock) { lmlock->grow_to(offs + ggml_nbytes(cur)); } + mmap_first = std::min(mmap_first, offs); + mmap_last = std::max(mmap_last, offs + ggml_nbytes(cur)); } else { ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur)); } @@ -2323,7 +2388,7 @@ struct llama_model_loader { // HACK: mark tensor as allocated cur->data = (void *)(uintptr_t)1; void * data; - if (use_mmap) { + if (use_mmap && mapping) { data = (uint8_t *) mapping->addr + offs; } else { read_buf.resize(ggml_nbytes(cur)); @@ -2343,14 +2408,19 @@ struct llama_model_loader { #endif } - done_size += ggml_nbytes(cur); + size_done += ggml_nbytes(cur); if (progress_callback) { - progress_callback((float) done_size / size_data, progress_callback_user_data); + progress_callback((float) size_done / size_data, progress_callback_user_data); } } - // TODO: unmap GPU tensors + // unmap GPU tensors + if (use_mmap && mapping) { + // unmap offloaded tensors and metadata + mapping->unmap(0, mmap_first); + mapping->unmap(mmap_last, mapping->size - mmap_last); + } } }; @@ -3507,7 +3577,7 @@ static void llm_load_tensors( ml.done_getting_tensors(); - ml.init_mapping(ctx); + ml.init_mapping(); // allocate tensors size_t vram_weights = 0; From 524104581994c3a546733dee3ab7632448003cf2 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 20 Dec 2023 12:57:45 +0100 Subject: [PATCH 09/21] use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap --- llama.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 464ee1249ac8e..0c196b6acef37 100644 --- a/llama.cpp +++ b/llama.cpp @@ -33,6 +33,7 @@ #include #if defined(_POSIX_MAPPED_FILES) #include + #include #endif #if defined(_POSIX_MEMLOCK_RANGE) #include @@ -840,6 +841,10 @@ struct llama_mmap { // prefetch/readahead impairs performance on NUMA systems if (numa) { prefetch = 0; } #ifdef __linux__ + if (posix_fadvise64(fd, 0, file->size, POSIX_FADV_SEQUENTIAL)) { + fprintf(stderr, "warning: fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n", + strerror(errno)); + } if (prefetch) { flags |= MAP_POPULATE; } #endif addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); @@ -2314,7 +2319,9 @@ struct llama_model_loader { } */ // prefetch the whole file - all the data is needed anyway - mapping.reset(new llama_mmap(&file, -1, ggml_is_numa())); + if (use_mmap) { + mapping.reset(new llama_mmap(&file, -1, ggml_is_numa())); + } } // for backwards compatibility only From bcd87ca92544ade4a51041f8d52e2022fb896a20 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 20 Dec 2023 17:15:43 +0100 Subject: [PATCH 10/21] update quantize and lora --- llama.cpp | 48 ++++++++++++++++++------------------------------ 1 file changed, 18 insertions(+), 30 deletions(-) diff --git a/llama.cpp b/llama.cpp index 0c196b6acef37..769906b299cb6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2236,8 +2236,12 @@ struct llama_model_loader { return gguf_get_tensor_name(ctx_gguf, i); } + struct ggml_tensor * get_tensor_meta(const char * name) const { + return ggml_get_tensor(ctx_meta, name); + } + struct ggml_tensor * get_tensor_meta(int i) const { - return ggml_get_tensor(ctx_meta, get_tensor_name(i)); + return get_tensor_meta(get_tensor_name(i)); } struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) { @@ -2302,7 +2306,7 @@ struct llama_model_loader { return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx); } - void init_mapping() { + void init_mapping(bool prefetch = true) { /* // prefetch only CPU tensors if (use_mmap) { @@ -2320,17 +2324,19 @@ struct llama_model_loader { */ // prefetch the whole file - all the data is needed anyway if (use_mmap) { - mapping.reset(new llama_mmap(&file, -1, ggml_is_numa())); + mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa())); } } - // for backwards compatibility only + // for backwards compatibility, does not support ggml-backend void load_data_for(struct ggml_tensor * cur) const { const size_t offs = file_offset(ggml_get_name(cur)); - if (use_mmap) { + if (use_mmap && mapping) { + GGML_ASSERT(cur->data == nullptr); cur->data = (uint8_t *)mapping->addr + offs; } else { + GGML_ASSERT(cur->data != nullptr); file.seek(offs, SEEK_SET); file.read_raw(cur->data, ggml_nbytes(cur)); } @@ -8569,9 +8575,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s #endif llama_model_loader ml(fname_inp, use_mmap, NULL); - if (ml.use_mmap) { - ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa())); - } + ml.init_mapping(false); // no prefetching? llama_model model; llm_load_arch(ml, model); @@ -8650,8 +8654,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } tensor->data = read_data.data(); } - GGML_ASSERT(!"not implemented"); - //ml.load_data_for(tensor); TODO + ml.load_data_for(tensor); LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ", ++idx, ml.n_tensors, @@ -8871,24 +8874,10 @@ static int llama_apply_lora_from_file_internal( // load base model std::unique_ptr ml; - unique_context base_ctx(nullptr, ggml_free); - if (path_base_model) { + if (path_base_model) { LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); - ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ NULL)); - - size_t ctx_size = ggml_tensor_overhead() * ml->n_tensors; - - ggml_init_params base_params; - base_params.mem_size = ctx_size; - base_params.mem_buffer = NULL; - base_params.no_alloc = true; - - base_ctx.reset(ggml_init(base_params)); - - // maybe this should be in llama_model_loader - if (ml->use_mmap) { - ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa())); - } + ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr)); + ml->init_mapping(false); // no prefetching } // read tensors and apply @@ -9001,9 +8990,8 @@ static int llama_apply_lora_from_file_internal( return 1; } - base_t = ml->create_tensor(base_ctx.get(), base_name, { dest_t->ne[0], dest_t->ne[1] }, GGML_BACKEND_CPU); - GGML_ASSERT(!"not implemented"); - //ml->load_data_for(base_t); // TODO + base_t = ml->get_tensor_meta(base_name.c_str()); + ml->load_data_for(base_t); } else { base_t = dest_t; } From 24cc321931a42393b21ee199dc7724f30450ff6f Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 20 Dec 2023 18:12:29 +0100 Subject: [PATCH 11/21] update session copy/set to use ggml-backend ggml-ci --- ggml-alloc.c | 5 ++++ ggml-backend.c | 2 +- llama.cpp | 74 +++++++++++++++++++++++++++++--------------------- 3 files changed, 49 insertions(+), 32 deletions(-) diff --git a/ggml-alloc.c b/ggml-alloc.c index 0a2b80f31f6dc..76e9d9ee57b75 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -792,6 +792,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte } else { ggml_backend_view_init(buffer, t); } + } else { + if (t->view_src != NULL) { + // view of a pre-allocated tensor + ggml_backend_view_init(buffer, t); + } } } diff --git a/ggml-backend.c b/ggml-backend.c index e6f69bb23ec5b..0c8c9ec430475 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1250,7 +1250,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml // utils void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { GGML_ASSERT(tensor->buffer == NULL); - GGML_ASSERT(tensor->data == NULL); + //GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized GGML_ASSERT(tensor->view_src != NULL); GGML_ASSERT(tensor->view_src->buffer != NULL); GGML_ASSERT(tensor->view_src->data != NULL); diff --git a/llama.cpp b/llama.cpp index 769906b299cb6..fa11793fc499d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3661,7 +3661,7 @@ static void llm_load_tensors( LLAMA_LOG_INFO("%s: VRAM used = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0); } -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +#if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST) const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); @@ -9830,17 +9830,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true }); ggml_cgraph * gf = ggml_new_graph(cpy_ctx); - std::vector> kout2d_data(n_layer); - std::vector> vout2d_data(n_layer); + std::vector kout2d(n_layer); + std::vector vout2d(n_layer); for (int il = 0; il < (int) n_layer; ++il) { - ggml_tensor * kout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head); - kout2d_data[il].resize(ggml_nbytes(kout2d)); - kout2d->data = kout2d_data[il].data(); - - ggml_tensor * vout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd); - vout2d_data[il].resize(ggml_nbytes(vout2d)); - vout2d->data = vout2d_data[il].data(); + kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head); + vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd); ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il], n_embd, kv_head, @@ -9850,21 +9845,28 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat kv_head, n_embd, elt_size*n_ctx, 0); - ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d)); - ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d)); + ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il])); + ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d[il])); } - std::vector work_buffer; - ggml_graph_compute_helper(work_buffer, gf, ctx->cparams.n_threads); + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend); - ggml_free(cpy_ctx); + ggml_backend_graph_compute(ctx->backend, gf); + + std::vector tmp_buf; + for (int il = 0; il < (int) n_layer; ++il) { + tmp_buf.resize(ggml_nbytes(kout2d[il])); + ggml_backend_tensor_get(kout2d[il], tmp_buf.data(), 0, tmp_buf.size()); + data_ctx->write(tmp_buf.data(), tmp_buf.size()); - // our data is now in the kout2d_data and vout2d_data buffers - // write them to file - for (uint32_t il = 0; il < n_layer; ++il) { - data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size()); - data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size()); + tmp_buf.resize(ggml_nbytes(vout2d[il])); + ggml_backend_tensor_get(vout2d[il], tmp_buf.data(), 0, tmp_buf.size()); + data_ctx->write(tmp_buf.data(), tmp_buf.size()); } + + ggml_free(cpy_ctx); + + ggml_backend_buffer_free(buf); } for (uint32_t i = 0; i < kv_size; ++i) { @@ -9969,14 +9971,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true }); ggml_cgraph * gf = ggml_new_graph(cpy_ctx); - for (int il = 0; il < n_layer; ++il) { - ggml_tensor * kin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head); - kin2d->data = (void *) inp; - inp += ggml_nbytes(kin2d); + std::vector kin2d(n_layer); + std::vector vin2d(n_layer); - ggml_tensor * vin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd); - vin2d->data = (void *) inp; - inp += ggml_nbytes(vin2d); + for (int il = 0; il < n_layer; ++il) { + kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head); + vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd); ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il], n_embd, kv_head, @@ -9986,14 +9986,26 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { kv_head, n_embd, elt_size*n_ctx, 0); - ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d, k2d)); - ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d, v2d)); + ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d)); + ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d[il], v2d)); + } + + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend); + + // load data into the tensors + for (int il = 0; il < n_layer; ++il) { + ggml_backend_tensor_set(kin2d[il], inp, 0, ggml_nbytes(kin2d[il])); + inp += ggml_nbytes(kin2d[il]); + + ggml_backend_tensor_set(vin2d[il], inp, 0, ggml_nbytes(vin2d[il])); + inp += ggml_nbytes(vin2d[il]); } - std::vector work_buffer; - ggml_graph_compute_helper(work_buffer, gf, ctx->cparams.n_threads); + ggml_backend_graph_compute(ctx->backend, gf); ggml_free(cpy_ctx); + + ggml_backend_buffer_free(buf); } ctx->kv_self.head = kv_head; From f70f94dfb8daf4fd4c61d861d245f7efa85a37d4 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 20 Dec 2023 21:53:52 +0100 Subject: [PATCH 12/21] use posix_fadvise instead of posix_fadvise64 --- llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index fa11793fc499d..8f8aee3c9f518 100644 --- a/llama.cpp +++ b/llama.cpp @@ -841,8 +841,8 @@ struct llama_mmap { // prefetch/readahead impairs performance on NUMA systems if (numa) { prefetch = 0; } #ifdef __linux__ - if (posix_fadvise64(fd, 0, file->size, POSIX_FADV_SEQUENTIAL)) { - fprintf(stderr, "warning: fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n", + if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) { + fprintf(stderr, "warning: posix_fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n", strerror(errno)); } if (prefetch) { flags |= MAP_POPULATE; } From 6c045a86ed34872f3da1aa5d71ee662f04bfcb05 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 20 Dec 2023 22:05:32 +0100 Subject: [PATCH 13/21] ggml_backend_alloc_ctx_tensors_from_buft : remove old print --- ggml-alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-alloc.c b/ggml-alloc.c index 76e9d9ee57b75..a97436b17ed70 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -778,7 +778,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte } if (nbytes == 0) { - //fprintf(stderr, "%s: no tensors to allocate\n", __func__); + // all the tensors in the context are already allocated return NULL; } From 5834a25345c0786feaef23f2c7094d409336caf8 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 20 Dec 2023 22:07:09 +0100 Subject: [PATCH 14/21] llama_mmap::align_offset : use pointers instead of references for out parameters --- llama.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/llama.cpp b/llama.cpp index 8f8aee3c9f518..1b8a04b77d9a7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -816,18 +816,18 @@ struct llama_mmap { llama_mmap(const llama_mmap &) = delete; - static void align_offset(size_t & offset, size_t & len, size_t page_size) { + static void align_offset(size_t * offset, size_t * len, size_t page_size) { // align offset to the next page - size_t offset_in_page = offset & (page_size - 1); + size_t offset_in_page = *offset & (page_size - 1); size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page; - offset += offset_to_page; + *offset += offset_to_page; - if (offset_to_page >= len) { - len = 0; + if (offset_to_page >= *len) { + *len = 0; } else { - len -= offset_to_page; + *len -= offset_to_page; // align len to the previous page - len -= len & (page_size - 1); + *len -= *len & (page_size - 1); } } @@ -871,7 +871,7 @@ struct llama_mmap { void unmap(size_t offset, size_t len) { int page_size = sysconf(_SC_PAGESIZE); - align_offset(offset, len, page_size); + align_offset(&offset, &len, page_size); if (len < (size_t)page_size) { return; } @@ -940,7 +940,7 @@ struct llama_mmap { SYSTEM_INFO si; GetSystemInfo(&si); DWORD page_size = si.dwAllocationGranularity; - align_offset(offset, len, page_size); + align_offset(&offset, &len, page_size); if (len < (size_t)page_size) { return; From ecb23d4ac584590ea9ed112228b211c9fcd0cbb9 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 20 Dec 2023 23:15:15 +0100 Subject: [PATCH 15/21] restore progress_callback behavior --- llama.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index 1b8a04b77d9a7..0124e424f5de2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2372,6 +2372,11 @@ struct llama_model_loader { for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); GGML_ASSERT(cur); // unused tensors should have been caught by load_data already + + if (progress_callback) { + progress_callback((float) size_done / size_data, progress_callback_user_data); + } + const size_t offs = file_offset(ggml_get_name(cur)); if (!legacy_offload || cur->backend == GGML_BACKEND_CPU) { @@ -2422,10 +2427,6 @@ struct llama_model_loader { } size_done += ggml_nbytes(cur); - - if (progress_callback) { - progress_callback((float) size_done / size_data, progress_callback_user_data); - } } // unmap GPU tensors From 8ed2a8eb62fbd39446817e68cef57a2e5238d59b Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 20 Dec 2023 23:23:21 +0100 Subject: [PATCH 16/21] move final progress_callback call to load_all_data --- llama.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index 0124e424f5de2..0c9d3f74503e8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2435,6 +2435,10 @@ struct llama_model_loader { mapping->unmap(0, mmap_first); mapping->unmap(mmap_last, mapping->size - mmap_last); } + + if (progress_callback) { + progress_callback(1.0f, progress_callback_user_data); + } } }; @@ -3691,10 +3695,6 @@ static void llm_load_tensors( ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL); - if (progress_callback) { - progress_callback(1.0f, progress_callback_user_data); - } - model.mapping = std::move(ml.mapping); // loading time will be recalculate after the first eval, so From a4e191f3dfb3a906dad26217abb089672023a921 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 21 Dec 2023 14:30:50 +0200 Subject: [PATCH 17/21] cuda : fix fprintf format string (minor) --- ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index b890ec985401f..52394352b6162 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -9240,7 +9240,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ if (tensor->op == GGML_OP_MUL_MAT) { if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) { #ifndef NDEBUG - fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]); + fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]); #endif return false; } From a74b1a89b380a2012365eb581c42f2c4edd66944 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 21 Dec 2023 14:18:21 +0100 Subject: [PATCH 18/21] do not offload scales --- llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 0c9d3f74503e8..2ba6d78c77d5e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5652,8 +5652,8 @@ static const std::unordered_map k_offload_map { "pos_embd", OFFLOAD_FUNC_NR }, { "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope) - { "Q_scale", OFFLOAD_FUNC_FRC }, - { "KQ_scale", OFFLOAD_FUNC_FRC }, + { "Q_scale", OFFLOAD_FUNC_NOP }, + { "KQ_scale", OFFLOAD_FUNC_NOP }, { "KQ_mask", OFFLOAD_FUNC_FRC }, { "K_shift", OFFLOAD_FUNC_FRC }, From cd4167b634725668928692314358bbfbefab59e1 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 21 Dec 2023 19:24:54 +0100 Subject: [PATCH 19/21] llama_mmap : avoid unmapping the same fragments again in the destructor --- llama.cpp | 122 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 76 insertions(+), 46 deletions(-) diff --git a/llama.cpp b/llama.cpp index 2ba6d78c77d5e..52e90a1874c3f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -816,24 +816,12 @@ struct llama_mmap { llama_mmap(const llama_mmap &) = delete; - static void align_offset(size_t * offset, size_t * len, size_t page_size) { - // align offset to the next page - size_t offset_in_page = *offset & (page_size - 1); - size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page; - *offset += offset_to_page; - - if (offset_to_page >= *len) { - *len = 0; - } else { - *len -= offset_to_page; - // align len to the previous page - *len -= *len & (page_size - 1); - } - } - #ifdef _POSIX_MAPPED_FILES static constexpr bool SUPPORTED = true; + // list of mapped fragments (first_offset, last_offset) + std::vector> mapped_fragments; + llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { size = file->size; int fd = fileno(file->fp); @@ -841,8 +829,9 @@ struct llama_mmap { // prefetch/readahead impairs performance on NUMA systems if (numa) { prefetch = 0; } #ifdef __linux__ + // advise the kernel to read the file sequentially (increases readahead) if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) { - fprintf(stderr, "warning: posix_fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n", + LLAMA_LOG_WARN("warning: posix_fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n", strerror(errno)); } if (prefetch) { flags |= MAP_POPULATE; } @@ -853,9 +842,9 @@ struct llama_mmap { } if (prefetch > 0) { - // Advise the kernel to preload the mapped memory + // advise the kernel to preload the mapped memory if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) { - fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n", + LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n", strerror(errno)); } } @@ -863,32 +852,81 @@ struct llama_mmap { // advise the kernel not to use readahead // (because the next page might not belong on the same node) if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) { - fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n", + LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n", strerror(errno)); } } + + // initialize list of mapped_fragments + mapped_fragments.emplace_back(0, file->size); + } + + static void align_range(size_t * first, size_t * last, size_t page_size) { + // align first to the next page + size_t offset_in_page = *first & (page_size - 1); + size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page; + *first += offset_to_page; + + // align last to the previous page + *last = *last & ~(page_size - 1); + + if (*last <= *first) { + *last = *first; + } } - void unmap(size_t offset, size_t len) { + // partially unmap the file in the range [first, last) + void unmap_fragment(size_t first, size_t last) { + // note: this function must not be called multiple times with overlapping ranges + // otherwise, there is a risk of invalidating addresses that have been repurposed for other mappings int page_size = sysconf(_SC_PAGESIZE); - align_offset(&offset, &len, page_size); - if (len < (size_t)page_size) { + align_range(&first, &last, page_size); + size_t len = last - first; + + if (len == 0) { return; } - void * next_page_start = (uint8_t *) addr + offset; - // unmap and discard the pages + GGML_ASSERT(first % page_size == 0); + GGML_ASSERT(last % page_size == 0); + GGML_ASSERT(last > first); + + void * next_page_start = (uint8_t *) addr + first; + + // unmap the range if (munmap(next_page_start, len)) { - fprintf(stderr, "warning: munmap failed: %s\n", strerror(errno)); - } - if (posix_madvise(next_page_start, len, POSIX_MADV_DONTNEED)) { - fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_DONTNEED) failed: %s\n", - strerror(errno)); + LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno)); + } + + // update the list of mapped fragments to avoid unmapping the same range again in the destructor + std::vector> new_mapped_fragments; + for (const auto & frag : mapped_fragments) { + if (frag.first < first && frag.second > last) { + // the range is in the middle of the fragment, split it + new_mapped_fragments.emplace_back(frag.first, first); + new_mapped_fragments.emplace_back(last, frag.second); + } else if (frag.first < first && frag.second > first) { + // the range starts in the middle of the fragment + new_mapped_fragments.emplace_back(frag.first, first); + } else if (frag.first < last && frag.second > last) { + // the range ends in the middle of the fragment + new_mapped_fragments.emplace_back(last, frag.second); + } else if (frag.first >= first && frag.second <= last) { + // the range covers the entire fragment + } else { + // the range is outside the fragment + new_mapped_fragments.push_back(frag); + } } + mapped_fragments = std::move(new_mapped_fragments); } ~llama_mmap() { - munmap(addr, size); + for (const auto & frag : mapped_fragments) { + if (munmap((char *) addr + frag.first, frag.second - frag.first)) { + LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno)); + } + } } #elif defined(_WIN32) static constexpr bool SUPPORTED = true; @@ -936,18 +974,10 @@ struct llama_mmap { } } - void unmap(size_t offset, size_t len) { - SYSTEM_INFO si; - GetSystemInfo(&si); - DWORD page_size = si.dwAllocationGranularity; - align_offset(&offset, &len, page_size); - - if (len < (size_t)page_size) { - return; - } - - void * next_page_start = (uint8_t *) addr + offset; - VirtualAlloc(next_page_start, len, MEM_RESET, PAGE_NOACCESS); + void unmap_fragment(size_t first, size_t last) { + // not supported + GGML_UNUSED(first); + GGML_UNUSED(last); } ~llama_mmap() { @@ -2429,11 +2459,11 @@ struct llama_model_loader { size_done += ggml_nbytes(cur); } - // unmap GPU tensors + // unmap offloaded tensors and metadata if (use_mmap && mapping) { - // unmap offloaded tensors and metadata - mapping->unmap(0, mmap_first); - mapping->unmap(mmap_last, mapping->size - mmap_last); + mapping->unmap_fragment(0, mmap_first); + mapping->unmap_fragment(mmap_last, mmap_last); + mapping->unmap_fragment(mmap_last, mapping->size); } if (progress_callback) { From 323881ef4bf3af4e3404150e60005f64e8257408 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 21 Dec 2023 19:37:23 +0100 Subject: [PATCH 20/21] remove unnecessary unmap --- llama.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index f444642400f89..da1175721c889 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2462,7 +2462,6 @@ struct llama_model_loader { // unmap offloaded tensors and metadata if (use_mmap && mapping) { mapping->unmap_fragment(0, mmap_first); - mapping->unmap_fragment(mmap_last, mmap_last); mapping->unmap_fragment(mmap_last, mapping->size); } From f4d884f47ee2fd15093c2e5169fb67e0714336fb Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 21 Dec 2023 20:03:40 +0100 Subject: [PATCH 21/21] metal : add default log function that prints to stderr, cleanup code ggml-ci --- ggml-metal.m | 54 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 0916202a0687d..e60b93b36a7de 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -180,7 +180,15 @@ @interface GGMLMetalClass : NSObject @implementation GGMLMetalClass @end -ggml_log_callback ggml_metal_log_callback = NULL; + +static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) { + fprintf(stderr, "%s", msg); + + UNUSED(level); + UNUSED(user_data); +} + +ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback; void * ggml_metal_log_user_data = NULL; void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) { @@ -622,7 +630,7 @@ int ggml_metal_if_optimized(struct ggml_metal_context * ctx) { // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap int n_buffers; - struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; + struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; }; // finds the Metal buffer that contains the tensor data on the GPU device @@ -2499,13 +2507,29 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba deallocator:nil]; if (ctx->buffers[0].metal == nil) { - GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, "default", size_aligned / 1024.0 / 1024.0); + GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); free(ctx); ggml_backend_metal_free_device(); return NULL; } - GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB\n", __func__, "default", size_aligned / 1024.0 / 1024.0); + GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0); + + +#if TARGET_OS_OSX + GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)", + device.currentAllocatedSize / 1024.0 / 1024.0, + device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); + + if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) { + GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); + } else { + GGML_METAL_LOG_INFO("\n"); + } +#else + GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0); +#endif + return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size); } @@ -2560,22 +2584,19 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz id device = ggml_backend_metal_get_device(); - const char * name = "from_ptr"; - // the buffer fits into the max buffer size allowed by the device if (size_aligned <= device.maxBufferLength) { - ctx->buffers[ctx->n_buffers].name = name; ctx->buffers[ctx->n_buffers].data = data; ctx->buffers[ctx->n_buffers].size = size; ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_aligned / 1024.0 / 1024.0); + GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); return false; } - GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_aligned / 1024.0 / 1024.0); + GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0); ++ctx->n_buffers; } else { @@ -2588,18 +2609,17 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz for (size_t i = 0; i < size; i += size_step) { const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i); - ctx->buffers[ctx->n_buffers].name = name; ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i); ctx->buffers[ctx->n_buffers].size = size_step_aligned; ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); + GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0); return false; } - GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); + GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, offs = %12ld", __func__, size_step_aligned / 1024.0 / 1024.0, i); if (i + size_step < size) { GGML_METAL_LOG_INFO("\n"); } @@ -2673,17 +2693,7 @@ static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct /* .supports_op = */ ggml_backend_metal_supports_op, }; -// TODO: make a common log callback for all backends in ggml-backend -static void ggml_backend_log_callback(enum ggml_log_level level, const char * msg, void * user_data) { - fprintf(stderr, "%s", msg); - - UNUSED(level); - UNUSED(user_data); -} - ggml_backend_t ggml_backend_metal_init(void) { - ggml_metal_log_set_callback(ggml_backend_log_callback, NULL); - struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS); if (ctx == NULL) {