From 1ad8f0d80eebdb56bac4e76100975a8fc14b1d62 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Thu, 14 Dec 2023 16:00:44 +0800 Subject: [PATCH 1/5] Fixes "Not enough space in the context's memory pool" encountered on certain models, which seems to be caused by some imprecision related to the automatic casting of floating point values --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 0e5ab044cdfa4..b8b806f5f9f09 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1555,7 +1555,7 @@ static bool llama_kv_cache_init( cache.cells.clear(); cache.cells.resize(n_ctx); - cache.buf.resize(n_elements*(ggml_type_sizef(ktype) + ggml_type_sizef(vtype)) + 2u*n_layer*ggml_tensor_overhead()); + cache.buf.resize(n_elements*((size_t)(ggml_type_sizef(ktype) + ggml_type_sizef(vtype))) + 2u*n_layer*ggml_tensor_overhead()); memset(cache.buf.data, 0, cache.buf.size); struct ggml_init_params params; From 05f7db4b29693bae138fe14c024bc960b87fa1d2 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Thu, 14 Dec 2023 16:43:34 +0800 Subject: [PATCH 2/5] do not cast to size_t, instead just use doubles --- ggml.c | 4 ++-- ggml.h | 2 +- llama.cpp | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml.c b/ggml.c index 29e18a24c76a8..cbdf93bfc2462 100644 --- a/ggml.c +++ b/ggml.c @@ -2011,8 +2011,8 @@ size_t ggml_type_size(enum ggml_type type) { return type_traits[type].type_size; } -float ggml_type_sizef(enum ggml_type type) { - return ((float)(type_traits[type].type_size))/type_traits[type].blck_size; +double ggml_type_sizef(enum ggml_type type) { + return ((double)(type_traits[type].type_size))/type_traits[type].blck_size; } const char * ggml_type_name(enum ggml_type type) { diff --git a/ggml.h b/ggml.h index 1447646b13c43..f04c259e31762 100644 --- a/ggml.h +++ b/ggml.h @@ -643,7 +643,7 @@ extern "C" { GGML_API int ggml_blck_size (enum ggml_type type); GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block - GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float + GGML_API double ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float GGML_API const char * ggml_type_name(enum ggml_type type); GGML_API const char * ggml_op_name (enum ggml_op op); diff --git a/llama.cpp b/llama.cpp index b8b806f5f9f09..0e5ab044cdfa4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1555,7 +1555,7 @@ static bool llama_kv_cache_init( cache.cells.clear(); cache.cells.resize(n_ctx); - cache.buf.resize(n_elements*((size_t)(ggml_type_sizef(ktype) + ggml_type_sizef(vtype))) + 2u*n_layer*ggml_tensor_overhead()); + cache.buf.resize(n_elements*(ggml_type_sizef(ktype) + ggml_type_sizef(vtype)) + 2u*n_layer*ggml_tensor_overhead()); memset(cache.buf.data, 0, cache.buf.size); struct ggml_init_params params; From 8a7cfaf946ebe50de7f69e7b1b86b37ffd6ef1d3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 14 Dec 2023 13:27:50 +0200 Subject: [PATCH 3/5] ggml : add ggml_row_size(), deprecate ggml_type_sizef() --- examples/benchmark/benchmark-matmult.cpp | 14 +++++++------- ggml.c | 4 ++++ ggml.h | 10 +++++++--- llama.cpp | 12 ++++++------ 4 files changed, 24 insertions(+), 16 deletions(-) diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index 284733b1035c9..475830950115f 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -129,13 +129,13 @@ int main(int argc, char ** argv) { const ggml_type qtype = GGML_TYPE_Q4_1; size_t ctx_size = 0; - ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += sizex*sizey*ggml_type_sizef(qtype); - ctx_size += sizex*sizey*ggml_type_sizef(qtype); - ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS - ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); + ctx_size += ggml_row_size(qtype, sizex*sizey); + ctx_size += ggml_row_size(qtype, sizex*sizey); + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS ctx_size += 1024*1024*16; printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024)); diff --git a/ggml.c b/ggml.c index cbdf93bfc2462..c7845c9c2a49e 100644 --- a/ggml.c +++ b/ggml.c @@ -2011,6 +2011,10 @@ size_t ggml_type_size(enum ggml_type type) { return type_traits[type].type_size; } +size_t ggml_row_size(enum ggml_type type, int64_t ne) { + return (ggml_type_size(type)*ne)/ggml_blck_size(type); +} + double ggml_type_sizef(enum ggml_type type) { return ((double)(type_traits[type].type_size))/type_traits[type].blck_size; } diff --git a/ggml.h b/ggml.h index f04c259e31762..ae8101fab3d6d 100644 --- a/ggml.h +++ b/ggml.h @@ -641,9 +641,13 @@ extern "C" { GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split); - GGML_API int ggml_blck_size (enum ggml_type type); - GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block - GGML_API double ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float + GGML_API int ggml_blck_size(enum ggml_type type); + GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block + GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row + + GGML_DEPRECATED( + GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float + "use ggml_row_size() instead"); GGML_API const char * ggml_type_name(enum ggml_type type); GGML_API const char * ggml_op_name (enum ggml_op op); diff --git a/llama.cpp b/llama.cpp index 0e5ab044cdfa4..456807d9d5a3a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1555,7 +1555,7 @@ static bool llama_kv_cache_init( cache.cells.clear(); cache.cells.resize(n_ctx); - cache.buf.resize(n_elements*(ggml_type_sizef(ktype) + ggml_type_sizef(vtype)) + 2u*n_layer*ggml_tensor_overhead()); + cache.buf.resize(ggml_row_size(ktype, n_elements) + ggml_row_size(vtype, n_elements) + 2u*n_layer*ggml_tensor_overhead()); memset(cache.buf.data, 0, cache.buf.size); struct ggml_init_params params; @@ -3822,8 +3822,8 @@ static void llm_build_k_shift( ggml_rope_custom_inplace(ctx, ggml_view_3d(ctx, kv.k_l[il], n_embd_head, n_head_kv, n_ctx, - ggml_type_sizef(kv.k_l[il]->type)*n_embd_head, - ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa, + ggml_row_size(kv.k_l[il]->type, n_embd_head), + ggml_row_size(kv.k_l[il]->type, n_embd_gqa), 0), K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); @@ -3852,7 +3852,7 @@ static void llm_build_kv_store( cb(v_cur_t, "v_cur_t", il); struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa, - (ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa)*kv_head); + (ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head); cb(k_cache_view, "k_cache_view", il); struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa, @@ -4011,8 +4011,8 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * k = ggml_view_3d(ctx, kv.k_l[il], n_embd_head, n_kv, n_head_kv, - ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa, - ggml_type_sizef(kv.k_l[il]->type)*n_embd_head, + ggml_row_size(kv.k_l[il]->type, n_embd_gqa), + ggml_row_size(kv.k_l[il]->type, n_embd_head), 0); cb(k, "k", il); From ed866f550b7f752f4c6fb49b06bb18226d803ef8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 14 Dec 2023 13:49:20 +0200 Subject: [PATCH 4/5] ggml : fix row size compute to avoid overflows --- ggml.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index c7845c9c2a49e..ed77054bf2255 100644 --- a/ggml.c +++ b/ggml.c @@ -2012,7 +2012,8 @@ size_t ggml_type_size(enum ggml_type type) { } size_t ggml_row_size(enum ggml_type type, int64_t ne) { - return (ggml_type_size(type)*ne)/ggml_blck_size(type); + assert(ne % ggml_blck_size(type) == 0); + return ggml_type_size(type)*ne/ggml_blck_size(type); } double ggml_type_sizef(enum ggml_type type) { From 80e78cea127326fc0dcf33b63641acb20a77b3af Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 14 Dec 2023 14:00:11 +0200 Subject: [PATCH 5/5] tests : fix sizey -> sizez --- examples/benchmark/benchmark-matmult.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index 475830950115f..434e1d6bd509e 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -131,7 +131,7 @@ int main(int argc, char ** argv) { size_t ctx_size = 0; ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); - ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez); ctx_size += ggml_row_size(qtype, sizex*sizey); ctx_size += ggml_row_size(qtype, sizex*sizey); ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS