From e9bcf66a5cb611bc7a722edb35a5b38fed070f53 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Tue, 3 Oct 2023 17:49:36 +0200
Subject: [PATCH 01/13] per-layer KV

---
 llama.cpp | 109 +++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 71 insertions(+), 38 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 4a61eecdd328b..acc5ec7f7cf04 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1035,6 +1035,9 @@ struct llama_kv_cache {
 
     struct ggml_tensor * k = NULL;
     struct ggml_tensor * v = NULL;
+    std::vector<ggml_tensor*> k_l; // per layer
+
+    std::vector<ggml_tensor*> v_l;
 
     struct ggml_context * ctx = NULL;
 
@@ -1239,6 +1242,7 @@ static bool llama_kv_cache_init(
     cache.cells.clear();
     cache.cells.resize(n_ctx);
 
+
     cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
 
     struct ggml_init_params params;
@@ -1248,34 +1252,48 @@ static bool llama_kv_cache_init(
 
     cache.ctx = ggml_init(params);
 
+    size_t vram_kv_cache = 0;
+
     if (!cache.ctx) {
         LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
         return false;
     }
 
-    cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
-    cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
-    ggml_set_name(cache.k, "cache_k");
-    ggml_set_name(cache.v, "cache_v");
+    // cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
+    // cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
+    // ggml_set_name(cache.k, "cache_k");
+    // ggml_set_name(cache.v, "cache_v");
 
-    (void) n_gpu_layers;
+    cache.k_l.reserve(n_layer);
+    cache.v_l.reserve(n_layer);
+
+    const int i_gpu_start = n_layer - n_gpu_layers;
+
+    for (uint32_t i = 0; i < n_layer; i++) {
+        ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, wtype, n_embd*n_ctx);
+        ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, wtype, n_embd*n_ctx);
+        ggml_format_name(k, "cache_k_l%d", i);
+        ggml_format_name(v, "cache_v_l%d", i);
+        cache.k_l.push_back(k);
+        cache.v_l.push_back(v);
 #ifdef GGML_USE_CUBLAS
-    size_t vram_kv_cache = 0;
+    if ((int)i >= i_gpu_start) {
+        ggml_cuda_assign_buffers_no_scratch(k);
+        LLAMA_LOG_INFO("%s: offloading k[%d] cache to GPU\n", __func__, i);
+        vram_kv_cache += ggml_nbytes(k);
 
-    if (n_gpu_layers > (int)n_layer + 1) {
-        ggml_cuda_assign_buffers_no_scratch(cache.v);
-        LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
-        vram_kv_cache += ggml_nbytes(cache.v);
+        ggml_cuda_assign_buffers_no_scratch(v);
+        LLAMA_LOG_INFO("%s: offloading v[%d] cache to GPU\n", __func__, i);
+        vram_kv_cache += ggml_nbytes(v);
     }
-    if (n_gpu_layers > (int)n_layer + 2) {
-        ggml_cuda_assign_buffers_no_scratch(cache.k);
-        LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
-        vram_kv_cache += ggml_nbytes(cache.k);
+#endif // GGML_USE_CUBLAS
     }
+
     if (vram_kv_cache > 0) {
         LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
     }
-#endif // GGML_USE_CUBLAS
+
+    (void) n_gpu_layers;
 
     return true;
 }
@@ -2634,17 +2652,17 @@ static struct ggml_cgraph * llm_build_llama(
     // offload functions set the tensor output backend to GPU
     // tensors are GPU-accelerated if any input or the output has been offloaded
     offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
-    offload_func_t offload_func_kq = llama_nop;
     offload_func_t offload_func_v  = llama_nop;
+    offload_func_t offload_func_kq = llama_nop;
 
 #ifdef GGML_USE_CUBLAS
     if (n_gpu_layers > n_layer) {
         offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
     }
-    if (n_gpu_layers > n_layer + 1) {
+    if (n_gpu_layers > 0) {
         offload_func_v  = ggml_cuda_assign_buffers_no_alloc;
     }
-    if (n_gpu_layers > n_layer + 2) {
+    if (n_gpu_layers > 0) {
         offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
     }
 #endif // GGML_USE_CUBLAS
@@ -2708,11 +2726,11 @@ static struct ggml_cgraph * llm_build_llama(
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * tmp =
                     ggml_rope_custom_inplace(ctx0,
-                        ggml_view_3d(ctx0, kv_self.k,
+                        ggml_view_3d(ctx0, kv_self.k_l[il],
                             n_embd_head, n_head_kv, n_ctx,
-                            ggml_element_size(kv_self.k)*n_embd_head,
-                            ggml_element_size(kv_self.k)*n_embd_gqa,
-                            ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
+                            ggml_element_size(kv_self.k_l[il])*n_embd_head,
+                            ggml_element_size(kv_self.k_l[il])*n_embd_gqa,
+                            0),
                         K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
             offload_func_kq(tmp);
             ggml_build_forward_expand(gf, tmp);
@@ -2723,10 +2741,14 @@ static struct ggml_cgraph * llm_build_llama(
         ggml_format_name(inpL, "layer_inp_%d", il);
 
         offload_func_t offload_func = llama_nop;
+        offload_func_v  = llama_nop;
+        offload_func_kq = llama_nop;
 
 #ifdef GGML_USE_CUBLAS
         if (il >= i_gpu_start) {
-            offload_func = ggml_cuda_assign_buffers_no_alloc;
+            offload_func    = ggml_cuda_assign_buffers_no_alloc;
+            offload_func_v  = ggml_cuda_assign_buffers_no_alloc;
+            offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
         }
 #endif // GGML_USE_CUBLAS
 
@@ -2775,13 +2797,13 @@ static struct ggml_cgraph * llm_build_llama(
                 offload_func_v(Vcur);
                 ggml_set_name(Vcur, "Vcur");
 
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
+                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k_l[il], n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k_l[il])*n_embd_gqa)*(kv_head));
                 offload_func_kq(k);
                 ggml_set_name(k, "k");
 
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
-                        (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
+                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v_l[il], n_tokens, n_embd_gqa,
+                        (   n_ctx)*ggml_element_size(kv_self.v_l[il]),
+                           kv_head*ggml_element_size(kv_self.v_l[il]));
                 offload_func_v(v);
                 ggml_set_name(v, "v");
 
@@ -2795,11 +2817,11 @@ static struct ggml_cgraph * llm_build_llama(
             ggml_set_name(Q, "Q");
 
             struct ggml_tensor * K =
-                ggml_view_3d(ctx0, kv_self.k,
+                ggml_view_3d(ctx0, kv_self.k_l[il],
                         n_embd_head, n_kv, n_head_kv,
-                        ggml_element_size(kv_self.k)*n_embd_gqa,
-                        ggml_element_size(kv_self.k)*n_embd_head,
-                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
+                        ggml_element_size(kv_self.k_l[il])*n_embd_gqa,
+                        ggml_element_size(kv_self.k_l[il])*n_embd_head,
+                        0);
             offload_func_kq(K);
             ggml_set_name(K, "K");
 
@@ -2826,11 +2848,11 @@ static struct ggml_cgraph * llm_build_llama(
 
             // split cached V into n_head heads
             struct ggml_tensor * V =
-                ggml_view_3d(ctx0, kv_self.v,
+                ggml_view_3d(ctx0, kv_self.v_l[il],
                         n_kv, n_embd_head, n_head_kv,
-                        ggml_element_size(kv_self.v)*n_ctx,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
+                        ggml_element_size(kv_self.v_l[il])*n_ctx,
+                        ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head,
+                        0);
             offload_func_v(V);
             ggml_set_name(V, "V");
 
@@ -6872,7 +6894,14 @@ struct llama_context * llama_new_context_with_model(
         }
 
         {
-            const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
+            // const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
+            size_t memory_size = 0;
+            for (auto & k : ctx->kv_self.k_l) {
+                memory_size += ggml_nbytes(k);
+            }
+            for (auto & v : ctx->kv_self.v_l) {
+                memory_size += ggml_nbytes(v);
+            }
             LLAMA_LOG_INFO("%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
         }
 
@@ -6946,8 +6975,12 @@ struct llama_context * llama_new_context_with_model(
             }
 
             size_t kv_vram_size = 0;
-            add_tensor(ctx->kv_self.k, kv_vram_size);
-            add_tensor(ctx->kv_self.v, kv_vram_size);
+            for (auto & k : ctx->kv_self.k_l) {
+                add_tensor(k, kv_vram_size);
+            }
+            for (auto & v : ctx->kv_self.v_l) {
+                add_tensor(v, kv_vram_size);
+            }
 
             size_t ctx_vram_size = alloc_size + kv_vram_size;
             size_t total_vram_size = model_vram_size + ctx_vram_size;

From 55f2f2fb43baf966ac37326f5fb9abe2112d38a1 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Wed, 4 Oct 2023 01:53:21 +0200
Subject: [PATCH 02/13] remove unnecessary copies

---
 llama.cpp | 68 +++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 53 insertions(+), 15 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index acc5ec7f7cf04..53793eeeb578c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2676,13 +2676,34 @@ static struct ggml_cgraph * llm_build_llama(
     }
 
     // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-    offload_func_kq(KQ_mask);
-    ggml_set_name(KQ_mask, "KQ_mask");
-    ggml_allocr_alloc(lctx.alloc, KQ_mask);
+    struct ggml_tensor * KQ_mask_gpu = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+    offload_func_kq(KQ_mask_gpu);
+    ggml_set_name(KQ_mask_gpu, "KQ_mask_gpu");
+    ggml_allocr_alloc(lctx.alloc, KQ_mask_gpu);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
-        float * data = (float *) KQ_mask->data;
-        memset(data, 0, ggml_nbytes(KQ_mask));
+        float * data = (float *) KQ_mask_gpu->data;
+        memset(data, 0, ggml_nbytes(KQ_mask_gpu));
+
+        for (int h = 0; h < 1; ++h) {
+            for (int j = 0; j < n_tokens; ++j) {
+                const llama_pos    pos    = batch.pos[j];
+                const llama_seq_id seq_id = batch.seq_id[j];
+
+                for (int i = 0; i < n_kv; ++i) {
+                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
+                        data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
+                    }
+                }
+            }
+        }
+    }
+
+    struct ggml_tensor * KQ_mask_cpu = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+    ggml_set_name(KQ_mask_cpu, "KQ_mask_cpu");
+    ggml_allocr_alloc(lctx.alloc, KQ_mask_cpu);
+    if (!ggml_allocr_is_measure(lctx.alloc)) {
+        float * data = (float *) KQ_mask_cpu->data;
+        memset(data, 0, ggml_nbytes(KQ_mask_cpu));
 
         for (int h = 0; h < 1; ++h) {
             for (int j = 0; j < n_tokens; ++j) {
@@ -2699,12 +2720,21 @@ static struct ggml_cgraph * llm_build_llama(
     }
 
     // KQ_pos - contains the positions
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-    offload_func_kq(KQ_pos);
-    ggml_set_name(KQ_pos, "KQ_pos");
-    ggml_allocr_alloc(lctx.alloc, KQ_pos);
+    struct ggml_tensor * KQ_pos_gpu = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+    offload_func_kq(KQ_pos_gpu);
+    ggml_set_name(KQ_pos_gpu, "KQ_pos_gpu");
+    ggml_allocr_alloc(lctx.alloc, KQ_pos_gpu);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
-        int * data = (int *) KQ_pos->data;
+        int * data = (int *) KQ_pos_gpu->data;
+        for (int i = 0; i < n_tokens; ++i) {
+            data[i] = batch.pos[i];
+        }
+    }
+    struct ggml_tensor * KQ_pos_cpu = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+    ggml_set_name(KQ_pos_cpu, "KQ_pos_cpu");
+    ggml_allocr_alloc(lctx.alloc, KQ_pos_cpu);
+    if (!ggml_allocr_is_measure(lctx.alloc)) {
+        int * data = (int *) KQ_pos_cpu->data;
         for (int i = 0; i < n_tokens; ++i) {
             data[i] = batch.pos[i];
         }
@@ -2732,7 +2762,9 @@ static struct ggml_cgraph * llm_build_llama(
                             ggml_element_size(kv_self.k_l[il])*n_embd_gqa,
                             0),
                         K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
-            offload_func_kq(tmp);
+            if (il >= i_gpu_start) {
+                offload_func_kq(tmp);
+            }
             ggml_build_forward_expand(gf, tmp);
         }
     }
@@ -2744,8 +2776,14 @@ static struct ggml_cgraph * llm_build_llama(
         offload_func_v  = llama_nop;
         offload_func_kq = llama_nop;
 
+        struct ggml_tensor * KQ_mask = KQ_mask_cpu;
+        struct ggml_tensor * KQ_pos = KQ_pos_cpu;
+
+
 #ifdef GGML_USE_CUBLAS
         if (il >= i_gpu_start) {
+            KQ_mask = KQ_mask_gpu;
+            KQ_pos = KQ_pos_gpu;
             offload_func    = ggml_cuda_assign_buffers_no_alloc;
             offload_func_v  = ggml_cuda_assign_buffers_no_alloc;
             offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
@@ -2779,11 +2817,11 @@ static struct ggml_cgraph * llm_build_llama(
 
             struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
             offload_func_kq(Kcur);
-            ggml_set_name(Kcur, "Kcur");
+            ggml_format_name(Kcur, "Kcur%d", il);
 
             struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head,    n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
             offload_func_kq(Qcur);
-            ggml_set_name(Qcur, "Qcur");
+            ggml_format_name(Qcur, "Qcur%d", il);
 
             // store key and value to memory
             {
@@ -2839,7 +2877,7 @@ static struct ggml_cgraph * llm_build_llama(
             // KQ_masked = mask_past(KQ_scaled)
             struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
             offload_func_kq(KQ_masked);
-            ggml_set_name(KQ_masked, "KQ_masked");
+            ggml_format_name(KQ_masked, "KQ_masked%d", il);
 
             // KQ = soft_max(KQ_masked)
             struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);

From f4f9367faa1d7bf1f77933fdce5fc4a7ad670207 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 6 Oct 2023 15:44:06 +0200
Subject: [PATCH 03/13] less code duplication, offload k and v separately

---
 llama.cpp | 113 ++++++++++++++++++++++--------------------------------
 1 file changed, 46 insertions(+), 67 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 53793eeeb578c..dac32e60989cb 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1,3 +1,7 @@
+// TODO: move to context params
+bool offload_k = true;
+bool offload_v = true;
+
 #define LLAMA_API_INTERNAL
 #include "llama.h"
 
@@ -1035,9 +1039,9 @@ struct llama_kv_cache {
 
     struct ggml_tensor * k = NULL;
     struct ggml_tensor * v = NULL;
-    std::vector<ggml_tensor*> k_l; // per layer
 
-    std::vector<ggml_tensor*> v_l;
+    std::vector<struct ggml_tensor *> k_l; // per layer
+    std::vector<struct ggml_tensor *> v_l;
 
     struct ggml_context * ctx = NULL;
 
@@ -1259,11 +1263,6 @@ static bool llama_kv_cache_init(
         return false;
     }
 
-    // cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
-    // cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
-    // ggml_set_name(cache.k, "cache_k");
-    // ggml_set_name(cache.v, "cache_v");
-
     cache.k_l.reserve(n_layer);
     cache.v_l.reserve(n_layer);
 
@@ -1278,13 +1277,14 @@ static bool llama_kv_cache_init(
         cache.v_l.push_back(v);
 #ifdef GGML_USE_CUBLAS
     if ((int)i >= i_gpu_start) {
-        ggml_cuda_assign_buffers_no_scratch(k);
-        LLAMA_LOG_INFO("%s: offloading k[%d] cache to GPU\n", __func__, i);
-        vram_kv_cache += ggml_nbytes(k);
-
-        ggml_cuda_assign_buffers_no_scratch(v);
-        LLAMA_LOG_INFO("%s: offloading v[%d] cache to GPU\n", __func__, i);
-        vram_kv_cache += ggml_nbytes(v);
+        if (offload_k) {
+            ggml_cuda_assign_buffers_no_scratch(k);
+            vram_kv_cache += ggml_nbytes(k);
+        }
+        if (offload_v) {
+            ggml_cuda_assign_buffers_no_scratch(v);
+            vram_kv_cache += ggml_nbytes(v);
+        }
     }
 #endif // GGML_USE_CUBLAS
     }
@@ -2659,10 +2659,10 @@ static struct ggml_cgraph * llm_build_llama(
     if (n_gpu_layers > n_layer) {
         offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
     }
-    if (n_gpu_layers > 0) {
+    if (n_gpu_layers > 0 && offload_v) {
         offload_func_v  = ggml_cuda_assign_buffers_no_alloc;
     }
-    if (n_gpu_layers > 0) {
+    if (n_gpu_layers > 0 && offload_k) {
         offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
     }
 #endif // GGML_USE_CUBLAS
@@ -2676,13 +2676,12 @@ static struct ggml_cgraph * llm_build_llama(
     }
 
     // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-    struct ggml_tensor * KQ_mask_gpu = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-    offload_func_kq(KQ_mask_gpu);
-    ggml_set_name(KQ_mask_gpu, "KQ_mask_gpu");
-    ggml_allocr_alloc(lctx.alloc, KQ_mask_gpu);
+    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+    ggml_set_name(KQ_mask, "KQ_mask");
+    ggml_allocr_alloc(lctx.alloc, KQ_mask);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
-        float * data = (float *) KQ_mask_gpu->data;
-        memset(data, 0, ggml_nbytes(KQ_mask_gpu));
+        float * data = (float *) KQ_mask->data;
+        memset(data, 0, ggml_nbytes(KQ_mask));
 
         for (int h = 0; h < 1; ++h) {
             for (int j = 0; j < n_tokens; ++j) {
@@ -2698,48 +2697,25 @@ static struct ggml_cgraph * llm_build_llama(
         }
     }
 
-    struct ggml_tensor * KQ_mask_cpu = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-    ggml_set_name(KQ_mask_cpu, "KQ_mask_cpu");
-    ggml_allocr_alloc(lctx.alloc, KQ_mask_cpu);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        float * data = (float *) KQ_mask_cpu->data;
-        memset(data, 0, ggml_nbytes(KQ_mask_cpu));
-
-        for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < n_tokens; ++j) {
-                const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
-
-                for (int i = 0; i < n_kv; ++i) {
-                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
-                        data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
-                    }
-                }
-            }
-        }
-    }
+    struct ggml_tensor * KQ_mask_gpu = ggml_view_tensor(ctx0, KQ_mask);
+    offload_func_kq(KQ_mask_gpu);
+    ggml_set_name(KQ_mask_gpu, "KQ_mask_gpu");
 
     // KQ_pos - contains the positions
-    struct ggml_tensor * KQ_pos_gpu = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-    offload_func_kq(KQ_pos_gpu);
-    ggml_set_name(KQ_pos_gpu, "KQ_pos_gpu");
-    ggml_allocr_alloc(lctx.alloc, KQ_pos_gpu);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        int * data = (int *) KQ_pos_gpu->data;
-        for (int i = 0; i < n_tokens; ++i) {
-            data[i] = batch.pos[i];
-        }
-    }
-    struct ggml_tensor * KQ_pos_cpu = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-    ggml_set_name(KQ_pos_cpu, "KQ_pos_cpu");
-    ggml_allocr_alloc(lctx.alloc, KQ_pos_cpu);
+    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+    ggml_set_name(KQ_pos, "KQ_pos");
+    ggml_allocr_alloc(lctx.alloc, KQ_pos);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
-        int * data = (int *) KQ_pos_cpu->data;
+        int * data = (int *) KQ_pos->data;
         for (int i = 0; i < n_tokens; ++i) {
             data[i] = batch.pos[i];
         }
     }
 
+    struct ggml_tensor * KQ_pos_gpu = ggml_view_tensor(ctx0, KQ_pos);
+    offload_func_kq(KQ_pos_gpu);
+    ggml_set_name(KQ_pos_gpu, "KQ_pos_gpu");
+
     // shift the entire K-cache if needed
     if (do_rope_shift) {
         struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
@@ -2776,17 +2752,20 @@ static struct ggml_cgraph * llm_build_llama(
         offload_func_v  = llama_nop;
         offload_func_kq = llama_nop;
 
-        struct ggml_tensor * KQ_mask = KQ_mask_cpu;
-        struct ggml_tensor * KQ_pos = KQ_pos_cpu;
-
+        struct ggml_tensor * KQ_mask_l = KQ_mask;
+        struct ggml_tensor * KQ_pos_l = KQ_pos;
 
 #ifdef GGML_USE_CUBLAS
         if (il >= i_gpu_start) {
-            KQ_mask = KQ_mask_gpu;
-            KQ_pos = KQ_pos_gpu;
-            offload_func    = ggml_cuda_assign_buffers_no_alloc;
-            offload_func_v  = ggml_cuda_assign_buffers_no_alloc;
-            offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
+            offload_func = ggml_cuda_assign_buffers_no_alloc;
+            if (offload_k) {
+                KQ_mask_l = KQ_mask_gpu;
+                KQ_pos_l = KQ_pos_gpu;
+                offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
+            }
+            if (offload_v) {
+                offload_func_v  = ggml_cuda_assign_buffers_no_alloc;
+            }
         }
 #endif // GGML_USE_CUBLAS
 
@@ -2815,11 +2794,11 @@ static struct ggml_cgraph * llm_build_llama(
             offload_func_kq(tmpq);
             ggml_set_name(tmpq, "tmpq");
 
-            struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
+            struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos_l, n_embd_head, 0, 0, freq_base, freq_scale);
             offload_func_kq(Kcur);
             ggml_format_name(Kcur, "Kcur%d", il);
 
-            struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head,    n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
+            struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head,    n_tokens), KQ_pos_l, n_embd_head, 0, 0, freq_base, freq_scale);
             offload_func_kq(Qcur);
             ggml_format_name(Qcur, "Qcur%d", il);
 
@@ -2875,7 +2854,7 @@ static struct ggml_cgraph * llm_build_llama(
             ggml_set_name(KQ_scaled, "KQ_scaled");
 
             // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
+            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask_l);
             offload_func_kq(KQ_masked);
             ggml_format_name(KQ_masked, "KQ_masked%d", il);
 

From 986b3da76acee4bcbdf6eb9aaab4389d6c216cd1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 3 Dec 2023 17:18:15 +0200
Subject: [PATCH 04/13] llama : offload KV cache per-layer

---
 llama.cpp | 241 ++++++++++++++++++++----------------------------------
 llama.h   |   2 +
 2 files changed, 89 insertions(+), 154 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index f5743bbe1d0f7..f98f4a1cd50fd 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1,7 +1,3 @@
-// TODO: move to context params
-bool offload_k = true;
-bool offload_v = true;
-
 #define LLAMA_API_INTERNAL
 #include "llama.h"
 
@@ -1249,6 +1245,9 @@ struct llama_cparams {
     float yarn_beta_slow;
 
     bool mul_mat_q;
+    bool offload_k;
+    bool offload_v;
+
 };
 
 struct llama_layer {
@@ -1331,8 +1330,10 @@ struct llama_kv_cache {
 
 #ifdef GGML_USE_CUBLAS
         if (ggml_cublas_loaded()) {
-            ggml_cuda_free_data(k);
-            ggml_cuda_free_data(v);
+            for (size_t i = 0; i < k_l.size(); ++i) {
+                ggml_cuda_free_data(k_l[i]);
+                ggml_cuda_free_data(v_l[i]);
+            }
         }
 #endif
     }
@@ -1524,7 +1525,9 @@ static bool llama_kv_cache_init(
              struct llama_kv_cache & cache,
                          ggml_type   wtype,
                           uint32_t   n_ctx,
-                               int   n_gpu_layers) {
+                               int   n_gpu_layers,
+                              bool   offload_k,
+                              bool   offload_v) {
     const uint32_t n_embd  = hparams.n_embd_gqa();
     const uint32_t n_layer = hparams.n_layer;
 
@@ -2782,14 +2785,7 @@ static void llm_load_tensors(
                         ggml_backend_type backend_output;
 
                         if (n_gpu_layers > int(n_layer)) {
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
-#ifndef _WIN32
-                            backend_norm = llama_backend_offload;
-#else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
-#endif // _WIN32
-
+                            backend_norm   = llama_backend_offload;
                             backend_output = llama_backend_offload_split;
                         } else {
                             backend_norm   = GGML_BACKEND_CPU;
@@ -2859,14 +2855,7 @@ static void llm_load_tensors(
                         ggml_backend_type backend_output;
 
                         if (n_gpu_layers > int(n_layer)) {
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
-#ifndef _WIN32
-                            backend_norm = llama_backend_offload;
-#else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
-#endif // _WIN32
-
+                            backend_norm   = llama_backend_offload;
                             backend_output = llama_backend_offload_split;
                         } else {
                             backend_norm   = GGML_BACKEND_CPU;
@@ -2929,14 +2918,7 @@ static void llm_load_tensors(
                         ggml_backend_type backend_output;
 
                         if (n_gpu_layers > int(n_layer)) {
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
-#ifndef _WIN32
-                            backend_norm = llama_backend_offload;
-#else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
-#endif // _WIN32
-
+                            backend_norm   = llama_backend_offload;
                             backend_output = llama_backend_offload_split;
                         } else {
                             backend_norm   = GGML_BACKEND_CPU;
@@ -3006,14 +2988,7 @@ static void llm_load_tensors(
                         ggml_backend_type backend_output;
 
                         if (n_gpu_layers > int(n_layer)) {
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
-#ifndef _WIN32
-                            backend_norm = llama_backend_offload;
-#else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
-#endif // _WIN32
-
+                            backend_norm   = llama_backend_offload;
                             backend_output = llama_backend_offload_split;
                         } else {
                             backend_norm   = GGML_BACKEND_CPU;
@@ -3083,21 +3058,7 @@ static void llm_load_tensors(
                         ggml_backend_type backend_output;
 
                         if (n_gpu_layers > int(n_layer)) {
-#ifdef GGML_USE_CUBLAS
-                            if (n_gpu_layers > int(n_layer + 1)) {
-                                LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
-                                    __func__, n_layer + 1);
-                                throw std::runtime_error("Persimmon CUDA offload failed");
-                            }
-#endif
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
-#ifndef _WIN32
-                            backend_norm = llama_backend_offload;
-#else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
-#endif // _WIN32
-
+                            backend_norm   = llama_backend_offload;
                             backend_output = llama_backend_offload_split;
                         } else {
                             backend_norm   = GGML_BACKEND_CPU;
@@ -3156,14 +3117,7 @@ static void llm_load_tensors(
                         ggml_backend_type backend_output;
 
                         if (n_gpu_layers > int(n_layer)) {
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
-#ifndef _WIN32
-                            backend_norm = llama_backend_offload;
-#else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
-#endif // _WIN32
-
+                            backend_norm   = llama_backend_offload;
                             backend_output = llama_backend_offload_split;
                         } else {
                             backend_norm   = GGML_BACKEND_CPU;
@@ -3234,14 +3188,7 @@ static void llm_load_tensors(
                         ggml_backend_type backend_output;
 
                         if (n_gpu_layers > int(n_layer)) {
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
-#ifndef _WIN32
-                            backend_norm = llama_backend_offload;
-#else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
-#endif // _WIN32
-
+                            backend_norm   = llama_backend_offload;
                             backend_output = llama_backend_offload_split;
                         } else {
                             backend_norm   = GGML_BACKEND_CPU;
@@ -3301,14 +3248,7 @@ static void llm_load_tensors(
                         ggml_backend_type backend_output;
 
                         if (n_gpu_layers > int(n_layer)) {
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
-#ifndef _WIN32
-                            backend_norm = llama_backend_offload;
-#else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
-#endif // _WIN32
-
+                            backend_norm   = llama_backend_offload;
                             backend_output = llama_backend_offload_split;
                         } else {
                             backend_norm   = GGML_BACKEND_CPU;
@@ -3373,14 +3313,7 @@ static void llm_load_tensors(
                         ggml_backend_type backend_output;
 
                         if (n_gpu_layers > int(n_layer)) {
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
-#ifndef _WIN32
-                            backend_norm = llama_backend_offload;
-#else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
-#endif // _WIN32
-
+                            backend_norm   = llama_backend_offload;
                             backend_output = llama_backend_offload_split;
                         } else {
                             backend_norm   = GGML_BACKEND_CPU;
@@ -3456,8 +3389,8 @@ static void llm_load_tensors(
         }
 
 #ifdef GGML_USE_CUBLAS
-        const int max_backend_supported_layers = hparams.n_layer + 3;
-        const int max_offloadable_layers       = hparams.n_layer + 3;
+        const int max_backend_supported_layers = hparams.n_layer + 1;
+        const int max_offloadable_layers       = hparams.n_layer + 1;
 #elif GGML_USE_CLBLAST
         const int max_backend_supported_layers = hparams.n_layer + 1;
         const int max_offloadable_layers       = hparams.n_layer + 1;
@@ -3981,16 +3914,16 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_pos, "inp_pos", -1);
+        struct ggml_tensor * inp_pos_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos_host, "inp_pos_host", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale, "KQ_scale", -1);
+        struct ggml_tensor * KQ_scale_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale_host, "KQ_scale_host", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask, "KQ_mask", -1);
+        struct ggml_tensor * KQ_mask_host = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask_host, "KQ_mask_host", -1);
 
         // shift the entire K-cache if needed
         if (do_rope_shift) {
@@ -3998,6 +3931,16 @@ struct llm_build_context {
         }
 
         for (int il = 0; il < n_layer; ++il) {
+            // offloaded mirrors
+            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_host);
+            cb(inp_pos, "inp_pos", il);
+
+            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_host);
+            cb(KQ_scale, "KQ_scale", il);
+
+            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_host);
+            cb(KQ_mask, "KQ_mask", il);
+
             struct ggml_tensor * inpSA = inpL;
 
             // norm
@@ -5165,8 +5108,6 @@ struct llm_build_context {
 enum llm_offload_func_e {
     OFFLOAD_FUNC_NOP,
     OFFLOAD_FUNC,
-    OFFLOAD_FUNC_KQ,
-    OFFLOAD_FUNC_V,
     OFFLOAD_FUNC_NR,
     OFFLOAD_FUNC_EMB,
     OFFLOAD_FUNC_OUT,
@@ -5252,11 +5193,15 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
   //{ "inp_embd",                   OFFLOAD_FUNC_NR  }, // TODO: missing K-quants get_rows kernel
     { "pos_embd",                   OFFLOAD_FUNC_NR  },
 
-    { "inp_pos",                    OFFLOAD_FUNC_KQ  }, // this is often used for KQ ops (e.g. rope)
-    { "KQ_scale",                   OFFLOAD_FUNC_KQ  },
-    { "KQ_mask",                    OFFLOAD_FUNC_KQ  },
-    { "K_shift",                    OFFLOAD_FUNC_KQ  },
-    { "K_shifted",                  OFFLOAD_FUNC_KQ  },
+    { "inp_pos_host",               OFFLOAD_FUNC_NOP }, // this is often used for KQ ops (e.g. rope)
+    { "KQ_scale_host",              OFFLOAD_FUNC_NOP },
+    { "KQ_mask_host",               OFFLOAD_FUNC_NOP },
+    { "inp_pos",                    OFFLOAD_FUNC     }, // these are offloaded versions of the tensors
+    { "KQ_scale",                   OFFLOAD_FUNC     },
+    { "KQ_mask",                    OFFLOAD_FUNC     },
+
+    { "K_shift",                    OFFLOAD_FUNC     },
+    { "K_shifted",                  OFFLOAD_FUNC     },
 
     { "inp_norm",                   OFFLOAD_FUNC_NR  },
     { "inp_norm_w",                 OFFLOAD_FUNC_NR  },
@@ -5269,38 +5214,38 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
     { "attn_norm",                  OFFLOAD_FUNC     },
     { "attn_norm_2",                OFFLOAD_FUNC     },
 
-    { "wqkv",                       OFFLOAD_FUNC_KQ  },
-    { "bqkv",                       OFFLOAD_FUNC_KQ  },
-    { "wqkv_clamped",               OFFLOAD_FUNC_KQ  },
-
-    { "tmpk",                       OFFLOAD_FUNC_KQ  },
-    { "tmpq",                       OFFLOAD_FUNC_KQ  },
-    { "tmpv",                       OFFLOAD_FUNC_V   },
-    { "Kcur",                       OFFLOAD_FUNC_KQ  },
-    { "Qcur",                       OFFLOAD_FUNC_KQ  },
-    { "Vcur",                       OFFLOAD_FUNC_V   },
-
-    { "krot",                       OFFLOAD_FUNC_KQ  },
-    { "qrot",                       OFFLOAD_FUNC_KQ  },
-    { "kpass",                      OFFLOAD_FUNC_KQ  },
-    { "qpass",                      OFFLOAD_FUNC_KQ  },
-    { "krotated",                   OFFLOAD_FUNC_KQ  },
-    { "qrotated",                   OFFLOAD_FUNC_KQ  },
-
-    { "q",                          OFFLOAD_FUNC_KQ  },
-    { "k",                          OFFLOAD_FUNC_KQ  },
-    { "kq",                         OFFLOAD_FUNC_KQ  },
-    { "kq_scaled",                  OFFLOAD_FUNC_KQ  },
-    { "kq_scaled_alibi",            OFFLOAD_FUNC_KQ  },
-    { "kq_masked",                  OFFLOAD_FUNC_KQ  },
-    { "kq_soft_max",                OFFLOAD_FUNC_V   },
-    { "kq_soft_max_ext",            OFFLOAD_FUNC_V   },
-    { "v",                          OFFLOAD_FUNC_V   },
-    { "kqv",                        OFFLOAD_FUNC_V   },
-    { "kqv_merged",                 OFFLOAD_FUNC_V   },
-    { "kqv_merged_cont",            OFFLOAD_FUNC_V   },
-    { "kqv_wo",                     OFFLOAD_FUNC_V   },
-    { "kqv_out",                    OFFLOAD_FUNC_V   },
+    { "wqkv",                       OFFLOAD_FUNC     },
+    { "bqkv",                       OFFLOAD_FUNC     },
+    { "wqkv_clamped",               OFFLOAD_FUNC     },
+
+    { "tmpk",                       OFFLOAD_FUNC     },
+    { "tmpq",                       OFFLOAD_FUNC     },
+    { "tmpv",                       OFFLOAD_FUNC     },
+    { "Kcur",                       OFFLOAD_FUNC     },
+    { "Qcur",                       OFFLOAD_FUNC     },
+    { "Vcur",                       OFFLOAD_FUNC     },
+
+    { "krot",                       OFFLOAD_FUNC     },
+    { "qrot",                       OFFLOAD_FUNC     },
+    { "kpass",                      OFFLOAD_FUNC     },
+    { "qpass",                      OFFLOAD_FUNC     },
+    { "krotated",                   OFFLOAD_FUNC     },
+    { "qrotated",                   OFFLOAD_FUNC     },
+
+    { "q",                          OFFLOAD_FUNC     },
+    { "k",                          OFFLOAD_FUNC     },
+    { "kq",                         OFFLOAD_FUNC     },
+    { "kq_scaled",                  OFFLOAD_FUNC     },
+    { "kq_scaled_alibi",            OFFLOAD_FUNC     },
+    { "kq_masked",                  OFFLOAD_FUNC     },
+    { "kq_soft_max",                OFFLOAD_FUNC     },
+    { "kq_soft_max_ext",            OFFLOAD_FUNC     },
+    { "v",                          OFFLOAD_FUNC     },
+    { "kqv",                        OFFLOAD_FUNC     },
+    { "kqv_merged",                 OFFLOAD_FUNC     },
+    { "kqv_merged_cont",            OFFLOAD_FUNC     },
+    { "kqv_wo",                     OFFLOAD_FUNC     },
+    { "kqv_out",                    OFFLOAD_FUNC     },
 
     { "ffn_inp",                    OFFLOAD_FUNC     },
     { "ffn_norm",                   OFFLOAD_FUNC     },
@@ -5390,7 +5335,7 @@ static struct ggml_cgraph * llama_build_graph(
             alloc_inp_embd = true;
         }
 
-        if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) {
+        if (!alloc_inp_pos && strcmp(name, "inp_pos_host") == 0) {
             ggml_allocr_alloc(lctx.alloc, cur);
 
             if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) {
@@ -5406,7 +5351,7 @@ static struct ggml_cgraph * llama_build_graph(
             alloc_inp_pos = true;
         }
 
-        if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) {
+        if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale_host") == 0) {
             ggml_allocr_alloc(lctx.alloc, cur);
 
             if (!ggml_allocr_is_measure(lctx.alloc)) {
@@ -5417,7 +5362,7 @@ static struct ggml_cgraph * llama_build_graph(
             alloc_inp_KQ_scale = true;
         }
 
-        if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
+        if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask_host") == 0) {
             ggml_allocr_alloc(lctx.alloc, cur);
 
             if (!ggml_allocr_is_measure(lctx.alloc)) {
@@ -5493,14 +5438,10 @@ static struct ggml_cgraph * llama_build_graph(
             { OFFLOAD_FUNC_OUT, "CPU" },
 #ifdef GGML_USE_CUBLAS
             { OFFLOAD_FUNC,     "GPU (CUDA)" },
-            { OFFLOAD_FUNC_KQ,  "GPU (CUDA) KQ" },
-            { OFFLOAD_FUNC_V,   "GPU (CUDA) V" },
             { OFFLOAD_FUNC_NR,  "GPU (CUDA) NR" },
             { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" },
 #else
             { OFFLOAD_FUNC,     "CPU" },
-            { OFFLOAD_FUNC_KQ,  "CPU" },
-            { OFFLOAD_FUNC_V,   "CPU" },
             { OFFLOAD_FUNC_NR,  "CPU" },
             { OFFLOAD_FUNC_EMB, "CPU" },
 #endif // GGML_USE_CUBLAS
@@ -5538,16 +5479,6 @@ static struct ggml_cgraph * llama_build_graph(
                     func_e = OFFLOAD_FUNC_NOP;
                 }
                 break;
-            case OFFLOAD_FUNC_V:
-                if (n_gpu_layers <= n_layer + 1) {
-                    func_e = OFFLOAD_FUNC_NOP;
-                }
-                break;
-            case OFFLOAD_FUNC_KQ:
-                if (n_gpu_layers <= n_layer + 2) {
-                    func_e = OFFLOAD_FUNC_NOP;
-                }
-                break;
             case OFFLOAD_FUNC_EMB:
                 if (!offload_emb || n_gpu_layers < n_layer) {
                     func_e = OFFLOAD_FUNC_NOP;
@@ -5569,8 +5500,6 @@ static struct ggml_cgraph * llama_build_graph(
             case OFFLOAD_FUNC_NOP:
             case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break;
             case OFFLOAD_FUNC:
-            case OFFLOAD_FUNC_KQ:
-            case OFFLOAD_FUNC_V:
             case OFFLOAD_FUNC_NR:
             case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break;
             default: GGML_ASSERT(false);
@@ -5806,7 +5735,7 @@ static int llama_decode_internal(
         n_threads = std::min(4, n_threads);
     }
 
-    const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
+    const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
     if (ggml_cpu_has_cublas() && fully_offloaded) {
         n_threads = 1;
     }
@@ -8644,6 +8573,8 @@ struct llama_context_params llama_context_default_params() {
         /*.f16_kv                      =*/ true,
         /*.logits_all                  =*/ false,
         /*.embedding                   =*/ false,
+        /*.offload_k                   =*/ true,
+        /*.offload_q                   =*/ true,
     };
 
     return result;
@@ -8760,6 +8691,8 @@ struct llama_context * llama_new_context_with_model(
     cparams.yarn_beta_fast   = params.yarn_beta_fast;
     cparams.yarn_beta_slow   = params.yarn_beta_slow;
     cparams.mul_mat_q        = params.mul_mat_q;
+    cparams.offload_k        = params.offload_k;
+    cparams.offload_v        = params.offload_v;
 
     cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
     cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
@@ -8797,7 +8730,7 @@ struct llama_context * llama_new_context_with_model(
 
     // reserve memory for context buffers
     if (!hparams.vocab_only) {
-        if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers)) {
+        if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers, cparams.offload_k, cparams.offload_v)) {
             LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
             llama_free(ctx);
             return nullptr;
diff --git a/llama.h b/llama.h
index 89cb6198e84b8..3e2ad0560f8b2 100644
--- a/llama.h
+++ b/llama.h
@@ -196,6 +196,8 @@ extern "C" {
         bool f16_kv;     // use fp16 for KV cache, fp32 otherwise
         bool logits_all; // the llama_eval() call computes all logits, not just the last one
         bool embedding;  // embedding mode only
+        bool offload_k;
+        bool offload_v;
     };
 
     // model quantization parameters

From f3dbfb9f6084d7c7f1aa5756e0fad8c2902389be Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 3 Dec 2023 17:43:04 +0200
Subject: [PATCH 05/13] llama : offload K shift tensors

---
 llama.cpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index f98f4a1cd50fd..b423ef5012505 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3543,8 +3543,8 @@ static void llm_build_k_shift(
 
     GGML_ASSERT(n_embd_head % n_rot == 0);
 
-    struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
-    cb(K_shift, "K_shift", -1);
+    struct ggml_tensor * K_shift_host = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
+    cb(K_shift_host, "K_shift_host", -1);
 
     int rope_type = 0;
 
@@ -3555,6 +3555,10 @@ static void llm_build_k_shift(
     }
 
     for (int il = 0; il < n_layer; ++il) {
+        // offloaded mirrors
+        struct ggml_tensor * K_shift = ggml_view_tensor(ctx, K_shift_host);
+        cb(K_shift, "K_shift", il);
+
         struct ggml_tensor * tmp =
             // we rotate only the first n_rot dimensions
             ggml_rope_custom_inplace(ctx,
@@ -5196,6 +5200,8 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
     { "inp_pos_host",               OFFLOAD_FUNC_NOP }, // this is often used for KQ ops (e.g. rope)
     { "KQ_scale_host",              OFFLOAD_FUNC_NOP },
     { "KQ_mask_host",               OFFLOAD_FUNC_NOP },
+    { "K_shift_host",               OFFLOAD_FUNC_NOP },
+
     { "inp_pos",                    OFFLOAD_FUNC     }, // these are offloaded versions of the tensors
     { "KQ_scale",                   OFFLOAD_FUNC     },
     { "KQ_mask",                    OFFLOAD_FUNC     },
@@ -5389,7 +5395,7 @@ static struct ggml_cgraph * llama_build_graph(
             alloc_inp_KQ_mask = true;
         }
 
-        if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) {
+        if (!alloc_inp_K_shift && strcmp(name, "K_shift_host") == 0) {
             ggml_allocr_alloc(lctx.alloc, cur);
 
             if (!ggml_allocr_is_measure(lctx.alloc)) {

From 3d3e6bd0e44daf3b14f6974a164ef0f338783f03 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 3 Dec 2023 17:52:23 +0200
Subject: [PATCH 06/13] llama : offload for rest of the model arches

---
 llama.cpp | 183 +++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 133 insertions(+), 50 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index b423ef5012505..3744a6463232e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4050,16 +4050,16 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_pos, "inp_pos", -1);
+        struct ggml_tensor * inp_pos_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos_host, "inp_pos_host", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale, "KQ_scale", -1);
+        struct ggml_tensor * KQ_scale_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale_host, "KQ_scale_host", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask, "KQ_mask", -1);
+        struct ggml_tensor * KQ_mask_host = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask_host, "KQ_mask_host", -1);
 
         // shift the entire K-cache if needed
         if (do_rope_shift) {
@@ -4067,6 +4067,16 @@ struct llm_build_context {
         }
 
         for (int il = 0; il < n_layer; ++il) {
+            // offloaded mirrors
+            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_host);
+            cb(inp_pos, "inp_pos", il);
+
+            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_host);
+            cb(KQ_scale, "KQ_scale", il);
+
+            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_host);
+            cb(KQ_mask, "KQ_mask", il);
+
             struct ggml_tensor * inpSA = inpL;
 
             cur = llm_build_norm(ctx0, inpL, hparams,
@@ -4170,16 +4180,16 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_pos, "inp_pos", -1);
+        struct ggml_tensor * inp_pos_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos_host, "inp_pos_host", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale, "KQ_scale", -1);
+        struct ggml_tensor * KQ_scale_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale_host, "KQ_scale_host", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask, "KQ_mask", -1);
+        struct ggml_tensor * KQ_mask_host = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask_host, "KQ_mask_host", -1);
 
         // shift the entire K-cache if needed
         if (do_rope_shift) {
@@ -4187,6 +4197,16 @@ struct llm_build_context {
         }
 
         for (int il = 0; il < n_layer; ++il) {
+            // offloaded mirrors
+            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_host);
+            cb(inp_pos, "inp_pos", il);
+
+            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_host);
+            cb(KQ_scale, "KQ_scale", il);
+
+            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_host);
+            cb(KQ_mask, "KQ_mask", il);
+
             struct ggml_tensor * attn_norm;
 
             attn_norm = llm_build_norm(ctx0, inpL, hparams,
@@ -4293,24 +4313,34 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_pos, "inp_pos", -1);
+        struct ggml_tensor * inp_pos_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos_host, "inp_pos_host", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale, "KQ_scale", -1);
+        struct ggml_tensor * KQ_scale_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale_host, "KQ_scale_host", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask, "KQ_mask", -1);
+        struct ggml_tensor * KQ_mask_host = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask_host, "KQ_mask_host", -1);
 
-        pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
+        pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos_host);
         cb(pos, "pos_embd", -1);
 
         inpL = ggml_add(ctx0, inpL, pos);
         cb(inpL, "inpL", -1);
 
         for (int il = 0; il < n_layer; ++il) {
+            // offloaded mirrors
+            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_host);
+            cb(inp_pos, "inp_pos", il);
+
+            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_host);
+            cb(KQ_scale, "KQ_scale", il);
+
+            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_host);
+            cb(KQ_mask, "KQ_mask", il);
+
             cur = llm_build_norm(ctx0, inpL, hparams,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
@@ -4392,21 +4422,33 @@ struct llm_build_context {
         inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
         cb(inpL, "imp_embd", -1);
 
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_pos, "inp_pos", -1);
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos_host, "inp_pos_host", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale, "KQ_scale", -1);
+        struct ggml_tensor * KQ_scale_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale_host, "KQ_scale_host", -1);
 
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask, "KQ_mask", -1);
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask_host = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask_host, "KQ_mask_host", -1);
 
         if (do_rope_shift) {
             llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
         }
 
         for (int il = 0; il < n_layer; ++il) {
+            // offloaded mirrors
+            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_host);
+            cb(inp_pos, "inp_pos", il);
+
+            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_host);
+            cb(KQ_scale, "KQ_scale", il);
+
+            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_host);
+            cb(KQ_mask, "KQ_mask", il);
+
             struct ggml_tensor * residual = inpL;
 
             cur = llm_build_norm(ctx0, inpL, hparams,
@@ -4601,14 +4643,21 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale, "KQ_scale", -1);
+        struct ggml_tensor * KQ_scale_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale_host, "KQ_scale_host", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask, "KQ_mask", -1);
+        struct ggml_tensor * KQ_mask_host = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask_host, "KQ_mask_host", -1);
 
         for (int il = 0; il < n_layer; ++il) {
+            // offloaded mirrors
+            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_host);
+            cb(KQ_scale, "KQ_scale", il);
+
+            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_host);
+            cb(KQ_mask, "KQ_mask", il);
+
             struct ggml_tensor * inpSA = inpL;
 
             cur = llm_build_norm(ctx0, inpL, hparams,
@@ -4692,12 +4741,12 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale, "KQ_scale", -1);
+        struct ggml_tensor * KQ_scale_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale_host, "KQ_scale_host", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask, "KQ_mask", -1);
+        struct ggml_tensor * KQ_mask_host = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask_host, "KQ_mask_host", -1);
 
         inpL = llm_build_norm(ctx0, inpL, hparams,
                 model.tok_norm,
@@ -4706,6 +4755,13 @@ struct llm_build_context {
         cb(inpL, "inp_norm", -1);
 
         for (int il = 0; il < n_layer; ++il) {
+            // offloaded mirrors
+            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_host);
+            cb(KQ_scale, "KQ_scale", il);
+
+            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_host);
+            cb(KQ_mask, "KQ_mask", il);
+
             cur = llm_build_norm(ctx0, inpL, hparams,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
@@ -4786,14 +4842,21 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale, "KQ_scale", -1);
+        struct ggml_tensor * KQ_scale_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale_host, "KQ_scale_host", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask, "KQ_mask", -1);
+        struct ggml_tensor * KQ_mask_host = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask_host, "KQ_mask_host", -1);
 
         for (int il = 0; il < n_layer; ++il) {
+            // offloaded mirrors
+            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_host);
+            cb(KQ_scale, "KQ_scale", il);
+
+            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_host);
+            cb(KQ_mask, "KQ_mask", il);
+
             struct ggml_tensor * attn_norm;
 
             attn_norm = llm_build_norm(ctx0, inpL, hparams,
@@ -4885,16 +4948,16 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_pos, "inp_pos", -1);
+        struct ggml_tensor * inp_pos_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos_host, "inp_pos_host", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale, "KQ_scale", -1);
+        struct ggml_tensor * KQ_scale_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale_host, "KQ_scale_host", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask, "KQ_mask", -1);
+        struct ggml_tensor * KQ_mask_host = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask_host, "KQ_mask_host", -1);
 
         // shift the entire K-cache if needed
         if (do_rope_shift) {
@@ -4902,6 +4965,16 @@ struct llm_build_context {
         }
 
         for (int il = 0; il < n_layer; ++il) {
+            // offloaded mirrors
+            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_host);
+            cb(inp_pos, "inp_pos", il);
+
+            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_host);
+            cb(KQ_scale, "KQ_scale", il);
+
+            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_host);
+            cb(KQ_mask, "KQ_mask", il);
+
             struct ggml_tensor * inpSA = inpL;
 
             // norm
@@ -4998,16 +5071,16 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_pos, "inp_pos", -1);
+        struct ggml_tensor * inp_pos_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos_host, "inp_pos_host", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale, "KQ_scale", -1);
+        struct ggml_tensor * KQ_scale_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale_host, "KQ_scale_host", -1);
 
-        // KQ_mask (mask for 1 head, it wil be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask, "KQ_mask", -1);
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask_host = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask_host, "KQ_mask_host", -1);
 
         // shift the entire K-cache if needed
         if (do_rope_shift) {
@@ -5015,6 +5088,16 @@ struct llm_build_context {
         }
 
         for (int il = 0; il < n_layer; ++il) {
+            // offloaded mirrors
+            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_host);
+            cb(inp_pos, "inp_pos", il);
+
+            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_host);
+            cb(KQ_scale, "KQ_scale", il);
+
+            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_host);
+            cb(KQ_mask, "KQ_mask", il);
+
             struct ggml_tensor * inpSA = inpL;
 
             cur = llm_build_norm(ctx0, inpL, hparams,

From 1fa91a4833915214df68cf6b950657e7cc8e7b7e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 3 Dec 2023 18:36:02 +0200
Subject: [PATCH 07/13] llama : enable offload debug temporarily

---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 3744a6463232e..334c1fad198bd 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5508,7 +5508,7 @@ static struct ggml_cgraph * llama_build_graph(
         //
         // TODO: will be removed with backend v2
 
-//#define LLAMA_OFFLOAD_DEBUG
+#define LLAMA_OFFLOAD_DEBUG
 
         if (!do_offload) {
             return;

From c44bc1ee001366afaef7c9adda7fd5b720da9849 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 3 Dec 2023 19:22:47 +0200
Subject: [PATCH 08/13] llama : keep the KV related layers on the device

---
 llama.cpp | 210 +++++++++++++++++++++++++++---------------------------
 1 file changed, 104 insertions(+), 106 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 334c1fad198bd..e56fa040722b2 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3543,8 +3543,8 @@ static void llm_build_k_shift(
 
     GGML_ASSERT(n_embd_head % n_rot == 0);
 
-    struct ggml_tensor * K_shift_host = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
-    cb(K_shift_host, "K_shift_host", -1);
+    struct ggml_tensor * K_shift_ref = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
+    cb(K_shift_ref, "K_shift_ref", -1);
 
     int rope_type = 0;
 
@@ -3555,8 +3555,7 @@ static void llm_build_k_shift(
     }
 
     for (int il = 0; il < n_layer; ++il) {
-        // offloaded mirrors
-        struct ggml_tensor * K_shift = ggml_view_tensor(ctx, K_shift_host);
+        struct ggml_tensor * K_shift = ggml_view_tensor(ctx, K_shift_ref);
         cb(K_shift, "K_shift", il);
 
         struct ggml_tensor * tmp =
@@ -3918,16 +3917,16 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_pos_host, "inp_pos_host", -1);
+        struct ggml_tensor * inp_pos_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos_ref, "inp_pos_ref", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale_host, "KQ_scale_host", -1);
+        struct ggml_tensor * KQ_scale_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale_ref, "KQ_scale_ref", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_host = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask_host, "KQ_mask_host", -1);
+        struct ggml_tensor * KQ_mask_ref = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask_ref, "KQ_mask_ref", -1);
 
         // shift the entire K-cache if needed
         if (do_rope_shift) {
@@ -3936,13 +3935,15 @@ struct llm_build_context {
 
         for (int il = 0; il < n_layer; ++il) {
             // offloaded mirrors
-            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_host);
+            // TODO: this is not ideal because when we do partial offloading, we will do Device->Host copies
+            //       for all non-offloaded layers
+            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_ref);
             cb(inp_pos, "inp_pos", il);
 
-            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_host);
+            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_ref);
             cb(KQ_scale, "KQ_scale", il);
 
-            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_host);
+            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_ref);
             cb(KQ_mask, "KQ_mask", il);
 
             struct ggml_tensor * inpSA = inpL;
@@ -4050,16 +4051,16 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_pos_host, "inp_pos_host", -1);
+        struct ggml_tensor * inp_pos_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos_ref, "inp_pos_ref", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale_host, "KQ_scale_host", -1);
+        struct ggml_tensor * KQ_scale_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale_ref, "KQ_scale_ref", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_host = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask_host, "KQ_mask_host", -1);
+        struct ggml_tensor * KQ_mask_ref = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask_ref, "KQ_mask_ref", -1);
 
         // shift the entire K-cache if needed
         if (do_rope_shift) {
@@ -4068,13 +4069,13 @@ struct llm_build_context {
 
         for (int il = 0; il < n_layer; ++il) {
             // offloaded mirrors
-            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_host);
+            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_ref);
             cb(inp_pos, "inp_pos", il);
 
-            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_host);
+            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_ref);
             cb(KQ_scale, "KQ_scale", il);
 
-            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_host);
+            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_ref);
             cb(KQ_mask, "KQ_mask", il);
 
             struct ggml_tensor * inpSA = inpL;
@@ -4180,16 +4181,16 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_pos_host, "inp_pos_host", -1);
+        struct ggml_tensor * inp_pos_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos_ref, "inp_pos_ref", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale_host, "KQ_scale_host", -1);
+        struct ggml_tensor * KQ_scale_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale_ref, "KQ_scale_ref", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_host = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask_host, "KQ_mask_host", -1);
+        struct ggml_tensor * KQ_mask_ref = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask_ref, "KQ_mask_ref", -1);
 
         // shift the entire K-cache if needed
         if (do_rope_shift) {
@@ -4197,14 +4198,13 @@ struct llm_build_context {
         }
 
         for (int il = 0; il < n_layer; ++il) {
-            // offloaded mirrors
-            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_host);
+            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_ref);
             cb(inp_pos, "inp_pos", il);
 
-            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_host);
+            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_ref);
             cb(KQ_scale, "KQ_scale", il);
 
-            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_host);
+            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_ref);
             cb(KQ_mask, "KQ_mask", il);
 
             struct ggml_tensor * attn_norm;
@@ -4313,32 +4313,31 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_pos_host, "inp_pos_host", -1);
+        struct ggml_tensor * inp_pos_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos_ref, "inp_pos_ref", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale_host, "KQ_scale_host", -1);
+        struct ggml_tensor * KQ_scale_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale_ref, "KQ_scale_ref", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_host = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask_host, "KQ_mask_host", -1);
+        struct ggml_tensor * KQ_mask_ref = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask_ref, "KQ_mask_ref", -1);
 
-        pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos_host);
+        pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos_ref);
         cb(pos, "pos_embd", -1);
 
         inpL = ggml_add(ctx0, inpL, pos);
         cb(inpL, "inpL", -1);
 
         for (int il = 0; il < n_layer; ++il) {
-            // offloaded mirrors
-            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_host);
+            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_ref);
             cb(inp_pos, "inp_pos", il);
 
-            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_host);
+            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_ref);
             cb(KQ_scale, "KQ_scale", il);
 
-            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_host);
+            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_ref);
             cb(KQ_mask, "KQ_mask", il);
 
             cur = llm_build_norm(ctx0, inpL, hparams,
@@ -4423,30 +4422,29 @@ struct llm_build_context {
         cb(inpL, "imp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_pos_host, "inp_pos_host", -1);
+        struct ggml_tensor * inp_pos_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos_ref, "inp_pos_ref", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale_host, "KQ_scale_host", -1);
+        struct ggml_tensor * KQ_scale_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale_ref, "KQ_scale_ref", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_host = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask_host, "KQ_mask_host", -1);
+        struct ggml_tensor * KQ_mask_ref = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask_ref, "KQ_mask_ref", -1);
 
         if (do_rope_shift) {
             llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
         }
 
         for (int il = 0; il < n_layer; ++il) {
-            // offloaded mirrors
-            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_host);
+            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_ref);
             cb(inp_pos, "inp_pos", il);
 
-            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_host);
+            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_ref);
             cb(KQ_scale, "KQ_scale", il);
 
-            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_host);
+            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_ref);
             cb(KQ_mask, "KQ_mask", il);
 
             struct ggml_tensor * residual = inpL;
@@ -4643,19 +4641,18 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale_host, "KQ_scale_host", -1);
+        struct ggml_tensor * KQ_scale_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale_ref, "KQ_scale_ref", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_host = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask_host, "KQ_mask_host", -1);
+        struct ggml_tensor * KQ_mask_ref = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask_ref, "KQ_mask_ref", -1);
 
         for (int il = 0; il < n_layer; ++il) {
-            // offloaded mirrors
-            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_host);
+            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_ref);
             cb(KQ_scale, "KQ_scale", il);
 
-            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_host);
+            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_ref);
             cb(KQ_mask, "KQ_mask", il);
 
             struct ggml_tensor * inpSA = inpL;
@@ -4741,12 +4738,12 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale_host, "KQ_scale_host", -1);
+        struct ggml_tensor * KQ_scale_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale_ref, "KQ_scale_ref", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_host = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask_host, "KQ_mask_host", -1);
+        struct ggml_tensor * KQ_mask_ref = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask_ref, "KQ_mask_ref", -1);
 
         inpL = llm_build_norm(ctx0, inpL, hparams,
                 model.tok_norm,
@@ -4755,11 +4752,10 @@ struct llm_build_context {
         cb(inpL, "inp_norm", -1);
 
         for (int il = 0; il < n_layer; ++il) {
-            // offloaded mirrors
-            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_host);
+            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_ref);
             cb(KQ_scale, "KQ_scale", il);
 
-            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_host);
+            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_ref);
             cb(KQ_mask, "KQ_mask", il);
 
             cur = llm_build_norm(ctx0, inpL, hparams,
@@ -4842,19 +4838,18 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale_host, "KQ_scale_host", -1);
+        struct ggml_tensor * KQ_scale_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale_ref, "KQ_scale_ref", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_host = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask_host, "KQ_mask_host", -1);
+        struct ggml_tensor * KQ_mask_ref = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask_ref, "KQ_mask_ref", -1);
 
         for (int il = 0; il < n_layer; ++il) {
-            // offloaded mirrors
-            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_host);
+            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_ref);
             cb(KQ_scale, "KQ_scale", il);
 
-            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_host);
+            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_ref);
             cb(KQ_mask, "KQ_mask", il);
 
             struct ggml_tensor * attn_norm;
@@ -4948,16 +4943,16 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_pos_host, "inp_pos_host", -1);
+        struct ggml_tensor * inp_pos_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos_ref, "inp_pos_ref", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale_host, "KQ_scale_host", -1);
+        struct ggml_tensor * KQ_scale_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale_ref, "KQ_scale_ref", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_host = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask_host, "KQ_mask_host", -1);
+        struct ggml_tensor * KQ_mask_ref = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask_ref, "KQ_mask_ref", -1);
 
         // shift the entire K-cache if needed
         if (do_rope_shift) {
@@ -4965,14 +4960,13 @@ struct llm_build_context {
         }
 
         for (int il = 0; il < n_layer; ++il) {
-            // offloaded mirrors
-            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_host);
+            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_ref);
             cb(inp_pos, "inp_pos", il);
 
-            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_host);
+            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_ref);
             cb(KQ_scale, "KQ_scale", il);
 
-            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_host);
+            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_ref);
             cb(KQ_mask, "KQ_mask", il);
 
             struct ggml_tensor * inpSA = inpL;
@@ -5071,16 +5065,16 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_pos_host, "inp_pos_host", -1);
+        struct ggml_tensor * inp_pos_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos_ref, "inp_pos_ref", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale_host = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale_host, "KQ_scale_host", -1);
+        struct ggml_tensor * KQ_scale_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale_ref, "KQ_scale_ref", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_host = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask_host, "KQ_mask_host", -1);
+        struct ggml_tensor * KQ_mask_ref = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask_ref, "KQ_mask_ref", -1);
 
         // shift the entire K-cache if needed
         if (do_rope_shift) {
@@ -5088,14 +5082,13 @@ struct llm_build_context {
         }
 
         for (int il = 0; il < n_layer; ++il) {
-            // offloaded mirrors
-            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_host);
+            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_ref);
             cb(inp_pos, "inp_pos", il);
 
-            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_host);
+            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_ref);
             cb(KQ_scale, "KQ_scale", il);
 
-            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_host);
+            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_ref);
             cb(KQ_mask, "KQ_mask", il);
 
             struct ggml_tensor * inpSA = inpL;
@@ -5195,6 +5188,7 @@ struct llm_build_context {
 enum llm_offload_func_e {
     OFFLOAD_FUNC_NOP,
     OFFLOAD_FUNC,
+    OFFLOAD_FUNC_FRC, // force offload
     OFFLOAD_FUNC_NR,
     OFFLOAD_FUNC_EMB,
     OFFLOAD_FUNC_OUT,
@@ -5280,16 +5274,16 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
   //{ "inp_embd",                   OFFLOAD_FUNC_NR  }, // TODO: missing K-quants get_rows kernel
     { "pos_embd",                   OFFLOAD_FUNC_NR  },
 
-    { "inp_pos_host",               OFFLOAD_FUNC_NOP }, // this is often used for KQ ops (e.g. rope)
-    { "KQ_scale_host",              OFFLOAD_FUNC_NOP },
-    { "KQ_mask_host",               OFFLOAD_FUNC_NOP },
-    { "K_shift_host",               OFFLOAD_FUNC_NOP },
+    { "inp_pos_ref",                OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
+    { "KQ_scale_ref",               OFFLOAD_FUNC_FRC },
+    { "KQ_mask_ref",                OFFLOAD_FUNC_FRC },
+    { "K_shift_ref",                OFFLOAD_FUNC_FRC },
 
-    { "inp_pos",                    OFFLOAD_FUNC     }, // these are offloaded versions of the tensors
+    { "inp_pos",                    OFFLOAD_FUNC     },
     { "KQ_scale",                   OFFLOAD_FUNC     },
     { "KQ_mask",                    OFFLOAD_FUNC     },
-
     { "K_shift",                    OFFLOAD_FUNC     },
+
     { "K_shifted",                  OFFLOAD_FUNC     },
 
     { "inp_norm",                   OFFLOAD_FUNC_NR  },
@@ -5424,7 +5418,7 @@ static struct ggml_cgraph * llama_build_graph(
             alloc_inp_embd = true;
         }
 
-        if (!alloc_inp_pos && strcmp(name, "inp_pos_host") == 0) {
+        if (!alloc_inp_pos && strcmp(name, "inp_pos_ref") == 0) {
             ggml_allocr_alloc(lctx.alloc, cur);
 
             if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) {
@@ -5440,7 +5434,7 @@ static struct ggml_cgraph * llama_build_graph(
             alloc_inp_pos = true;
         }
 
-        if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale_host") == 0) {
+        if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale_ref") == 0) {
             ggml_allocr_alloc(lctx.alloc, cur);
 
             if (!ggml_allocr_is_measure(lctx.alloc)) {
@@ -5451,7 +5445,7 @@ static struct ggml_cgraph * llama_build_graph(
             alloc_inp_KQ_scale = true;
         }
 
-        if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask_host") == 0) {
+        if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask_ref") == 0) {
             ggml_allocr_alloc(lctx.alloc, cur);
 
             if (!ggml_allocr_is_measure(lctx.alloc)) {
@@ -5478,7 +5472,7 @@ static struct ggml_cgraph * llama_build_graph(
             alloc_inp_KQ_mask = true;
         }
 
-        if (!alloc_inp_K_shift && strcmp(name, "K_shift_host") == 0) {
+        if (!alloc_inp_K_shift && strcmp(name, "K_shift_ref") == 0) {
             ggml_allocr_alloc(lctx.alloc, cur);
 
             if (!ggml_allocr_is_measure(lctx.alloc)) {
@@ -5526,11 +5520,13 @@ static struct ggml_cgraph * llama_build_graph(
             { OFFLOAD_FUNC_NOP, "CPU" },
             { OFFLOAD_FUNC_OUT, "CPU" },
 #ifdef GGML_USE_CUBLAS
-            { OFFLOAD_FUNC,     "GPU (CUDA)" },
-            { OFFLOAD_FUNC_NR,  "GPU (CUDA) NR" },
+            { OFFLOAD_FUNC,     "GPU (CUDA)"     },
+            { OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
+            { OFFLOAD_FUNC_NR,  "GPU (CUDA) NR"  },
             { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" },
 #else
             { OFFLOAD_FUNC,     "CPU" },
+            { OFFLOAD_FUNC_FRC, "CPU" },
             { OFFLOAD_FUNC_NR,  "CPU" },
             { OFFLOAD_FUNC_EMB, "CPU" },
 #endif // GGML_USE_CUBLAS
@@ -5555,6 +5551,7 @@ static struct ggml_cgraph * llama_build_graph(
         switch (func_e) {
             case OFFLOAD_FUNC_NOP:
             case OFFLOAD_FUNC_OUT:
+            case OFFLOAD_FUNC_FRC:
                 break;
             case OFFLOAD_FUNC:
                 if (n_gpu_layers < n_layer) {
@@ -5589,6 +5586,7 @@ static struct ggml_cgraph * llama_build_graph(
             case OFFLOAD_FUNC_NOP:
             case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break;
             case OFFLOAD_FUNC:
+            case OFFLOAD_FUNC_FRC:
             case OFFLOAD_FUNC_NR:
             case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break;
             default: GGML_ASSERT(false);

From c80b8a2bff73239b88700fd0b8d32860ad1c6986 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 3 Dec 2023 19:46:06 +0200
Subject: [PATCH 09/13] llama : remove mirrors, perform Device -> Host when
 partial offload

---
 llama.cpp | 223 ++++++++++++++++--------------------------------------
 1 file changed, 65 insertions(+), 158 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index e56fa040722b2..8a2946fe7ed1e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3543,8 +3543,8 @@ static void llm_build_k_shift(
 
     GGML_ASSERT(n_embd_head % n_rot == 0);
 
-    struct ggml_tensor * K_shift_ref = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
-    cb(K_shift_ref, "K_shift_ref", -1);
+    struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
+    cb(K_shift, "K_shift", -1);
 
     int rope_type = 0;
 
@@ -3555,9 +3555,6 @@ static void llm_build_k_shift(
     }
 
     for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * K_shift = ggml_view_tensor(ctx, K_shift_ref);
-        cb(K_shift, "K_shift", il);
-
         struct ggml_tensor * tmp =
             // we rotate only the first n_rot dimensions
             ggml_rope_custom_inplace(ctx,
@@ -3917,16 +3914,16 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_pos_ref, "inp_pos_ref", -1);
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos, "inp_pos", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale_ref, "KQ_scale_ref", -1);
+        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale, "KQ_scale", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_ref = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask_ref, "KQ_mask_ref", -1);
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
 
         // shift the entire K-cache if needed
         if (do_rope_shift) {
@@ -3934,18 +3931,6 @@ struct llm_build_context {
         }
 
         for (int il = 0; il < n_layer; ++il) {
-            // offloaded mirrors
-            // TODO: this is not ideal because when we do partial offloading, we will do Device->Host copies
-            //       for all non-offloaded layers
-            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_ref);
-            cb(inp_pos, "inp_pos", il);
-
-            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_ref);
-            cb(KQ_scale, "KQ_scale", il);
-
-            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_ref);
-            cb(KQ_mask, "KQ_mask", il);
-
             struct ggml_tensor * inpSA = inpL;
 
             // norm
@@ -4051,16 +4036,16 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_pos_ref, "inp_pos_ref", -1);
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos, "inp_pos", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale_ref, "KQ_scale_ref", -1);
+        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale, "KQ_scale", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_ref = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask_ref, "KQ_mask_ref", -1);
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
 
         // shift the entire K-cache if needed
         if (do_rope_shift) {
@@ -4068,16 +4053,6 @@ struct llm_build_context {
         }
 
         for (int il = 0; il < n_layer; ++il) {
-            // offloaded mirrors
-            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_ref);
-            cb(inp_pos, "inp_pos", il);
-
-            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_ref);
-            cb(KQ_scale, "KQ_scale", il);
-
-            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_ref);
-            cb(KQ_mask, "KQ_mask", il);
-
             struct ggml_tensor * inpSA = inpL;
 
             cur = llm_build_norm(ctx0, inpL, hparams,
@@ -4181,16 +4156,16 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_pos_ref, "inp_pos_ref", -1);
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos, "inp_pos", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale_ref, "KQ_scale_ref", -1);
+        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale, "KQ_scale", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_ref = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask_ref, "KQ_mask_ref", -1);
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
 
         // shift the entire K-cache if needed
         if (do_rope_shift) {
@@ -4198,15 +4173,6 @@ struct llm_build_context {
         }
 
         for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_ref);
-            cb(inp_pos, "inp_pos", il);
-
-            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_ref);
-            cb(KQ_scale, "KQ_scale", il);
-
-            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_ref);
-            cb(KQ_mask, "KQ_mask", il);
-
             struct ggml_tensor * attn_norm;
 
             attn_norm = llm_build_norm(ctx0, inpL, hparams,
@@ -4313,33 +4279,24 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_pos_ref, "inp_pos_ref", -1);
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos, "inp_pos", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale_ref, "KQ_scale_ref", -1);
+        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale, "KQ_scale", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_ref = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask_ref, "KQ_mask_ref", -1);
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
 
-        pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos_ref);
+        pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
         cb(pos, "pos_embd", -1);
 
         inpL = ggml_add(ctx0, inpL, pos);
         cb(inpL, "inpL", -1);
 
         for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_ref);
-            cb(inp_pos, "inp_pos", il);
-
-            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_ref);
-            cb(KQ_scale, "KQ_scale", il);
-
-            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_ref);
-            cb(KQ_mask, "KQ_mask", il);
-
             cur = llm_build_norm(ctx0, inpL, hparams,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
@@ -4422,31 +4379,22 @@ struct llm_build_context {
         cb(inpL, "imp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_pos_ref, "inp_pos_ref", -1);
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos, "inp_pos", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale_ref, "KQ_scale_ref", -1);
+        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale, "KQ_scale", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_ref = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask_ref, "KQ_mask_ref", -1);
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
 
         if (do_rope_shift) {
             llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
         }
 
         for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_ref);
-            cb(inp_pos, "inp_pos", il);
-
-            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_ref);
-            cb(KQ_scale, "KQ_scale", il);
-
-            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_ref);
-            cb(KQ_mask, "KQ_mask", il);
-
             struct ggml_tensor * residual = inpL;
 
             cur = llm_build_norm(ctx0, inpL, hparams,
@@ -4641,20 +4589,14 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale_ref, "KQ_scale_ref", -1);
+        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale, "KQ_scale", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_ref = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask_ref, "KQ_mask_ref", -1);
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
 
         for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_ref);
-            cb(KQ_scale, "KQ_scale", il);
-
-            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_ref);
-            cb(KQ_mask, "KQ_mask", il);
-
             struct ggml_tensor * inpSA = inpL;
 
             cur = llm_build_norm(ctx0, inpL, hparams,
@@ -4738,12 +4680,12 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale_ref, "KQ_scale_ref", -1);
+        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale, "KQ_scale", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_ref = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask_ref, "KQ_mask_ref", -1);
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
 
         inpL = llm_build_norm(ctx0, inpL, hparams,
                 model.tok_norm,
@@ -4752,12 +4694,6 @@ struct llm_build_context {
         cb(inpL, "inp_norm", -1);
 
         for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_ref);
-            cb(KQ_scale, "KQ_scale", il);
-
-            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_ref);
-            cb(KQ_mask, "KQ_mask", il);
-
             cur = llm_build_norm(ctx0, inpL, hparams,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
@@ -4838,20 +4774,14 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale_ref, "KQ_scale_ref", -1);
+        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale, "KQ_scale", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_ref = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask_ref, "KQ_mask_ref", -1);
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
 
         for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_ref);
-            cb(KQ_scale, "KQ_scale", il);
-
-            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_ref);
-            cb(KQ_mask, "KQ_mask", il);
-
             struct ggml_tensor * attn_norm;
 
             attn_norm = llm_build_norm(ctx0, inpL, hparams,
@@ -4943,16 +4873,16 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_pos_ref, "inp_pos_ref", -1);
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos, "inp_pos", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale_ref, "KQ_scale_ref", -1);
+        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale, "KQ_scale", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_ref = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask_ref, "KQ_mask_ref", -1);
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
 
         // shift the entire K-cache if needed
         if (do_rope_shift) {
@@ -4960,15 +4890,6 @@ struct llm_build_context {
         }
 
         for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_ref);
-            cb(inp_pos, "inp_pos", il);
-
-            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_ref);
-            cb(KQ_scale, "KQ_scale", il);
-
-            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_ref);
-            cb(KQ_mask, "KQ_mask", il);
-
             struct ggml_tensor * inpSA = inpL;
 
             // norm
@@ -5065,16 +4986,16 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_pos_ref, "inp_pos_ref", -1);
+        struct ggml_tensor * inp_pos= ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos, "inp_pos", -1);
 
         // KQ_scale
-        struct ggml_tensor * KQ_scale_ref = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        cb(KQ_scale_ref, "KQ_scale_ref", -1);
+        struct ggml_tensor * KQ_scale= ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale, "KQ_scale", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_ref = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-        cb(KQ_mask_ref, "KQ_mask_ref", -1);
+        struct ggml_tensor * KQ_mask= ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
 
         // shift the entire K-cache if needed
         if (do_rope_shift) {
@@ -5082,15 +5003,6 @@ struct llm_build_context {
         }
 
         for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inp_pos = ggml_view_tensor(ctx0, inp_pos_ref);
-            cb(inp_pos, "inp_pos", il);
-
-            struct ggml_tensor * KQ_scale = ggml_view_tensor(ctx0, KQ_scale_ref);
-            cb(KQ_scale, "KQ_scale", il);
-
-            struct ggml_tensor * KQ_mask = ggml_view_tensor(ctx0, KQ_mask_ref);
-            cb(KQ_mask, "KQ_mask", il);
-
             struct ggml_tensor * inpSA = inpL;
 
             cur = llm_build_norm(ctx0, inpL, hparams,
@@ -5274,15 +5186,10 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
   //{ "inp_embd",                   OFFLOAD_FUNC_NR  }, // TODO: missing K-quants get_rows kernel
     { "pos_embd",                   OFFLOAD_FUNC_NR  },
 
-    { "inp_pos_ref",                OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
-    { "KQ_scale_ref",               OFFLOAD_FUNC_FRC },
-    { "KQ_mask_ref",                OFFLOAD_FUNC_FRC },
-    { "K_shift_ref",                OFFLOAD_FUNC_FRC },
-
-    { "inp_pos",                    OFFLOAD_FUNC     },
-    { "KQ_scale",                   OFFLOAD_FUNC     },
-    { "KQ_mask",                    OFFLOAD_FUNC     },
-    { "K_shift",                    OFFLOAD_FUNC     },
+    { "inp_pos",                    OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
+    { "KQ_scale",                   OFFLOAD_FUNC_FRC },
+    { "KQ_mask",                    OFFLOAD_FUNC_FRC },
+    { "K_shift",                    OFFLOAD_FUNC_FRC },
 
     { "K_shifted",                  OFFLOAD_FUNC     },
 
@@ -5418,7 +5325,7 @@ static struct ggml_cgraph * llama_build_graph(
             alloc_inp_embd = true;
         }
 
-        if (!alloc_inp_pos && strcmp(name, "inp_pos_ref") == 0) {
+        if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) {
             ggml_allocr_alloc(lctx.alloc, cur);
 
             if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) {
@@ -5434,7 +5341,7 @@ static struct ggml_cgraph * llama_build_graph(
             alloc_inp_pos = true;
         }
 
-        if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale_ref") == 0) {
+        if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) {
             ggml_allocr_alloc(lctx.alloc, cur);
 
             if (!ggml_allocr_is_measure(lctx.alloc)) {
@@ -5445,7 +5352,7 @@ static struct ggml_cgraph * llama_build_graph(
             alloc_inp_KQ_scale = true;
         }
 
-        if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask_ref") == 0) {
+        if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
             ggml_allocr_alloc(lctx.alloc, cur);
 
             if (!ggml_allocr_is_measure(lctx.alloc)) {
@@ -5472,7 +5379,7 @@ static struct ggml_cgraph * llama_build_graph(
             alloc_inp_KQ_mask = true;
         }
 
-        if (!alloc_inp_K_shift && strcmp(name, "K_shift_ref") == 0) {
+        if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) {
             ggml_allocr_alloc(lctx.alloc, cur);
 
             if (!ggml_allocr_is_measure(lctx.alloc)) {

From e262947d438b838c1e646908d33a07b3749cd136 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 3 Dec 2023 20:31:01 +0200
Subject: [PATCH 10/13] common : add command-line arg to disable KV cache
 offloading

---
 common/common.cpp |   5 +++
 common/common.h   |   1 +
 llama.cpp         | 102 ++++++++++++++++++++++++++--------------------
 llama.h           |  11 +++--
 4 files changed, 68 insertions(+), 51 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 1dcc235eac0e6..43c374d5ce936 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -498,6 +498,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             params.infill = true;
         } else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
             params.dump_kv_cache = true;
+        } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
+            params.no_kv_offload = true;
         } else if (arg == "--multiline-input") {
             params.multiline_input = true;
         } else if (arg == "--simple-io") {
@@ -840,6 +842,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --verbose-prompt      print prompt before generation\n");
     printf("  -dkvc, --dump-kv-cache\n");
     printf("                        verbose print of the KV cache\n");
+    printf("  -nkvo, --no-kv-offload\n");
+    printf("                        disable KV offload\n");
     printf("  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
     printf("  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
     printf("  --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
@@ -924,6 +928,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.yarn_beta_fast    = params.yarn_beta_fast;
     cparams.yarn_beta_slow    = params.yarn_beta_slow;
     cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
+    cparams.offload_kqv       = !params.no_kv_offload;
 
     return cparams;
 }
diff --git a/common/common.h b/common/common.h
index 2f6fe48ab53d3..2664c8fc175b7 100644
--- a/common/common.h
+++ b/common/common.h
@@ -123,6 +123,7 @@ struct gpt_params {
     bool verbose_prompt    = false; // print prompt tokens before generation
     bool infill            = false; // use infill mode
     bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
+    bool no_kv_offload     = false; // disable KV offloading
 
     // multimodal models (see examples/llava)
     std::string mmproj = ""; // path to multimodal projector
diff --git a/llama.cpp b/llama.cpp
index 8a2946fe7ed1e..357f19bb0c9d9 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1245,8 +1245,7 @@ struct llama_cparams {
     float yarn_beta_slow;
 
     bool mul_mat_q;
-    bool offload_k;
-    bool offload_v;
+    bool offload_kqv;
 
 };
 
@@ -1526,8 +1525,7 @@ static bool llama_kv_cache_init(
                          ggml_type   wtype,
                           uint32_t   n_ctx,
                                int   n_gpu_layers,
-                              bool   offload_k,
-                              bool   offload_v) {
+                              bool   offload) {
     const uint32_t n_embd  = hparams.n_embd_gqa();
     const uint32_t n_layer = hparams.n_layer;
 
@@ -1574,11 +1572,9 @@ static bool llama_kv_cache_init(
         cache.v_l.push_back(v);
 #ifdef GGML_USE_CUBLAS
         if (i >= i_gpu_start) {
-            if (offload_k) {
+            if (offload) {
                 ggml_cuda_assign_buffers_no_scratch(k);
                 vram_kv_cache += ggml_nbytes(k);
-            }
-            if (offload_v) {
                 ggml_cuda_assign_buffers_no_scratch(v);
                 vram_kv_cache += ggml_nbytes(v);
             }
@@ -5101,6 +5097,7 @@ enum llm_offload_func_e {
     OFFLOAD_FUNC_NOP,
     OFFLOAD_FUNC,
     OFFLOAD_FUNC_FRC, // force offload
+    OFFLOAD_FUNC_KQV,
     OFFLOAD_FUNC_NR,
     OFFLOAD_FUNC_EMB,
     OFFLOAD_FUNC_OUT,
@@ -5204,38 +5201,38 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
     { "attn_norm",                  OFFLOAD_FUNC     },
     { "attn_norm_2",                OFFLOAD_FUNC     },
 
-    { "wqkv",                       OFFLOAD_FUNC     },
-    { "bqkv",                       OFFLOAD_FUNC     },
-    { "wqkv_clamped",               OFFLOAD_FUNC     },
-
-    { "tmpk",                       OFFLOAD_FUNC     },
-    { "tmpq",                       OFFLOAD_FUNC     },
-    { "tmpv",                       OFFLOAD_FUNC     },
-    { "Kcur",                       OFFLOAD_FUNC     },
-    { "Qcur",                       OFFLOAD_FUNC     },
-    { "Vcur",                       OFFLOAD_FUNC     },
-
-    { "krot",                       OFFLOAD_FUNC     },
-    { "qrot",                       OFFLOAD_FUNC     },
-    { "kpass",                      OFFLOAD_FUNC     },
-    { "qpass",                      OFFLOAD_FUNC     },
-    { "krotated",                   OFFLOAD_FUNC     },
-    { "qrotated",                   OFFLOAD_FUNC     },
-
-    { "q",                          OFFLOAD_FUNC     },
-    { "k",                          OFFLOAD_FUNC     },
-    { "kq",                         OFFLOAD_FUNC     },
-    { "kq_scaled",                  OFFLOAD_FUNC     },
-    { "kq_scaled_alibi",            OFFLOAD_FUNC     },
-    { "kq_masked",                  OFFLOAD_FUNC     },
-    { "kq_soft_max",                OFFLOAD_FUNC     },
-    { "kq_soft_max_ext",            OFFLOAD_FUNC     },
-    { "v",                          OFFLOAD_FUNC     },
-    { "kqv",                        OFFLOAD_FUNC     },
-    { "kqv_merged",                 OFFLOAD_FUNC     },
-    { "kqv_merged_cont",            OFFLOAD_FUNC     },
-    { "kqv_wo",                     OFFLOAD_FUNC     },
-    { "kqv_out",                    OFFLOAD_FUNC     },
+    { "wqkv",                       OFFLOAD_FUNC_KQV },
+    { "bqkv",                       OFFLOAD_FUNC_KQV },
+    { "wqkv_clamped",               OFFLOAD_FUNC_KQV },
+
+    { "tmpk",                       OFFLOAD_FUNC_KQV },
+    { "tmpq",                       OFFLOAD_FUNC_KQV },
+    { "tmpv",                       OFFLOAD_FUNC_KQV },
+    { "Kcur",                       OFFLOAD_FUNC_KQV },
+    { "Qcur",                       OFFLOAD_FUNC_KQV },
+    { "Vcur",                       OFFLOAD_FUNC_KQV },
+
+    { "krot",                       OFFLOAD_FUNC_KQV },
+    { "qrot",                       OFFLOAD_FUNC_KQV },
+    { "kpass",                      OFFLOAD_FUNC_KQV },
+    { "qpass",                      OFFLOAD_FUNC_KQV },
+    { "krotated",                   OFFLOAD_FUNC_KQV },
+    { "qrotated",                   OFFLOAD_FUNC_KQV },
+
+    { "q",                          OFFLOAD_FUNC_KQV },
+    { "k",                          OFFLOAD_FUNC_KQV },
+    { "kq",                         OFFLOAD_FUNC_KQV },
+    { "kq_scaled",                  OFFLOAD_FUNC_KQV },
+    { "kq_scaled_alibi",            OFFLOAD_FUNC_KQV },
+    { "kq_masked",                  OFFLOAD_FUNC_KQV },
+    { "kq_soft_max",                OFFLOAD_FUNC_KQV },
+    { "kq_soft_max_ext",            OFFLOAD_FUNC_KQV },
+    { "v",                          OFFLOAD_FUNC_KQV },
+    { "kqv",                        OFFLOAD_FUNC_KQV },
+    { "kqv_merged",                 OFFLOAD_FUNC_KQV },
+    { "kqv_merged_cont",            OFFLOAD_FUNC_KQV },
+    { "kqv_wo",                     OFFLOAD_FUNC_KQV },
+    { "kqv_out",                    OFFLOAD_FUNC_KQV },
 
     { "ffn_inp",                    OFFLOAD_FUNC     },
     { "ffn_norm",                   OFFLOAD_FUNC     },
@@ -5429,11 +5426,13 @@ static struct ggml_cgraph * llama_build_graph(
 #ifdef GGML_USE_CUBLAS
             { OFFLOAD_FUNC,     "GPU (CUDA)"     },
             { OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
+            { OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" },
             { OFFLOAD_FUNC_NR,  "GPU (CUDA) NR"  },
             { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" },
 #else
             { OFFLOAD_FUNC,     "CPU" },
             { OFFLOAD_FUNC_FRC, "CPU" },
+            { OFFLOAD_FUNC_KQV, "CPU" },
             { OFFLOAD_FUNC_NR,  "CPU" },
             { OFFLOAD_FUNC_EMB, "CPU" },
 #endif // GGML_USE_CUBLAS
@@ -5458,7 +5457,6 @@ static struct ggml_cgraph * llama_build_graph(
         switch (func_e) {
             case OFFLOAD_FUNC_NOP:
             case OFFLOAD_FUNC_OUT:
-            case OFFLOAD_FUNC_FRC:
                 break;
             case OFFLOAD_FUNC:
                 if (n_gpu_layers < n_layer) {
@@ -5467,6 +5465,21 @@ static struct ggml_cgraph * llama_build_graph(
                     }
                 }
                 break;
+            case OFFLOAD_FUNC_FRC:
+                if (!lctx.cparams.offload_kqv) {
+                    func_e = OFFLOAD_FUNC_NOP;
+                } break;
+            case OFFLOAD_FUNC_KQV:
+                if (!lctx.cparams.offload_kqv) {
+                    func_e = OFFLOAD_FUNC_NOP;
+                } else {
+                    if (n_gpu_layers < n_layer) {
+                        if (il < i_gpu_start) {
+                            func_e = OFFLOAD_FUNC_NOP;
+                        }
+                    }
+                }
+                break;
             case OFFLOAD_FUNC_NR:
                 if (n_gpu_layers <= n_layer + 0) {
                     func_e = OFFLOAD_FUNC_NOP;
@@ -5493,6 +5506,7 @@ static struct ggml_cgraph * llama_build_graph(
             case OFFLOAD_FUNC_NOP:
             case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break;
             case OFFLOAD_FUNC:
+            case OFFLOAD_FUNC_KQV:
             case OFFLOAD_FUNC_FRC:
             case OFFLOAD_FUNC_NR:
             case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break;
@@ -8567,8 +8581,7 @@ struct llama_context_params llama_context_default_params() {
         /*.f16_kv                      =*/ true,
         /*.logits_all                  =*/ false,
         /*.embedding                   =*/ false,
-        /*.offload_k                   =*/ true,
-        /*.offload_q                   =*/ true,
+        /*.offload_kqv                 =*/ true,
     };
 
     return result;
@@ -8685,8 +8698,7 @@ struct llama_context * llama_new_context_with_model(
     cparams.yarn_beta_fast   = params.yarn_beta_fast;
     cparams.yarn_beta_slow   = params.yarn_beta_slow;
     cparams.mul_mat_q        = params.mul_mat_q;
-    cparams.offload_k        = params.offload_k;
-    cparams.offload_v        = params.offload_v;
+    cparams.offload_kqv      = params.offload_kqv;
 
     cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
     cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
@@ -8724,7 +8736,7 @@ struct llama_context * llama_new_context_with_model(
 
     // reserve memory for context buffers
     if (!hparams.vocab_only) {
-        if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers, cparams.offload_k, cparams.offload_v)) {
+        if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
             LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
             llama_free(ctx);
             return nullptr;
diff --git a/llama.h b/llama.h
index 3e2ad0560f8b2..634969b343417 100644
--- a/llama.h
+++ b/llama.h
@@ -192,12 +192,11 @@ extern "C" {
         uint32_t yarn_orig_ctx;    // YaRN original context size
 
         // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
-        bool f16_kv;     // use fp16 for KV cache, fp32 otherwise
-        bool logits_all; // the llama_eval() call computes all logits, not just the last one
-        bool embedding;  // embedding mode only
-        bool offload_k;
-        bool offload_v;
+        bool mul_mat_q;   // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
+        bool f16_kv;      // use fp16 for KV cache, fp32 otherwise
+        bool logits_all;  // the llama_eval() call computes all logits, not just the last one
+        bool embedding;   // embedding mode only
+        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
     };
 
     // model quantization parameters

From 66aaac98679267612d37b1fb13b118402215d3ab Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 3 Dec 2023 21:10:16 +0200
Subject: [PATCH 11/13] llama : update session save/load

---
 llama.cpp | 91 ++++++++++++++++++++++++++++++-------------------------
 llama.h   |  2 +-
 2 files changed, 50 insertions(+), 43 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 357f19bb0c9d9..d23a14469a0f0 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1563,6 +1563,8 @@ static bool llama_kv_cache_init(
 
     const int i_gpu_start = n_layer - n_gpu_layers; GGML_UNUSED(i_gpu_start);
 
+    GGML_UNUSED(offload);
+
     for (int i = 0; i < (int) n_layer; i++) {
         ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, wtype, n_embd*n_ctx);
         ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, wtype, n_embd*n_ctx);
@@ -5406,7 +5408,7 @@ static struct ggml_cgraph * llama_build_graph(
         //
         // TODO: will be removed with backend v2
 
-#define LLAMA_OFFLOAD_DEBUG
+//#define LLAMA_OFFLOAD_DEBUG
 
         if (!do_offload) {
             return;
@@ -9297,40 +9299,45 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
         data_ctx->write(&kv_used,     sizeof(kv_used));
 
         if (kv_buf_size) {
-#pragma message("TODO: implement KV cache saving")
-#if 0
-            const size_t elt_size = ggml_element_size(kv_self.k);
+            const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
 
-            ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
+            ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
             ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
 
-            ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
-            std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
-            kout3d->data = kout3d_data.data();
+            std::vector<std::vector<uint8_t>> kout2d_data(n_layer);
+            std::vector<std::vector<uint8_t>> vout2d_data(n_layer);
+
+            for (int il = 0; il < (int) n_layer; ++il) {
+                ggml_tensor * kout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
+                kout2d_data[il].resize(ggml_nbytes(kout2d));
+                kout2d->data = kout2d_data[il].data();
 
-            ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
-            std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
-            vout3d->data = vout3d_data.data();
+                ggml_tensor * vout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
+                vout2d_data[il].resize(ggml_nbytes(vout2d));
+                vout2d->data = vout2d_data[il].data();
 
-            ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
-                n_embd, kv_head, n_layer,
-                elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
+                ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
+                        n_embd, kv_head,
+                        elt_size*n_embd, 0);
 
-            ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
-                kv_head, n_embd, n_layer,
-                elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
+                ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
+                        kv_head, n_embd,
+                        elt_size*n_ctx, 0);
+
+                ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d));
+                ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d));
+            }
 
-            ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
-            ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
             ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
 
             ggml_free(cpy_ctx);
 
-            // our data is now in the kout3d_data and vout3d_data buffers
+            // our data is now in the kout2d_data and vout2d_data buffers
             // write them to file
-            data_ctx->write(kout3d_data.data(), kout3d_data.size());
-            data_ctx->write(vout3d_data.data(), vout3d_data.size());
-#endif
+            for (uint32_t il = 0; il < n_layer; ++il) {
+                data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size());
+                data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size());
+            }
         }
 
         for (uint32_t i = 0; i < kv_size; ++i) {
@@ -9430,35 +9437,35 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
         if (kv_buf_size) {
             GGML_ASSERT(kv_self.buf.size == kv_buf_size);
 
-#pragma message("TODO: implement KV cache loading")
-#if 0
-            const size_t elt_size = ggml_element_size(kv_self.k);
+            const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
 
-            ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
+            ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
             ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
 
-            ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
-            kin3d->data = (void *) inp;
-            inp += ggml_nbytes(kin3d);
+            for (int il = 0; il < n_layer; ++il) {
+                ggml_tensor * kin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
+                kin2d->data = (void *) inp;
+                inp += ggml_nbytes(kin2d);
+
+                ggml_tensor * vin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
+                vin2d->data = (void *) inp;
+                inp += ggml_nbytes(vin2d);
 
-            ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
-            vin3d->data = (void *) inp;
-            inp += ggml_nbytes(vin3d);
+                ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
+                    n_embd, kv_head,
+                    elt_size*n_embd, 0);
 
-            ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
-                n_embd, kv_head, n_layer,
-                elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
+                ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
+                    kv_head, n_embd,
+                    elt_size*n_ctx, 0);
 
-            ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
-                kv_head, n_embd, n_layer,
-                elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
+                ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d, k2d));
+                ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d, v2d));
+            }
 
-            ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
-            ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
             ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
 
             ggml_free(cpy_ctx);
-#endif
         }
 
         ctx->kv_self.head = kv_head;
diff --git a/llama.h b/llama.h
index 634969b343417..c1593c9b03331 100644
--- a/llama.h
+++ b/llama.h
@@ -42,7 +42,7 @@
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
 
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 2
+#define LLAMA_SESSION_VERSION 3
 
 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.

From 1a1a1c38452427b20a14da60239fef41b2c0d267 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 6 Dec 2023 13:30:20 +0200
Subject: [PATCH 12/13] llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>
---
 common/common.cpp                          |  40 ++++-
 common/common.h                            |   6 +-
 examples/llama-bench/llama-bench.cpp       | 111 +++++++++---
 examples/quantize-stats/quantize-stats.cpp |   1 -
 examples/server/server.cpp                 |   4 -
 ggml-cuda.cu                               | 186 ++++++++++++++++++--
 ggml-metal.m                               |  32 +++-
 ggml-metal.metal                           | 192 +++++++++++++++++++++
 llama.cpp                                  |  50 ++++--
 llama.h                                    |   4 +-
 10 files changed, 552 insertions(+), 74 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 43c374d5ce936..a5b5c468c802b 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -278,8 +278,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.yarn_beta_slow = std::stof(argv[i]);
-        } else if (arg == "--memory-f32") {
-            params.memory_f16 = false;
         } else if (arg == "--top-p") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -500,6 +498,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             params.dump_kv_cache = true;
         } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
             params.no_kv_offload = true;
+        } else if (arg == "-ctk" || arg == "--cache-type-k") {
+            params.cache_type_k = argv[++i];
+        } else if (arg == "-ctv" || arg == "--cache-type-v") {
+            params.cache_type_v = argv[++i];
         } else if (arg == "--multiline-input") {
             params.multiline_input = true;
         } else if (arg == "--simple-io") {
@@ -800,8 +802,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --yarn-beta-fast N    YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
     printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
     printf("  --no-penalize-nl      do not penalize newline token\n");
-    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
-    printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
     printf("  --temp N              temperature (default: %.1f)\n", (double)sparams.temp);
     printf("  --logits-all          return logits for all tokens in the batch (default: disabled)\n");
     printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
@@ -844,6 +844,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        verbose print of the KV cache\n");
     printf("  -nkvo, --no-kv-offload\n");
     printf("                        disable KV offload\n");
+    printf("  -ctk TYPE, --cache-type-k TYPE\n");
+    printf("                        KV cache data type for K (default: %s)\n", params.cache_type_k.c_str());
+    printf("  -ctv TYPE, --cache-type-v TYPE\n");
+    printf("                        KV cache data type for V (default: %s)\n", params.cache_type_v.c_str());
     printf("  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
     printf("  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
     printf("  --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
@@ -908,6 +912,29 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
     return mparams;
 }
 
+static ggml_type kv_cache_type_from_str(const std::string & s) {
+    if (s == "f16") {
+        return GGML_TYPE_F16;
+    }
+    if (s == "q8_0") {
+        return GGML_TYPE_Q8_0;
+    }
+    if (s == "q4_0") {
+        return GGML_TYPE_Q4_0;
+    }
+    if (s == "q4_1") {
+        return GGML_TYPE_Q4_1;
+    }
+    if (s == "q5_0") {
+        return GGML_TYPE_Q5_0;
+    }
+    if (s == "q5_1") {
+        return GGML_TYPE_Q5_1;
+    }
+
+    throw std::runtime_error("Invalid cache type: " + s);
+}
+
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
     auto cparams = llama_context_default_params();
 
@@ -917,7 +944,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
     cparams.mul_mat_q         = params.mul_mat_q;
     cparams.seed              = params.seed;
-    cparams.f16_kv            = params.memory_f16;
     cparams.logits_all        = params.logits_all;
     cparams.embedding         = params.embedding;
     cparams.rope_scaling_type = params.rope_scaling_type;
@@ -930,6 +956,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
     cparams.offload_kqv       = !params.no_kv_offload;
 
+    cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
+    cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
+
     return cparams;
 }
 
@@ -1341,7 +1370,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     }
     fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
     fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
-    fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
     fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
     fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
     fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
diff --git a/common/common.h b/common/common.h
index 2664c8fc175b7..4cf471c7a8a16 100644
--- a/common/common.h
+++ b/common/common.h
@@ -98,7 +98,6 @@ struct gpt_params {
     size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score
 
     bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
-    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
     bool random_prompt     = false; // do not randomize prompt if none provided
     bool use_color         = false; // use color to distinguish generations and inputs
     bool interactive       = false; // interactive mode
@@ -125,9 +124,12 @@ struct gpt_params {
     bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
     bool no_kv_offload     = false; // disable KV offloading
 
+    std::string cache_type_k = "f16"; // KV cache data type for the K
+    std::string cache_type_v = "f16"; // KV cache data type for the V
+
     // multimodal models (see examples/llava)
     std::string mmproj = ""; // path to multimodal projector
-    std::string image = ""; // path to an image file
+    std::string image  = ""; // path to an image file
 };
 
 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 9bd82d565834a..6617c050ddfec 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -53,6 +53,13 @@ static std::vector<T> split(const std::string & str, char delim) {
     return values;
 }
 
+template<typename T, typename F>
+static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
+    std::vector<std::string> str_values;
+    std::transform(values.begin(), values.end(), std::back_inserter(str_values), f);
+    return str_values;
+}
+
 template<typename T>
 static T avg(const std::vector<T> & v) {
     if (v.empty()) {
@@ -126,7 +133,8 @@ struct cmd_params {
     std::vector<int> n_prompt;
     std::vector<int> n_gen;
     std::vector<int> n_batch;
-    std::vector<bool> f32_kv;
+    std::vector<ggml_type> type_k;
+    std::vector<ggml_type> type_v;
     std::vector<int> n_threads;
     std::vector<int> n_gpu_layers;
     std::vector<int> main_gpu;
@@ -142,7 +150,8 @@ static const cmd_params cmd_params_defaults = {
     /* n_prompt      */ {512},
     /* n_gen         */ {128},
     /* n_batch       */ {512},
-    /* f32_kv        */ {false},
+    /* type_k        */ {GGML_TYPE_F16},
+    /* type_v        */ {GGML_TYPE_F16},
     /* n_threads     */ {get_num_physical_cores()},
     /* n_gpu_layers  */ {99},
     /* main_gpu      */ {0},
@@ -162,7 +171,8 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -p, --n-prompt <n>                (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
     printf("  -n, --n-gen <n>                   (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
     printf("  -b, --batch-size <n>              (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
-    printf("  --memory-f32 <0|1>                (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
+    printf("  -ctk <t>, --cache-type-k <t>      (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
+    printf("  -ctv <t>, --cache-type-v <t>      (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
     printf("  -t, --threads <n>                 (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
     printf("  -ngl, --n-gpu-layers <n>          (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
     printf("  -mg, --main-gpu <i>               (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
@@ -173,9 +183,32 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -v, --verbose                     (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
     printf("\n");
     printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
+}
 
+static ggml_type ggml_type_from_name(const std::string & s) {
+    if (s == "f16") {
+        return GGML_TYPE_F16;
+    }
+    if (s == "q8_0") {
+        return GGML_TYPE_Q8_0;
+    }
+    if (s == "q4_0") {
+        return GGML_TYPE_Q4_0;
+    }
+    if (s == "q4_1") {
+        return GGML_TYPE_Q4_1;
+    }
+    if (s == "q5_0") {
+        return GGML_TYPE_Q5_0;
+    }
+    if (s == "q5_1") {
+        return GGML_TYPE_Q5_1;
+    }
+
+    return GGML_TYPE_COUNT;
 }
 
+
 static cmd_params parse_cmd_params(int argc, char ** argv) {
     cmd_params params;
     std::string arg;
@@ -224,13 +257,38 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
             }
             auto p = split<int>(argv[i], split_delim);
             params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
-        } else if (arg == "--memory-f32") {
+        } else if (arg == "-ctk" || arg == "--cache-type-k") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<int>(argv[i], split_delim);
-            params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end());
+            auto p = split<std::string>(argv[i], split_delim);
+            std::vector<ggml_type> types;
+            for (const auto & t : p) {
+                ggml_type gt = ggml_type_from_name(t);
+                if (gt == GGML_TYPE_COUNT) {
+                    invalid_param = true;
+                    break;
+                }
+                types.push_back(gt);
+            }
+            params.type_k.insert(params.type_k.end(), types.begin(), types.end());
+        } else if (arg == "-ctv" || arg == "--cache-type-v") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<std::string>(argv[i], split_delim);
+            std::vector<ggml_type> types;
+            for (const auto & t : p) {
+                ggml_type gt = ggml_type_from_name(t);
+                if (gt == GGML_TYPE_COUNT) {
+                    invalid_param = true;
+                    break;
+                }
+                types.push_back(gt);
+            }
+            params.type_v.insert(params.type_v.end(), types.begin(), types.end());
         } else if (arg == "-t" || arg == "--threads") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -321,7 +379,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
     if (params.n_prompt.empty())     { params.n_prompt = cmd_params_defaults.n_prompt; }
     if (params.n_gen.empty())        { params.n_gen = cmd_params_defaults.n_gen; }
     if (params.n_batch.empty())      { params.n_batch = cmd_params_defaults.n_batch; }
-    if (params.f32_kv.empty())       { params.f32_kv = cmd_params_defaults.f32_kv; }
+    if (params.type_k.empty())       { params.type_k = cmd_params_defaults.type_k; }
+    if (params.type_v.empty())       { params.type_v = cmd_params_defaults.type_v; }
     if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
     if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
     if (params.mul_mat_q.empty())    { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
@@ -336,7 +395,8 @@ struct cmd_params_instance {
     int n_prompt;
     int n_gen;
     int n_batch;
-    bool f32_kv;
+    ggml_type type_k;
+    ggml_type type_v;
     int n_threads;
     int n_gpu_layers;
     int main_gpu;
@@ -365,7 +425,8 @@ struct cmd_params_instance {
 
         cparams.n_ctx = n_prompt + n_gen;
         cparams.n_batch = n_batch;
-        cparams.f16_kv = !f32_kv;
+        cparams.type_k = type_k;
+        cparams.type_v = type_v;
         cparams.mul_mat_q = mul_mat_q;
 
         return cparams;
@@ -380,7 +441,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
     for (const auto & mg : params.main_gpu)
     for (const auto & ts : params.tensor_split)
     for (const auto & nb : params.n_batch)
-    for (const auto & fk : params.f32_kv)
+    for (const auto & tk : params.type_k)
+    for (const auto & tv : params.type_v)
     for (const auto & mmq : params.mul_mat_q)
     for (const auto & nt : params.n_threads) {
         cmd_params_instance instance = {
@@ -388,7 +450,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
             /* .n_prompt     = */ n_prompt,
             /* .n_gen        = */ n_gen,
             /* .n_batch      = */ nb,
-            /* .f32_kv       = */ fk,
+            /* .type_k       = */ tk,
+            /* .type_v       = */ tv,
             /* .n_threads    = */ nt,
             /* .n_gpu_layers = */ nl,
             /* .main_gpu     = */ mg,
@@ -410,7 +473,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
     for (const auto & mg : params.main_gpu)
     for (const auto & ts : params.tensor_split)
     for (const auto & nb : params.n_batch)
-    for (const auto & fk : params.f32_kv)
+    for (const auto & tk : params.type_k)
+    for (const auto & tv : params.type_v)
     for (const auto & mmq : params.mul_mat_q)
     for (const auto & nt : params.n_threads) {
         for (const auto & n_prompt : params.n_prompt) {
@@ -422,7 +486,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .n_prompt     = */ n_prompt,
                 /* .n_gen        = */ 0,
                 /* .n_batch      = */ nb,
-                /* .f32_kv       = */ fk,
+                /* .type_k       = */ tk,
+                /* .type_v       = */ tv,
                 /* .n_threads    = */ nt,
                 /* .n_gpu_layers = */ nl,
                 /* .main_gpu     = */ mg,
@@ -441,7 +506,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .n_prompt     = */ 0,
                 /* .n_gen        = */ n_gen,
                 /* .n_batch      = */ nb,
-                /* .f32_kv       = */ fk,
+                /* .type_k       = */ tk,
+                /* .type_v       = */ tv,
                 /* .n_threads    = */ nt,
                 /* .n_gpu_layers = */ nl,
                 /* .main_gpu     = */ mg,
@@ -489,7 +555,8 @@ struct test {
     uint64_t model_n_params;
     int n_batch;
     int n_threads;
-    bool f32_kv;
+    ggml_type type_k;
+    ggml_type type_v;
     int n_gpu_layers;
     int main_gpu;
     bool mul_mat_q;
@@ -508,7 +575,8 @@ struct test {
         model_n_params = llama_model_n_params(lmodel);
         n_batch = inst.n_batch;
         n_threads = inst.n_threads;
-        f32_kv = inst.f32_kv;
+        type_k = inst.type_k;
+        type_v = inst.type_v;
         n_gpu_layers = inst.n_gpu_layers;
         main_gpu = inst.main_gpu;
         mul_mat_q = inst.mul_mat_q;
@@ -571,7 +639,7 @@ struct test {
             "cuda", "opencl", "metal", "gpu_blas", "blas",
             "cpu_info", "gpu_info",
             "model_filename", "model_type", "model_size", "model_n_params",
-            "n_batch", "n_threads", "f16_kv",
+            "n_batch", "n_threads", "type_k", "type_v",
             "n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split",
             "n_prompt", "n_gen", "test_time",
             "avg_ns", "stddev_ns",
@@ -621,7 +689,7 @@ struct test {
             std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
             cpu_info, gpu_info,
             model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
-            std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
+            std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
             std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str,
             std::to_string(n_prompt), std::to_string(n_gen), test_time,
             std::to_string(avg_ns()), std::to_string(stdev_ns()),
@@ -805,8 +873,11 @@ struct markdown_printer : public printer {
         if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
             fields.push_back("n_batch");
         }
-        if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) {
-            fields.push_back("f16_kv");
+        if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
+            fields.push_back("type_k");
+        }
+        if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
+            fields.push_back("type_v");
         }
         if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
             fields.push_back("main_gpu");
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index 2712824774ae7..773024160f839 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -321,7 +321,6 @@ int main(int argc, char ** argv) {
         auto cparams = llama_context_default_params();
         cparams.n_ctx      = 256;
         cparams.seed       = 1;
-        cparams.f16_kv     = false;
 
         ctx = llama_new_context_with_model(model, cparams);
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 911f7bbe1f85a..ef2a95004f453 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2108,10 +2108,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
             }
             params.yarn_beta_slow = std::stof(argv[i]);
         }
-        else if (arg == "--memory-f32" || arg == "--memory_f32")
-        {
-            params.memory_f16 = false;
-        }
         else if (arg == "--threads" || arg == "-t")
         {
             if (++i >= argc)
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 9019a849f0bff..1200d1c888b42 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -7,6 +7,7 @@
 #include <stdio.h>
 #include <atomic>
 #include <assert.h>
+#include <float.h>
 
 #if defined(GGML_USE_HIPBLAS)
 #include <hip/hip_runtime.h>
@@ -4559,6 +4560,116 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
     cpy_1(cx + x_offset, cdst + dst_offset);
 }
 
+static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    block_q8_0 * dsti = (block_q8_0 *) cdsti;
+
+    float amax = 0.0f; // absolute max
+
+    for (int j = 0; j < QK8_0; j++) {
+        const float v = xi[j];
+        amax = fmaxf(amax, fabsf(v));
+    }
+
+    const float d = amax / ((1 << 7) - 1);
+    const float id = d ? 1.0f/d : 0.0f;
+
+    dsti->d = d;
+
+    for (int j = 0; j < QK8_0; ++j) {
+        const float x0 = xi[j]*id;
+
+        dsti->qs[j] = roundf(x0);
+    }
+}
+
+static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    block_q4_0 * dsti = (block_q4_0 *) cdsti;
+
+    float amax = 0.0f;
+    float vmax = 0.0f;
+
+    for (int j = 0; j < QK4_0; ++j) {
+        const float v = xi[j];
+        if (amax < fabsf(v)) {
+            amax = fabsf(v);
+            vmax = v;
+        }
+    }
+
+    const float d  = vmax / -8;
+    const float id = d ? 1.0f/d : 0.0f;
+
+    dsti->d = d;
+
+    for (int j = 0; j < QK4_0/2; ++j) {
+        const float x0 = xi[0       + j]*id;
+        const float x1 = xi[QK4_0/2 + j]*id;
+
+        const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
+        const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
+
+        dsti->qs[j]  = xi0;
+        dsti->qs[j] |= xi1 << 4;
+    }
+}
+
+static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    block_q4_1 * dsti = (block_q4_1 *) cdsti;
+
+    float vmin = FLT_MAX;
+    float vmax = -FLT_MAX;
+
+    for (int j = 0; j < QK4_1; ++j) {
+        const float v = xi[j];
+
+        if (v < vmin) vmin = v;
+        if (v > vmax) vmax = v;
+    }
+
+    const float d  = (vmax - vmin) / ((1 << 4) - 1);
+    const float id = d ? 1.0f/d : 0.0f;
+
+    dsti->dm.x = d;
+    dsti->dm.y = vmin;
+
+    for (int j = 0; j < QK4_1/2; ++j) {
+        const float x0 = (xi[0       + j] - vmin)*id;
+        const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
+
+        const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
+        const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
+
+        dsti->qs[j]  = xi0;
+        dsti->qs[j] |= xi1 << 4;
+    }
+}
+
+template <cpy_kernel_t cpy_blck, int qk>
+static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
+                                 const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
+                                 const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
+    const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
+
+    if (i >= ne) {
+        return;
+    }
+
+    const int i02 = i / (ne00*ne01);
+    const int i01 = (i - i02*ne01*ne00) / ne00;
+    const int i00 = (i - i02*ne01*ne00 - i01*ne00);
+    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
+
+    const int i12 = i / (ne10*ne11);
+    const int i11 = (i - i12*ne10*ne11) / ne10;
+    const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
+    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
+
+    cpy_blck(cx + x_offset, cdst + dst_offset);
+}
+
 static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
     const float y = (i0 / 2 - low) / max(0.001f, high - low);
     return 1.0f - min(1.0f, max(0.0f, y));
@@ -5737,6 +5848,39 @@ static void ggml_cpy_f32_f16_cuda(
         (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
 }
 
+static void ggml_cpy_f32_q8_0_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
+    const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
+
+    GGML_ASSERT(ne % QK8_0 == 0);
+    const int num_blocks = ne / QK8_0;
+    cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
+}
+
+static void ggml_cpy_f32_q4_0_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
+    const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
+
+    GGML_ASSERT(ne % QK4_0 == 0);
+    const int num_blocks = ne / QK4_0;
+    cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
+}
+
+static void ggml_cpy_f32_q4_1_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
+    const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
+
+    GGML_ASSERT(ne % QK4_1 == 0);
+    const int num_blocks = ne / QK4_1;
+    cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
+}
+
 static void ggml_cpy_f16_f16_cuda(
     const char * cx, char * cdst, const int ne,
     const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
@@ -6093,20 +6237,21 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
     const enum ggml_type type = src->type;
     const int64_t ts = ggml_type_size(type);
     const int64_t bs = ggml_blck_size(type);
-    int64_t i1_diff = i1_high - i1_low;
+    const int64_t i1_diff = i1_high - i1_low;
 
     const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
-    if (nb0 == ts && nb1 == ts*ne0/bs) {
+    if (nb0 == ts && nb1 == ts*(ne0/bs)) {
         return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
     }
     if (nb0 == ts) {
-        return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
+        return cudaMemcpy2DAsync(dst_ptr, ts*(ne0/bs), x, nb1, ts*(ne0/bs), i1_diff, kind, stream);
     }
+    GGML_ASSERT(bs == 1 && "TODO: implement bs != 1");
     for (int64_t i1 = 0; i1 < i1_diff; i1++) {
         const void * rx = (const void *) ((const char *) x + i1*nb1);
-        void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
+        void * rd = (void *) (dst_ptr + i1*ts*ne0);
         // pretend the row is a matrix with cols=1
-        cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
+        cudaError_t r = cudaMemcpy2DAsync(rd, ts, rx, nb0, ts, ne0, kind, stream);
         if (r != cudaSuccess) { return r; }
     }
     return cudaSuccess;
@@ -6474,6 +6619,8 @@ inline void ggml_cuda_op_mul_mat_vec_q(
     const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
     const int64_t src1_padded_row_size, const cudaStream_t & stream) {
 
+    GGML_ASSERT(ggml_nrows(src1) == 1);
+
     const int64_t ne00 = src0->ne[0];
     const int64_t row_diff = row_high - row_low;
 
@@ -6533,7 +6680,8 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
     size_t ash;
     dfloat * src1_dfloat = nullptr; // dfloat == half
 
-    bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
+    bool src1_convert_f16 =
+        src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
         src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
         src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
 
@@ -7103,10 +7251,9 @@ static void ggml_cuda_op_mul_mat(
 
     const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
     const bool src0_is_contiguous = ggml_is_contiguous(src0);
-
     const bool src1_is_contiguous = ggml_is_contiguous(src1);
-    const int64_t src1_padded_col_size = ne10 % MATRIX_ROW_PADDING == 0 ?
-        ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
+
+    const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
 
     const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
     GGML_ASSERT(!(split && ne02 > 1));
@@ -7231,7 +7378,7 @@ static void ggml_cuda_op_mul_mat(
                 const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
 
                 // for split tensors the data begins at i0 == i0_offset_low
-                char  *  src0_dd_i =  src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs;
+                char  *  src0_dd_i =  src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
                 float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
                 char  * src1_ddq_i = src1_ddq[id] +  src1_ddq_i_offset;
                 float *   dst_dd_i =   dst_dd[id] + (i0*ne1  + src1_col_0) * (dst_on_device ? ne0 : row_diff);
@@ -7698,10 +7845,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
 #ifdef GGML_CUDA_FORCE_DMMV
             const bool use_mul_mat_vec_q = false;
 #else
-            const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
+            const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
 #endif // GGML_CUDA_FORCE_DMMV
 
             if (use_mul_mat_vec_q) {
+                // NOTE: this kernel does not support ggml_nrows(src1) > 1
                 ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
             } else {
                 ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
@@ -7770,14 +7918,17 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
     char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
 
     if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
-        ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
-                              ne10, ne11, nb10, nb11, nb12, main_stream);
+        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
-        ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
-                              ne10, ne11, nb10, nb11, nb12, main_stream);
+        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
+        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
+        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
+        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
-        ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
-                              ne10, ne11, nb10, nb11, nb12, main_stream);
+        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
     } else {
         fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
                 ggml_type_name(src0->type), ggml_type_name(src1->type));
@@ -7788,6 +7939,7 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
 }
 
 static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    // TODO: why do we pass dst as src1 here?
     ggml_cuda_cpy(src0, dst, nullptr);
     (void) src1;
 }
diff --git a/ggml-metal.m b/ggml-metal.m
index 3343bc8a3af37..be4ab0f2ed47c 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -118,6 +118,11 @@
     GGML_METAL_DECL_KERNEL(im2col_f16);
     GGML_METAL_DECL_KERNEL(cpy_f32_f16);
     GGML_METAL_DECL_KERNEL(cpy_f32_f32);
+    GGML_METAL_DECL_KERNEL(cpy_f32_q8_0);
+    GGML_METAL_DECL_KERNEL(cpy_f32_q4_0);
+    GGML_METAL_DECL_KERNEL(cpy_f32_q4_1);
+    //GGML_METAL_DECL_KERNEL(cpy_f32_q5_0);
+    //GGML_METAL_DECL_KERNEL(cpy_f32_q5_1);
     GGML_METAL_DECL_KERNEL(cpy_f16_f16);
     GGML_METAL_DECL_KERNEL(concat);
     GGML_METAL_DECL_KERNEL(sqr);
@@ -324,6 +329,11 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
         GGML_METAL_ADD_KERNEL(im2col_f16);
         GGML_METAL_ADD_KERNEL(cpy_f32_f16);
         GGML_METAL_ADD_KERNEL(cpy_f32_f32);
+        GGML_METAL_ADD_KERNEL(cpy_f32_q8_0);
+        GGML_METAL_ADD_KERNEL(cpy_f32_q4_0);
+        GGML_METAL_ADD_KERNEL(cpy_f32_q4_1);
+        //GGML_METAL_ADD_KERNEL(cpy_f32_q5_0);
+        //GGML_METAL_ADD_KERNEL(cpy_f32_q5_1);
         GGML_METAL_ADD_KERNEL(cpy_f16_f16);
         GGML_METAL_ADD_KERNEL(concat);
         GGML_METAL_ADD_KERNEL(sqr);
@@ -425,6 +435,11 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
     GGML_METAL_DEL_KERNEL(im2col_f16);
     GGML_METAL_DEL_KERNEL(cpy_f32_f16);
     GGML_METAL_DEL_KERNEL(cpy_f32_f32);
+    GGML_METAL_DEL_KERNEL(cpy_f32_q8_0);
+    GGML_METAL_DEL_KERNEL(cpy_f32_q4_0);
+    GGML_METAL_DEL_KERNEL(cpy_f32_q4_1);
+    //GGML_METAL_DEL_KERNEL(cpy_f32_q5_0);
+    //GGML_METAL_DEL_KERNEL(cpy_f32_q5_1);
     GGML_METAL_DEL_KERNEL(cpy_f16_f16);
     GGML_METAL_DEL_KERNEL(concat);
     GGML_METAL_DEL_KERNEL(sqr);
@@ -1114,7 +1129,7 @@ void ggml_metal_graph_compute(
                                 !ggml_is_transposed(src1) &&
                                 src1t == GGML_TYPE_F32 &&
                                 ne00 % 32 == 0 && ne00 >= 64 &&
-                                ne11 > ne11_mm_min) {
+                                (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) {
                                 //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
                                 switch (src0->type) {
                                     case GGML_TYPE_F32:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32];  break;
@@ -1549,14 +1564,23 @@ void ggml_metal_graph_compute(
                     case GGML_OP_CPY:
                     case GGML_OP_CONT:
                         {
-                            const int nth = MIN(1024, ne00);
+                            GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
+
+                            int nth = MIN(1024, ne00/ggml_blck_size(src0->type));
 
                             switch (src0t) {
                                 case GGML_TYPE_F32:
                                     {
+                                        GGML_ASSERT(ne0 % ggml_blck_size(dst->type) == 0);
+
                                         switch (dstt) {
-                                            case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16]; break;
-                                            case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; break;
+                                            case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16];  break;
+                                            case GGML_TYPE_F32:  [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32];  break;
+                                            case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q8_0]; break;
+                                            case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q4_0]; break;
+                                            case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q4_1]; break;
+                                            //case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q5_0]; break;
+                                            //case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q5_1]; break;
                                             default: GGML_ASSERT(false && "not implemented");
                                         };
                                     } break;
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 9a79f815f3a72..9f5ffcbafe8fc 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -3,6 +3,7 @@
 using namespace metal;
 
 #define MAX(x, y) ((x) > (y) ? (x) : (y))
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
 
 #define QK4_0 32
 #define QR4_0 2
@@ -1460,6 +1461,197 @@ kernel void kernel_cpy_f32_f32(
     }
 }
 
+kernel void kernel_cpy_f32_q8_0(
+        device const float * src0,
+        device        void * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant   int64_t & ne03,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        constant   int64_t & ne2,
+        constant   int64_t & ne3,
+        constant  uint64_t & nb0,
+        constant  uint64_t & nb1,
+        constant  uint64_t & nb2,
+        constant  uint64_t & nb3,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = tgpig[2];
+    const int64_t i02 = tgpig[1];
+    const int64_t i01 = tgpig[0];
+
+    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    const int64_t i3 = n / (ne2*ne1*ne0);
+    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
+    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
+    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK8_0;
+
+    device block_q8_0 * dst_data = (device block_q8_0 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+    for (int64_t i00 = tpitg.x*QK8_0; i00 < ne00; i00 += ntg.x*QK8_0) {
+        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+
+        float amax = 0.0f; // absolute max
+
+        for (int j = 0; j < QK8_0; j++) {
+            const float v = src[j];
+            amax = MAX(amax, fabs(v));
+        }
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        dst_data[i00/QK8_0].d = d;
+
+        for (int j = 0; j < QK8_0; ++j) {
+            const float x0 = src[j]*id;
+
+            dst_data[i00/QK8_0].qs[j] = round(x0);
+        }
+    }
+}
+
+kernel void kernel_cpy_f32_q4_0(
+        device const float * src0,
+        device        void * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant   int64_t & ne03,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        constant   int64_t & ne2,
+        constant   int64_t & ne3,
+        constant  uint64_t & nb0,
+        constant  uint64_t & nb1,
+        constant  uint64_t & nb2,
+        constant  uint64_t & nb3,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = tgpig[2];
+    const int64_t i02 = tgpig[1];
+    const int64_t i01 = tgpig[0];
+
+    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    const int64_t i3 = n / (ne2*ne1*ne0);
+    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
+    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
+    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK4_0;
+
+    device block_q4_0 * dst_data = (device block_q4_0 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+    for (int64_t i00 = tpitg.x*QK4_0; i00 < ne00; i00 += ntg.x*QK4_0) {
+        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+
+        float amax = 0.0f; // absolute max
+        float max  = 0.0f;
+
+        for (int j = 0; j < QK4_0; j++) {
+            const float v = src[j];
+            if (amax < fabs(v)) {
+                amax = fabs(v);
+                max  = v;
+            }
+        }
+
+        const float d = max / -8;
+        const float id = d ? 1.0f/d : 0.0f;
+
+        dst_data[i00/QK4_0].d = d;
+
+        for (int j = 0; j < QK4_0/2; ++j) {
+            const float x0 = src[0       + j]*id;
+            const float x1 = src[QK4_0/2 + j]*id;
+
+            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
+            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
+
+            dst_data[i00/QK4_0].qs[j]  = xi0;
+            dst_data[i00/QK4_0].qs[j] |= xi1 << 4;
+        }
+    }
+}
+
+kernel void kernel_cpy_f32_q4_1(
+        device const float * src0,
+        device        void * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant   int64_t & ne03,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        constant   int64_t & ne2,
+        constant   int64_t & ne3,
+        constant  uint64_t & nb0,
+        constant  uint64_t & nb1,
+        constant  uint64_t & nb2,
+        constant  uint64_t & nb3,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = tgpig[2];
+    const int64_t i02 = tgpig[1];
+    const int64_t i01 = tgpig[0];
+
+    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    const int64_t i3 = n / (ne2*ne1*ne0);
+    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
+    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
+    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK4_1;
+
+    device block_q4_1 * dst_data = (device block_q4_1 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+    for (int64_t i00 = tpitg.x*QK4_1; i00 < ne00; i00 += ntg.x*QK4_1) {
+        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+
+        float min = FLT_MAX;
+        float max = -FLT_MAX;
+
+        for (int j = 0; j < QK4_1; j++) {
+            const float v = src[j];
+            if (min > v) min = v;
+            if (max < v) max = v;
+        }
+
+        const float d = (max - min) / ((1 << 4) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        dst_data[i00/QK4_1].d = d;
+        dst_data[i00/QK4_1].m = min;
+
+        for (int j = 0; j < QK4_1/2; ++j) {
+            const float x0 = (src[0       + j] - min)*id;
+            const float x1 = (src[QK4_1/2 + j] - min)*id;
+
+            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
+            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
+
+            dst_data[i00/QK4_1].qs[j]  = xi0;
+            dst_data[i00/QK4_1].qs[j] |= xi1 << 4;
+        }
+    }
+}
+
 kernel void kernel_concat(
     device const char * src0,
     device const char * src1,
diff --git a/llama.cpp b/llama.cpp
index d23a14469a0f0..800951ab8de72 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1246,7 +1246,6 @@ struct llama_cparams {
 
     bool mul_mat_q;
     bool offload_kqv;
-
 };
 
 struct llama_layer {
@@ -1522,7 +1521,8 @@ struct llama_context {
 static bool llama_kv_cache_init(
         const struct llama_hparams & hparams,
              struct llama_kv_cache & cache,
-                         ggml_type   wtype,
+                         ggml_type   ktype,
+                         ggml_type   vtype,
                           uint32_t   n_ctx,
                                int   n_gpu_layers,
                               bool   offload) {
@@ -1541,7 +1541,7 @@ static bool llama_kv_cache_init(
     cache.cells.clear();
     cache.cells.resize(n_ctx);
 
-    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*n_layer*ggml_tensor_overhead());
+    cache.buf.resize(n_elements*(ggml_type_sizef(ktype) + ggml_type_sizef(vtype)) + 2u*n_layer*ggml_tensor_overhead());
     memset(cache.buf.data, 0, cache.buf.size);
 
     struct ggml_init_params params;
@@ -1561,13 +1561,13 @@ static bool llama_kv_cache_init(
     cache.k_l.reserve(n_layer);
     cache.v_l.reserve(n_layer);
 
-    const int i_gpu_start = n_layer - n_gpu_layers; GGML_UNUSED(i_gpu_start);
+    const int i_gpu_start = (int) n_layer - n_gpu_layers; GGML_UNUSED(i_gpu_start);
 
     GGML_UNUSED(offload);
 
     for (int i = 0; i < (int) n_layer; i++) {
-        ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, wtype, n_embd*n_ctx);
-        ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, wtype, n_embd*n_ctx);
+        ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
+        ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd*n_ctx);
         ggml_format_name(k, "cache_k_l%d", i);
         ggml_format_name(v, "cache_v_l%d", i);
         cache.k_l.push_back(k);
@@ -3558,8 +3558,8 @@ static void llm_build_k_shift(
             ggml_rope_custom_inplace(ctx,
                     ggml_view_3d(ctx, kv.k_l[il],
                         n_embd_head, n_head_kv, n_ctx,
-                        ggml_element_size(kv.k_l[il])*n_embd_head,
-                        ggml_element_size(kv.k_l[il])*n_embd_gqa,
+                        ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
+                        ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
                         0),
                     K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow);
@@ -3588,7 +3588,7 @@ static void llm_build_kv_store(
     cb(v_cur_t, "v_cur_t", il);
 
     struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
-            (ggml_element_size(kv.k_l[il])*n_embd_gqa)*kv_head);
+            (ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa)*kv_head);
     cb(k_cache_view, "k_cache_view", il);
 
     struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
@@ -3747,8 +3747,8 @@ static struct ggml_tensor * llm_build_kqv(
     struct ggml_tensor * k =
         ggml_view_3d(ctx, kv.k_l[il],
                 n_embd_head, n_kv, n_head_kv,
-                ggml_element_size(kv.k_l[il])*n_embd_gqa,
-                ggml_element_size(kv.k_l[il])*n_embd_head,
+                ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
+                ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
                 0);
     cb(k, "k", il);
 
@@ -5695,6 +5695,7 @@ static int llama_decode_internal(
     // after enough generations, the benefit from this heuristic disappears
     // if we start defragmenting the cache, the benefit from this will be more important
     kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
+    //kv_self.n = llama_kv_cache_cell_max(kv_self);
 
     //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
 
@@ -8579,8 +8580,9 @@ struct llama_context_params llama_context_default_params() {
         /*.yarn_beta_fast              =*/ 32.0f,
         /*.yarn_beta_slow              =*/ 1.0f,
         /*.yarn_orig_ctx               =*/ 0,
+        /*.type_k                      =*/ GGML_TYPE_F16,
+        /*.type_v                      =*/ GGML_TYPE_F16,
         /*.mul_mat_q                   =*/ true,
-        /*.f16_kv                      =*/ true,
         /*.logits_all                  =*/ false,
         /*.embedding                   =*/ false,
         /*.offload_kqv                 =*/ true,
@@ -8734,26 +8736,36 @@ struct llama_context * llama_new_context_with_model(
     ctx->rng = std::mt19937(params.seed);
     ctx->logits_all = params.logits_all;
 
-    ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
+    const ggml_type type_k = params.type_k;
+    const ggml_type type_v = params.type_v;
+
+    GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_k) == 0);
+    GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_v) == 0);
 
     // reserve memory for context buffers
     if (!hparams.vocab_only) {
-        if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
+        if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
             LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
             llama_free(ctx);
             return nullptr;
         }
 
         {
-            // const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
-            size_t memory_size = 0;
+            size_t memory_size_k = 0;
+            size_t memory_size_v = 0;
+
             for (auto & k : ctx->kv_self.k_l) {
-                memory_size += ggml_nbytes(k);
+                memory_size_k += ggml_nbytes(k);
             }
+
             for (auto & v : ctx->kv_self.v_l) {
-                memory_size += ggml_nbytes(v);
+                memory_size_v += ggml_nbytes(v);
             }
-            LLAMA_LOG_INFO("%s: kv self size  = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
+
+            LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
+                (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
+                ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
+                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
         }
 
         // resized during inference
diff --git a/llama.h b/llama.h
index c1593c9b03331..ead37562e37f3 100644
--- a/llama.h
+++ b/llama.h
@@ -191,9 +191,11 @@ extern "C" {
         float    yarn_beta_slow;   // YaRN high correction dim
         uint32_t yarn_orig_ctx;    // YaRN original context size
 
+        enum ggml_type type_k; // data type for K cache
+        enum ggml_type type_v; // data type for V cache
+
         // Keep the booleans together to avoid misalignment during copy-by-value.
         bool mul_mat_q;   // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
-        bool f16_kv;      // use fp16 for KV cache, fp32 otherwise
         bool logits_all;  // the llama_eval() call computes all logits, not just the last one
         bool embedding;   // embedding mode only
         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU

From fc5f334689e25be2efdd6c43a734bcc6337be53e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 7 Dec 2023 12:35:02 +0200
Subject: [PATCH 13/13] readme : add API change notice

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index dac971ae5dfe5..ce026b8d1d851 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 
 ### Hot topics
 
+- **llama.h API change for handling KV cache offloading and data type: https://github.com/ggerganov/llama.cpp/pull/4309**
 - Using `llama.cpp` with AWS instances: https://github.com/ggerganov/llama.cpp/discussions/4225
 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
 - Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167