From 9cdfe9368f8476ce260f7a2cbcd571afbc9b13a7 Mon Sep 17 00:00:00 2001
From: Brandon Roberts <brandon@bxroberts.org>
Date: Sat, 16 Dec 2023 20:10:39 -0800
Subject: [PATCH] Complete removal or f16_kv, add offload_kqv field

This addresses two issues:

 - #995 which just requests to add the KV cache offloading param
 - #1006 a NULL ptr exception when using the embeddings (introduced by
   leaving f16_kv in the fields struct)
---
 llama_cpp/llama_cpp.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 82c7187e62..538e3ff160 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -432,9 +432,9 @@ class llama_context_params(Structure):
         type_k (int): data type for K cache
         type_v (int): data type for V cache
         mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
-        f16_kv (bool): use fp16 for KV cache, fp32 otherwise
         logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
-        embedding (bool): embedding mode only"""
+        embedding (bool): embedding mode only
+        offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU"""
     _fields_ = [
         ("seed", c_uint32),
         ("n_ctx", c_uint32),
@@ -452,9 +452,9 @@ class llama_context_params(Structure):
         ("type_k", c_int),
         ("type_v", c_int),
         ("mul_mat_q", c_bool),
-        ("f16_kv", c_bool),
         ("logits_all", c_bool),
         ("embedding", c_bool),
+        ("offload_kqv", c_bool),
     ]