Condense lcpp cache types to f16, q4_0 and q8_0 to match prebuilt sup…

…port. Disallow type mixing.
oobabooga · Dec 11, 2024 · 82ced8e · 82ced8e
1 parent a0405f5
commit 82ced8e
Show file tree

Hide file tree

Showing 8 changed files with 41 additions and 48 deletions.
diff --git a/modules/exllamav2.py b/modules/exllamav2.py
@@ -60,8 +60,8 @@ def from_pretrained(self, path_to_model):
 
         # Determine the correct cache type
         kv_cache_type = 'fp16'
-        if shared.args.cache_kv_type:
-            kv_cache_type = shared.args.cache_kv_type.lower()
+        if shared.args.exl_cache_type:
+            kv_cache_type = shared.args.exl_cache_type.lower()
 
         if kv_cache_type == 'fp16':
             cache_type = ExLlamaV2Cache

diff --git a/modules/exllamav2_hf.py b/modules/exllamav2_hf.py
@@ -48,8 +48,8 @@ def __init__(self, config: ExLlamaV2Config):
 
         # Determine the correct cache type
         kv_cache_type = 'fp16'
-        if shared.args.cache_kv_type:
-            kv_cache_type = shared.args.cache_kv_type.lower()
+        if shared.args.exl_cache_type:
+            kv_cache_type = shared.args.exl_cache_type.lower()
 
         if kv_cache_type == 'fp16':
             cache_type = ExLlamaV2Cache

diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
@@ -197,10 +197,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             'flash_attn': shared.args.flash_attn
         }
 
-        if shared.args.cache_k_type:
-            params["type_k"] = get_llamacpp_quant_type_for_string(shared.args.cache_k_type)
-        if shared.args.cache_v_type:
-            params["type_v"] = get_llamacpp_quant_type_for_string(shared.args.cache_v_type)
+        if shared.args.lcpp_cache_type:
+            params["type_k"] = get_llamacpp_quant_type_for_string(shared.args.lcpp_cache_type)
+            params["type_v"] = get_llamacpp_quant_type_for_string(shared.args.lcpp_cache_type)
 
         Llama = llama_cpp_lib().Llama
         model = Llama(**params)

diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
@@ -12,6 +12,7 @@
 
 
 llamacpp_quant_mapping = {
+    'f32': 0,
     'fp16': 1,
     'q4_0': 2,
     'q4_1': 3,
@@ -25,6 +26,7 @@
     'q5_k': 13,
     'q6_k': 14,
     'q8_k': 15,
+    'iq4_nl': 20,
     'bf16': 30,
 }
 
@@ -101,10 +103,9 @@ def from_pretrained(self, path):
             'flash_attn': shared.args.flash_attn
         }
 
-        if shared.args.cache_k_type:
-            params["type_k"] = get_llamacpp_quant_type_for_string(shared.args.cache_k_type)
-        if shared.args.cache_v_type:
-            params["type_v"] = get_llamacpp_quant_type_for_string(shared.args.cache_v_type)
+        if shared.args.lcpp_cache_type:
+            params["type_k"] = get_llamacpp_quant_type_for_string(shared.args.lcpp_cache_type)
+            params["type_v"] = get_llamacpp_quant_type_for_string(shared.args.lcpp_cache_type)
 
         result.model = Llama(**params)
         if cache_capacity > 0:

diff --git a/modules/loaders.py b/modules/loaders.py
@@ -31,8 +31,7 @@
     'llama.cpp': [
         'n_ctx',
         'n_gpu_layers',
-        'cache_k_type',
-        'cache_v_type',
+        'lcpp_cache_type',
         'tensor_split',
         'n_batch',
         'threads',
@@ -54,8 +53,7 @@
     'llamacpp_HF': [
         'n_ctx',
         'n_gpu_layers',
-        'cache_k_type',
-        'cache_v_type',
+        'lcpp_cache_type',
         'tensor_split',
         'n_batch',
         'threads',
@@ -87,7 +85,7 @@
         'no_xformers',
         'no_sdpa',
         'num_experts_per_token',
-        'cache_kv_type',
+        'exl_cache_type',
         'autosplit',
         'enable_tp',
         'alpha_value',
@@ -102,7 +100,7 @@
         'no_xformers',
         'no_sdpa',
         'num_experts_per_token',
-        'cache_kv_type',
+        'exl_cache_type',
         'autosplit',
         'enable_tp',
         'alpha_value',

diff --git a/modules/shared.py b/modules/shared.py
@@ -47,9 +47,8 @@
     'max_updates_second': 0,
     'prompt_lookup_num_tokens': 0,
     'custom_stopping_strings': '',
-    'cache_k_type': 'FP16',
-    'cache_v_type': 'FP16',
-    'cache_kv_type': 'FP16',
+    'lcpp_cache_type': 'fp16',
+    'exl_cache_type': 'fp16',
     'custom_token_bans': '',
     'auto_max_new_tokens': False,
     'ban_eos_token': False,
@@ -128,8 +127,7 @@
 group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
 group.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.')
 group.add_argument('--tensor_split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.')
-group.add_argument('--cache_k_type', type=str, default='fp16', help='KV cache K-quant type.')
-group.add_argument('--cache_v_type', type=str, default='fp16', help='KV cache V-quant type.')
+group.add_argument('--lcpp_cache_type', type=str, default='fp16', help='KV cache K-quant type. May be one of fp16, q8_0, q4_0.')
 group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
 group.add_argument('--logits_all', action='store_true', help='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.')
 group.add_argument('--no_offload_kqv', action='store_true', help='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
@@ -148,7 +146,7 @@
 group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.')
 group.add_argument('--no_xformers', action='store_true', help='Force xformers to not be used.')
 group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to not be used.')
-group.add_argument('--cache_kv_type', type=str, default='fp16', help='KV cache type; may be one of FP16, FP8, Q8, Q6 or Q4.')
+group.add_argument('--exl_cache_type', type=str, default='fp16', help='KV cache type; may be one of FP16, FP8, Q8, Q6 or Q4.')
 group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
 group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.')
 
@@ -288,10 +286,13 @@ def set(key, value):
         else:
             setattr(opts, key, value)
 
-    def del_key(key):
-        # only remove from user dict, can't modify argparse.Namespace
-        if type(opts) is dict and key in opts:
-            del opts[key]
+    def del_key(key, fallback_set):
+        # only remove from user dict, can't delete from argparse.Namespace
+        if type(opts) is dict:
+            if key in opts:
+                del opts[key]
+        else:
+            setattr(opts, key, fallback_set)
 
     loader = get('loader')
     cache_8bit = get('cache_8bit')
@@ -305,28 +306,24 @@ def del_key(key):
     #      prevent as much breakage as possible.
     if not loader:
         if cache_8bit:
-            set('cache_k_type', 'q8_0')
-            set('cache_v_type', 'q8_0')
-            set('cache_kv_type', 'fp8')
+            set('lcpp_cache_type', 'q8_0')
+            set('exl_cache_type', 'fp8')
         elif cache_4bit:
-            set('cache_k_type', 'q4_0')
-            set('cache_v_type', 'q4_0')
-            set('cache_kv_type', 'q4')
+            set('lcpp_cache_type', 'q4_0')
+            set('exl_cache_type', 'q4')
     elif loader.lower() in ['exllamav2', 'exllamav2_hf']:
         if cache_8bit:
-            set('cache_kv_type', 'fp8')
+            set('exl_cache_type', 'fp8')
         elif cache_4bit:
-            set('cache_kv_type', 'q4')
+            set('exl_cache_type', 'q4')
     elif loader.lower() in ['llama.cpp', 'llamacpp_hf']:
         if cache_4bit:
-            set('cache_k_type', 'q4_0')
-            set('cache_v_type', 'q4_0')
+            set('lcpp_cache_type', 'q4_0')
         elif cache_8bit:
-            set('cache_k_type', 'q8_0')
-            set('cache_v_type', 'q8_0')
+            set('lcpp_cache_type', 'q8_0')
 
-    del_key('cache_4bit')
-    del_key('cache_8bit')
+    del_key('cache_4bit', False)
+    del_key('cache_8bit', False)
     return opts
 
 

diff --git a/modules/ui.py b/modules/ui.py
@@ -87,9 +87,8 @@ def list_model_elements():
         'no_xformers',
         'no_sdpa',
         'num_experts_per_token',
-        'cache_k_type',
-        'cache_v_type',
-        'cache_kv_type',
+        'lcpp_cache_type',
+        'exl_cache_type',
         'autosplit',
         'enable_tp',
         'threads',

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
@@ -118,9 +118,8 @@ def create_ui():
                             shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
                             shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
                             shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='NVIDIA only: use llama-cpp-python compiled with tensor cores support. This may increase performance on newer cards.')
-                            shared.gradio['cache_k_type'] = gr.Dropdown(label="cache_k_type", value=shared.args.cache_k_type, info='KV cache K type', choices=['fp16', 'q4_0', 'q4_1', 'q5_0', 'q5_1', 'q8_0', 'q8_1', 'q2_k', 'q3_k', 'q4_k', 'q5_k', 'q6_k', 'q8_k', 'bf16' ] )
-                            shared.gradio['cache_v_type'] = gr.Dropdown(label="cache_v_type", value=shared.args.cache_v_type, info='KV cache V type', choices=['fp16', 'q4_0', 'q4_1', 'q5_0', 'q5_1', 'q8_0', 'q8_1', 'q2_k', 'q3_k', 'q4_k', 'q5_k', 'q6_k', 'q8_k', 'bf16' ] )
-                            shared.gradio['cache_kv_type'] = gr.Dropdown(label="cache_kv_type", value=shared.args.cache_kv_type, info='KV cache type', choices=['fp16', 'fp8', 'q8', 'q6', 'q4'])
+                            shared.gradio['lcpp_cache_type'] = gr.Dropdown(label="cache_type", value=shared.args.lcpp_cache_type, info='KV cache type', choices=['fp16', 'q8_0', 'q4_0'] )
+                            shared.gradio['exl_cache_type'] = gr.Dropdown(label="cache_type", value=shared.args.exl_cache_type, info='KV cache type', choices=['fp16', 'fp8', 'q8', 'q6', 'q4'])
                             shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='(experimental) Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
                             shared.gradio['attention_sink_size'] = gr.Number(label="attention_sink_size", value=shared.args.attention_sink_size, precision=0, info='StreamingLLM: number of sink tokens. Only used if the trimmed prompt doesn\'t share a prefix with the old prompt.')
                             shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')