From 82ced8ef68eaf89e4c58077d8b777f2f229b2eee Mon Sep 17 00:00:00 2001 From: Diner Burger Date: Tue, 10 Dec 2024 10:49:21 -0500 Subject: [PATCH] Condense lcpp cache types to f16, q4_0 and q8_0 to match prebuilt support. Disallow type mixing. --- modules/exllamav2.py | 4 ++-- modules/exllamav2_hf.py | 4 ++-- modules/llamacpp_hf.py | 7 +++--- modules/llamacpp_model.py | 9 ++++---- modules/loaders.py | 10 ++++----- modules/shared.py | 45 ++++++++++++++++++--------------------- modules/ui.py | 5 ++--- modules/ui_model_menu.py | 5 ++--- 8 files changed, 41 insertions(+), 48 deletions(-) diff --git a/modules/exllamav2.py b/modules/exllamav2.py index 8feb06b20c..7b9b2ce94a 100644 --- a/modules/exllamav2.py +++ b/modules/exllamav2.py @@ -60,8 +60,8 @@ def from_pretrained(self, path_to_model): # Determine the correct cache type kv_cache_type = 'fp16' - if shared.args.cache_kv_type: - kv_cache_type = shared.args.cache_kv_type.lower() + if shared.args.exl_cache_type: + kv_cache_type = shared.args.exl_cache_type.lower() if kv_cache_type == 'fp16': cache_type = ExLlamaV2Cache diff --git a/modules/exllamav2_hf.py b/modules/exllamav2_hf.py index 352ac23d66..88ec772f17 100644 --- a/modules/exllamav2_hf.py +++ b/modules/exllamav2_hf.py @@ -48,8 +48,8 @@ def __init__(self, config: ExLlamaV2Config): # Determine the correct cache type kv_cache_type = 'fp16' - if shared.args.cache_kv_type: - kv_cache_type = shared.args.cache_kv_type.lower() + if shared.args.exl_cache_type: + kv_cache_type = shared.args.exl_cache_type.lower() if kv_cache_type == 'fp16': cache_type = ExLlamaV2Cache diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py index 51f207023c..15e92f2e16 100644 --- a/modules/llamacpp_hf.py +++ b/modules/llamacpp_hf.py @@ -197,10 +197,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P 'flash_attn': shared.args.flash_attn } - if shared.args.cache_k_type: - params["type_k"] = get_llamacpp_quant_type_for_string(shared.args.cache_k_type) - if shared.args.cache_v_type: - params["type_v"] = get_llamacpp_quant_type_for_string(shared.args.cache_v_type) + if shared.args.lcpp_cache_type: + params["type_k"] = get_llamacpp_quant_type_for_string(shared.args.lcpp_cache_type) + params["type_v"] = get_llamacpp_quant_type_for_string(shared.args.lcpp_cache_type) Llama = llama_cpp_lib().Llama model = Llama(**params) diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py index 7fc714d1fc..f6bdc8d526 100644 --- a/modules/llamacpp_model.py +++ b/modules/llamacpp_model.py @@ -12,6 +12,7 @@ llamacpp_quant_mapping = { + 'f32': 0, 'fp16': 1, 'q4_0': 2, 'q4_1': 3, @@ -25,6 +26,7 @@ 'q5_k': 13, 'q6_k': 14, 'q8_k': 15, + 'iq4_nl': 20, 'bf16': 30, } @@ -101,10 +103,9 @@ def from_pretrained(self, path): 'flash_attn': shared.args.flash_attn } - if shared.args.cache_k_type: - params["type_k"] = get_llamacpp_quant_type_for_string(shared.args.cache_k_type) - if shared.args.cache_v_type: - params["type_v"] = get_llamacpp_quant_type_for_string(shared.args.cache_v_type) + if shared.args.lcpp_cache_type: + params["type_k"] = get_llamacpp_quant_type_for_string(shared.args.lcpp_cache_type) + params["type_v"] = get_llamacpp_quant_type_for_string(shared.args.lcpp_cache_type) result.model = Llama(**params) if cache_capacity > 0: diff --git a/modules/loaders.py b/modules/loaders.py index 499de6bef9..73d15948f4 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -31,8 +31,7 @@ 'llama.cpp': [ 'n_ctx', 'n_gpu_layers', - 'cache_k_type', - 'cache_v_type', + 'lcpp_cache_type', 'tensor_split', 'n_batch', 'threads', @@ -54,8 +53,7 @@ 'llamacpp_HF': [ 'n_ctx', 'n_gpu_layers', - 'cache_k_type', - 'cache_v_type', + 'lcpp_cache_type', 'tensor_split', 'n_batch', 'threads', @@ -87,7 +85,7 @@ 'no_xformers', 'no_sdpa', 'num_experts_per_token', - 'cache_kv_type', + 'exl_cache_type', 'autosplit', 'enable_tp', 'alpha_value', @@ -102,7 +100,7 @@ 'no_xformers', 'no_sdpa', 'num_experts_per_token', - 'cache_kv_type', + 'exl_cache_type', 'autosplit', 'enable_tp', 'alpha_value', diff --git a/modules/shared.py b/modules/shared.py index cd7e15c5ef..926b54fb7f 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -47,9 +47,8 @@ 'max_updates_second': 0, 'prompt_lookup_num_tokens': 0, 'custom_stopping_strings': '', - 'cache_k_type': 'FP16', - 'cache_v_type': 'FP16', - 'cache_kv_type': 'FP16', + 'lcpp_cache_type': 'fp16', + 'exl_cache_type': 'fp16', 'custom_token_bans': '', 'auto_max_new_tokens': False, 'ban_eos_token': False, @@ -128,8 +127,7 @@ group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.') group.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.') group.add_argument('--tensor_split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.') -group.add_argument('--cache_k_type', type=str, default='fp16', help='KV cache K-quant type.') -group.add_argument('--cache_v_type', type=str, default='fp16', help='KV cache V-quant type.') +group.add_argument('--lcpp_cache_type', type=str, default='fp16', help='KV cache K-quant type. May be one of fp16, q8_0, q4_0.') group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.') group.add_argument('--logits_all', action='store_true', help='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.') group.add_argument('--no_offload_kqv', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.') @@ -148,7 +146,7 @@ group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.') group.add_argument('--no_xformers', action='store_true', help='Force xformers to not be used.') group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to not be used.') -group.add_argument('--cache_kv_type', type=str, default='fp16', help='KV cache type; may be one of FP16, FP8, Q8, Q6 or Q4.') +group.add_argument('--exl_cache_type', type=str, default='fp16', help='KV cache type; may be one of FP16, FP8, Q8, Q6 or Q4.') group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.') group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.') @@ -288,10 +286,13 @@ def set(key, value): else: setattr(opts, key, value) - def del_key(key): - # only remove from user dict, can't modify argparse.Namespace - if type(opts) is dict and key in opts: - del opts[key] + def del_key(key, fallback_set): + # only remove from user dict, can't delete from argparse.Namespace + if type(opts) is dict: + if key in opts: + del opts[key] + else: + setattr(opts, key, fallback_set) loader = get('loader') cache_8bit = get('cache_8bit') @@ -305,28 +306,24 @@ def del_key(key): # prevent as much breakage as possible. if not loader: if cache_8bit: - set('cache_k_type', 'q8_0') - set('cache_v_type', 'q8_0') - set('cache_kv_type', 'fp8') + set('lcpp_cache_type', 'q8_0') + set('exl_cache_type', 'fp8') elif cache_4bit: - set('cache_k_type', 'q4_0') - set('cache_v_type', 'q4_0') - set('cache_kv_type', 'q4') + set('lcpp_cache_type', 'q4_0') + set('exl_cache_type', 'q4') elif loader.lower() in ['exllamav2', 'exllamav2_hf']: if cache_8bit: - set('cache_kv_type', 'fp8') + set('exl_cache_type', 'fp8') elif cache_4bit: - set('cache_kv_type', 'q4') + set('exl_cache_type', 'q4') elif loader.lower() in ['llama.cpp', 'llamacpp_hf']: if cache_4bit: - set('cache_k_type', 'q4_0') - set('cache_v_type', 'q4_0') + set('lcpp_cache_type', 'q4_0') elif cache_8bit: - set('cache_k_type', 'q8_0') - set('cache_v_type', 'q8_0') + set('lcpp_cache_type', 'q8_0') - del_key('cache_4bit') - del_key('cache_8bit') + del_key('cache_4bit', False) + del_key('cache_8bit', False) return opts diff --git a/modules/ui.py b/modules/ui.py index e2e7861553..49b2487e54 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -87,9 +87,8 @@ def list_model_elements(): 'no_xformers', 'no_sdpa', 'num_experts_per_token', - 'cache_k_type', - 'cache_v_type', - 'cache_kv_type', + 'lcpp_cache_type', + 'exl_cache_type', 'autosplit', 'enable_tp', 'threads', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 61cfb3f220..4ea1f5df74 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -118,9 +118,8 @@ def create_ui(): shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.') shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices) shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='NVIDIA only: use llama-cpp-python compiled with tensor cores support. This may increase performance on newer cards.') - shared.gradio['cache_k_type'] = gr.Dropdown(label="cache_k_type", value=shared.args.cache_k_type, info='KV cache K type', choices=['fp16', 'q4_0', 'q4_1', 'q5_0', 'q5_1', 'q8_0', 'q8_1', 'q2_k', 'q3_k', 'q4_k', 'q5_k', 'q6_k', 'q8_k', 'bf16' ] ) - shared.gradio['cache_v_type'] = gr.Dropdown(label="cache_v_type", value=shared.args.cache_v_type, info='KV cache V type', choices=['fp16', 'q4_0', 'q4_1', 'q5_0', 'q5_1', 'q8_0', 'q8_1', 'q2_k', 'q3_k', 'q4_k', 'q5_k', 'q6_k', 'q8_k', 'bf16' ] ) - shared.gradio['cache_kv_type'] = gr.Dropdown(label="cache_kv_type", value=shared.args.cache_kv_type, info='KV cache type', choices=['fp16', 'fp8', 'q8', 'q6', 'q4']) + shared.gradio['lcpp_cache_type'] = gr.Dropdown(label="cache_type", value=shared.args.lcpp_cache_type, info='KV cache type', choices=['fp16', 'q8_0', 'q4_0'] ) + shared.gradio['exl_cache_type'] = gr.Dropdown(label="cache_type", value=shared.args.exl_cache_type, info='KV cache type', choices=['fp16', 'fp8', 'q8', 'q6', 'q4']) shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='(experimental) Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') shared.gradio['attention_sink_size'] = gr.Number(label="attention_sink_size", value=shared.args.attention_sink_size, precision=0, info='StreamingLLM: number of sink tokens. Only used if the trimmed prompt doesn\'t share a prefix with the old prompt.') shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')