From d1259b7b35f5c29154645344e781a8e894b7a4fb Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 10 Dec 2023 13:00:13 +0200 Subject: [PATCH] llama : do not quantize expert gating tensors --- llama.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llama.cpp b/llama.cpp index 4ac46193caf7c..0a5f755ca569d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8443,6 +8443,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s quantize &= params->quantize_output_tensor || name != "output.weight"; quantize &= !params->only_copy; + // do not quantize expert gating tensors + quantize &= name.find("ffn_gate_inp.weight") == std::string::npos; + enum ggml_type new_type; void * new_data; size_t new_size;