From d1259b7b35f5c29154645344e781a8e894b7a4fb Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 10 Dec 2023 13:00:13 +0200
Subject: [PATCH] llama : do not quantize expert gating tensors

---
 llama.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index 4ac46193caf7c..0a5f755ca569d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8443,6 +8443,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         quantize &= params->quantize_output_tensor || name != "output.weight";
         quantize &= !params->only_copy;
 
+        // do not quantize expert gating tensors
+        quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
+
         enum ggml_type new_type;
         void * new_data;
         size_t new_size;