From 002e36eaec1507818af0411d64a55b0288a36362 Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar109@gmail.com>
Date: Wed, 28 Feb 2024 17:33:41 +0000
Subject: [PATCH 01/28] Arm AArch64: optimized GEMV and GEMM kernels for
 q4_0_q8_0, and q8_0_q8_0 quantization

---
 ggml/include/ggml.h    |  15 +
 ggml/src/ggml-impl.h   |   5 +
 ggml/src/ggml-quants.c | 924 +++++++++++++++++++++++++++++++++++++++++
 ggml/src/ggml-quants.h | 264 ++++++++++++
 ggml/src/ggml.c        | 395 +++++++++++++++++-
 src/llama.cpp          |  27 ++
 6 files changed, 1617 insertions(+), 13 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index d895c9acdb596..2d377267387e2 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -1,3 +1,4 @@
+// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
 #pragma once
 
 //
@@ -602,6 +603,11 @@ extern "C" {
         void * extra; // extra things e.g. for ggml-cuda.cu
 
         // char padding[4];
+        char padding[9];
+
+        void * rearranged_weight_gemv;
+        void * rearranged_weight_gemm;
+        bool weight_rearranged;
     };
 
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -2422,6 +2428,15 @@ extern "C" {
 
     GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
 
+    GGML_API void rearrange_q4_0_weights_blocked8_neon(struct ggml_tensor * cur);
+    GGML_API void rearrange_q4_0_weights_blocked8_sve(struct ggml_tensor * cur);
+    GGML_API void rearrange_q4_0_weights_for_gemv(struct ggml_tensor * cur);
+    GGML_API void rearrange_q4_0_weights_for_gemm(struct ggml_tensor * cur);
+    GGML_API void rearrange_q8_0_weights_blocked8_neon(struct ggml_tensor * cur);
+    GGML_API void rearrange_q8_0_weights_blocked8_sve(struct ggml_tensor * cur);
+    GGML_API void rearrange_q8_0_weights_for_gemv(struct ggml_tensor * cur);
+    GGML_API void rearrange_q8_0_weights_for_gemm(struct ggml_tensor * cur);
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index 1d23361906c34..23a85229afaf2 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -1,3 +1,4 @@
+// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
 #pragma once
 
 #include "ggml.h"
@@ -609,6 +610,10 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
 
 #endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
 
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif // __ARM_FEATURE_SVE
+
 // precomputed f32 table for f16 (256 KB)
 // defined in ggml.c, initialized in ggml_init()
 extern float ggml_table_f32_f16[1 << 16];
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 0eb52e485089f..2c0e89d4dfd7a 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -1,3 +1,4 @@
+// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
 #define GGML_COMMON_IMPL_C
 #include "ggml-common.h"
 
@@ -14706,6 +14707,929 @@ void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k)
     assert(k % QK_K == 0);
     block_iq2_s * restrict y = vy;
     quantize_row_iq2_s_reference(x, y, k);
+
+// Routines to create the blocked formats
+// Note input is array of pointers.
+// The exact interleaving format needed is different for GEMM (using SMMLA)
+// and GEMV (using SDOT) cases.  For GEMM, we interleave 8 pairs of values
+// at a time (with the two nibbles separated at runtime to give 2x2x8
+// matrices).  For GEMV, we need to interleave 4 pairs of values instead.
+block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned int block_len) {
+    block_q4_0x4 out;
+
+    for (int i = 0; i < 4; i++) {
+        out.d[i] = in[i]->d;
+    }
+
+    for (int i = 0; i < QK4_0 * 2; i++) {
+        // We are interleaving 4 rows in blocks of 8, making a total of 32
+        // output bytes per block (2 MMLA input vectors).  This repeats
+        // until we have processed the whole block.
+        //
+        // Per the comment above, for GEMV cases a similar process is used
+        // but with blocks of 4 instead, giving a single DOT input vector.
+        //
+        // In the case of q4, we add on 128 to convert the top nibble from
+        // "bias offset" form to pure sign form (this saves a subtract when
+        // we unpack it).
+        int src_offset = (i / (4 * block_len)) * block_len;
+        int src_id = (i % (4 * block_len)) / block_len;
+        src_offset += (i % block_len);
+
+        out.qs[i] = in[src_id]->qs[src_offset] + 0x80;
+    }
+
+    return out;
+}
+
+// 8-block version - see comments in code above
+block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned int block_len) {
+    block_q4_0x8 out;
+
+    for (int i = 0; i < 8; i++) {
+        out.d[i] = in[i]->d;
+    }
+
+    for (int i = 0; i < QK4_0 * 4; i++) {
+        int src_offset = (i / (8 * block_len)) * block_len;
+        int src_id = (i % (8 * block_len)) / block_len;
+        src_offset += (i % block_len);
+
+        out.qs[i] = in[src_id]->qs[src_offset] + 0x80;
+    }
+
+    return out;
+}
+
+block_q8_0x4 make_block_q8_0x4(const block_q8_0 * const in[4], unsigned int block_len) {
+    block_q8_0x4 out;
+
+    for (int i = 0; i < 4; i++) {
+        out.d[i] = in[i]->d;
+    }
+
+    for (int i = 0; i < QK8_0 * 4; i++) {
+        int src_offset = (i / (4 * block_len)) * block_len;
+        int src_id = (i % (4 * block_len)) / block_len;
+        src_offset += (i % block_len);
+
+        out.qs[i] = in[src_id]->qs[src_offset];
+    }
+
+    return out;
+}
+
+// 8-block version - see comments in code above
+block_q8_0x8 make_block_q8_0x8(const block_q8_0 * const in[8], unsigned int block_len) {
+    block_q8_0x8 out;
+
+    for (int i = 0; i < 8; i++) {
+        out.d[i] = in[i]->d;
+    }
+
+    for (int i = 0; i < QK8_0 * 8; i++) {
+        int src_offset = (i / (8 * block_len)) * block_len;
+        int src_id = (i % (8 * block_len)) / block_len;
+        src_offset += (i % block_len);
+
+        out.qs[i] = in[src_id]->qs[src_offset];
+    }
+
+    return out;
+}
+
+void quantize_row_q8_0_and_make_block_q8_0x2(const float * restrict x, void * restrict vy, int k, int rows_interleaved) {
+    assert(QK8_0 == 32);
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    block_q8_0x2 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    for (int i = 0; i < nb; i++) {
+        float32x4_t srcv[rows_interleaved][8];
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+        float id[rows_interleaved];
+
+        for (int row_iter = 0; row_iter < rows_interleaved; row_iter++) {
+            for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j);
+            for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]);
+
+            for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]);
+            for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]);
+            for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]);
+
+            const float amax = vmaxvq_f32(amaxv[0]);
+
+            const float d = amax / ((1 << 7) - 1);
+            id[row_iter] = d ? 1.0f / d : 0.0f;
+
+            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
+        }
+
+        for (int j = 0; j < 4; j++) {
+            float32x4_t v = vmulq_n_f32(srcv[0][2 * j], id[0]);
+            int32x4_t vi = vcvtnq_s32_f32(v);
+            y[i].qs[16 * j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 * j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[16 * j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 * j + 3] = vgetq_lane_s32(vi, 3);
+            v = vmulq_n_f32(srcv[0][2 * j + 1], id[0]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[16 * j + 4] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 * j + 5] = vgetq_lane_s32(vi, 1);
+            y[i].qs[16 * j + 6] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 * j + 7] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[1][2 * j], id[1]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[16 * j + 8] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 * j + 9] = vgetq_lane_s32(vi, 1);
+            y[i].qs[16 * j + 10] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 * j + 11] = vgetq_lane_s32(vi, 3);
+            v = vmulq_n_f32(srcv[1][2 * j + 1], id[1]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[16 * j + 12] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 * j + 13] = vgetq_lane_s32(vi, 1);
+            y[i].qs[16 * j + 14] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 * j + 15] = vgetq_lane_s32(vi, 3);
+        }
+    }
+#endif
+}
+
+void quantize_row_q8_0_and_make_block_q8_0x4(const float * restrict x, void * restrict vy, int k, int rows_interleaved) {
+    assert(QK8_0 == 32);
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    block_q8_0x4 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    for (int i = 0; i < nb; i++) {
+        float32x4_t srcv[rows_interleaved][8];
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+        float id[rows_interleaved];
+
+        for (int row_iter = 0; row_iter < rows_interleaved; row_iter++) {
+            for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j);
+            for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]);
+
+            for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]);
+            for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]);
+            for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]);
+
+            const float amax = vmaxvq_f32(amaxv[0]);
+
+            const float d = amax / ((1 << 7) - 1);
+            id[row_iter] = d ? 1.0f / d : 0.0f;
+
+            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
+        }
+
+        for (int j = 0; j < 4; j++) {
+            float32x4_t v = vmulq_n_f32(srcv[0][2 * j], id[0]);
+            int32x4_t vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 3] = vgetq_lane_s32(vi, 3);
+            v = vmulq_n_f32(srcv[0][2 * j + 1], id[0]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 4] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 5] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 6] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 7] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[1][2 * j], id[1]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 8] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 9] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 10] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 11] = vgetq_lane_s32(vi, 3);
+            v = vmulq_n_f32(srcv[1][2 * j + 1], id[1]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 12] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 13] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 14] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 15] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[2][2 * j], id[2]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 16] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 17] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 18] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 19] = vgetq_lane_s32(vi, 3);
+            v = vmulq_n_f32(srcv[2][2 * j + 1], id[2]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 20] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 21] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 22] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 23] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[3][2 * j], id[3]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 24] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 25] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 26] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 27] = vgetq_lane_s32(vi, 3);
+            v = vmulq_n_f32(srcv[3][2 * j + 1], id[3]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 28] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 29] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 30] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3);
+        }
+    }
+#endif
+}
+
+inline int64_t roundup(const int64_t a, const int64_t b) {
+    int64_t rem = a % b;
+
+    if (rem) {
+        return a + b - rem;
+    } else {
+        return a;
+    }
+}
+
+void ggml_gemv_q4_0_q8_0_blocked8_neon(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
+#if defined(__ARM_NEON)
+    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)8);
+    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)8);
+
+    int64_t nb = n / QK4_0;
+    int64_t a_nb = n / QK8_0;
+
+    const uint8x16_t m4b = vdupq_n_u8(0x0F);
+    const int8x16_t  s8b = vdupq_n_s8(0x8);
+
+    const block_q4_0x8 * b_ptr_start = vx;
+    const block_q8_0 * a_ptr_start = vy;
+
+    for (int64_t y = 0; y < input_width; y++) {
+        for (int64_t x = x0 / 8; x < xend / 8; x++) {
+            // Pointers to LHS blocks
+            const block_q8_0 * a_ptr = a_ptr_start + (y * a_nb);
+            // Pointers to RHS blocks
+            const block_q4_0x8 * b_ptr = b_ptr_start + (x * nb);
+            // Master FP accumulator
+            float32x4_t acc_row[2];
+            acc_row[0] = acc_row[1] = vdupq_n_f32(0.0f);
+
+            for (int64_t b = 0; b < nb; b++) {
+                // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
+                const uint8x16_t rhs_raw_vec_0_0 = vld1q_u8(b_ptr[b].qs);
+                const uint8x16_t rhs_raw_vec_1_0 = vld1q_u8(b_ptr[b].qs + 16);
+                const uint8x16_t rhs_raw_vec_0_1 = vld1q_u8(b_ptr[b].qs + 32);
+                const uint8x16_t rhs_raw_vec_1_1 = vld1q_u8(b_ptr[b].qs + 48);
+                const uint8x16_t rhs_raw_vec_0_2 = vld1q_u8(b_ptr[b].qs + 64);
+                const uint8x16_t rhs_raw_vec_1_2 = vld1q_u8(b_ptr[b].qs + 80);
+                const uint8x16_t rhs_raw_vec_0_3 = vld1q_u8(b_ptr[b].qs + 96);
+                const uint8x16_t rhs_raw_vec_1_3 = vld1q_u8(b_ptr[b].qs + 112);
+
+                const int8x16_t rhs_vec_0_0_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_vec_0_0, m4b)), s8b);
+                const int8x16_t rhs_vec_0_1_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_vec_0_1, m4b)), s8b);
+                const int8x16_t rhs_vec_0_2_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_vec_0_2, m4b)), s8b);
+                const int8x16_t rhs_vec_0_3_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_vec_0_3, m4b)), s8b);
+                const int8x16_t rhs_vec_1_0_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_vec_1_0, m4b)), s8b);
+                const int8x16_t rhs_vec_1_1_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_vec_1_1, m4b)), s8b);
+                const int8x16_t rhs_vec_1_2_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_vec_1_2, m4b)), s8b);
+                const int8x16_t rhs_vec_1_3_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_vec_1_3, m4b)), s8b);
+
+                const int8x16_t rhs_vec_0_0_1 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_vec_0_0), 4);
+                const int8x16_t rhs_vec_0_1_1 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_vec_0_1), 4);
+                const int8x16_t rhs_vec_0_2_1 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_vec_0_2), 4);
+                const int8x16_t rhs_vec_0_3_1 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_vec_0_3), 4);
+                const int8x16_t rhs_vec_1_0_1 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_vec_1_0), 4);
+                const int8x16_t rhs_vec_1_1_1 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_vec_1_1), 4);
+                const int8x16_t rhs_vec_1_2_1 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_vec_1_2), 4);
+                const int8x16_t rhs_vec_1_3_1 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_vec_1_3), 4);
+
+                // Scale values - assemble the four row/column scales into a (64-bit) vector, then expand to FP32
+                const float16x8_t col_scale_f16 = vld1q_f16(b_ptr[b].d);
+                const float32x4_t col_scale_f32_0 = vcvt_f32_f16(vget_low_f16(col_scale_f16));
+                const float32x4_t col_scale_f32_1 = vcvt_f32_f16(vget_high_f16(col_scale_f16));
+
+                const float16x4_t row_scale_f16 = vld1_dup_f16(&(a_ptr[b].d));
+                const float32x4_t row_scale_f32 = vcvt_f32_f16(row_scale_f16);
+
+                const int8x16_t lhs_vec_0 = vld1q_s8(a_ptr[b].qs);
+                const int8x16_t lhs_vec_1 = vld1q_s8(a_ptr[b].qs + 16);
+
+                int32x4_t iacc0 = vdupq_n_s32(0);
+                int32x4_t iacc1 = vdupq_n_s32(0);
+
+                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_0_0, lhs_vec_0, 0);
+                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_0_1, lhs_vec_1, 0);
+
+                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_0_0, lhs_vec_0, 0);
+                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_0_1, lhs_vec_1, 0);
+
+                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_1_0, lhs_vec_0, 1);
+                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_1_1, lhs_vec_1, 1);
+
+                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_1_0, lhs_vec_0, 1);
+                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_1_1, lhs_vec_1, 1);
+
+                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_2_0, lhs_vec_0, 2);
+                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_2_1, lhs_vec_1, 2);
+
+                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_2_0, lhs_vec_0, 2);
+                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_2_1, lhs_vec_1, 2);
+
+                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_3_0, lhs_vec_0, 3);
+                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_3_1, lhs_vec_1, 3);
+
+                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_3_0, lhs_vec_0, 3);
+                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_3_1, lhs_vec_1, 3);
+
+                acc_row[0] = vfmaq_f32(acc_row[0], vcvtq_f32_s32(iacc0), vmulq_f32(col_scale_f32_0, row_scale_f32));
+                acc_row[1] = vfmaq_f32(acc_row[1], vcvtq_f32_s32(iacc1), vmulq_f32(col_scale_f32_1, row_scale_f32));
+            }
+
+            vst1q_f32(s + (y * output_channels + x * 8), acc_row[0]);
+            vst1q_f32(s + (y * output_channels + x * 8 + 4), acc_row[1]);
+        }
+    }
+#endif
+}
+
+void ggml_gemv_q4_0_q8_0_blocked8_sve(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
+#if defined(__ARM_FEATURE_SVE)
+    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)8);
+    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)8);
+
+    int64_t nb = n / QK4_0;
+    int64_t a_nb = n / QK8_0;
+
+    const svuint8_t m4b = svdup_u8(0x0F);
+    const svint8_t  s8b = svdup_s8(0x8);
+
+    const svbool_t ptrue = svptrue_b8();
+
+    const block_q4_0x8 * b_ptr_start = vx;
+    const block_q8_0 * a_ptr_start = vy;
+
+    for (int64_t y = 0; y < input_width; y++) {
+        for (int64_t x = x0 / 8; x < xend / 8; x++) {
+            // Pointers to LHS blocks
+            const block_q8_0 * a_ptr = a_ptr_start + (y * a_nb);
+            // Pointers to RHS blocks
+            const block_q4_0x8 * b_ptr = b_ptr_start + (x * nb);
+
+            // Master FP accumulator
+            svfloat32_t acc_row = svdup_f32(0.0f);
+
+            for (int64_t b = 0; b < nb; b++) {
+                // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
+                const svuint8_t rhs_raw_vec_0_0 = svld1_u8(ptrue, b_ptr[b].qs);
+                const svuint8_t rhs_raw_vec_0_1 = svld1_vnum_u8(ptrue, b_ptr[b].qs, 1);
+                const svuint8_t rhs_raw_vec_0_2 = svld1_vnum_u8(ptrue, b_ptr[b].qs, 2);
+                const svuint8_t rhs_raw_vec_0_3 = svld1_vnum_u8(ptrue, b_ptr[b].qs, 3);
+
+                const svint8_t rhs_vec_0_0_1 = svasr_n_s8_x(ptrue, svreinterpret_s8_u8(rhs_raw_vec_0_0), 4);
+                const svint8_t rhs_vec_0_1_1 = svasr_n_s8_x(ptrue, svreinterpret_s8_u8(rhs_raw_vec_0_1), 4);
+                const svint8_t rhs_vec_0_2_1 = svasr_n_s8_x(ptrue, svreinterpret_s8_u8(rhs_raw_vec_0_2), 4);
+                const svint8_t rhs_vec_0_3_1 = svasr_n_s8_x(ptrue, svreinterpret_s8_u8(rhs_raw_vec_0_3), 4);
+
+                const svint8_t rhs_vec_0_0_0 = svsub_s8_x(ptrue, svreinterpret_s8_u8(svand_u8_x(ptrue, rhs_raw_vec_0_0, m4b)), s8b);
+                const svint8_t rhs_vec_0_1_0 = svsub_s8_x(ptrue, svreinterpret_s8_u8(svand_u8_x(ptrue, rhs_raw_vec_0_1, m4b)), s8b);
+                const svint8_t rhs_vec_0_2_0 = svsub_s8_x(ptrue, svreinterpret_s8_u8(svand_u8_x(ptrue, rhs_raw_vec_0_2, m4b)), s8b);
+                const svint8_t rhs_vec_0_3_0 = svsub_s8_x(ptrue, svreinterpret_s8_u8(svand_u8_x(ptrue, rhs_raw_vec_0_3, m4b)), s8b);
+
+                // Scale values
+                const svfloat16_t col_scale_f16 = svreinterpret_f16_u32(svld1uh_u32(ptrue, (const uint16_t *) b_ptr[b].d));
+                const svfloat32_t col_scale_f32 = svcvt_f32_f16_x(ptrue, col_scale_f16);
+
+                const svfloat16_t row_scale_f16 = svdup_f16(a_ptr[b].d);
+                const svfloat32_t row_scale_f32 = svcvt_f32_f16_x(ptrue, row_scale_f16);
+
+                const svint8_t lhs_vec_0 = svld1rq_s8(ptrue, a_ptr[b].qs);
+                const svint8_t lhs_vec_1 = svld1rq_s8(ptrue, a_ptr[b].qs + 16);
+
+                svint32_t iacc = svdup_s32(0);
+
+                iacc = svdot_lane(iacc, rhs_vec_0_0_0, lhs_vec_0, 0);
+                iacc = svdot_lane(iacc, rhs_vec_0_0_1, lhs_vec_1, 0);
+
+                iacc = svdot_lane(iacc, rhs_vec_0_1_0, lhs_vec_0, 1);
+                iacc = svdot_lane(iacc, rhs_vec_0_1_1, lhs_vec_1, 1);
+
+                iacc = svdot_lane(iacc, rhs_vec_0_2_0, lhs_vec_0, 2);
+                iacc = svdot_lane(iacc, rhs_vec_0_2_1, lhs_vec_1, 2);
+
+                iacc = svdot_lane(iacc, rhs_vec_0_3_0, lhs_vec_0, 3);
+                iacc = svdot_lane(iacc, rhs_vec_0_3_1, lhs_vec_1, 3);
+
+                acc_row = svmla_x(ptrue, acc_row, svcvt_f32_s32_x(ptrue, iacc), svmul_x(ptrue, col_scale_f32, row_scale_f32));
+            }
+
+            svst1(ptrue, s + (y * output_channels + x * 8), acc_row);
+        }
+    }
+#endif
+}
+
+void ggml_gemm_q4_0_q8_0(const int n, int rows, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)4);
+    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)4);
+
+    int64_t nb = n / QK4_0;
+    int64_t a_nb = n / QK8_0;
+
+    const uint8x16_t m4b = vdupq_n_u8(0x0F);
+    const int8x16_t  s8b = vdupq_n_s8(0x8);
+
+    const block_q4_0x4 * b_ptr_start = vx;
+    const block_q8_0x4 * a_ptr_start = vy;
+
+    for (int64_t y = 0; y < input_width / 4; y += rows / 4) {
+        for (int64_t x = x0 / 4; x < xend / 4; x++) {
+            const block_q8_0x4 * a_ptrs[rows / 4];
+
+            a_ptrs[0] = a_ptr_start + (y * a_nb);
+            for (int i = 0; i < (rows / 4) - 1; i++) {
+                a_ptrs[i + 1] = a_ptrs[i] + a_nb;
+            }
+
+            const block_q4_0x4 * b_ptr = b_ptr_start + (x * nb);
+
+            // Master FP accumulators
+            float32x4_t acc_rows[rows];
+            for (int i = 0; i < rows; i++) {
+                acc_rows[i] = vdupq_n_f32(0.0f);
+            }
+
+            for (int64_t b = 0; b < nb; b++) {
+                // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
+                const uint8x16_t rhs_raw_mat_01_0 = vld1q_u8(b_ptr[b].qs);
+                const uint8x16_t rhs_raw_mat_23_0 = vld1q_u8(b_ptr[b].qs + 16);
+                const uint8x16_t rhs_raw_mat_01_1 = vld1q_u8(b_ptr[b].qs + 32);
+                const uint8x16_t rhs_raw_mat_23_1 = vld1q_u8(b_ptr[b].qs + 48);
+
+                // 4-bit -> 8-bit
+                const int8x16_t rhs_mat_01_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_01_0, m4b)), s8b);
+                const int8x16_t rhs_mat_23_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_23_0, m4b)), s8b);
+                const int8x16_t rhs_mat_01_1 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_01_1, m4b)), s8b);
+                const int8x16_t rhs_mat_23_1 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_23_1, m4b)), s8b);
+                const int8x16_t rhs_mat_01_2 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_01_0), 4);
+                const int8x16_t rhs_mat_23_2 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_23_0), 4);
+                const int8x16_t rhs_mat_01_3 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_01_1), 4);
+                const int8x16_t rhs_mat_23_3 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_23_1), 4);
+
+                // Scale values - assemble the four row/column scales into a (64-bit) vector, then expand to FP32
+                const float16x4_t col_scale_f16 = vld1_f16(b_ptr[b].d);
+                const float32x4_t col_scale_f32 = vcvt_f32_f16(col_scale_f16);
+
+                // Process LHS in pairs of rows
+                for (int rp = 0; rp < rows / 4; rp++) {
+                    const int8x16_t lhs_mat_01_0 = vld1q_s8(a_ptrs[rp][b].qs);
+                    const int8x16_t lhs_mat_23_0 = vld1q_s8(a_ptrs[rp][b].qs + 16);
+                    const int8x16_t lhs_mat_01_1 = vld1q_s8(a_ptrs[rp][b].qs + 32);
+                    const int8x16_t lhs_mat_23_1 = vld1q_s8(a_ptrs[rp][b].qs + 48);
+
+                    const int8x16_t lhs_mat_01_2 = vld1q_s8(a_ptrs[rp][b].qs + 64);
+                    const int8x16_t lhs_mat_23_2 = vld1q_s8(a_ptrs[rp][b].qs + 80);
+                    const int8x16_t lhs_mat_01_3 = vld1q_s8(a_ptrs[rp][b].qs + 96);
+                    const int8x16_t lhs_mat_23_3 = vld1q_s8(a_ptrs[rp][b].qs + 112);
+
+                    // Do the MMLAs into 2x2 matrices
+                    const int32x4_t iacc_mat_00 =
+                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_01_0), lhs_mat_01_1, rhs_mat_01_1), lhs_mat_01_2, rhs_mat_01_2), lhs_mat_01_3, rhs_mat_01_3);
+                    const int32x4_t iacc_mat_01 =
+                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_23_0), lhs_mat_01_1, rhs_mat_23_1), lhs_mat_01_2, rhs_mat_23_2), lhs_mat_01_3, rhs_mat_23_3);
+                    const int32x4_t iacc_mat_10 =
+                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_23_0, rhs_mat_01_0), lhs_mat_23_1, rhs_mat_01_1), lhs_mat_23_2, rhs_mat_01_2), lhs_mat_23_3, rhs_mat_01_3);
+                    const int32x4_t iacc_mat_11 =
+                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_23_0, rhs_mat_23_0), lhs_mat_23_1, rhs_mat_23_1), lhs_mat_23_2, rhs_mat_23_2), lhs_mat_23_3, rhs_mat_23_3);
+
+                    // Straighten out to make 4 row vectors
+                    const int32x4_t iacc_row_0 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
+                    const int32x4_t iacc_row_1 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
+                    const int32x4_t iacc_row_2 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_10), vreinterpretq_u64_s32(iacc_mat_11)));
+                    const int32x4_t iacc_row_3 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_10), vreinterpretq_u64_s32(iacc_mat_11)));
+
+                    const float16x4_t row_scale_f16 = vld1_f16(a_ptrs[rp][b].d);
+                    const float32x4_t row_scale_f32 = vcvt_f32_f16(row_scale_f16);
+
+                    acc_rows[rp * 4] = vfmaq_f32(acc_rows[rp * 4], vcvtq_f32_s32(iacc_row_0), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 0));
+                    acc_rows[rp * 4 + 1] = vfmaq_f32(acc_rows[rp * 4 + 1], vcvtq_f32_s32(iacc_row_1), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 1));
+                    acc_rows[rp * 4 + 2] = vfmaq_f32(acc_rows[rp * 4 + 2], vcvtq_f32_s32(iacc_row_2), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 2));
+                    acc_rows[rp * 4 + 3] = vfmaq_f32(acc_rows[rp * 4 + 3], vcvtq_f32_s32(iacc_row_3), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 3));
+                }
+            }
+
+            for (int i = 0; i < rows; i++) {
+                vst1q_f32(s + ((y * 4 + i) * output_channels + x * 4), acc_rows[i]);
+            }
+        }
+    }
+#endif
+}
+
+void ggml_gemm_q4_0_q8_0_2x4blocked_mmla(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    int rows = 2;
+    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)4);
+    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)4);
+
+    int64_t nb = n / QK4_0;
+    int64_t a_nb = n / QK8_0;
+
+    const uint8x16_t m4b = vdupq_n_u8(0x0F);
+    const int8x16_t  s8b = vdupq_n_s8(0x8);
+
+    const block_q4_0x4 * b_ptr_start = vx;
+    const block_q8_0x2 * a_ptr_start = vy;
+
+    for (int64_t y = 0; y < input_width / 2; y += rows / 2) {
+        for (int64_t x = x0 / 4; x < xend / 4; x++) {
+            const block_q8_0x2 * a_ptrs[rows / 2];
+
+            a_ptrs[0] = a_ptr_start + (y * a_nb);
+
+            const block_q4_0x4 * b_ptr = b_ptr_start + (x * nb);
+
+            // Master FP accumulators
+            float32x4_t acc_rows[rows];
+            acc_rows[0] = vdupq_n_f32(0.0f);
+            acc_rows[1] = vdupq_n_f32(0.0f);
+
+            for (int64_t b = 0; b < nb; b++) {
+                // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
+                const uint8x16_t rhs_raw_mat_01_0 = vld1q_u8(b_ptr[b].qs);
+                const uint8x16_t rhs_raw_mat_23_0 = vld1q_u8(b_ptr[b].qs + 16);
+                const uint8x16_t rhs_raw_mat_01_1 = vld1q_u8(b_ptr[b].qs + 32);
+                const uint8x16_t rhs_raw_mat_23_1 = vld1q_u8(b_ptr[b].qs + 48);
+
+                const int8x16_t rhs_mat_01_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_01_0, m4b)), s8b);
+                const int8x16_t rhs_mat_23_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_23_0, m4b)), s8b);
+                const int8x16_t rhs_mat_01_1 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_01_1, m4b)), s8b);
+                const int8x16_t rhs_mat_23_1 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_23_1, m4b)), s8b);
+
+                const int8x16_t rhs_mat_01_2 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_01_0), 4);
+                const int8x16_t rhs_mat_23_2 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_23_0), 4);
+                const int8x16_t rhs_mat_01_3 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_01_1), 4);
+                const int8x16_t rhs_mat_23_3 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_23_1), 4);
+
+                // Scale values - assemble the four row/column scales into a (64-bit) vector, then expand to FP32
+                const float16x4_t col_scale_f16 = vld1_f16(b_ptr[b].d);
+                const float32x4_t col_scale_f32 = vcvt_f32_f16(col_scale_f16);
+
+                // Process LHS in pairs of rows
+                int rp = 0;
+                const int8x16_t lhs_mat_01_0 = vld1q_s8(a_ptrs[rp][b].qs);
+                const int8x16_t lhs_mat_01_1 = vld1q_s8(a_ptrs[rp][b].qs + 16);
+
+                const int8x16_t lhs_mat_01_2 = vld1q_s8(a_ptrs[rp][b].qs + 32);
+                const int8x16_t lhs_mat_01_3 = vld1q_s8(a_ptrs[rp][b].qs + 48);
+
+                // Do the MMLAs into 2x2 matrices
+                const int32x4_t iacc_mat_00 =
+                    vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_01_0), lhs_mat_01_1, rhs_mat_01_1), lhs_mat_01_2, rhs_mat_01_2), lhs_mat_01_3, rhs_mat_01_3);
+                const int32x4_t iacc_mat_01 =
+                    vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_23_0), lhs_mat_01_1, rhs_mat_23_1), lhs_mat_01_2, rhs_mat_23_2), lhs_mat_01_3, rhs_mat_23_3);
+
+                // Straighten out to make 2 row vectors
+                const int32x4_t iacc_row_0 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
+                const int32x4_t iacc_row_1 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
+
+                const float16x4_t row_scale_f16_0 = vld1_dup_f16(&(a_ptrs[rp][b].d[0]));
+                const float32x4_t row_scale_f32_0 = vcvt_f32_f16(row_scale_f16_0);
+                const float16x4_t row_scale_f16_1 = vld1_dup_f16(&(a_ptrs[rp][b].d[1]));
+                const float32x4_t row_scale_f32_1 = vcvt_f32_f16(row_scale_f16_1);
+
+                acc_rows[rp * 2] = vfmaq_f32(acc_rows[rp * 2], vcvtq_f32_s32(iacc_row_0), vmulq_f32(col_scale_f32, row_scale_f32_0));
+                acc_rows[rp * 2 + 1] = vfmaq_f32(acc_rows[rp * 2 + 1], vcvtq_f32_s32(iacc_row_1), vmulq_f32(col_scale_f32, row_scale_f32_1));
+            }
+
+            vst1q_f32(s + ((y * 2) * output_channels + x * 4), acc_rows[0]);
+            vst1q_f32(s + ((y * 2 + 1) * output_channels + x * 4), acc_rows[1]);
+        }
+    }
+#endif
+}
+
+void ggml_gemv_q8_0_q8_0_blocked8_neon(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
+#if defined(__ARM_NEON)
+    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)8);
+    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)8);
+
+    int64_t nb = n / QK8_0;
+    int64_t a_nb = n / QK8_0;
+
+    const block_q8_0x8 * b_ptr_start = vx;
+    const block_q8_0 * a_ptr_start = vy;
+
+    for (int64_t y = 0; y < input_width; y++) {
+        for (int64_t x = x0 / 8; x < xend / 8; x++) {
+            // Pointers to LHS blocks
+            const block_q8_0 * a_ptr = a_ptr_start + (y * a_nb);
+            // Pointers to RHS blocks
+            const block_q8_0x8 * b_ptr = b_ptr_start + (x * nb);
+            // Master FP accumulator
+            float32x4_t acc_row[2];
+            acc_row[0] = acc_row[1] = vdupq_n_f32(0.0f);
+
+            for (int64_t b = 0; b < nb; b++) {
+                // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
+                const int8x16_t rhs_vec_0_0_0 = vld1q_s8(b_ptr[b].qs);
+                const int8x16_t rhs_vec_1_0_0 = vld1q_s8(b_ptr[b].qs + 16);
+                const int8x16_t rhs_vec_0_1_0 = vld1q_s8(b_ptr[b].qs + 32);
+                const int8x16_t rhs_vec_1_1_0 = vld1q_s8(b_ptr[b].qs + 48);
+                const int8x16_t rhs_vec_0_2_0 = vld1q_s8(b_ptr[b].qs + 64);
+                const int8x16_t rhs_vec_1_2_0 = vld1q_s8(b_ptr[b].qs + 80);
+                const int8x16_t rhs_vec_0_3_0 = vld1q_s8(b_ptr[b].qs + 96);
+                const int8x16_t rhs_vec_1_3_0 = vld1q_s8(b_ptr[b].qs + 112);
+                const int8x16_t rhs_vec_0_0_1 = vld1q_s8(b_ptr[b].qs + 128);
+                const int8x16_t rhs_vec_1_0_1 = vld1q_s8(b_ptr[b].qs + 144);
+                const int8x16_t rhs_vec_0_1_1 = vld1q_s8(b_ptr[b].qs + 160);
+                const int8x16_t rhs_vec_1_1_1 = vld1q_s8(b_ptr[b].qs + 176);
+                const int8x16_t rhs_vec_0_2_1 = vld1q_s8(b_ptr[b].qs + 192);
+                const int8x16_t rhs_vec_1_2_1 = vld1q_s8(b_ptr[b].qs + 208);
+                const int8x16_t rhs_vec_0_3_1 = vld1q_s8(b_ptr[b].qs + 224);
+                const int8x16_t rhs_vec_1_3_1 = vld1q_s8(b_ptr[b].qs + 240);
+
+                // Scale values - assemble the four row/column scales into a (64-bit) vector, then expand to FP32
+                const float16x8_t col_scale_f16 = vld1q_f16(b_ptr[b].d);
+                const float32x4_t col_scale_f32_0 = vcvt_f32_f16(vget_low_f16(col_scale_f16));
+                const float32x4_t col_scale_f32_1 = vcvt_f32_f16(vget_high_f16(col_scale_f16));
+
+                const float16x4_t row_scale_f16 = vld1_dup_f16(&(a_ptr[b].d));
+                const float32x4_t row_scale_f32 = vcvt_f32_f16(row_scale_f16);
+
+                const int8x16_t lhs_vec_0 = vld1q_s8(a_ptr[b].qs);
+                const int8x16_t lhs_vec_1 = vld1q_s8(a_ptr[b].qs + 16);
+
+                int32x4_t iacc0 = vdupq_n_s32(0);
+                int32x4_t iacc1 = vdupq_n_s32(0);
+
+                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_0_0, lhs_vec_0, 0);
+                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_0_1, lhs_vec_1, 0);
+
+                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_0_0, lhs_vec_0, 0);
+                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_0_1, lhs_vec_1, 0);
+
+                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_1_0, lhs_vec_0, 1);
+                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_1_1, lhs_vec_1, 1);
+
+                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_1_0, lhs_vec_0, 1);
+                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_1_1, lhs_vec_1, 1);
+
+                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_2_0, lhs_vec_0, 2);
+                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_2_1, lhs_vec_1, 2);
+
+                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_2_0, lhs_vec_0, 2);
+                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_2_1, lhs_vec_1, 2);
+
+                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_3_0, lhs_vec_0, 3);
+                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_3_1, lhs_vec_1, 3);
+
+                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_3_0, lhs_vec_0, 3);
+                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_3_1, lhs_vec_1, 3);
+
+                acc_row[0] = vfmaq_f32(acc_row[0], vcvtq_f32_s32(iacc0), vmulq_f32(col_scale_f32_0, row_scale_f32));
+                acc_row[1] = vfmaq_f32(acc_row[1], vcvtq_f32_s32(iacc1), vmulq_f32(col_scale_f32_1, row_scale_f32));
+            }
+
+            vst1q_f32(s + (y * output_channels + x * 8), acc_row[0]);
+            vst1q_f32(s + (y * output_channels + x * 8 + 4), acc_row[1]);
+        }
+    }
+#endif
+}
+
+void ggml_gemv_q8_0_q8_0_blocked8_sve(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
+#if defined(__ARM_FEATURE_SVE)
+    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)8);
+    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)8);
+
+    int64_t nb = n / QK8_0;
+    int64_t a_nb = n / QK8_0;
+
+    const svbool_t ptrue = svptrue_b8();
+
+    const block_q8_0x8 * b_ptr_start = vx;
+    const block_q8_0 * a_ptr_start = vy;
+
+    for (int64_t y = 0; y < input_width; y++) {
+        for (int64_t x = x0 / 8; x < xend / 8; x++) {
+            // Pointers to LHS blocks
+            const block_q8_0 * a_ptr = a_ptr_start + (y * a_nb);
+            // Pointers to RHS blocks
+            const block_q8_0x8 * b_ptr = b_ptr_start + (x * nb);
+
+            // Master FP accumulator
+            svfloat32_t acc_row = svdup_f32(0.0f);
+
+            for (int64_t b = 0; b < nb; b++) {
+                // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
+                const svint8_t rhs_vec_0_0_0 = svld1_s8(ptrue, b_ptr[b].qs);
+                const svint8_t rhs_vec_0_1_0 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 1);
+                const svint8_t rhs_vec_0_2_0 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 2);
+                const svint8_t rhs_vec_0_3_0 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 3);
+                const svint8_t rhs_vec_0_0_1 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 4);
+                const svint8_t rhs_vec_0_1_1 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 5);
+                const svint8_t rhs_vec_0_2_1 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 6);
+                const svint8_t rhs_vec_0_3_1 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 7);
+
+                // Scale values
+                const svfloat16_t col_scale_f16 = svreinterpret_f16_u32(svld1uh_u32(ptrue, (const uint16_t *) b_ptr[b].d));
+                const svfloat32_t col_scale_f32 = svcvt_f32_f16_x(ptrue, col_scale_f16);
+
+                const svfloat16_t row_scale_f16 = svdup_f16(a_ptr[b].d);
+                const svfloat32_t row_scale_f32 = svcvt_f32_f16_x(ptrue, row_scale_f16);
+
+                const svint8_t lhs_vec_0 = svld1rq_s8(ptrue, a_ptr[b].qs);
+                const svint8_t lhs_vec_1 = svld1rq_s8(ptrue, a_ptr[b].qs + 16);
+
+                svint32_t iacc = svdup_s32(0);
+
+                iacc = svdot_lane(iacc, rhs_vec_0_0_0, lhs_vec_0, 0);
+                iacc = svdot_lane(iacc, rhs_vec_0_0_1, lhs_vec_1, 0);
+
+                iacc = svdot_lane(iacc, rhs_vec_0_1_0, lhs_vec_0, 1);
+                iacc = svdot_lane(iacc, rhs_vec_0_1_1, lhs_vec_1, 1);
+
+                iacc = svdot_lane(iacc, rhs_vec_0_2_0, lhs_vec_0, 2);
+                iacc = svdot_lane(iacc, rhs_vec_0_2_1, lhs_vec_1, 2);
+
+                iacc = svdot_lane(iacc, rhs_vec_0_3_0, lhs_vec_0, 3);
+                iacc = svdot_lane(iacc, rhs_vec_0_3_1, lhs_vec_1, 3);
+
+                acc_row = svmla_x(ptrue, acc_row, svcvt_f32_s32_x(ptrue, iacc), svmul_x(ptrue, col_scale_f32, row_scale_f32));
+            }
+
+            svst1(ptrue, s + (y * output_channels + x * 8), acc_row);
+        }
+    }
+#endif
+}
+
+void ggml_gemm_q8_0_q8_0(const int n, int rows, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)4);
+    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)4);
+
+    int64_t nb = n / QK8_0;
+    int64_t a_nb = n / QK8_0;
+
+    const block_q8_0x4 * b_ptr_start = vx;
+    const block_q8_0x4 * a_ptr_start = vy;
+
+    for (int64_t y = 0; y < input_width / 4; y += rows / 4) {
+        for (int64_t x = x0 / 4; x < xend / 4; x++) {
+            const block_q8_0x4 * a_ptrs[rows / 4];
+
+            a_ptrs[0] = a_ptr_start + (y * a_nb);
+            for (int i = 0; i < (rows / 4) - 1; i++) {
+                a_ptrs[i + 1] = a_ptrs[i] + a_nb;
+            }
+
+            const block_q8_0x4 * b_ptr = b_ptr_start + (x * nb);
+
+            // Master FP accumulators
+            float32x4_t acc_rows[rows];
+            for (int i = 0; i < rows; i++) {
+                acc_rows[i] = vdupq_n_f32(0.0f);
+            }
+
+            for (int64_t b = 0; b < nb; b++) {
+                // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
+                const int8x16_t rhs_mat_01_0 = vld1q_s8(b_ptr[b].qs);
+                const int8x16_t rhs_mat_23_0 = vld1q_s8(b_ptr[b].qs + 16);
+                const int8x16_t rhs_mat_01_1 = vld1q_s8(b_ptr[b].qs + 32);
+                const int8x16_t rhs_mat_23_1 = vld1q_s8(b_ptr[b].qs + 48);
+                const int8x16_t rhs_mat_01_2 = vld1q_s8(b_ptr[b].qs + 64);
+                const int8x16_t rhs_mat_23_2 = vld1q_s8(b_ptr[b].qs + 80);
+                const int8x16_t rhs_mat_01_3 = vld1q_s8(b_ptr[b].qs + 96);
+                const int8x16_t rhs_mat_23_3 = vld1q_s8(b_ptr[b].qs + 112);
+
+                // Scale values - assemble the four row/column scales into a (64-bit) vector, then expand to FP32
+                const float16x4_t col_scale_f16 = vld1_f16(b_ptr[b].d);
+                const float32x4_t col_scale_f32 = vcvt_f32_f16(col_scale_f16);
+
+                // Process LHS in pairs of rows
+                for (int rp = 0; rp < rows / 4; rp++) {
+                    const int8x16_t lhs_mat_01_0 = vld1q_s8(a_ptrs[rp][b].qs);
+                    const int8x16_t lhs_mat_23_0 = vld1q_s8(a_ptrs[rp][b].qs + 16);
+                    const int8x16_t lhs_mat_01_1 = vld1q_s8(a_ptrs[rp][b].qs + 32);
+                    const int8x16_t lhs_mat_23_1 = vld1q_s8(a_ptrs[rp][b].qs + 48);
+
+                    const int8x16_t lhs_mat_01_2 = vld1q_s8(a_ptrs[rp][b].qs + 64);
+                    const int8x16_t lhs_mat_23_2 = vld1q_s8(a_ptrs[rp][b].qs + 80);
+                    const int8x16_t lhs_mat_01_3 = vld1q_s8(a_ptrs[rp][b].qs + 96);
+                    const int8x16_t lhs_mat_23_3 = vld1q_s8(a_ptrs[rp][b].qs + 112);
+
+                    // Do the MMLAs into 2x2 matrices
+                    const int32x4_t iacc_mat_00 =
+                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_01_0), lhs_mat_01_1, rhs_mat_01_1), lhs_mat_01_2, rhs_mat_01_2), lhs_mat_01_3, rhs_mat_01_3);
+                    const int32x4_t iacc_mat_01 =
+                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_23_0), lhs_mat_01_1, rhs_mat_23_1), lhs_mat_01_2, rhs_mat_23_2), lhs_mat_01_3, rhs_mat_23_3);
+                    const int32x4_t iacc_mat_10 =
+                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_23_0, rhs_mat_01_0), lhs_mat_23_1, rhs_mat_01_1), lhs_mat_23_2, rhs_mat_01_2), lhs_mat_23_3, rhs_mat_01_3);
+                    const int32x4_t iacc_mat_11 =
+                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_23_0, rhs_mat_23_0), lhs_mat_23_1, rhs_mat_23_1), lhs_mat_23_2, rhs_mat_23_2), lhs_mat_23_3, rhs_mat_23_3);
+
+                    // Straighten out to make 4 row vectors
+                    const int32x4_t iacc_row_0 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
+                    const int32x4_t iacc_row_1 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
+                    const int32x4_t iacc_row_2 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_10), vreinterpretq_u64_s32(iacc_mat_11)));
+                    const int32x4_t iacc_row_3 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_10), vreinterpretq_u64_s32(iacc_mat_11)));
+
+                    const float16x4_t row_scale_f16 = vld1_f16(a_ptrs[rp][b].d);
+                    const float32x4_t row_scale_f32 = vcvt_f32_f16(row_scale_f16);
+
+                    acc_rows[rp * 4] = vfmaq_f32(acc_rows[rp * 4], vcvtq_f32_s32(iacc_row_0), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 0));
+                    acc_rows[rp * 4 + 1] = vfmaq_f32(acc_rows[rp * 4 + 1], vcvtq_f32_s32(iacc_row_1), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 1));
+                    acc_rows[rp * 4 + 2] = vfmaq_f32(acc_rows[rp * 4 + 2], vcvtq_f32_s32(iacc_row_2), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 2));
+                    acc_rows[rp * 4 + 3] = vfmaq_f32(acc_rows[rp * 4 + 3], vcvtq_f32_s32(iacc_row_3), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 3));
+                }
+            }
+
+            for (int i = 0; i < rows; i++) {
+                vst1q_f32(s + ((y * 4 + i) * output_channels + x * 4), acc_rows[i]);
+            }
+        }
+    }
+#endif
+}
+
+void ggml_gemm_q8_0_q8_0_2x4blocked_mmla(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    int rows = 2;
+    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)4);
+    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)4);
+
+    int64_t nb = n / QK8_0;
+    int64_t a_nb = n / QK8_0;
+
+    const block_q8_0x4 * b_ptr_start = vx;
+    const block_q8_0x2 * a_ptr_start = vy;
+
+    for (int64_t y = 0; y < input_width / 2; y += rows / 2) {
+        for (int64_t x = x0 / 4; x < xend / 4; x++) {
+            const block_q8_0x2 * a_ptrs[rows / 2];
+
+            a_ptrs[0] = a_ptr_start + (y * a_nb);
+
+            const block_q8_0x4 * b_ptr = b_ptr_start + (x * nb);
+
+            // Master FP accumulators
+            float32x4_t acc_rows[rows];
+            acc_rows[0] = vdupq_n_f32(0.0f);
+            acc_rows[1] = vdupq_n_f32(0.0f);
+
+            for (int64_t b = 0; b < nb; b++) {
+                const int8x16_t rhs_mat_01_0 = vld1q_s8(b_ptr[b].qs);
+                const int8x16_t rhs_mat_23_0 = vld1q_s8(b_ptr[b].qs + 16);
+                const int8x16_t rhs_mat_01_1 = vld1q_s8(b_ptr[b].qs + 32);
+                const int8x16_t rhs_mat_23_1 = vld1q_s8(b_ptr[b].qs + 48);
+                const int8x16_t rhs_mat_01_2 = vld1q_s8(b_ptr[b].qs + 64);
+                const int8x16_t rhs_mat_23_2 = vld1q_s8(b_ptr[b].qs + 80);
+                const int8x16_t rhs_mat_01_3 = vld1q_s8(b_ptr[b].qs + 96);
+                const int8x16_t rhs_mat_23_3 = vld1q_s8(b_ptr[b].qs + 112);
+
+                // Scale values - assemble the four row/column scales into a (64-bit) vector, then expand to FP32
+                const float16x4_t col_scale_f16 = vld1_f16(b_ptr[b].d);
+                const float32x4_t col_scale_f32 = vcvt_f32_f16(col_scale_f16);
+
+                // Process LHS in pairs of rows
+                int rp = 0;
+                const int8x16_t lhs_mat_01_0 = vld1q_s8(a_ptrs[rp][b].qs);
+                const int8x16_t lhs_mat_01_1 = vld1q_s8(a_ptrs[rp][b].qs + 16);
+
+                const int8x16_t lhs_mat_01_2 = vld1q_s8(a_ptrs[rp][b].qs + 32);
+                const int8x16_t lhs_mat_01_3 = vld1q_s8(a_ptrs[rp][b].qs + 48);
+
+                // Do the MMLAs into 2x2 matrices
+                const int32x4_t iacc_mat_00 =
+                    vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_01_0), lhs_mat_01_1, rhs_mat_01_1), lhs_mat_01_2, rhs_mat_01_2), lhs_mat_01_3, rhs_mat_01_3);
+                const int32x4_t iacc_mat_01 =
+                    vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_23_0), lhs_mat_01_1, rhs_mat_23_1), lhs_mat_01_2, rhs_mat_23_2), lhs_mat_01_3, rhs_mat_23_3);
+
+                // Straighten out to make 2 row vectors
+                const int32x4_t iacc_row_0 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
+                const int32x4_t iacc_row_1 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
+
+                const float16x4_t row_scale_f16_0 = vld1_dup_f16(&(a_ptrs[rp][b].d[0]));
+                const float32x4_t row_scale_f32_0 = vcvt_f32_f16(row_scale_f16_0);
+                const float16x4_t row_scale_f16_1 = vld1_dup_f16(&(a_ptrs[rp][b].d[1]));
+                const float32x4_t row_scale_f32_1 = vcvt_f32_f16(row_scale_f16_1);
+
+                acc_rows[rp * 2] = vfmaq_f32(acc_rows[rp * 2], vcvtq_f32_s32(iacc_row_0), vmulq_f32(col_scale_f32, row_scale_f32_0));
+                acc_rows[rp * 2 + 1] = vfmaq_f32(acc_rows[rp * 2 + 1], vcvtq_f32_s32(iacc_row_1), vmulq_f32(col_scale_f32, row_scale_f32_1));
+            }
+            vst1q_f32(s + ((y * 2) * output_channels + x * 4), acc_rows[0]);
+            vst1q_f32(s + ((y * 2 + 1) * output_channels + x * 4), acc_rows[1]);
+        }
+    }
+#endif
 }
 
 static bool validate_float(float f, size_t i) {
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
index 30983b8728fa2..852263da609b8 100644
--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@@ -1,3 +1,4 @@
+// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
 #pragma once
 
 #define GGML_COMMON_DECL_C
@@ -7,6 +8,250 @@
 
 // GGML internal header
 
+#include <stdint.h>
+#include <stddef.h>
+
+#define QK4_0 32
+typedef struct {
+    ggml_fp16_t d;          // delta
+    uint8_t qs[QK4_0 / 2];  // nibbles / quants
+} block_q4_0;
+static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
+
+#define QK4_1 32
+typedef struct {
+    ggml_fp16_t d;          // delta
+    ggml_fp16_t m;          // min
+    uint8_t qs[QK4_1 / 2];  // nibbles / quants
+} block_q4_1;
+static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
+
+#define QK5_0 32
+typedef struct {
+    ggml_fp16_t d;         // delta
+    uint8_t qh[4];         // 5-th bit of quants
+    uint8_t qs[QK5_0 / 2]; // nibbles / quants
+} block_q5_0;
+static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
+
+#define QK5_1 32
+typedef struct {
+    ggml_fp16_t d;         // delta
+    ggml_fp16_t m;         // min
+    uint8_t qh[4];         // 5-th bit of quants
+    uint8_t qs[QK5_1 / 2]; // nibbles / quants
+} block_q5_1;
+static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
+
+#define QK8_0 32
+typedef struct {
+    ggml_fp16_t d;         // delta
+    int8_t  qs[QK8_0];     // quants
+} block_q8_0;
+static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
+
+#define QK8_1 32
+typedef struct {
+    float d;               // delta
+    float s;               // d * sum(qs[i])
+    int8_t  qs[QK8_1];     // quants
+} block_q8_1;
+static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
+
+typedef struct {
+    ggml_fp16_t d[4];      // deltas for 4 q4_0 blocks
+    uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks
+} block_q4_0x4;
+static_assert(sizeof(block_q4_0x4) == 4 * sizeof(ggml_fp16_t) + QK4_0 * 2, "wrong q4_0x4 block size/padding");
+
+typedef struct {
+    ggml_fp16_t d[8];      // deltas for 8 q4_0 blocks
+    uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks
+} block_q4_0x8;
+static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_fp16_t) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
+
+typedef struct {
+    ggml_fp16_t d[16];     // deltas for 16 q4_0 blocks
+    uint8_t qs[QK4_0 * 8]; // nibbles / quants for 16 q4_0 blocks
+} block_q4_0x16;
+static_assert(sizeof(block_q4_0x16) == 16 * sizeof(ggml_fp16_t) + QK4_0 * 8, "wrong q4_0x16 block size/padding");
+
+typedef struct {
+    ggml_fp16_t d[64];     // deltas for 64 q4_0 blocks
+    uint8_t qs[QK4_0 * 32];// nibbles / quants for 64 q4_0 blocks
+} block_q4_0x64;
+static_assert(sizeof(block_q4_0x64) == 64 * sizeof(ggml_fp16_t) + QK4_0 * 32, "wrong q4_0x64 block size/padding");
+
+typedef struct {
+    ggml_fp16_t d[2];      // deltas for 2 q8_0 blocks
+    int8_t qs[QK8_0 * 2];  // quants for 2 q8_0 blocks
+} block_q8_0x2;
+static_assert(sizeof(block_q8_0x2) == 2 * sizeof(ggml_fp16_t) + QK8_0 * 2, "wrong q8_0x2 block size/padding");
+
+typedef struct {
+    ggml_fp16_t d[4];      // deltas for 4 q8_0 blocks
+    int8_t qs[QK8_0 * 4];  // quants for 4 q8_0 blocks
+} block_q8_0x4;
+static_assert(sizeof(block_q8_0x4) == 4 * sizeof(ggml_fp16_t) + QK8_0 * 4, "wrong q8_0x4 block size/padding");
+
+typedef struct {
+    ggml_fp16_t d[8];      // deltas for 8 q8_0 blocks
+    int8_t qs[QK8_0 * 8];  // quants for 8 q8_0 blocks
+} block_q8_0x8;
+static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_fp16_t) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
+
+//
+// Super-block quantization structures
+//
+
+// Super-block size
+#ifdef GGML_QKK_64
+#define QK_K 64
+#define K_SCALE_SIZE 4
+#else
+#define QK_K 256
+#define K_SCALE_SIZE 12
+#endif
+
+// 2-bit quantization
+// weight is represented as x = a * q + b
+// 16 blocks of 16 elements each
+// Effectively 2.625 bits per weight
+typedef struct {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    ggml_fp16_t d;           // super-block scale for quantized scales
+    ggml_fp16_t dmin;        // super-block scale for quantized mins
+} block_q2_K;
+static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
+
+// 3-bit quantization
+// weight is represented as x = a * q
+// 16 blocks of 16 elements each
+// Effectively 3.4375 bits per weight
+#ifdef GGML_QKK_64
+typedef struct {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+    uint8_t scales[2];
+    ggml_fp16_t d;             // super-block scale
+} block_q3_K;
+static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
+#else
+typedef struct {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+    uint8_t scales[12];        // scales, quantized with 6 bits
+    ggml_fp16_t d;             // super-block scale
+} block_q3_K;
+static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
+#endif
+
+// 4-bit quantization
+// 8 blocks of 32 elements each
+// weight is represented as x = a * q + b
+// Effectively 4.5 bits per weight
+#ifdef GGML_QKK_64
+typedef struct {
+    ggml_fp16_t d[2];          // super-block scales/mins
+    uint8_t scales[2];         // 4-bit block scales/mins
+    uint8_t qs[QK_K/2];        // 4--bit quants
+} block_q4_K;
+static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
+#else
+typedef struct {
+    ggml_fp16_t d;             // super-block scale for quantized scales
+    ggml_fp16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+} block_q4_K;
+static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
+#endif
+
+// 5-bit quantization
+// 8 blocks of 32 elements each
+// weight is represented as x = a * q + b
+// Effectively 5.5 bits per weight
+#ifdef GGML_QKK_64
+typedef struct {
+    ggml_fp16_t d;               // super-block scale
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+} block_q5_K;
+static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
+#else
+typedef struct {
+    ggml_fp16_t d;               // super-block scale for quantized scales
+    ggml_fp16_t dmin;            // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+} block_q5_K;
+static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
+#endif
+
+// 6-bit quantization
+// weight is represented as x = a * q
+// 16 blocks of 16 elements each
+// Effectively 6.5625 bits per weight
+typedef struct {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    ggml_fp16_t d;           // super-block scale
+} block_q6_K;
+static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
+
+// This is only used for intermediate quantization and dot products
+typedef struct {
+    float   d;              // delta
+    int8_t  qs[QK_K];       // quants
+    int16_t bsums[QK_K/16]; // sum of quants in groups of 16
+} block_q8_K;
+static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
+
+// (Almost) "true" 2-bit quantization.
+// Due to the need to use blocks as per ggml design, it ends up using
+// 2.0625 bpw because of the 16-bit scale for each block of 256.
+typedef struct {
+    ggml_fp16_t d;
+    uint16_t qs[QK_K/8];
+} block_iq2_xxs;
+static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
+
+// 2.3125 bpw quants
+typedef struct {
+    ggml_fp16_t d;
+    uint16_t qs[QK_K/8];
+    uint8_t  scales[QK_K/32];
+} block_iq2_xs;
+static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
+
+// (Almost) "true" 3-bit quantization.
+// Due to the need to use blocks as per ggml design, it ends up using
+// 3.0625 bpw because of the 16-bit scale for each block of 256.
+typedef struct {
+    ggml_fp16_t d;
+    uint8_t qs[3*QK_K/8];
+} block_iq3_xxs;
+static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
+
+typedef struct {
+    ggml_fp16_t d;
+    uint8_t qs[QK_K/8];
+    uint8_t scales[QK_K/16];
+} block_iq1_s;
+static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
+
+// Non-linear quants
+#define QK4_NL 32
+typedef struct {
+    ggml_fp16_t d;
+    uint8_t qs[QK4_NL/2];
+} block_iq4_nl;
+static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -127,6 +372,25 @@ void iq2xs_free_impl(enum ggml_type type);
 void iq3xs_init_impl(int grid_size);
 void iq3xs_free_impl(int grid_size);
 
+block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned int block_len);
+block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned int block_len);
+block_q8_0x4 make_block_q8_0x4(const block_q8_0 * const in[4], unsigned int block_len);
+block_q8_0x8 make_block_q8_0x8(const block_q8_0 * const in[8], unsigned int block_len);
+void quantize_row_q8_0_and_make_block_q8_0x2(const float * restrict x, void * restrict vy, int k, int rows_interleaved);
+void quantize_row_q8_0_and_make_block_q8_0x4(const float * restrict x, void * restrict vy, int k, int rows_interleaved);
+
+// GEMV
+void ggml_gemv_q4_0_q8_0_blocked8_neon(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
+void ggml_gemv_q4_0_q8_0_blocked8_sve(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
+void ggml_gemv_q8_0_q8_0_blocked8_neon(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
+void ggml_gemv_q8_0_q8_0_blocked8_sve(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
+
+// GEMM
+void ggml_gemm_q4_0_q8_0(const int n, int rows, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
+void ggml_gemm_q4_0_q8_0_2x4blocked_mmla(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
+void ggml_gemm_q8_0_q8_0(const int n, int rows, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
+void ggml_gemm_q8_0_q8_0_2x4blocked_mmla(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index bc91ac3a726ab..8b613a6a09534 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1,3 +1,4 @@
+// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
 #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
 #define _USE_MATH_DEFINES // For M_PI on MSVC
 
@@ -473,6 +474,204 @@ int64_t ggml_cycles_per_ms(void) {
     return CLOCKS_PER_SEC/1000;
 }
 
+#ifdef GGML_PERF
+#define ggml_perf_time_ms()       ggml_time_ms()
+#define ggml_perf_time_us()       ggml_time_us()
+#define ggml_perf_cycles()        ggml_cycles()
+#define ggml_perf_cycles_per_ms() ggml_cycles_per_ms()
+#else
+#define ggml_perf_time_ms()       0
+#define ggml_perf_time_us()       0
+#define ggml_perf_cycles()        0
+#define ggml_perf_cycles_per_ms() 0
+#endif
+
+void rearrange_q4_0_weights_blocked8_neon(struct ggml_tensor * cur) {
+    block_q4_0x8 * out_ptr_B = malloc(ggml_nbytes(cur));  // B_blocked->data;
+    block_q4_0x8 * out_ptr_B_start = out_ptr_B;
+    int64_t nb = cur->ne[0] / QK4_0;
+
+    for (int y_out = 0; y_out < cur->ne[1] / 8; y_out++) {
+        const block_q4_0 * in_ptrs[8];
+
+        in_ptrs[0] = (block_q4_0 *) cur->data + (y_out * 8 * nb);
+        for (int i = 0; i < 7; i++) {
+            in_ptrs[i + 1] = in_ptrs[i] + nb;
+        }
+
+        for (int64_t x = 0; x < nb; x++) {
+            *out_ptr_B = make_block_q4_0x8(in_ptrs, 4);  // block_len=4 for SDOT
+            out_ptr_B++;
+
+            for (int i = 0; i < 8; i++) {
+                in_ptrs[i]++;
+            }
+        }
+    }
+    cur->rearranged_weight_gemv = (uint8_t *) out_ptr_B_start;
+}
+
+void rearrange_q4_0_weights_blocked8_sve(struct ggml_tensor * cur) {
+#if defined(__ARM_FEATURE_SVE)
+    if (svcntw() != 8) {
+        printf("ggml_gemv_q4_0_q8_0_blocked8_sve: SVE VL != 256 - aborting. Use Arm Neon GEMV kernels\n");
+        exit(1);
+    }
+
+    block_q4_0x8 * out_ptr_B = malloc(ggml_nbytes(cur));  // B_blocked->data;
+    block_q4_0x8 * out_ptr_B_start = out_ptr_B;
+    int64_t nb = cur->ne[0] / QK4_0;
+
+    for (int y_out = 0; y_out < cur->ne[1] / 8; y_out++) {
+        const block_q4_0 * in_ptrs[8];
+
+        in_ptrs[0] = (block_q4_0 *) cur->data + (y_out * 8 * nb);
+        for (int i = 0; i < 7; i++) {
+            in_ptrs[i + 1] = in_ptrs[i] + nb;
+        }
+
+        for (int64_t x = 0; x < nb; x++) {
+            *out_ptr_B = make_block_q4_0x8(in_ptrs, 4);  // block_len=4 for SDOT
+            out_ptr_B++;
+
+            for (int i = 0; i < 8; i++) {
+                in_ptrs[i]++;
+            }
+        }
+    }
+    cur->rearranged_weight_gemv = (uint8_t *) out_ptr_B_start;
+#endif
+}
+
+#if defined(__ARM_FEATURE_SVE)
+static void (*_rearrange_q4_0_weights_for_gemv)(struct ggml_tensor *) = &rearrange_q4_0_weights_blocked8_sve;
+#elif defined(__ARM_NEON)
+static void (*_rearrange_q4_0_weights_for_gemv)(struct ggml_tensor *) = &rearrange_q4_0_weights_blocked8_neon;
+#endif
+
+#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
+void rearrange_q4_0_weights_for_gemv(struct ggml_tensor * cur) { _rearrange_q4_0_weights_for_gemv(cur); }
+#endif
+
+void rearrange_q4_0_weights_for_gemm(struct ggml_tensor * cur) {
+    block_q4_0x4 * out_ptr_B = malloc(ggml_nbytes(cur));  // B_blocked->data;
+    block_q4_0x4 * out_ptr_B_start = out_ptr_B;
+    int64_t nb = cur->ne[0] / QK4_0;
+
+    for (int y_out = 0; y_out < cur->ne[1] / 4; y_out++) {
+        const block_q4_0 * in_ptrs[4];
+
+        in_ptrs[0] = (block_q4_0 *) cur->data + (y_out * 4 * nb);
+        for (int i = 0; i < 3; i++) {
+            in_ptrs[i + 1] = in_ptrs[i] + nb;
+        }
+
+        for (int64_t x = 0; x < nb; x++) {
+            *out_ptr_B =
+                make_block_q4_0x4(in_ptrs, 8);  // block_len=8 for SMMLA
+            out_ptr_B++;
+
+            for (int i = 0; i < 4; i++) {
+                in_ptrs[i]++;
+            }
+        }
+    }
+    cur->rearranged_weight_gemm = (uint8_t *) out_ptr_B_start;
+}
+
+void rearrange_q8_0_weights_blocked8_neon(struct ggml_tensor * cur) {
+    block_q8_0x8 * out_ptr_B = malloc(ggml_nbytes(cur));  // B_blocked->data;
+    block_q8_0x8 * out_ptr_B_start = out_ptr_B;
+    int64_t nb = cur->ne[0] / QK8_0;
+
+    for (int y_out = 0; y_out < cur->ne[1] / 8; y_out++) {
+        const block_q8_0 * in_ptrs[8];
+
+        in_ptrs[0] = (block_q8_0 *) cur->data + (y_out * 8 * nb);
+        for (int i = 0; i < 7; i++) {
+            in_ptrs[i + 1] = in_ptrs[i] + nb;
+        }
+
+        for (int64_t x = 0; x < nb; x++) {
+            *out_ptr_B = make_block_q8_0x8(in_ptrs, 4);  // block_len=4 for SDOT
+            out_ptr_B++;
+
+            for (int i = 0; i < 8; i++) {
+                in_ptrs[i]++;
+            }
+        }
+    }
+    cur->rearranged_weight_gemv = (uint8_t *) out_ptr_B_start;
+}
+
+void rearrange_q8_0_weights_blocked8_sve(struct ggml_tensor * cur) {
+#if defined(__ARM_FEATURE_SVE)
+    if (svcntw() != 8) {
+        printf("ggml_gemv_q8_0_q8_0_blocked8_sve: SVE VL != 256 - aborting. Use Arm Neon GEMV kernels\n");
+        exit(1);
+    }
+
+    block_q8_0x8 * out_ptr_B = malloc(ggml_nbytes(cur));  // B_blocked->data;
+    block_q8_0x8 * out_ptr_B_start = out_ptr_B;
+    int64_t nb = cur->ne[0] / QK8_0;
+
+    for (int y_out = 0; y_out < cur->ne[1] / 8; y_out++) {
+        const block_q8_0 * in_ptrs[8];
+
+        in_ptrs[0] = (block_q8_0 *) cur->data + (y_out * 8 * nb);
+        for (int i = 0; i < 7; i++) {
+            in_ptrs[i + 1] = in_ptrs[i] + nb;
+        }
+
+        for (int64_t x = 0; x < nb; x++) {
+            *out_ptr_B = make_block_q8_0x8(in_ptrs, 4);  // block_len=4 for SDOT
+            out_ptr_B++;
+
+            for (int i = 0; i < 8; i++) {
+                in_ptrs[i]++;
+            }
+        }
+    }
+    cur->rearranged_weight_gemv = (uint8_t *) out_ptr_B_start;
+#endif
+}
+
+#if defined(__ARM_FEATURE_SVE)
+static void (*_rearrange_q8_0_weights_for_gemv)(struct ggml_tensor *) = &rearrange_q8_0_weights_blocked8_sve;
+#elif defined(__ARM_NEON)
+static void (*_rearrange_q8_0_weights_for_gemv)(struct ggml_tensor *) = &rearrange_q8_0_weights_blocked8_neon;
+#endif
+
+#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
+void rearrange_q8_0_weights_for_gemv(struct ggml_tensor * cur) { _rearrange_q8_0_weights_for_gemv(cur); }
+#endif
+
+void rearrange_q8_0_weights_for_gemm(struct ggml_tensor * cur) {
+    block_q8_0x4 * out_ptr_B = malloc(ggml_nbytes(cur));  // B_blocked->data;
+    block_q8_0x4 * out_ptr_B_start = out_ptr_B;
+    int64_t nb = cur->ne[0] / QK8_0;
+
+    for (int y_out = 0; y_out < cur->ne[1] / 4; y_out++) {
+        const block_q8_0 * in_ptrs[4];
+
+        in_ptrs[0] = (block_q8_0 *) cur->data + (y_out * 4 * nb);
+        for (int i = 0; i < 3; i++) {
+            in_ptrs[i + 1] = in_ptrs[i] + nb;
+        }
+
+        for (int64_t x = 0; x < nb; x++) {
+            *out_ptr_B =
+                make_block_q8_0x4(in_ptrs, 8);  // block_len=8 for SMMLA
+            out_ptr_B++;
+
+            for (int i = 0; i < 4; i++) {
+                in_ptrs[i]++;
+            }
+        }
+    }
+    cur->rearranged_weight_gemm = (uint8_t *) out_ptr_B_start;
+}
+
 //
 // cross-platform UTF-8 file paths
 //
@@ -2605,6 +2804,10 @@ inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
     *s = idx;
 }
 
+static void ggml_gemv_q4_0_q8_0(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
+
+static void ggml_gemv_q8_0_q8_0(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
+
 //
 // data types
 //
@@ -3647,6 +3850,9 @@ static struct ggml_tensor * ggml_new_tensor_impl(
         /*.name         =*/ { 0 },
         /*.extra        =*/ NULL,
         ///*.padding      =*/ { 0 },
+        /*.rearranged_weight_gemv =*/ NULL,
+        /*.rearranged_weight_gemm =*/ NULL,
+        /*.weight_rearranged      =*/ false,
     };
 
 #ifdef __clang__
@@ -12199,7 +12405,32 @@ UseGgmlGemm1:;
                 }
             }
         }
-    }
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+            if ((src0->weight_rearranged == true) && (ne11 >= 4) && (ne12 == 1) && (ne13 == 1)) {
+                    for (int64_t i11 = 0; i11 < ne11 / 4; ++i11) {
+                        quantize_row_q8_0_and_make_block_q8_0x4((float *)((char *) src1->data + i11 * 4 * nb11), (void *) wdata, ne10, 4);
+                        wdata += row_size * 4;
+                    }
+                    for (int64_t i11 = (ne11 / 4) * 4; i11 < ne11; ++i11) {
+                        from_float_to_vec_dot((float *)((char *) src1->data + i11 * nb11), (void *) wdata, ne10);
+                         wdata += row_size;
+                     }
+            }
+#endif
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+            else {
+#endif
+                for (int64_t i13 = 0; i13 < ne13; ++i13) {
+                    for (int64_t i12 = 0; i12 < ne12; ++i12) {
+                        for (int64_t i11 = 0; i11 < ne11; ++i11) {
+                            from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
+                            wdata += row_size;
+                        }
+                    }
+                }
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+            }
+#endif
 
     if (ith == 0) {
         // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
@@ -12275,25 +12506,141 @@ UseGgmlGemm2:;
 
     // The first chunk comes from our thread_id, the rest will get auto-assigned.
     int current_chunk = ith;
+    //if (ith == 0)
+    //    printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d.  Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);
+
+#if defined(__ARM_FEATURE_MATMUL_INT8) && (defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE))
+    if ((ggml_n_dims(src0) == 2) && (ne11 == 1) && (src0->weight_rearranged == true)) {
+        if (src0->type == GGML_TYPE_Q4_0) {
+            ggml_gemv_q4_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data), (const char *) src0->rearranged_weight_gemv, (const char *) wdata, ith, nth); // use Arm Neon/SVE GEMV kernels
+        } else if (src0->type == GGML_TYPE_Q8_0) {
+            ggml_gemv_q8_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data), (const char *) src0->rearranged_weight_gemv, (const char *) wdata, ith, nth); // use Arm Neon/SVE GEMV kernels
+        }
+    }
+    else if ((ggml_n_dims(src0) == 2) && (ne11 >= 16) && (src0->weight_rearranged == true)) {
+        // use batch-sized 16, 8, and 4 GEMM kernels
+        if (src0->type == GGML_TYPE_Q4_0) {
+            for (int row_iter = 0; row_iter < ne11 / 16; row_iter++) {
+                ggml_gemm_q4_0_q8_0(ne00, 16, ne01, 16, (float *)((char *) dst->data + (row_iter * 16 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 16) * row_size : (row_iter * 16 * nb11)), ith, nth);
+            }
+            int rows_processed = (ne11 / 16) * 16;
+            for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 8; row_iter++) {
+                ggml_gemm_q4_0_q8_0(ne00, 8, ne01, 8, (float *)((char *) dst->data + ((rows_processed + row_iter * 8) * nb1)), (const char *) src0->rearranged_weight_gemm,
+                                            (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 8) * row_size : ((rows_processed + row_iter * 8) * nb11)), ith, nth);
+            }
+            rows_processed = rows_processed + ((ne11 - rows_processed) / 8) * 8;
+            for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
+                ggml_gemm_q4_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->rearranged_weight_gemm,
+                                            (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
+            }
+            rows_processed = rows_processed + ((ne11 - rows_processed) / 4) * 4;
+            for (int row_iter = rows_processed; row_iter < ne11; row_iter++) {
+                ggml_gemv_q4_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
+            }
+        } else if (src0->type == GGML_TYPE_Q8_0) {
+            for (int row_iter = 0; row_iter < ne11 / 16; row_iter++) {
+                ggml_gemm_q8_0_q8_0(ne00, 16, ne01, 16, (float *)((char *) dst->data + (row_iter * 16 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 16) * row_size : (row_iter * 16 * nb11)), ith, nth);
+            }
+            int rows_processed = (ne11 / 16) * 16;
+            for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 8; row_iter++) {
+                ggml_gemm_q8_0_q8_0(ne00, 8, ne01, 8, (float *)((char *) dst->data + ((rows_processed + row_iter * 8) * nb1)), (const char *) src0->rearranged_weight_gemm,
+                                            (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 8) * row_size : ((rows_processed + row_iter * 8) * nb11)), ith, nth);
+            }
+            rows_processed = rows_processed + ((ne11 - rows_processed) / 8) * 8;
+            for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
+                ggml_gemm_q8_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->rearranged_weight_gemm,
+                                            (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
+            }
+            rows_processed = rows_processed + ((ne11 - rows_processed) / 4) * 4;
+            for (int row_iter = rows_processed; row_iter < ne11; row_iter++) {
+                ggml_gemv_q8_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
+            }
+        }
+    } else if ((ggml_n_dims(src0) == 2) && (ne11 >= 8) && (src0->weight_rearranged == true)) {
+        // use batch-sized 8, and 4 GEMM kernels
+        if (src0->type == GGML_TYPE_Q4_0) {
+            for (int row_iter = 0; row_iter < ne11 / 8; row_iter++) {
+                ggml_gemm_q4_0_q8_0(ne00, 8, ne01, 8, (float *)((char *) dst->data + (row_iter * 8 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 8) * row_size : (row_iter * 8 * nb11)), ith, nth);
+            }
+            int rows_processed = (ne11 / 8) * 8;
+            for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
+                ggml_gemm_q4_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->rearranged_weight_gemm,
+                                            (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
+            }
+            for (int row_iter = ((ne11 / 8) * 8) + ((ne11 - rows_processed) / 4 * 4); row_iter < ne11; row_iter++) {
+                ggml_gemv_q4_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
+            }
+        } else if (src0->type == GGML_TYPE_Q8_0) {
+            for (int row_iter = 0; row_iter < ne11 / 8; row_iter++) {
+                ggml_gemm_q8_0_q8_0(ne00, 8, ne01, 8, (float *)((char *) dst->data + (row_iter * 8 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 8) * row_size : (row_iter * 8 * nb11)), ith, nth);
+            }
+            int rows_processed = (ne11 / 8) * 8;
+            for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
+                ggml_gemm_q8_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->rearranged_weight_gemm,
+                                            (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
+            }
+            for (int row_iter = ((ne11 / 8) * 8) + ((ne11 - rows_processed) / 4 * 4); row_iter < ne11; row_iter++) {
+                ggml_gemv_q8_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
+            }
+        }
+    } else if ((ggml_n_dims(src0) == 2) && (ne11 >= 4) && (src0->weight_rearranged == true)) {
+        // use batch-sized 4 GEMM kernel
+        if (src0->type == GGML_TYPE_Q4_0) {
+            for (int row_iter = 0; row_iter < ne11 / 4; row_iter++) {
+                ggml_gemm_q4_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + (row_iter * 4 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 4) * row_size : (row_iter * 4 * nb11)), ith, nth);
+            }
+            for (int row_iter = (ne11 / 4) * 4; row_iter < ne11; row_iter++) {
+                ggml_gemv_q4_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
+            }
+        } else if (src0->type == GGML_TYPE_Q8_0) {
+            for (int row_iter = 0; row_iter < ne11 / 4; row_iter++) {
+                ggml_gemm_q8_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + (row_iter * 4 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 4) * row_size : (row_iter * 4 * nb11)), ith, nth);
+            }
+            for (int row_iter = (ne11 / 4) * 4; row_iter < ne11; row_iter++) {
+                ggml_gemv_q8_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
+            }
+        }
+    }
+#elif defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
+    if ((ggml_n_dims(src0) == 2) && (src0->weight_rearranged == true)) {
+        if (src0->type == GGML_TYPE_Q4_0) {
+            for (int row_iter = 0; row_iter < ne11; row_iter++) {
+                ggml_gemv_q4_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
+            }
+        } else if (src0->type == GGML_TYPE_Q8_0) {
+            for (int row_iter = 0; row_iter < ne11; row_iter++) {
+                ggml_gemv_q8_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
+            }
+        }
+    }
+#endif
+#if defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
+    else {
+#endif
+        // The first chunk comes from our thread_id, the rest will get auto-assigned.
+        int current_chunk = ith;
 
-    while (current_chunk < nchunk0 * nchunk1) {
-        const int64_t ith0 = current_chunk % nchunk0;
-        const int64_t ith1 = current_chunk / nchunk0;
+        while (current_chunk < nchunk0 * nchunk1) {
+            const int64_t ith0 = current_chunk % nchunk0;
+            const int64_t ith1 = current_chunk / nchunk0;
 
-        const int64_t ir0_start = dr0 * ith0;
-        const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
+            const int64_t ir0_start = dr0 * ith0;
+            const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
 
-        const int64_t ir1_start = dr1 * ith1;
-        const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
+            const int64_t ir1_start = dr1 * ith1;
+            const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
 
-        ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
+            ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
 
-        if (nth >= nchunk0 * nchunk1) {
-            break;
-        }
+            if (nth >= nchunk0 * nchunk1) {
+                break;
+            }
 
-        current_chunk = atomic_fetch_add(&params->shared->current_chunk, 1);
+            current_chunk = atomic_fetch_add(&params->shared->current_chunk, 1);
+        }
+#if defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
     }
+#endif
 }
 
 // ggml_compute_forward_mul_mat_id
@@ -21891,4 +22238,26 @@ int ggml_cpu_has_matmul_int8(void) {
 #endif
 }
 
+#if defined(__ARM_FEATURE_SVE)
+static void (*_ggml_gemv_q4_0_q8_0)(const int, int, int, float * restrict, const void * restrict, const void * restrict, int, int) = &ggml_gemv_q4_0_q8_0_blocked8_sve;
+#elif defined(__ARM_NEON)
+static void (*_ggml_gemv_q4_0_q8_0)(const int, int, int, float * restrict, const void * restrict, const void * restrict, int, int) = &ggml_gemv_q4_0_q8_0_blocked8_neon;
+#endif
+
+#if defined(__ARM_FEATURE_SVE)
+static void (*_ggml_gemv_q8_0_q8_0)(const int, int, int, float * restrict, const void * restrict, const void * restrict, int, int) = &ggml_gemv_q8_0_q8_0_blocked8_sve;
+#elif defined(__ARM_NEON)
+static void (*_ggml_gemv_q8_0_q8_0)(const int, int, int, float * restrict, const void * restrict, const void * restrict, int, int) = &ggml_gemv_q8_0_q8_0_blocked8_neon;
+#endif
+
+#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
+static void ggml_gemv_q4_0_q8_0(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
+    _ggml_gemv_q4_0_q8_0(n, output_channels, input_width, s, vx, vy, ith, nth);
+}
+
+static void ggml_gemv_q8_0_q8_0(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
+    _ggml_gemv_q8_0_q8_0(n, output_channels, input_width, s, vx, vy, ith, nth);
+}
+#endif
+
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/src/llama.cpp b/src/llama.cpp
index 2b9ace2858457..7aecda2f594e5 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1,3 +1,4 @@
+// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
 #define LLAMA_API_INTERNAL
 #include "llama.h"
 
@@ -4358,6 +4359,32 @@ struct llama_model_loader {
                 }
             }
 
+#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
+            if ((cur->type == GGML_TYPE_Q4_0) && (cur->ne[1] % 4 == 0)) {
+                cur->weight_rearranged = true;
+#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
+                rearrange_q4_0_weights_for_gemv(cur); // rearrange weights for Arm Neon/SVE GEMV kernels
+#endif
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+                rearrange_q4_0_weights_for_gemm(cur); // rearrange weights for GEMM MMLA kernels
+#endif
+            }
+            else if ((cur->type == GGML_TYPE_Q8_0) && (cur->ne[1] % 4 == 0)) {
+                cur->weight_rearranged = true;
+#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
+                rearrange_q8_0_weights_for_gemv(cur); // rearrange weights for Arm Neon/SVE GEMV kernels
+#endif
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+                rearrange_q8_0_weights_for_gemm(cur); // rearrange weights for GEMM MMLA kernels
+#endif
+            }
+            else {
+                cur->weight_rearranged = false;
+            }
+#else
+            cur->weight_rearranged = false;
+#endif
+
             size_done += n_size;
         }
 

From 340ef07fca904bc77ac46aa3fec34436e60400e2 Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar109@gmail.com>
Date: Mon, 22 Apr 2024 08:08:17 +0000
Subject: [PATCH 02/28] Arm AArch64: add optimized GEMV and GEMM asm kernels
 for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780
 suggestions

---
 examples/quantize/quantize.cpp |    1 +
 ggml/include/ggml.h            |   23 +-
 ggml/src/ggml-quants.c         | 2135 +++++++++++++++++++++++++++-----
 ggml/src/ggml-quants.h         |   46 +-
 ggml/src/ggml.c                |  398 ++----
 include/llama.h                |    1 +
 src/llama.cpp                  |   39 +-
 7 files changed, 1922 insertions(+), 721 deletions(-)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 76e2052d55d79..214edb03c56b1 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -46,6 +46,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B",  },
     { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 6.14G, +0.0217 ppl @ Llama-3-8B",  },
     { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 7.96G, +0.0026 ppl @ Llama-3-8B",  },
+    { "Q4_0_AARCH64", LLAMA_FTYPE_MOSTLY_Q4_0_AARCH64, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
     { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "14.00G, +0.0020 ppl @ Mistral-7B",  },
     { "BF16",   LLAMA_FTYPE_MOSTLY_BF16,   "14.00G, -0.0050 ppl @ Mistral-7B",  },
     { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B",          },
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 2d377267387e2..bea898c32bdb6 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -384,6 +384,7 @@ extern "C" {
         GGML_TYPE_F64     = 28,
         GGML_TYPE_IQ1_M   = 29,
         GGML_TYPE_BF16    = 30,
+        GGML_TYPE_Q4_0_AARCH64 = 31,
         GGML_TYPE_COUNT,
     };
 
@@ -425,6 +426,7 @@ extern "C" {
         GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
         GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_0_AARCH64 = 25, // except 1d tensors
     };
 
     // available tensor operations:
@@ -603,11 +605,6 @@ extern "C" {
         void * extra; // extra things e.g. for ggml-cuda.cu
 
         // char padding[4];
-        char padding[9];
-
-        void * rearranged_weight_gemv;
-        void * rearranged_weight_gemm;
-        bool weight_rearranged;
     };
 
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -2397,6 +2394,7 @@ extern "C" {
     GGML_API int ggml_cpu_has_rpc        (void);
     GGML_API int ggml_cpu_has_vsx        (void);
     GGML_API int ggml_cpu_has_matmul_int8(void);
+    GGML_API int ggml_cpu_has_sve        (void);
 
     //
     // Internal types and functions exposed for tests and benchmarks
@@ -2412,6 +2410,9 @@ extern "C" {
     typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
     typedef void (*ggml_vec_dot_t)   (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
                                       const void * GGML_RESTRICT y, size_t by, int nrc);
+    typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int k, int n, int b);
+    typedef void (*ggml_gemv_t)      (size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
+    typedef void (*ggml_gemm_t)      (size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
 
     typedef struct {
         const char      * type_name;
@@ -2424,19 +2425,13 @@ extern "C" {
         ggml_vec_dot_t    vec_dot;
         enum ggml_type    vec_dot_type;
         int64_t           nrows; // number of rows to process simultaneously;
+        ggml_from_float_to_mat_t from_float_to_mat;
+        ggml_gemv_t       gemv;
+        ggml_gemm_t       gemm;
     } ggml_type_traits_t;
 
     GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
 
-    GGML_API void rearrange_q4_0_weights_blocked8_neon(struct ggml_tensor * cur);
-    GGML_API void rearrange_q4_0_weights_blocked8_sve(struct ggml_tensor * cur);
-    GGML_API void rearrange_q4_0_weights_for_gemv(struct ggml_tensor * cur);
-    GGML_API void rearrange_q4_0_weights_for_gemm(struct ggml_tensor * cur);
-    GGML_API void rearrange_q8_0_weights_blocked8_neon(struct ggml_tensor * cur);
-    GGML_API void rearrange_q8_0_weights_blocked8_sve(struct ggml_tensor * cur);
-    GGML_API void rearrange_q8_0_weights_for_gemv(struct ggml_tensor * cur);
-    GGML_API void rearrange_q8_0_weights_for_gemm(struct ggml_tensor * cur);
-
 #ifdef  __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 2c0e89d4dfd7a..f774810375211 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -700,6 +700,64 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
     quantize_row_q4_0_reference(x, y, k);
 }
 
+void quantize_row_q4_0_aarch64(const float * src, void * dst, int n, int k) {
+    int nrows_interleaved, blocklen_per_row;
+    typedef block_q4_0x8 block_q4_0xn;
+    typedef block_q4_0xn (*make_block_q4_0xn_t)(const block_q4_0 *, unsigned int, unsigned int);
+    make_block_q4_0xn_t make_block_q4_0xn = make_block_q4_0x8;
+
+    if (ggml_cpu_has_sve() && (svcntw() == 8)) {
+        nrows_interleaved = 8;
+        blocklen_per_row = 8;
+        typedef block_q4_0x8 block_q4_0xn;
+        make_block_q4_0xn = make_block_q4_0x8;
+    }
+    else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
+        nrows_interleaved = 4;
+        blocklen_per_row = 8;
+        typedef block_q4_0x4 block_q4_0xn;
+        make_block_q4_0xn = make_block_q4_0x4;
+    }
+    else if (ggml_cpu_has_neon()) {
+        nrows_interleaved = 4;
+        blocklen_per_row = 4;
+        typedef block_q4_0x4 block_q4_0xn;
+        make_block_q4_0xn = make_block_q4_0x4;
+    }
+    else {
+        assert(false);
+    }
+
+    assert(k % QK4_0 == 0);
+    const int nb = k / QK4_0;
+
+    block_q4_0xn * out_ptr_B = (block_q4_0xn *) malloc(sizeof(block_q4_0xn) * nb);
+    block_q4_0xn * out_ptr_B_start = out_ptr_B;
+
+    for (int b = 0; b < n; b += nrows_interleaved * k) {
+        const block_q4_0 * in_ptrs[nrows_interleaved];
+
+        for (int i  = 0; i < nrows_interleaved; i++ ) {
+            in_ptrs[i] = (block_q4_0 *) dst + (b + i * k) / QK4_0;
+            quantize_row_q4_0_reference(src + b + i * k, in_ptrs[i], k);
+        }
+
+        for (int64_t x = 0; x < nb; x++) {
+            *out_ptr_B = make_block_q4_0xn(in_ptrs, blocklen_per_row, 0x88);
+            out_ptr_B++;
+
+            for (int i = 0; i < nrows_interleaved; i++) {
+                in_ptrs[i]++;
+            }
+        }
+        out_ptr_B = out_ptr_B_start;
+        memcpy ((block_q4_0 *) dst + b / QK4_0, out_ptr_B_start, sizeof(block_q4_0xn) * nb);
+    }
+    if (out_ptr_B_start) free(out_ptr_B_start);
+
+    return (n / QK4_0 * sizeof(block_q4_0));
+}
+
 
 void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
     const int qk = QK4_1;
@@ -3307,6 +3365,76 @@ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nr
     return nrow * row_size;
 }
 
+size_t quantize_q4_0_aarch64(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    if (!quant_weights) {
+        //quantize_row_q4_0_reference(src, dst, (int64_t)nrow*n_per_row);
+        //return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
+
+        int nrows_interleaved, blocklen_per_row;
+        typedef block_q4_0x8 block_q4_0xn;
+        typedef block_q4_0xn (*make_block_q4_0xn_t)(const block_q4_0 *, unsigned int, unsigned int);
+        make_block_q4_0xn_t make_block_q4_0xn = make_block_q4_0x8;
+
+        if (ggml_cpu_has_sve() && (svcntw() == 8)) {
+            nrows_interleaved = 8;
+            blocklen_per_row = 8;
+            typedef block_q4_0x8 block_q4_0xn;
+            make_block_q4_0xn = make_block_q4_0x8;
+        }
+        else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
+            nrows_interleaved = 4;
+            blocklen_per_row = 8;
+            typedef block_q4_0x4 block_q4_0xn;
+            make_block_q4_0xn = make_block_q4_0x4;
+        }
+        else if (ggml_cpu_has_neon()) {
+            nrows_interleaved = 4;
+            blocklen_per_row = 4;
+            typedef block_q4_0x4 block_q4_0xn;
+            make_block_q4_0xn = make_block_q4_0x4;
+        }
+        else {
+            assert(false);
+        }
+
+        assert(n_per_row % QK4_0 == 0);
+        const int nb = n_per_row / QK4_0;
+
+        block_q4_0xn * out_ptr_B = (block_q4_0xn *) malloc(sizeof(block_q4_0xn) * nb);
+        block_q4_0xn * out_ptr_B_start = out_ptr_B;
+
+        for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) {
+            const block_q4_0 * in_ptrs[nrows_interleaved];
+
+            for (int i  = 0; i < nrows_interleaved; i++ ) {
+                in_ptrs[i] = (block_q4_0 *) dst + (b + i * n_per_row) / QK4_0;
+                quantize_row_q4_0_reference(src + b + i * n_per_row, in_ptrs[i], n_per_row);
+            }
+
+            for (int64_t x = 0; x < nb; x++) {
+                *out_ptr_B = make_block_q4_0xn(in_ptrs, blocklen_per_row, 0x88);
+                out_ptr_B++;
+
+                for (int i = 0; i < nrows_interleaved; i++) {
+                    in_ptrs[i]++;
+                }
+            }
+            out_ptr_B = out_ptr_B_start;
+            memcpy ((block_q4_0 *) dst + b / QK4_0, out_ptr_B_start, sizeof(block_q4_0xn) * nb);
+        }
+        if (out_ptr_B_start) free(out_ptr_B_start);
+        return (nrow * n_per_row / QK4_0 * sizeof(block_q4_0));
+    }
+    size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
+    char * qrow = (char *)dst;
+    for (int64_t row = 0; row < nrow; ++row) {
+        quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
+        src += n_per_row;
+        qrow += row_size;
+    }
+    return nrow * row_size;
+}
+
 // ====================== "True" 2-bit (de)-quantization
 
 void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
@@ -14714,7 +14842,7 @@ void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k)
 // and GEMV (using SDOT) cases.  For GEMM, we interleave 8 pairs of values
 // at a time (with the two nibbles separated at runtime to give 2x2x8
 // matrices).  For GEMV, we need to interleave 4 pairs of values instead.
-block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned int block_len) {
+block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned int block_len, unsigned int xor_mask) {
     block_q4_0x4 out;
 
     for (int i = 0; i < 4; i++) {
@@ -14736,14 +14864,14 @@ block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned int bloc
         int src_id = (i % (4 * block_len)) / block_len;
         src_offset += (i % block_len);
 
-        out.qs[i] = in[src_id]->qs[src_offset] + 0x80;
+        out.qs[i] = in[src_id]->qs[src_offset] ^ xor_mask;
     }
 
     return out;
 }
 
 // 8-block version - see comments in code above
-block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned int block_len) {
+block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned int block_len, unsigned int xor_mask) {
     block_q4_0x8 out;
 
     for (int i = 0; i < 8; i++) {
@@ -14755,7 +14883,7 @@ block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned int bloc
         int src_id = (i % (8 * block_len)) / block_len;
         src_offset += (i % block_len);
 
-        out.qs[i] = in[src_id]->qs[src_offset] + 0x80;
+        out.qs[i] = in[src_id]->qs[src_offset] ^ xor_mask;
     }
 
     return out;
@@ -14798,68 +14926,7 @@ block_q8_0x8 make_block_q8_0x8(const block_q8_0 * const in[8], unsigned int bloc
     return out;
 }
 
-void quantize_row_q8_0_and_make_block_q8_0x2(const float * restrict x, void * restrict vy, int k, int rows_interleaved) {
-    assert(QK8_0 == 32);
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
-
-    block_q8_0x2 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv[rows_interleaved][8];
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
-        float id[rows_interleaved];
-
-        for (int row_iter = 0; row_iter < rows_interleaved; row_iter++) {
-            for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j);
-            for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]);
-
-            for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]);
-            for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]);
-            for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]);
-
-            const float amax = vmaxvq_f32(amaxv[0]);
-
-            const float d = amax / ((1 << 7) - 1);
-            id[row_iter] = d ? 1.0f / d : 0.0f;
-
-            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
-        }
-
-        for (int j = 0; j < 4; j++) {
-            float32x4_t v = vmulq_n_f32(srcv[0][2 * j], id[0]);
-            int32x4_t vi = vcvtnq_s32_f32(v);
-            y[i].qs[16 * j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[16 * j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[16 * j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[16 * j + 3] = vgetq_lane_s32(vi, 3);
-            v = vmulq_n_f32(srcv[0][2 * j + 1], id[0]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[16 * j + 4] = vgetq_lane_s32(vi, 0);
-            y[i].qs[16 * j + 5] = vgetq_lane_s32(vi, 1);
-            y[i].qs[16 * j + 6] = vgetq_lane_s32(vi, 2);
-            y[i].qs[16 * j + 7] = vgetq_lane_s32(vi, 3);
-
-            v = vmulq_n_f32(srcv[1][2 * j], id[1]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[16 * j + 8] = vgetq_lane_s32(vi, 0);
-            y[i].qs[16 * j + 9] = vgetq_lane_s32(vi, 1);
-            y[i].qs[16 * j + 10] = vgetq_lane_s32(vi, 2);
-            y[i].qs[16 * j + 11] = vgetq_lane_s32(vi, 3);
-            v = vmulq_n_f32(srcv[1][2 * j + 1], id[1]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[16 * j + 12] = vgetq_lane_s32(vi, 0);
-            y[i].qs[16 * j + 13] = vgetq_lane_s32(vi, 1);
-            y[i].qs[16 * j + 14] = vgetq_lane_s32(vi, 2);
-            y[i].qs[16 * j + 15] = vgetq_lane_s32(vi, 3);
-        }
-    }
-#endif
-}
-
-void quantize_row_q8_0_and_make_block_q8_0x4(const float * restrict x, void * restrict vy, int k, int rows_interleaved) {
+void quantize_row_q8_0_aarch64(const float * restrict x, void * restrict vy, int k, int nrows_interleaved, int blocklen_per_row) {
     assert(QK8_0 == 32);
     assert(k % QK8_0 == 0);
     const int nb = k / QK8_0;
@@ -14868,12 +14935,12 @@ void quantize_row_q8_0_and_make_block_q8_0x4(const float * restrict x, void * re
 
 #if defined(__ARM_NEON)
     for (int i = 0; i < nb; i++) {
-        float32x4_t srcv[rows_interleaved][8];
+        float32x4_t srcv[nrows_interleaved][8];
         float32x4_t asrcv[8];
         float32x4_t amaxv[8];
-        float id[rows_interleaved];
+        float id[nrows_interleaved];
 
-        for (int row_iter = 0; row_iter < rows_interleaved; row_iter++) {
+        for (int row_iter = 0; row_iter < nrows_interleaved; row_iter++) {
             for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j);
             for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]);
 
@@ -14889,58 +14956,91 @@ void quantize_row_q8_0_and_make_block_q8_0x4(const float * restrict x, void * re
             y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
         }
 
-        for (int j = 0; j < 4; j++) {
-            float32x4_t v = vmulq_n_f32(srcv[0][2 * j], id[0]);
-            int32x4_t vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 3] = vgetq_lane_s32(vi, 3);
-            v = vmulq_n_f32(srcv[0][2 * j + 1], id[0]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 4] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 5] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 6] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 7] = vgetq_lane_s32(vi, 3);
-
-            v = vmulq_n_f32(srcv[1][2 * j], id[1]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 8] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 9] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 10] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 11] = vgetq_lane_s32(vi, 3);
-            v = vmulq_n_f32(srcv[1][2 * j + 1], id[1]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 12] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 13] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 14] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 15] = vgetq_lane_s32(vi, 3);
-
-            v = vmulq_n_f32(srcv[2][2 * j], id[2]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 16] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 17] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 18] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 19] = vgetq_lane_s32(vi, 3);
-            v = vmulq_n_f32(srcv[2][2 * j + 1], id[2]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 20] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 21] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 22] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 23] = vgetq_lane_s32(vi, 3);
-
-            v = vmulq_n_f32(srcv[3][2 * j], id[3]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 24] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 25] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 26] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 27] = vgetq_lane_s32(vi, 3);
-            v = vmulq_n_f32(srcv[3][2 * j + 1], id[3]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 28] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 29] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 30] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3);
+        if (blocklen_per_row == 8) {
+            for (int j = 0; j < 4; j++) {
+                float32x4_t v = vmulq_n_f32(srcv[0][2 * j], id[0]);
+                int32x4_t vi = vcvtnq_s32_f32(v);
+                y[i].qs[32 * j + 0] = vgetq_lane_s32(vi, 0);
+                y[i].qs[32 * j + 1] = vgetq_lane_s32(vi, 1);
+                y[i].qs[32 * j + 2] = vgetq_lane_s32(vi, 2);
+                y[i].qs[32 * j + 3] = vgetq_lane_s32(vi, 3);
+                v = vmulq_n_f32(srcv[0][2 * j + 1], id[0]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[32 * j + 4] = vgetq_lane_s32(vi, 0);
+                y[i].qs[32 * j + 5] = vgetq_lane_s32(vi, 1);
+                y[i].qs[32 * j + 6] = vgetq_lane_s32(vi, 2);
+                y[i].qs[32 * j + 7] = vgetq_lane_s32(vi, 3);
+
+                v = vmulq_n_f32(srcv[1][2 * j], id[1]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[32 * j + 8] = vgetq_lane_s32(vi, 0);
+                y[i].qs[32 * j + 9] = vgetq_lane_s32(vi, 1);
+                y[i].qs[32 * j + 10] = vgetq_lane_s32(vi, 2);
+                y[i].qs[32 * j + 11] = vgetq_lane_s32(vi, 3);
+                v = vmulq_n_f32(srcv[1][2 * j + 1], id[1]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[32 * j + 12] = vgetq_lane_s32(vi, 0);
+                y[i].qs[32 * j + 13] = vgetq_lane_s32(vi, 1);
+                y[i].qs[32 * j + 14] = vgetq_lane_s32(vi, 2);
+                y[i].qs[32 * j + 15] = vgetq_lane_s32(vi, 3);
+
+                v = vmulq_n_f32(srcv[2][2 * j], id[2]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[32 * j + 16] = vgetq_lane_s32(vi, 0);
+                y[i].qs[32 * j + 17] = vgetq_lane_s32(vi, 1);
+                y[i].qs[32 * j + 18] = vgetq_lane_s32(vi, 2);
+                y[i].qs[32 * j + 19] = vgetq_lane_s32(vi, 3);
+                v = vmulq_n_f32(srcv[2][2 * j + 1], id[2]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[32 * j + 20] = vgetq_lane_s32(vi, 0);
+                y[i].qs[32 * j + 21] = vgetq_lane_s32(vi, 1);
+                y[i].qs[32 * j + 22] = vgetq_lane_s32(vi, 2);
+                y[i].qs[32 * j + 23] = vgetq_lane_s32(vi, 3);
+
+                v = vmulq_n_f32(srcv[3][2 * j], id[3]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[32 * j + 24] = vgetq_lane_s32(vi, 0);
+                y[i].qs[32 * j + 25] = vgetq_lane_s32(vi, 1);
+                y[i].qs[32 * j + 26] = vgetq_lane_s32(vi, 2);
+                y[i].qs[32 * j + 27] = vgetq_lane_s32(vi, 3);
+                v = vmulq_n_f32(srcv[3][2 * j + 1], id[3]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[32 * j + 28] = vgetq_lane_s32(vi, 0);
+                y[i].qs[32 * j + 29] = vgetq_lane_s32(vi, 1);
+                y[i].qs[32 * j + 30] = vgetq_lane_s32(vi, 2);
+                y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3);
+            }
+        }
+        else if (blocklen_per_row == 4) {
+            for (int j = 0; j < 8; j++) {
+                float32x4_t v = vmulq_n_f32(srcv[0][j], id[0]);
+                int32x4_t vi = vcvtnq_s32_f32(v);
+                y[i].qs[16 * j + 0] = vgetq_lane_s32(vi, 0);
+                y[i].qs[16 * j + 1] = vgetq_lane_s32(vi, 1);
+                y[i].qs[16 * j + 2] = vgetq_lane_s32(vi, 2);
+                y[i].qs[16 * j + 3] = vgetq_lane_s32(vi, 3);
+
+                v = vmulq_n_f32(srcv[1][j], id[1]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[16 * j + 4] = vgetq_lane_s32(vi, 0);
+                y[i].qs[16 * j + 5] = vgetq_lane_s32(vi, 1);
+                y[i].qs[16 * j + 6] = vgetq_lane_s32(vi, 2);
+                y[i].qs[16 * j + 7] = vgetq_lane_s32(vi, 3);
+
+                v = vmulq_n_f32(srcv[2][j], id[2]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[16 * j + 8] = vgetq_lane_s32(vi, 0);
+                y[i].qs[16 * j + 9] = vgetq_lane_s32(vi, 1);
+                y[i].qs[16 * j + 10] = vgetq_lane_s32(vi, 2);
+                y[i].qs[16 * j + 11] = vgetq_lane_s32(vi, 3);
+
+                v = vmulq_n_f32(srcv[3][j], id[3]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[16 * j + 12] = vgetq_lane_s32(vi, 0);
+                y[i].qs[16 * j + 13] = vgetq_lane_s32(vi, 1);
+                y[i].qs[16 * j + 14] = vgetq_lane_s32(vi, 2);
+                y[i].qs[16 * j + 15] = vgetq_lane_s32(vi, 3);
+            }
         }
     }
 #endif
@@ -15134,184 +15234,227 @@ void ggml_gemv_q4_0_q8_0_blocked8_sve(const int n, int output_channels, int inpu
 #endif
 }
 
-void ggml_gemm_q4_0_q8_0(const int n, int rows, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
-#if defined(__ARM_FEATURE_MATMUL_INT8)
+void ggml_gemv_q4_0_q8_0_aarch64_sve256(size_t depth, size_t output_channels, size_t height, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
+#if defined(__ARM_FEATURE_SVE)
+    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)8);
+    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)8);
+    size_t width = xend - x0;
+
+    int64_t nb = depth / QK4_0;
+    const void * b_ptr = (void *)((block_q4_0x8 *) vx + ((x0 / 8) * nb));
+    const void * a_ptr = vy;
+    float * res_ptr = s + x0;
+
+    assert(depth % 32 == 0);
+    assert(width % 8 == 0);
+
+    size_t num_blocks = depth / 32;
+
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "add %x[b_ptr], %x[b_ptr], #0x10\n"
+      "1:"  // Column loop
+      "add x22, %x[a_ptr], #0x2\n"
+      "mov z31.b, #0x0\n"
+      "mov x21, %x[num_blocks]\n"
+      "2:"  // Block loop
+      "ld1b { z30.b }, p0/Z, [%x[b_ptr]]\n"
+      "ld1b { z29.b }, p0/Z, [%x[b_ptr], #1, MUL VL]\n"
+      "mov z28.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "ld1rd { z26.d }, p0/Z, [x22]\n"
+      "ld1b { z25.b }, p0/Z, [%x[b_ptr], #2, MUL VL]\n"
+      "sub x20, x22, #0x2\n"
+      "sub x21, x21, #0x1\n"
+      "ld1b { z24.b }, p0/Z, [%x[b_ptr], #3, MUL VL]\n"
+      "ld1rd { z23.d }, p0/Z, [x22, #8]\n"
+      "lsl z22.b, z30.b, #0x4\n"
+      "lsl z16.b, z29.b, #0x4\n"
+      "and z30.b, z30.b, #0xf0\n"
+      "and z29.b, z29.b, #0xf0\n"
+      "ld1rd { z21.d }, p0/Z, [x22, #16]\n"
+      "ld1rd { z20.d }, p0/Z, [x22, #24]\n"
+      "lsl z19.b, z25.b, #0x4\n"
+      "and z25.b, z25.b, #0xf0\n"
+      "ld1rh { z17.h }, p0/Z, [x20]\n"
+      "ld1h { z18.s }, p0/Z, [%x[b_ptr], #-1, MUL VL]\n"
+      "sdot z28.s, z22.b, z26.b\n"
+      "sdot z27.s, z16.b, z26.b\n"
+      "lsl z16.b, z24.b, #0x4\n"
+      "add x22, x22, #0x22\n"
+      "and z24.b, z24.b, #0xf0\n"
+      "add %x[b_ptr], %x[b_ptr], #0x90\n"
+      "fcvt z17.s, p0/m, z17.h\n"
+      "fcvt z18.s, p0/m, z18.h\n"
+      "sdot z28.s, z19.b, z23.b\n"
+      "sdot z27.s, z16.b, z23.b\n"
+      "fmul z18.s, z18.s, z17.s\n"
+      "sdot z28.s, z30.b, z21.b\n"
+      "sdot z27.s, z29.b, z21.b\n"
+      "sdot z28.s, z25.b, z20.b\n"
+      "sdot z27.s, z24.b, z20.b\n"
+      "uzp1 z17.s, z28.s, z27.s\n"
+      "uzp2 z16.s, z28.s, z27.s\n"
+      "add z17.s, z17.s, z16.s\n"
+      "asr z17.s, z17.s, #0x4\n"
+      "scvtf z17.s, p0/m, z17.s\n"
+      "fmla z31.s, p0/M, z17.s, z18.s\n"
+      "cbnz x21, 2b\n"
+      "sub %x[width], %x[width], #0x8\n"
+      "st1w { z31.s }, p0, [%x[res_ptr]]\n"
+      "add %x[res_ptr], %x[res_ptr], #0x20\n"
+      "cbnz %x[width], 1b\n"
+      : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [width] "+&r" (width)
+      : [a_ptr] "r" (a_ptr), [num_blocks] "r" (num_blocks)
+      : "memory", "p0", "x20", "x21", "x22", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+#endif
+}
+
+void ggml_gemv_q4_0_q8_0_aarch64_neon(size_t depth, size_t output_channels, size_t height, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
+#if defined(__ARM_NEON)
     int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)4);
     int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)4);
-
-    int64_t nb = n / QK4_0;
-    int64_t a_nb = n / QK8_0;
-
-    const uint8x16_t m4b = vdupq_n_u8(0x0F);
-    const int8x16_t  s8b = vdupq_n_s8(0x8);
-
-    const block_q4_0x4 * b_ptr_start = vx;
-    const block_q8_0x4 * a_ptr_start = vy;
-
-    for (int64_t y = 0; y < input_width / 4; y += rows / 4) {
-        for (int64_t x = x0 / 4; x < xend / 4; x++) {
-            const block_q8_0x4 * a_ptrs[rows / 4];
-
-            a_ptrs[0] = a_ptr_start + (y * a_nb);
-            for (int i = 0; i < (rows / 4) - 1; i++) {
-                a_ptrs[i + 1] = a_ptrs[i] + a_nb;
-            }
-
-            const block_q4_0x4 * b_ptr = b_ptr_start + (x * nb);
-
-            // Master FP accumulators
-            float32x4_t acc_rows[rows];
-            for (int i = 0; i < rows; i++) {
-                acc_rows[i] = vdupq_n_f32(0.0f);
-            }
-
-            for (int64_t b = 0; b < nb; b++) {
-                // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
-                const uint8x16_t rhs_raw_mat_01_0 = vld1q_u8(b_ptr[b].qs);
-                const uint8x16_t rhs_raw_mat_23_0 = vld1q_u8(b_ptr[b].qs + 16);
-                const uint8x16_t rhs_raw_mat_01_1 = vld1q_u8(b_ptr[b].qs + 32);
-                const uint8x16_t rhs_raw_mat_23_1 = vld1q_u8(b_ptr[b].qs + 48);
-
-                // 4-bit -> 8-bit
-                const int8x16_t rhs_mat_01_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_01_0, m4b)), s8b);
-                const int8x16_t rhs_mat_23_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_23_0, m4b)), s8b);
-                const int8x16_t rhs_mat_01_1 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_01_1, m4b)), s8b);
-                const int8x16_t rhs_mat_23_1 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_23_1, m4b)), s8b);
-                const int8x16_t rhs_mat_01_2 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_01_0), 4);
-                const int8x16_t rhs_mat_23_2 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_23_0), 4);
-                const int8x16_t rhs_mat_01_3 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_01_1), 4);
-                const int8x16_t rhs_mat_23_3 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_23_1), 4);
-
-                // Scale values - assemble the four row/column scales into a (64-bit) vector, then expand to FP32
-                const float16x4_t col_scale_f16 = vld1_f16(b_ptr[b].d);
-                const float32x4_t col_scale_f32 = vcvt_f32_f16(col_scale_f16);
-
-                // Process LHS in pairs of rows
-                for (int rp = 0; rp < rows / 4; rp++) {
-                    const int8x16_t lhs_mat_01_0 = vld1q_s8(a_ptrs[rp][b].qs);
-                    const int8x16_t lhs_mat_23_0 = vld1q_s8(a_ptrs[rp][b].qs + 16);
-                    const int8x16_t lhs_mat_01_1 = vld1q_s8(a_ptrs[rp][b].qs + 32);
-                    const int8x16_t lhs_mat_23_1 = vld1q_s8(a_ptrs[rp][b].qs + 48);
-
-                    const int8x16_t lhs_mat_01_2 = vld1q_s8(a_ptrs[rp][b].qs + 64);
-                    const int8x16_t lhs_mat_23_2 = vld1q_s8(a_ptrs[rp][b].qs + 80);
-                    const int8x16_t lhs_mat_01_3 = vld1q_s8(a_ptrs[rp][b].qs + 96);
-                    const int8x16_t lhs_mat_23_3 = vld1q_s8(a_ptrs[rp][b].qs + 112);
-
-                    // Do the MMLAs into 2x2 matrices
-                    const int32x4_t iacc_mat_00 =
-                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_01_0), lhs_mat_01_1, rhs_mat_01_1), lhs_mat_01_2, rhs_mat_01_2), lhs_mat_01_3, rhs_mat_01_3);
-                    const int32x4_t iacc_mat_01 =
-                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_23_0), lhs_mat_01_1, rhs_mat_23_1), lhs_mat_01_2, rhs_mat_23_2), lhs_mat_01_3, rhs_mat_23_3);
-                    const int32x4_t iacc_mat_10 =
-                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_23_0, rhs_mat_01_0), lhs_mat_23_1, rhs_mat_01_1), lhs_mat_23_2, rhs_mat_01_2), lhs_mat_23_3, rhs_mat_01_3);
-                    const int32x4_t iacc_mat_11 =
-                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_23_0, rhs_mat_23_0), lhs_mat_23_1, rhs_mat_23_1), lhs_mat_23_2, rhs_mat_23_2), lhs_mat_23_3, rhs_mat_23_3);
-
-                    // Straighten out to make 4 row vectors
-                    const int32x4_t iacc_row_0 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
-                    const int32x4_t iacc_row_1 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
-                    const int32x4_t iacc_row_2 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_10), vreinterpretq_u64_s32(iacc_mat_11)));
-                    const int32x4_t iacc_row_3 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_10), vreinterpretq_u64_s32(iacc_mat_11)));
-
-                    const float16x4_t row_scale_f16 = vld1_f16(a_ptrs[rp][b].d);
-                    const float32x4_t row_scale_f32 = vcvt_f32_f16(row_scale_f16);
-
-                    acc_rows[rp * 4] = vfmaq_f32(acc_rows[rp * 4], vcvtq_f32_s32(iacc_row_0), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 0));
-                    acc_rows[rp * 4 + 1] = vfmaq_f32(acc_rows[rp * 4 + 1], vcvtq_f32_s32(iacc_row_1), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 1));
-                    acc_rows[rp * 4 + 2] = vfmaq_f32(acc_rows[rp * 4 + 2], vcvtq_f32_s32(iacc_row_2), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 2));
-                    acc_rows[rp * 4 + 3] = vfmaq_f32(acc_rows[rp * 4 + 3], vcvtq_f32_s32(iacc_row_3), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 3));
-                }
-            }
-
-            for (int i = 0; i < rows; i++) {
-                vst1q_f32(s + ((y * 4 + i) * output_channels + x * 4), acc_rows[i]);
-            }
-        }
-    }
+    size_t width = xend - x0;
+
+    int64_t nb = depth / QK4_0;
+    const void * b_ptr = (void *)((block_q4_0x4 *) vx + ((x0 / 4) * nb));
+    const void * a_ptr = vy;
+    float * res_ptr = s + x0;
+
+    assert(depth % 32 == 0);
+    assert(width % 4 == 0);
+
+    size_t num_blocks = depth / 32;
+
+    __asm__ __volatile__(
+      "movi v2.16b, #0x4\n"
+      "movi v1.16b, #0xf0\n"
+      "add %x[b_ptr], %x[b_ptr], #0x8\n"
+      "1:"  // Column loop
+      "add x23, %x[a_ptr], #0x2\n"
+      "movi v0.16b, #0x0\n"
+      "mov x22, %x[num_blocks]\n"
+      "2:"  // Block loop
+      "ldr q31, [%x[b_ptr], #0x0]\n"
+      "ldr q30, [%x[b_ptr], #0x10]\n"
+      "mov x21, x23\n"
+      "movi v29.4s, #0x0\n"
+      "ldr q28, [%x[b_ptr], #0x20]\n"
+      "ldr q27, [%x[b_ptr], #0x30]\n"
+      "movi v26.4s, #0x0\n"
+      "sub x20, x23, #0x2\n"
+      "ld1r { v25.8h }, [x20]\n"
+      "ldr q24, [%x[b_ptr], #-0x8]\n"
+      "sub x22, x22, #0x1\n"
+      "add x23, x23, #0x22\n"
+      "ld1r { v23.2d }, [x21], #0x8\n"
+      "sshl v22.16b, v31.16b, v2.16b\n"
+      "sshl v16.16b, v30.16b, v2.16b\n"
+      "add %x[b_ptr], %x[b_ptr], #0x48\n"
+      "ld1r { v21.2d }, [x21], #0x8\n"
+      "sshl v20.16b, v28.16b, v2.16b\n"
+      "sshl v19.16b, v27.16b, v2.16b\n"
+      "ld1r { v18.2d }, [x21], #0x8\n"
+      "ld1r { v17.2d }, [x21], #0x8\n"
+      "and v31.16b, v31.16b, v1.16b\n"
+      "and v30.16b, v30.16b, v1.16b\n"
+      ".inst 0x4e9796dd  // sdot v29.4s, v22.16b, v23.16b\n"
+      ".inst 0x4e97961a  // sdot v26.4s, v16.16b, v23.16b\n"
+      "and v28.16b, v28.16b, v1.16b\n"
+      "and v27.16b, v27.16b, v1.16b\n"
+      "fcvtl v25.4s, v25.4h\n"
+      "fcvtl v16.4s, v24.4h\n"
+      ".inst 0x4e95969d  // sdot v29.4s, v20.16b, v21.16b\n"
+      ".inst 0x4e95967a  // sdot v26.4s, v19.16b, v21.16b\n"
+      "fmul v16.4s, v16.4s, v25.4s\n"
+      ".inst 0x4e9297fd  // sdot v29.4s, v31.16b, v18.16b\n"
+      ".inst 0x4e9297da  // sdot v26.4s, v30.16b, v18.16b\n"
+      ".inst 0x4e91979d  // sdot v29.4s, v28.16b, v17.16b\n"
+      ".inst 0x4e91977a  // sdot v26.4s, v27.16b, v17.16b\n"
+      "addp v29.4s, v29.4s, v26.4s\n"
+      "scvtf v29.4s, v29.4s, #0x4\n"
+      "fmla v0.4s, v29.4s, v16.4s\n"
+      "cbnz x22, 2b\n"
+      "sub %x[width], %x[width], #0x4\n"
+      "str q0, [%x[res_ptr], #0x0]\n"
+      "add %x[res_ptr], %x[res_ptr], #0x10\n"
+      "cbnz %x[width], 1b\n"
+      : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [width] "+&r" (width)
+      : [a_ptr] "r" (a_ptr), [num_blocks] "r" (num_blocks)
+      : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
+    );
 #endif
 }
 
-void ggml_gemm_q4_0_q8_0_2x4blocked_mmla(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    int rows = 2;
+void ggml_gemv_q4_0_q8_0_aarch64_neon_noi8mm(size_t depth, size_t output_channels, size_t height, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
+#if defined(__ARM_NEON)
     int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)4);
     int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)4);
-
-    int64_t nb = n / QK4_0;
-    int64_t a_nb = n / QK8_0;
-
-    const uint8x16_t m4b = vdupq_n_u8(0x0F);
-    const int8x16_t  s8b = vdupq_n_s8(0x8);
-
-    const block_q4_0x4 * b_ptr_start = vx;
-    const block_q8_0x2 * a_ptr_start = vy;
-
-    for (int64_t y = 0; y < input_width / 2; y += rows / 2) {
-        for (int64_t x = x0 / 4; x < xend / 4; x++) {
-            const block_q8_0x2 * a_ptrs[rows / 2];
-
-            a_ptrs[0] = a_ptr_start + (y * a_nb);
-
-            const block_q4_0x4 * b_ptr = b_ptr_start + (x * nb);
-
-            // Master FP accumulators
-            float32x4_t acc_rows[rows];
-            acc_rows[0] = vdupq_n_f32(0.0f);
-            acc_rows[1] = vdupq_n_f32(0.0f);
-
-            for (int64_t b = 0; b < nb; b++) {
-                // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
-                const uint8x16_t rhs_raw_mat_01_0 = vld1q_u8(b_ptr[b].qs);
-                const uint8x16_t rhs_raw_mat_23_0 = vld1q_u8(b_ptr[b].qs + 16);
-                const uint8x16_t rhs_raw_mat_01_1 = vld1q_u8(b_ptr[b].qs + 32);
-                const uint8x16_t rhs_raw_mat_23_1 = vld1q_u8(b_ptr[b].qs + 48);
-
-                const int8x16_t rhs_mat_01_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_01_0, m4b)), s8b);
-                const int8x16_t rhs_mat_23_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_23_0, m4b)), s8b);
-                const int8x16_t rhs_mat_01_1 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_01_1, m4b)), s8b);
-                const int8x16_t rhs_mat_23_1 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_23_1, m4b)), s8b);
-
-                const int8x16_t rhs_mat_01_2 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_01_0), 4);
-                const int8x16_t rhs_mat_23_2 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_23_0), 4);
-                const int8x16_t rhs_mat_01_3 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_01_1), 4);
-                const int8x16_t rhs_mat_23_3 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_23_1), 4);
-
-                // Scale values - assemble the four row/column scales into a (64-bit) vector, then expand to FP32
-                const float16x4_t col_scale_f16 = vld1_f16(b_ptr[b].d);
-                const float32x4_t col_scale_f32 = vcvt_f32_f16(col_scale_f16);
-
-                // Process LHS in pairs of rows
-                int rp = 0;
-                const int8x16_t lhs_mat_01_0 = vld1q_s8(a_ptrs[rp][b].qs);
-                const int8x16_t lhs_mat_01_1 = vld1q_s8(a_ptrs[rp][b].qs + 16);
-
-                const int8x16_t lhs_mat_01_2 = vld1q_s8(a_ptrs[rp][b].qs + 32);
-                const int8x16_t lhs_mat_01_3 = vld1q_s8(a_ptrs[rp][b].qs + 48);
-
-                // Do the MMLAs into 2x2 matrices
-                const int32x4_t iacc_mat_00 =
-                    vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_01_0), lhs_mat_01_1, rhs_mat_01_1), lhs_mat_01_2, rhs_mat_01_2), lhs_mat_01_3, rhs_mat_01_3);
-                const int32x4_t iacc_mat_01 =
-                    vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_23_0), lhs_mat_01_1, rhs_mat_23_1), lhs_mat_01_2, rhs_mat_23_2), lhs_mat_01_3, rhs_mat_23_3);
-
-                // Straighten out to make 2 row vectors
-                const int32x4_t iacc_row_0 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
-                const int32x4_t iacc_row_1 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
-
-                const float16x4_t row_scale_f16_0 = vld1_dup_f16(&(a_ptrs[rp][b].d[0]));
-                const float32x4_t row_scale_f32_0 = vcvt_f32_f16(row_scale_f16_0);
-                const float16x4_t row_scale_f16_1 = vld1_dup_f16(&(a_ptrs[rp][b].d[1]));
-                const float32x4_t row_scale_f32_1 = vcvt_f32_f16(row_scale_f16_1);
-
-                acc_rows[rp * 2] = vfmaq_f32(acc_rows[rp * 2], vcvtq_f32_s32(iacc_row_0), vmulq_f32(col_scale_f32, row_scale_f32_0));
-                acc_rows[rp * 2 + 1] = vfmaq_f32(acc_rows[rp * 2 + 1], vcvtq_f32_s32(iacc_row_1), vmulq_f32(col_scale_f32, row_scale_f32_1));
-            }
-
-            vst1q_f32(s + ((y * 2) * output_channels + x * 4), acc_rows[0]);
-            vst1q_f32(s + ((y * 2 + 1) * output_channels + x * 4), acc_rows[1]);
-        }
-    }
+    size_t width = xend - x0;
+
+    int64_t nb = depth / QK4_0;
+    const void * b_ptr = (void *)((block_q4_0x4 *) vx + ((x0 / 4) * nb));
+    const void * a_ptr = vy;
+    float * res_ptr = s + x0;
+
+    assert(depth % 32 == 0);
+    assert(width % 4 == 0);
+
+    size_t num_blocks = depth / 32;
+
+    __asm__ __volatile__(
+      "movi v31.16b, #0x4\n"
+      "movi v30.16b, #0xf0\n"
+      "add %x[b_ptr], %x[b_ptr], #0x8\n"
+      "1:"  // Column loop
+      "add x22, %x[a_ptr], #0x2\n"
+      "movi v29.16b, #0x0\n"
+      "mov x21, %x[num_blocks]\n"
+      "2:"  // Block loop
+      "ldr q28, [%x[b_ptr], #0x0]\n"
+      "ldr q27, [x22, #0x0]\n"
+      "movi v26.4s, #0x0\n"
+      "sub x20, x22, #0x2\n"
+      "ldr q25, [x22, #0x10]\n"
+      "ldr q24, [%x[b_ptr], #0x10]\n"
+      "sub x21, x21, #0x1\n"
+      "add x22, x22, #0x22\n"
+      "ldr q23, [%x[b_ptr], #0x20]\n"
+      "ldr q22, [%x[b_ptr], #0x30]\n"
+      "ld1r { v21.8h }, [x20]\n"
+      "ldr q20, [%x[b_ptr], #-0x8]\n"
+      "sshl v16.16b, v28.16b, v31.16b\n"
+      "and v28.16b, v28.16b, v30.16b\n"
+      "sshl v19.16b, v24.16b, v31.16b\n"
+      "and v24.16b, v24.16b, v30.16b\n"
+      "add %x[b_ptr], %x[b_ptr], #0x48\n"
+      "sshl v18.16b, v23.16b, v31.16b\n"
+      "and v23.16b, v23.16b, v30.16b\n"
+      ".inst 0x4f9be21a  // sdot v26.4s, v16.16b, v27.4b[0]\n"
+      "sshl v17.16b, v22.16b, v31.16b\n"
+      "and v22.16b, v22.16b, v30.16b\n"
+      "fcvtl v21.4s, v21.4h\n"
+      "fcvtl v16.4s, v20.4h\n"
+      ".inst 0x4f99e39a  // sdot v26.4s, v28.16b, v25.4b[0]\n"
+      "fmul v16.4s, v16.4s, v21.4s\n"
+      ".inst 0x4fbbe27a  // sdot v26.4s, v19.16b, v27.4b[1]\n"
+      ".inst 0x4fb9e31a  // sdot v26.4s, v24.16b, v25.4b[1]\n"
+      ".inst 0x4f9bea5a  // sdot v26.4s, v18.16b, v27.4b[2]\n"
+      ".inst 0x4f99eafa  // sdot v26.4s, v23.16b, v25.4b[2]\n"
+      ".inst 0x4fbbea3a  // sdot v26.4s, v17.16b, v27.4b[3]\n"
+      ".inst 0x4fb9eada  // sdot v26.4s, v22.16b, v25.4b[3]\n"
+      "scvtf v26.4s, v26.4s, #0x4\n"
+      "fmla v29.4s, v26.4s, v16.4s\n"
+      "cbnz x21, 2b\n"
+      "sub %x[width], %x[width], #0x4\n"
+      "str q29, [%x[res_ptr], #0x0]\n"
+      "add %x[res_ptr], %x[res_ptr], #0x10\n"
+      "cbnz %x[width], 1b\n"
+      : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [width] "+&r" (width)
+      : [a_ptr] "r" (a_ptr), [num_blocks] "r" (num_blocks)
+      : "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22"
+    );
 #endif
 }
 
@@ -15471,15 +15614,18 @@ void ggml_gemv_q8_0_q8_0_blocked8_sve(const int n, int output_channels, int inpu
 #endif
 }
 
-void ggml_gemm_q8_0_q8_0(const int n, int rows, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
+void ggml_gemm_q4_0_q8_0(const int n, int rows, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
 #if defined(__ARM_FEATURE_MATMUL_INT8)
     int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)4);
     int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)4);
 
-    int64_t nb = n / QK8_0;
+    int64_t nb = n / QK4_0;
     int64_t a_nb = n / QK8_0;
 
-    const block_q8_0x4 * b_ptr_start = vx;
+    const uint8x16_t m4b = vdupq_n_u8(0x0F);
+    const int8x16_t  s8b = vdupq_n_s8(0x8);
+
+    const block_q4_0x4 * b_ptr_start = vx;
     const block_q8_0x4 * a_ptr_start = vy;
 
     for (int64_t y = 0; y < input_width / 4; y += rows / 4) {
@@ -15491,7 +15637,7 @@ void ggml_gemm_q8_0_q8_0(const int n, int rows, int output_channels, int input_w
                 a_ptrs[i + 1] = a_ptrs[i] + a_nb;
             }
 
-            const block_q8_0x4 * b_ptr = b_ptr_start + (x * nb);
+            const block_q4_0x4 * b_ptr = b_ptr_start + (x * nb);
 
             // Master FP accumulators
             float32x4_t acc_rows[rows];
@@ -15501,14 +15647,20 @@ void ggml_gemm_q8_0_q8_0(const int n, int rows, int output_channels, int input_w
 
             for (int64_t b = 0; b < nb; b++) {
                 // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
-                const int8x16_t rhs_mat_01_0 = vld1q_s8(b_ptr[b].qs);
-                const int8x16_t rhs_mat_23_0 = vld1q_s8(b_ptr[b].qs + 16);
-                const int8x16_t rhs_mat_01_1 = vld1q_s8(b_ptr[b].qs + 32);
-                const int8x16_t rhs_mat_23_1 = vld1q_s8(b_ptr[b].qs + 48);
-                const int8x16_t rhs_mat_01_2 = vld1q_s8(b_ptr[b].qs + 64);
-                const int8x16_t rhs_mat_23_2 = vld1q_s8(b_ptr[b].qs + 80);
-                const int8x16_t rhs_mat_01_3 = vld1q_s8(b_ptr[b].qs + 96);
-                const int8x16_t rhs_mat_23_3 = vld1q_s8(b_ptr[b].qs + 112);
+                const uint8x16_t rhs_raw_mat_01_0 = vld1q_u8(b_ptr[b].qs);
+                const uint8x16_t rhs_raw_mat_23_0 = vld1q_u8(b_ptr[b].qs + 16);
+                const uint8x16_t rhs_raw_mat_01_1 = vld1q_u8(b_ptr[b].qs + 32);
+                const uint8x16_t rhs_raw_mat_23_1 = vld1q_u8(b_ptr[b].qs + 48);
+
+                // 4-bit -> 8-bit
+                const int8x16_t rhs_mat_01_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_01_0, m4b)), s8b);
+                const int8x16_t rhs_mat_23_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_23_0, m4b)), s8b);
+                const int8x16_t rhs_mat_01_1 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_01_1, m4b)), s8b);
+                const int8x16_t rhs_mat_23_1 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_23_1, m4b)), s8b);
+                const int8x16_t rhs_mat_01_2 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_01_0), 4);
+                const int8x16_t rhs_mat_23_2 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_23_0), 4);
+                const int8x16_t rhs_mat_01_3 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_01_1), 4);
+                const int8x16_t rhs_mat_23_3 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_23_1), 4);
 
                 // Scale values - assemble the four row/column scales into a (64-bit) vector, then expand to FP32
                 const float16x4_t col_scale_f16 = vld1_f16(b_ptr[b].d);
@@ -15560,9 +15712,1310 @@ void ggml_gemm_q8_0_q8_0(const int n, int rows, int output_channels, int input_w
 #endif
 }
 
-void ggml_gemm_q8_0_q8_0_2x4blocked_mmla(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
+void ggml_gemm_q4_0_q8_0_aarch64_sve256(size_t depth, size_t output_channels, size_t height, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
+    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)8);
+    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)8);
+    size_t width = xend - x0;
+
+    int64_t nb = depth / QK4_0;
+    const void * b_ptr = (void *)((block_q4_0x8 *) vx + ((x0 / 8) * nb));
+    const void * a_ptr = vy;
+    float * res_ptr = s + x0;
+    size_t res_stride = output_channels  * sizeof(float);
+
+    assert(depth % 32 == 0);
+    assert(width % 8 == 0);
+
+    size_t num_blocks = depth / 32;
+
+    __asm__ __volatile__(
+      "mov x20, #0x4\n"
+      "mov x13, %x[height]\n"
+      "mov z28.s, #-0x4\n"
+      "mov x12, #0x88\n"
+      "ptrue p1.b\n"
+      "whilelt p0.s, XZR, x20\n"
+      "cmp x13, #0x10\n"
+      "mul x12, %x[num_blocks], x12\n"
+      "blt 4f\n"
+      "1:"  // Row loop
+      "add x11, %x[b_ptr], #0x10\n"
+      "mov x10, %x[width]\n"
+      "add x9, %x[res_ptr], %x[res_stride], LSL #4\n"
+      "2:"  // Column loop
+      "add x28, %x[a_ptr], #0x8\n"
+      "mov z24.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov x27, %x[num_blocks]\n"
+      "add x26, x28, x12\n"
+      "mov z12.b, #0x0\n"
+      "mov z0.b, #0x0\n"
+      "add x25, x26, x12\n"
+      "mov z13.b, #0x0\n"
+      "mov z1.b, #0x0\n"
+      "add x24, x25, x12\n"
+      "mov z20.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z8.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "3:"  // Block loop
+      "ld1b { z30.b }, p1/Z, [x11]\n"
+      "ld1b { z21.b }, p1/Z, [x11, #1, MUL VL]\n"
+      "mov z18.s, #0x0\n"
+      "mov z7.s, #0x0\n"
+      "ld1rqb { z3.b }, p1/Z, [x28]\n"
+      "ld1rqb { z5.b }, p1/Z, [x28, #16]\n"
+      "mov z9.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "ld1b { z4.b }, p1/Z, [x11, #2, MUL VL]\n"
+      "ld1b { z17.b }, p1/Z, [x11, #3, MUL VL]\n"
+      "sub x20, x11, #0x10\n"
+      "sub x23, x28, #0x8\n"
+      "lsl z31.b, z30.b, #0x4\n"
+      "lsl z6.b, z21.b, #0x4\n"
+      "ld1h { z23.s }, p1/Z, [x20]\n"
+      "sub x22, x26, #0x8\n"
+      "and z30.b, z30.b, #0xf0\n"
+      "and z21.b, z21.b, #0xf0\n"
+      "sub x21, x25, #0x8\n"
+      "sub x20, x24, #0x8\n"
+      "lsl z14.b, z4.b, #0x4\n"
+      "lsl z2.b, z17.b, #0x4\n"
+      "subs x27, x27, #0x1\n"
+      "add x11, x11, #0x90\n"
+      ".inst 0x451f9872  // smmla z18.s, z3.b, z31.b\n"
+      ".inst 0x45069867  // smmla z7.s, z3.b, z6.b\n"
+      "ld1rqb { z3.b }, p1/Z, [x28, #32]\n"
+      "and z4.b, z4.b, #0xf0\n"
+      ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
+      ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
+      "ld1rqb { z5.b }, p1/Z, [x28, #48]\n"
+      "and z17.b, z17.b, #0xf0\n"
+      "fcvt z23.s, p1/m, z23.h\n"
+      ".inst 0x450e9872  // smmla z18.s, z3.b, z14.b\n"
+      ".inst 0x45029867  // smmla z7.s, z3.b, z2.b\n"
+      "ld1rqb { z3.b }, p1/Z, [x28, #64]\n"
+      ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
+      ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
+      "ld1rqb { z5.b }, p1/Z, [x28, #80]\n"
+      "fscale z23.s, p1/m, z23.s, z28.s\n"
+      ".inst 0x451e9872  // smmla z18.s, z3.b, z30.b\n"
+      ".inst 0x45159867  // smmla z7.s, z3.b, z21.b\n"
+      "ld1rqb { z3.b }, p1/Z, [x28, #96]\n"
+      ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
+      ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
+      "ld1rqb { z5.b }, p1/Z, [x28, #112]\n"
+      "add x28, x28, #0x88\n"
+      ".inst 0x45049872  // smmla z18.s, z3.b, z4.b\n"
+      ".inst 0x45119867  // smmla z7.s, z3.b, z17.b\n"
+      "ld1h { z3.s }, p0/Z, [x23]\n"
+      ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
+      ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
+      "fcvt z3.s, p1/m, z3.h\n"
+      "uzp1 z5.d, z18.d, z7.d\n"
+      "uzp2 z18.d, z18.d, z7.d\n"
+      "mov z3.q, z3.q[0]\n"
+      "uzp1 z7.d, z9.d, z22.d\n"
+      "uzp2 z22.d, z9.d, z22.d\n"
+      "fmul z9.s, z23.s, z3.s[0]\n"
+      "scvtf z5.s, p1/m, z5.s\n"
+      "scvtf z18.s, p1/m, z18.s\n"
+      "scvtf z7.s, p1/m, z7.s\n"
+      "scvtf z22.s, p1/m, z22.s\n"
+      "fmla z24.s, p1/M, z5.s, z9.s\n"
+      "ld1rqb { z5.b }, p1/Z, [x26]\n"
+      "fmul z9.s, z23.s, z3.s[1]\n"
+      "fmla z15.s, p1/M, z18.s, z9.s\n"
+      "ld1rqb { z18.b }, p1/Z, [x26, #16]\n"
+      "fmul z9.s, z23.s, z3.s[2]\n"
+      "fmul z3.s, z23.s, z3.s[3]\n"
+      "fmla z12.s, p1/M, z7.s, z9.s\n"
+      "mov z9.s, #0x0\n"
+      "ld1h { z7.s }, p0/Z, [x22]\n"
+      ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
+      "fmla z0.s, p1/M, z22.s, z3.s\n"
+      "mov z22.s, #0x0\n"
+      "ld1h { z3.s }, p0/Z, [x21]\n"
+      ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
+      "ld1rqb { z5.b }, p1/Z, [x26, #32]\n"
+      "fcvt z7.s, p1/m, z7.h\n"
+      "fcvt z3.s, p1/m, z3.h\n"
+      ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
+      ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
+      "ld1rqb { z5.b }, p1/Z, [x26, #64]\n"
+      "mov z7.q, z7.q[0]\n"
+      "mov z3.q, z3.q[0]\n"
+      ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
+      ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
+      "ld1rqb { z5.b }, p1/Z, [x26, #96]\n"
+      ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
+      ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
+      "uzp1 z5.d, z9.d, z22.d\n"
+      "scvtf z5.s, p1/m, z5.s\n"
+      "uzp2 z22.d, z9.d, z22.d\n"
+      "fmul z9.s, z23.s, z7.s[0]\n"
+      "scvtf z22.s, p1/m, z22.s\n"
+      "fmla z13.s, p1/M, z5.s, z9.s\n"
+      "ld1rqb { z9.b }, p1/Z, [x25]\n"
+      "fmul z5.s, z23.s, z7.s[1]\n"
+      "fmla z1.s, p1/M, z22.s, z5.s\n"
+      "mov z5.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      ".inst 0x451f9a45  // smmla z5.s, z18.b, z31.b\n"
+      ".inst 0x45069a56  // smmla z22.s, z18.b, z6.b\n"
+      "ld1rqb { z18.b }, p1/Z, [x26, #48]\n"
+      ".inst 0x450e9a45  // smmla z5.s, z18.b, z14.b\n"
+      ".inst 0x45029a56  // smmla z22.s, z18.b, z2.b\n"
+      "ld1rqb { z18.b }, p1/Z, [x26, #80]\n"
+      ".inst 0x451e9a45  // smmla z5.s, z18.b, z30.b\n"
+      ".inst 0x45159a56  // smmla z22.s, z18.b, z21.b\n"
+      "ld1rqb { z18.b }, p1/Z, [x26, #112]\n"
+      "add x26, x26, #0x88\n"
+      ".inst 0x45049a45  // smmla z5.s, z18.b, z4.b\n"
+      ".inst 0x45119a56  // smmla z22.s, z18.b, z17.b\n"
+      "uzp1 z18.d, z5.d, z22.d\n"
+      "scvtf z18.s, p1/m, z18.s\n"
+      "uzp2 z22.d, z5.d, z22.d\n"
+      "fmul z5.s, z23.s, z7.s[2]\n"
+      "fmul z7.s, z23.s, z7.s[3]\n"
+      "scvtf z22.s, p1/m, z22.s\n"
+      "fmla z20.s, p1/M, z18.s, z5.s\n"
+      "ld1rqb { z18.b }, p1/Z, [x25, #16]\n"
+      "ld1h { z5.s }, p0/Z, [x20]\n"
+      "fcvt z5.s, p1/m, z5.h\n"
+      "fmla z25.s, p1/M, z22.s, z7.s\n"
+      "mov z22.s, #0x0\n"
+      "mov z7.s, #0x0\n"
+      ".inst 0x451f9936  // smmla z22.s, z9.b, z31.b\n"
+      ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
+      "ld1rqb { z9.b }, p1/Z, [x25, #32]\n"
+      "mov z5.q, z5.q[0]\n"
+      ".inst 0x450e9936  // smmla z22.s, z9.b, z14.b\n"
+      ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
+      "ld1rqb { z9.b }, p1/Z, [x25, #64]\n"
+      ".inst 0x451e9936  // smmla z22.s, z9.b, z30.b\n"
+      ".inst 0x45159927  // smmla z7.s, z9.b, z21.b\n"
+      "ld1rqb { z9.b }, p1/Z, [x25, #96]\n"
+      ".inst 0x45049936  // smmla z22.s, z9.b, z4.b\n"
+      ".inst 0x45119927  // smmla z7.s, z9.b, z17.b\n"
+      "uzp1 z9.d, z22.d, z7.d\n"
+      "scvtf z9.s, p1/m, z9.s\n"
+      "uzp2 z22.d, z22.d, z7.d\n"
+      "fmul z7.s, z23.s, z3.s[0]\n"
+      "scvtf z22.s, p1/m, z22.s\n"
+      "fmla z11.s, p1/M, z9.s, z7.s\n"
+      "ld1rqb { z9.b }, p1/Z, [x24]\n"
+      "fmul z7.s, z23.s, z3.s[1]\n"
+      "fmla z16.s, p1/M, z22.s, z7.s\n"
+      "mov z22.s, #0x0\n"
+      "mov z7.s, #0x0\n"
+      ".inst 0x451f9a56  // smmla z22.s, z18.b, z31.b\n"
+      ".inst 0x45069a47  // smmla z7.s, z18.b, z6.b\n"
+      "ld1rqb { z18.b }, p1/Z, [x25, #48]\n"
+      ".inst 0x450e9a56  // smmla z22.s, z18.b, z14.b\n"
+      ".inst 0x45029a47  // smmla z7.s, z18.b, z2.b\n"
+      "ld1rqb { z18.b }, p1/Z, [x25, #80]\n"
+      ".inst 0x451e9a56  // smmla z22.s, z18.b, z30.b\n"
+      ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
+      "ld1rqb { z18.b }, p1/Z, [x25, #112]\n"
+      "add x25, x25, #0x88\n"
+      ".inst 0x45049a56  // smmla z22.s, z18.b, z4.b\n"
+      ".inst 0x45119a47  // smmla z7.s, z18.b, z17.b\n"
+      "uzp1 z18.d, z22.d, z7.d\n"
+      "scvtf z18.s, p1/m, z18.s\n"
+      "uzp2 z7.d, z22.d, z7.d\n"
+      "fmul z22.s, z23.s, z3.s[2]\n"
+      "fmul z3.s, z23.s, z3.s[3]\n"
+      "scvtf z7.s, p1/m, z7.s\n"
+      "fmla z19.s, p1/M, z18.s, z22.s\n"
+      "ld1rqb { z18.b }, p1/Z, [x24, #16]\n"
+      "fmul z22.s, z23.s, z5.s[0]\n"
+      "fmla z26.s, p1/M, z7.s, z3.s\n"
+      "mov z3.s, #0x0\n"
+      "mov z7.s, #0x0\n"
+      ".inst 0x451f9923  // smmla z3.s, z9.b, z31.b\n"
+      ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
+      "ld1rqb { z9.b }, p1/Z, [x24, #32]\n"
+      ".inst 0x450e9923  // smmla z3.s, z9.b, z14.b\n"
+      ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
+      "mov z9.s, #0x0\n"
+      ".inst 0x451f9a49  // smmla z9.s, z18.b, z31.b\n"
+      "mov z31.s, #0x0\n"
+      ".inst 0x45069a5f  // smmla z31.s, z18.b, z6.b\n"
+      "ld1rqb { z6.b }, p1/Z, [x24, #48]\n"
+      "ld1rqb { z18.b }, p1/Z, [x24, #64]\n"
+      ".inst 0x450e98c9  // smmla z9.s, z6.b, z14.b\n"
+      "fmul z14.s, z23.s, z5.s[1]\n"
+      ".inst 0x450298df  // smmla z31.s, z6.b, z2.b\n"
+      "ld1rqb { z6.b }, p1/Z, [x24, #80]\n"
+      "fmul z2.s, z23.s, z5.s[2]\n"
+      "fmul z23.s, z23.s, z5.s[3]\n"
+      ".inst 0x451e9a43  // smmla z3.s, z18.b, z30.b\n"
+      ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
+      "ld1rqb { z5.b }, p1/Z, [x24, #96]\n"
+      ".inst 0x451e98c9  // smmla z9.s, z6.b, z30.b\n"
+      ".inst 0x451598df  // smmla z31.s, z6.b, z21.b\n"
+      "ld1rqb { z18.b }, p1/Z, [x24, #112]\n"
+      "add x24, x24, #0x88\n"
+      ".inst 0x450498a3  // smmla z3.s, z5.b, z4.b\n"
+      ".inst 0x451198a7  // smmla z7.s, z5.b, z17.b\n"
+      ".inst 0x45049a49  // smmla z9.s, z18.b, z4.b\n"
+      ".inst 0x45119a5f  // smmla z31.s, z18.b, z17.b\n"
+      "uzp1 z18.d, z3.d, z7.d\n"
+      "uzp2 z5.d, z3.d, z7.d\n"
+      "scvtf z18.s, p1/m, z18.s\n"
+      "uzp1 z6.d, z9.d, z31.d\n"
+      "uzp2 z9.d, z9.d, z31.d\n"
+      "scvtf z5.s, p1/m, z5.s\n"
+      "fmla z8.s, p1/M, z18.s, z22.s\n"
+      "scvtf z6.s, p1/m, z6.s\n"
+      "scvtf z9.s, p1/m, z9.s\n"
+      "fmla z29.s, p1/M, z5.s, z14.s\n"
+      "fmla z27.s, p1/M, z6.s, z2.s\n"
+      "fmla z10.s, p1/M, z9.s, z23.s\n"
+      "bgt 3b\n"
+      "mov x20, %x[res_ptr]\n"
+      "subs x10, x10, #0x8\n"
+      "add %x[res_ptr], %x[res_ptr], #0x20\n"
+      "st1w { z24.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z15.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z12.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z0.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z13.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z1.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z20.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z25.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z11.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z16.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z19.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z26.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z8.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z29.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z27.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z10.s }, p1, [x20]\n"
+      "bne 2b\n"
+      "mov x20, #0x4\n"
+      "sub x13, x13, #0x10\n"
+      "cmp x13, #0x10\n"
+      "mov %x[res_ptr], x9\n"
+      "madd %x[a_ptr], x20, x12, %x[a_ptr]\n"
+      "bge 1b\n"
+      "4:"  // Row loop skip
+      "cbz x13, 9f\n"
+      "5:"  // Row tail: Row loop
+      "add x25, %x[b_ptr], #0x10\n"
+      "mov x24, %x[width]\n"
+      "add x23, %x[res_ptr], %x[res_stride], LSL #2\n"
+      "6:"  // Row tail: Column loop
+      "mov z24.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "add x28, %x[a_ptr], #0x8\n"
+      "mov x22, %x[num_blocks]\n"
+      "mov z12.b, #0x0\n"
+      "mov z0.b, #0x0\n"
+      "7:"  // Row tail: Block loop
+      "ld1b { z3.b }, p1/Z, [x25]\n"
+      "ld1b { z6.b }, p1/Z, [x25, #1, MUL VL]\n"
+      "mov z2.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "ld1rqb { z26.b }, p1/Z, [x28]\n"
+      "ld1rqb { z21.b }, p1/Z, [x28, #16]\n"
+      "mov z27.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "ld1b { z29.b }, p1/Z, [x25, #2, MUL VL]\n"
+      "ld1b { z16.b }, p1/Z, [x25, #3, MUL VL]\n"
+      "sub x21, x25, #0x10\n"
+      "sub x20, x28, #0x8\n"
+      "lsl z20.b, z3.b, #0x4\n"
+      "lsl z4.b, z6.b, #0x4\n"
+      "ld1rqb { z10.b }, p1/Z, [x28, #32]\n"
+      "ld1rqb { z23.b }, p1/Z, [x28, #48]\n"
+      "and z3.b, z3.b, #0xf0\n"
+      "and z6.b, z6.b, #0xf0\n"
+      "ld1rqb { z11.b }, p1/Z, [x28, #64]\n"
+      "ld1rqb { z7.b }, p1/Z, [x28, #80]\n"
+      "lsl z8.b, z29.b, #0x4\n"
+      "lsl z14.b, z16.b, #0x4\n"
+      "ld1rqb { z18.b }, p1/Z, [x28, #96]\n"
+      "ld1rqb { z30.b }, p1/Z, [x28, #112]\n"
+      ".inst 0x45149b42  // smmla z2.s, z26.b, z20.b\n"
+      ".inst 0x45049b59  // smmla z25.s, z26.b, z4.b\n"
+      "and z29.b, z29.b, #0xf0\n"
+      "ld1h { z17.s }, p1/Z, [x21]\n"
+      ".inst 0x45149abb  // smmla z27.s, z21.b, z20.b\n"
+      ".inst 0x45049ab3  // smmla z19.s, z21.b, z4.b\n"
+      "and z16.b, z16.b, #0xf0\n"
+      "ld1h { z4.s }, p0/Z, [x20]\n"
+      "subs x22, x22, #0x1\n"
+      "add x28, x28, #0x88\n"
+      "fcvt z17.s, p1/m, z17.h\n"
+      "add x25, x25, #0x90\n"
+      ".inst 0x45089942  // smmla z2.s, z10.b, z8.b\n"
+      ".inst 0x450e9959  // smmla z25.s, z10.b, z14.b\n"
+      "fcvt z4.s, p1/m, z4.h\n"
+      ".inst 0x45089afb  // smmla z27.s, z23.b, z8.b\n"
+      ".inst 0x450e9af3  // smmla z19.s, z23.b, z14.b\n"
+      "fscale z17.s, p1/m, z17.s, z28.s\n"
+      "mov z4.q, z4.q[0]\n"
+      ".inst 0x45039962  // smmla z2.s, z11.b, z3.b\n"
+      ".inst 0x45069979  // smmla z25.s, z11.b, z6.b\n"
+      "fmul z23.s, z17.s, z4.s[0]\n"
+      "fmul z9.s, z17.s, z4.s[1]\n"
+      "fmul z21.s, z17.s, z4.s[2]\n"
+      "fmul z4.s, z17.s, z4.s[3]\n"
+      ".inst 0x450398fb  // smmla z27.s, z7.b, z3.b\n"
+      ".inst 0x450698f3  // smmla z19.s, z7.b, z6.b\n"
+      ".inst 0x451d9a42  // smmla z2.s, z18.b, z29.b\n"
+      ".inst 0x45109a59  // smmla z25.s, z18.b, z16.b\n"
+      ".inst 0x451d9bdb  // smmla z27.s, z30.b, z29.b\n"
+      ".inst 0x45109bd3  // smmla z19.s, z30.b, z16.b\n"
+      "uzp1 z31.d, z2.d, z25.d\n"
+      "uzp2 z13.d, z2.d, z25.d\n"
+      "scvtf z31.s, p1/m, z31.s\n"
+      "uzp1 z17.d, z27.d, z19.d\n"
+      "uzp2 z18.d, z27.d, z19.d\n"
+      "scvtf z13.s, p1/m, z13.s\n"
+      "fmla z24.s, p1/M, z31.s, z23.s\n"
+      "scvtf z17.s, p1/m, z17.s\n"
+      "scvtf z18.s, p1/m, z18.s\n"
+      "fmla z15.s, p1/M, z13.s, z9.s\n"
+      "fmla z12.s, p1/M, z17.s, z21.s\n"
+      "fmla z0.s, p1/M, z18.s, z4.s\n"
+      "bgt 7b\n"
+      "mov x20, %x[res_ptr]\n"
+      "cmp x13, #0x1\n"
+      "st1w { z24.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "ble 8f\n"
+      "cmp x13, #0x2\n"
+      "st1w { z15.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "ble 8f\n"
+      "cmp x13, #0x3\n"
+      "st1w { z12.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "ble 8f\n"
+      "st1w { z0.s }, p1, [x20]\n"
+      "8:"  // Row tail: Accumulator store skip
+      "subs x24, x24, #0x8\n"
+      "add %x[res_ptr], %x[res_ptr], #0x20\n"
+      "bne 6b\n"
+      "subs x13, x13, #0x4\n"
+      "add %x[a_ptr], %x[a_ptr], x12\n"
+      "mov %x[res_ptr], x23\n"
+      "bgt 5b\n"
+      "9:"  // Row tail: Row loop skip
+      : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
+      : [b_ptr] "r" (b_ptr), [height] "r" (height), [num_blocks] "r" (num_blocks), [res_stride] "r" (res_stride), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+#endif
+}
+
+void ggml_gemm_q4_0_q8_0_aarch64_neon(size_t depth, size_t output_channels, size_t height, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)4);
+    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)4);
+    size_t width = xend - x0;
+
+    int64_t nb = depth / QK4_0;
+    const void * b_ptr = (void *)((block_q4_0x4 *) vx + ((x0 / 4) * nb));
+    const void * a_ptr = vy;
+    float * res_ptr = s + x0;
+    size_t res_stride = output_channels  * sizeof(float);
+
+    assert(depth % 32 == 0);
+    assert(width % 4 == 0);
+
+    size_t num_blocks = depth / 32;
+
+    __asm__ __volatile__(
+      "mov x10, %x[height]\n"
+      "mov x9, #0x88\n"
+      "cmp x10, #0x10\n"
+      "mul x9, %x[num_blocks], x9\n"
+      "blt 4f\n"
+      "1:"  // Row loop
+      "add x28, %x[b_ptr], #0x8\n"
+      "mov x27, %x[width]\n"
+      "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
+      "2:"  // Column loop
+      "add x25, %x[a_ptr], #0x8\n"
+      "movi v2.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "mov x24, %x[num_blocks]\n"
+      "add x23, x25, x9\n"
+      "movi v12.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "add x22, x23, x9\n"
+      "movi v11.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "add x21, x22, x9\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v5.16b, #0x0\n"
+      "movi v7.16b, #0x0\n"
+      "movi v4.16b, #0x0\n"
+      "movi v6.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "3:"  // Block loop
+      "ldr q21, [x28, #0x0]\n"
+      "ldr q16, [x28, #0x10]\n"
+      "movi v1.16b, #0x4\n"
+      "movi v19.4s, #0x0\n"
+      "ldr q27, [x25, #0x0]\n"
+      "ldr q15, [x25, #0x10]\n"
+      "movi v26.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "ldr q29, [x28, #0x20]\n"
+      "ldr q3, [x28, #0x30]\n"
+      "movi v17.4s, #0x0\n"
+      "movi v0.16b, #0xf0\n"
+      "ldr d20, [x25, #-0x8]\n"
+      "ldr d9, [x23, #-0x8]\n"
+      "sshl v8.16b, v21.16b, v1.16b\n"
+      "sshl v31.16b, v16.16b, v1.16b\n"
+      "and v21.16b, v21.16b, v0.16b\n"
+      "and v16.16b, v16.16b, v0.16b\n"
+      "sub x20, x28, #0x8\n"
+      "subs x24, x24, #0x1\n"
+      "add x28, x28, #0x48\n"
+      ".inst 0x4e88a773  // smmla v19.4s, v27.16b, v8.16b\n"
+      ".inst 0x4e9fa77a  // smmla v26.4s, v27.16b, v31.16b\n"
+      "ldr q27, [x25, #0x20]\n"
+      ".inst 0x4e88a5f2  // smmla v18.4s, v15.16b, v8.16b\n"
+      ".inst 0x4e9fa5f1  // smmla v17.4s, v15.16b, v31.16b\n"
+      "sshl v15.16b, v29.16b, v1.16b\n"
+      "sshl v1.16b, v3.16b, v1.16b\n"
+      "and v29.16b, v29.16b, v0.16b\n"
+      "and v3.16b, v3.16b, v0.16b\n"
+      "ldr q0, [x25, #0x30]\n"
+      "fcvtl v20.4s, v20.4h\n"
+      ".inst 0x4e8fa773  // smmla v19.4s, v27.16b, v15.16b\n"
+      "fcvtl v9.4s, v9.4h\n"
+      ".inst 0x4e81a77a  // smmla v26.4s, v27.16b, v1.16b\n"
+      "ldr q27, [x25, #0x40]\n"
+      ".inst 0x4e8fa412  // smmla v18.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e81a411  // smmla v17.4s, v0.16b, v1.16b\n"
+      "ldr q0, [x25, #0x50]\n"
+      ".inst 0x4e95a773  // smmla v19.4s, v27.16b, v21.16b\n"
+      ".inst 0x4e90a77a  // smmla v26.4s, v27.16b, v16.16b\n"
+      "ldr q27, [x25, #0x60]\n"
+      ".inst 0x4e95a412  // smmla v18.4s, v0.16b, v21.16b\n"
+      ".inst 0x4e90a411  // smmla v17.4s, v0.16b, v16.16b\n"
+      "ldr q0, [x25, #0x70]\n"
+      "add x25, x25, #0x88\n"
+      ".inst 0x4e9da773  // smmla v19.4s, v27.16b, v29.16b\n"
+      ".inst 0x4e83a77a  // smmla v26.4s, v27.16b, v3.16b\n"
+      "ldr d27, [x20, #0x0]\n"
+      ".inst 0x4e9da412  // smmla v18.4s, v0.16b, v29.16b\n"
+      ".inst 0x4e83a411  // smmla v17.4s, v0.16b, v3.16b\n"
+      "fcvtl v27.4s, v27.4h\n"
+      "uzp1 v0.2d, v19.2d, v26.2d\n"
+      "uzp2 v26.2d, v19.2d, v26.2d\n"
+      "fmul v19.4s, v27.4s, v20.s[0]\n"
+      "scvtf v0.4s, v0.4s, #0x4\n"
+      "scvtf v26.4s, v26.4s, #0x4\n"
+      "fmla v2.4s, v0.4s, v19.4s\n"
+      "ldr q19, [x23, #0x0]\n"
+      "uzp1 v0.2d, v18.2d, v17.2d\n"
+      "uzp2 v18.2d, v18.2d, v17.2d\n"
+      "fmul v17.4s, v27.4s, v20.s[1]\n"
+      "scvtf v0.4s, v0.4s, #0x4\n"
+      "scvtf v18.4s, v18.4s, #0x4\n"
+      "fmla v10.4s, v26.4s, v17.4s\n"
+      "ldr q17, [x23, #0x10]\n"
+      "fmul v26.4s, v27.4s, v20.s[2]\n"
+      "fmul v20.4s, v27.4s, v20.s[3]\n"
+      "fmla v12.4s, v0.4s, v26.4s\n"
+      "ldr d0, [x22, #-0x8]\n"
+      "ldr d26, [x21, #-0x8]\n"
+      "fcvtl v0.4s, v0.4h\n"
+      "fmla v28.4s, v18.4s, v20.4s\n"
+      "movi v20.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
+      ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
+      "ldr q19, [x23, #0x20]\n"
+      "fcvtl v26.4s, v26.4h\n"
+      ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
+      ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
+      "ldr q19, [x23, #0x40]\n"
+      ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
+      ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
+      "ldr q19, [x23, #0x60]\n"
+      ".inst 0x4e9da674  // smmla v20.4s, v19.16b, v29.16b\n"
+      ".inst 0x4e83a672  // smmla v18.4s, v19.16b, v3.16b\n"
+      "uzp1 v19.2d, v20.2d, v18.2d\n"
+      "scvtf v19.4s, v19.4s, #0x4\n"
+      "uzp2 v20.2d, v20.2d, v18.2d\n"
+      "fmul v18.4s, v27.4s, v9.s[0]\n"
+      "scvtf v20.4s, v20.4s, #0x4\n"
+      "fmla v11.4s, v19.4s, v18.4s\n"
+      "ldr q18, [x22, #0x0]\n"
+      "fmul v19.4s, v27.4s, v9.s[1]\n"
+      "fmla v13.4s, v20.4s, v19.4s\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      ".inst 0x4e88a633  // smmla v19.4s, v17.16b, v8.16b\n"
+      ".inst 0x4e9fa634  // smmla v20.4s, v17.16b, v31.16b\n"
+      "ldr q17, [x23, #0x30]\n"
+      ".inst 0x4e8fa633  // smmla v19.4s, v17.16b, v15.16b\n"
+      ".inst 0x4e81a634  // smmla v20.4s, v17.16b, v1.16b\n"
+      "ldr q17, [x23, #0x50]\n"
+      ".inst 0x4e95a633  // smmla v19.4s, v17.16b, v21.16b\n"
+      ".inst 0x4e90a634  // smmla v20.4s, v17.16b, v16.16b\n"
+      "ldr q17, [x23, #0x70]\n"
+      "add x23, x23, #0x88\n"
+      ".inst 0x4e9da633  // smmla v19.4s, v17.16b, v29.16b\n"
+      ".inst 0x4e83a634  // smmla v20.4s, v17.16b, v3.16b\n"
+      "uzp1 v17.2d, v19.2d, v20.2d\n"
+      "scvtf v17.4s, v17.4s, #0x4\n"
+      "uzp2 v20.2d, v19.2d, v20.2d\n"
+      "fmul v19.4s, v27.4s, v9.s[2]\n"
+      "fmul v9.4s, v27.4s, v9.s[3]\n"
+      "scvtf v20.4s, v20.4s, #0x4\n"
+      "fmla v22.4s, v17.4s, v19.4s\n"
+      "ldr q17, [x22, #0x10]\n"
+      "movi v19.4s, #0x0\n"
+      ".inst 0x4e88a653  // smmla v19.4s, v18.16b, v8.16b\n"
+      "fmla v23.4s, v20.4s, v9.4s\n"
+      "movi v20.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      ".inst 0x4e9fa654  // smmla v20.4s, v18.16b, v31.16b\n"
+      "ldr q18, [x22, #0x20]\n"
+      ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
+      ".inst 0x4e8fa653  // smmla v19.4s, v18.16b, v15.16b\n"
+      ".inst 0x4e81a654  // smmla v20.4s, v18.16b, v1.16b\n"
+      "ldr q18, [x22, #0x40]\n"
+      ".inst 0x4e95a653  // smmla v19.4s, v18.16b, v21.16b\n"
+      ".inst 0x4e90a654  // smmla v20.4s, v18.16b, v16.16b\n"
+      "ldr q18, [x22, #0x60]\n"
+      ".inst 0x4e9da653  // smmla v19.4s, v18.16b, v29.16b\n"
+      ".inst 0x4e83a654  // smmla v20.4s, v18.16b, v3.16b\n"
+      "movi v18.4s, #0x0\n"
+      ".inst 0x4e9fa632  // smmla v18.4s, v17.16b, v31.16b\n"
+      "ldr q17, [x22, #0x30]\n"
+      ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
+      ".inst 0x4e81a632  // smmla v18.4s, v17.16b, v1.16b\n"
+      "ldr q17, [x22, #0x50]\n"
+      ".inst 0x4e95a629  // smmla v9.4s, v17.16b, v21.16b\n"
+      ".inst 0x4e90a632  // smmla v18.4s, v17.16b, v16.16b\n"
+      "ldr q17, [x22, #0x70]\n"
+      "add x22, x22, #0x88\n"
+      ".inst 0x4e9da629  // smmla v9.4s, v17.16b, v29.16b\n"
+      ".inst 0x4e83a632  // smmla v18.4s, v17.16b, v3.16b\n"
+      "uzp1 v17.2d, v19.2d, v20.2d\n"
+      "uzp2 v20.2d, v19.2d, v20.2d\n"
+      "fmul v19.4s, v27.4s, v0.s[0]\n"
+      "scvtf v17.4s, v17.4s, #0x4\n"
+      "scvtf v20.4s, v20.4s, #0x4\n"
+      "fmla v25.4s, v17.4s, v19.4s\n"
+      "ldr q19, [x21, #0x0]\n"
+      "fmul v17.4s, v27.4s, v0.s[1]\n"
+      "fmla v5.4s, v20.4s, v17.4s\n"
+      "ldr q17, [x21, #0x10]\n"
+      "uzp1 v20.2d, v9.2d, v18.2d\n"
+      "uzp2 v9.2d, v9.2d, v18.2d\n"
+      "fmul v18.4s, v27.4s, v0.s[2]\n"
+      "fmul v0.4s, v27.4s, v0.s[3]\n"
+      "scvtf v20.4s, v20.4s, #0x4\n"
+      "scvtf v9.4s, v9.4s, #0x4\n"
+      "fmla v7.4s, v20.4s, v18.4s\n"
+      "movi v20.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
+      ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
+      "ldr q19, [x21, #0x20]\n"
+      "fmla v4.4s, v9.4s, v0.4s\n"
+      "movi v9.4s, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
+      "fmul v8.4s, v27.4s, v26.s[0]\n"
+      ".inst 0x4e9fa620  // smmla v0.4s, v17.16b, v31.16b\n"
+      "ldr q17, [x21, #0x30]\n"
+      ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
+      "fmul v31.4s, v27.4s, v26.s[1]\n"
+      ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
+      "ldr q19, [x21, #0x40]\n"
+      ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
+      "fmul v15.4s, v27.4s, v26.s[2]\n"
+      "fmul v27.4s, v27.4s, v26.s[3]\n"
+      ".inst 0x4e81a620  // smmla v0.4s, v17.16b, v1.16b\n"
+      "ldr q1, [x21, #0x50]\n"
+      ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
+      ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
+      "ldr q26, [x21, #0x60]\n"
+      ".inst 0x4e95a429  // smmla v9.4s, v1.16b, v21.16b\n"
+      ".inst 0x4e90a420  // smmla v0.4s, v1.16b, v16.16b\n"
+      "ldr q21, [x21, #0x70]\n"
+      "add x21, x21, #0x88\n"
+      ".inst 0x4e9da754  // smmla v20.4s, v26.16b, v29.16b\n"
+      ".inst 0x4e83a752  // smmla v18.4s, v26.16b, v3.16b\n"
+      ".inst 0x4e9da6a9  // smmla v9.4s, v21.16b, v29.16b\n"
+      ".inst 0x4e83a6a0  // smmla v0.4s, v21.16b, v3.16b\n"
+      "uzp1 v29.2d, v20.2d, v18.2d\n"
+      "uzp2 v21.2d, v20.2d, v18.2d\n"
+      "scvtf v29.4s, v29.4s, #0x4\n"
+      "uzp1 v18.2d, v9.2d, v0.2d\n"
+      "uzp2 v16.2d, v9.2d, v0.2d\n"
+      "scvtf v21.4s, v21.4s, #0x4\n"
+      "fmla v6.4s, v29.4s, v8.4s\n"
+      "scvtf v18.4s, v18.4s, #0x4\n"
+      "scvtf v16.4s, v16.4s, #0x4\n"
+      "fmla v30.4s, v21.4s, v31.4s\n"
+      "fmla v24.4s, v18.4s, v15.4s\n"
+      "fmla v14.4s, v16.4s, v27.4s\n"
+      "bgt 3b\n"
+      "mov x20, %x[res_ptr]\n"
+      "subs x27, x27, #0x4\n"
+      "add %x[res_ptr], %x[res_ptr], #0x10\n"
+      "str q2, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q10, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q12, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q28, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q11, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q13, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q22, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q23, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q25, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q5, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q7, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q4, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q6, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q30, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q24, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q14, [x20, #0x0]\n"
+      "bne 2b\n"
+      "mov x20, #0x4\n"
+      "sub x10, x10, #0x10\n"
+      "cmp x10, #0x10\n"
+      "mov %x[res_ptr], x26\n"
+      "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
+      "bge 1b\n"
+      "4:"  // Row loop skip
+      "cbz x10, 9f\n"
+      "5:"  // Row tail: Row loop
+      "add x24, %x[b_ptr], #0x8\n"
+      "mov x23, %x[width]\n"
+      "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
+      "6:"  // Row tail: Column loop
+      "movi v2.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "add x25, %x[a_ptr], #0x8\n"
+      "mov x21, %x[num_blocks]\n"
+      "movi v12.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "7:"  // Row tail: Block loop
+      "ldr q6, [x24, #0x0]\n"
+      "ldr q5, [x24, #0x10]\n"
+      "movi v17.16b, #0x4\n"
+      "movi v8.4s, #0x0\n"
+      "ldr q4, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "movi v27.4s, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "ldr q31, [x24, #0x20]\n"
+      "ldr q14, [x24, #0x30]\n"
+      "movi v29.4s, #0x0\n"
+      "movi v22.16b, #0xf0\n"
+      "ldr q11, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "sshl v21.16b, v6.16b, v17.16b\n"
+      "sshl v16.16b, v5.16b, v17.16b\n"
+      "ldr q20, [x25, #0x40]\n"
+      "ldr q26, [x25, #0x50]\n"
+      "and v6.16b, v6.16b, v22.16b\n"
+      "and v5.16b, v5.16b, v22.16b\n"
+      "ldr q25, [x25, #0x60]\n"
+      "ldr q3, [x25, #0x70]\n"
+      "sshl v19.16b, v31.16b, v17.16b\n"
+      "sshl v18.16b, v14.16b, v17.16b\n"
+      "ldr d17, [x25, #-0x8]\n"
+      ".inst 0x4e95a488  // smmla v8.4s, v4.16b, v21.16b\n"
+      ".inst 0x4e90a49b  // smmla v27.4s, v4.16b, v16.16b\n"
+      "and v31.16b, v31.16b, v22.16b\n"
+      ".inst 0x4e95a5a0  // smmla v0.4s, v13.16b, v21.16b\n"
+      ".inst 0x4e90a5bd  // smmla v29.4s, v13.16b, v16.16b\n"
+      "and v14.16b, v14.16b, v22.16b\n"
+      "sub x20, x24, #0x8\n"
+      "ldr d16, [x20, #0x0]\n"
+      "subs x21, x21, #0x1\n"
+      "add x25, x25, #0x88\n"
+      "fcvtl v17.4s, v17.4h\n"
+      "add x24, x24, #0x48\n"
+      ".inst 0x4e93a568  // smmla v8.4s, v11.16b, v19.16b\n"
+      ".inst 0x4e92a57b  // smmla v27.4s, v11.16b, v18.16b\n"
+      ".inst 0x4e93a6e0  // smmla v0.4s, v23.16b, v19.16b\n"
+      ".inst 0x4e92a6fd  // smmla v29.4s, v23.16b, v18.16b\n"
+      "fcvtl v16.4s, v16.4h\n"
+      ".inst 0x4e86a688  // smmla v8.4s, v20.16b, v6.16b\n"
+      ".inst 0x4e85a69b  // smmla v27.4s, v20.16b, v5.16b\n"
+      "fmul v23.4s, v16.4s, v17.s[0]\n"
+      "fmul v21.4s, v16.4s, v17.s[1]\n"
+      "fmul v1.4s, v16.4s, v17.s[2]\n"
+      "fmul v20.4s, v16.4s, v17.s[3]\n"
+      ".inst 0x4e86a740  // smmla v0.4s, v26.16b, v6.16b\n"
+      ".inst 0x4e85a75d  // smmla v29.4s, v26.16b, v5.16b\n"
+      ".inst 0x4e9fa728  // smmla v8.4s, v25.16b, v31.16b\n"
+      ".inst 0x4e8ea73b  // smmla v27.4s, v25.16b, v14.16b\n"
+      ".inst 0x4e9fa460  // smmla v0.4s, v3.16b, v31.16b\n"
+      ".inst 0x4e8ea47d  // smmla v29.4s, v3.16b, v14.16b\n"
+      "uzp1 v19.2d, v8.2d, v27.2d\n"
+      "uzp2 v18.2d, v8.2d, v27.2d\n"
+      "scvtf v19.4s, v19.4s, #0x4\n"
+      "uzp1 v17.2d, v0.2d, v29.2d\n"
+      "uzp2 v16.2d, v0.2d, v29.2d\n"
+      "scvtf v18.4s, v18.4s, #0x4\n"
+      "fmla v2.4s, v19.4s, v23.4s\n"
+      "scvtf v17.4s, v17.4s, #0x4\n"
+      "scvtf v16.4s, v16.4s, #0x4\n"
+      "fmla v10.4s, v18.4s, v21.4s\n"
+      "fmla v12.4s, v17.4s, v1.4s\n"
+      "fmla v28.4s, v16.4s, v20.4s\n"
+      "bgt 7b\n"
+      "mov x20, %x[res_ptr]\n"
+      "cmp x10, #0x1\n"
+      "str q2, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "ble 8f\n"
+      "cmp x10, #0x2\n"
+      "str q10, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "ble 8f\n"
+      "cmp x10, #0x3\n"
+      "str q12, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "ble 8f\n"
+      "str q28, [x20, #0x0]\n"
+      "8:"  // Row tail: Accumulator store skip
+      "subs x23, x23, #0x4\n"
+      "add %x[res_ptr], %x[res_ptr], #0x10\n"
+      "bne 6b\n"
+      "subs x10, x10, #0x4\n"
+      "add %x[a_ptr], %x[a_ptr], x9\n"
+      "mov %x[res_ptr], x22\n"
+      "bgt 5b\n"
+      "9:"  // Row tail: Row loop skip
+      : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
+      : [b_ptr] "r" (b_ptr), [height] "r" (height), [num_blocks] "r" (num_blocks), [res_stride] "r" (res_stride), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+#endif
+}
+
+void ggml_gemm_q4_0_q8_0_aarch64_neon_noi8mm(size_t depth, size_t output_channels, size_t height, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
+#if defined(__ARM_NEON)
+    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)4);
+    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)4);
+    size_t width = xend - x0;
+
+    int64_t nb = depth / QK4_0;
+    void * b_ptr = (void *)((block_q4_0x4 *) vx + ((x0/4) * nb));
+    void * a_ptr = vy;
+    float * res_ptr = s + x0;
+    size_t res_stride = output_channels  * sizeof(float);
+
+    assert(depth % 32 == 0);
+    assert(width % 4 == 0);
+
+    size_t num_blocks = depth / 32;
+
+    __asm__ __volatile__(
+      "mov x10, %x[height]\n"
+      "mov x9, #0x88\n"
+      "cmp x10, #0x10\n"
+      "mul x9, %x[num_blocks], x9\n"
+      "blt 4f\n"
+      "1:"  // Row loop
+      "add x28, %x[b_ptr], #0x8\n"
+      "mov x27, %x[width]\n"
+      "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
+      "2:"  // Column loop
+      "add x25, %x[a_ptr], #0x8\n"
+      "movi v15.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "mov x24, %x[num_blocks]\n"
+      "add x23, x25, x9\n"
+      "movi v18.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "add x22, x23, x9\n"
+      "movi v11.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "add x21, x22, x9\n"
+      "movi v23.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v7.16b, #0x0\n"
+      "movi v0.16b, #0x0\n"
+      "movi v4.16b, #0x0\n"
+      "movi v5.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v8.16b, #0x0\n"
+      "movi v1.16b, #0x0\n"
+      "3:"  // Block loop
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q31, [x25, #0x0]\n"
+      "movi v28.16b, #0x4\n"
+      "movi v10.4s, #0x0\n"
+      "ldr q22, [x28, #0x10]\n"
+      "ldr q6, [x25, #0x10]\n"
+      "movi v29.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "ldr q27, [x28, #0x20]\n"
+      "ldr q30, [x28, #0x30]\n"
+      "movi v20.4s, #0x0\n"
+      "movi v24.16b, #0xf0\n"
+      "ldr d2, [x25, #-0x8]\n"
+      "ldr d26, [x23, #-0x8]\n"
+      "sshl v12.16b, v3.16b, v28.16b\n"
+      "sub x20, x28, #0x8\n"
+      "ldr d17, [x20, #0x0]\n"
+      "and v3.16b, v3.16b, v24.16b\n"
+      "subs x24, x24, #0x1\n"
+      "add x28, x28, #0x48\n"
+      ".inst 0x4f9fe18a  // sdot v10.4s, v12.16b, v31.4b[0]\n"
+      ".inst 0x4fbfe19d  // sdot v29.4s, v12.16b, v31.4b[1]\n"
+      ".inst 0x4f9fe989  // sdot v9.4s, v12.16b, v31.4b[2]\n"
+      ".inst 0x4fbfe994  // sdot v20.4s, v12.16b, v31.4b[3]\n"
+      "sshl v31.16b, v22.16b, v28.16b\n"
+      "and v22.16b, v22.16b, v24.16b\n"
+      "fcvtl v17.4s, v17.4h\n"
+      "fcvtl v2.4s, v2.4h\n"
+      "fcvtl v26.4s, v26.4h\n"
+      ".inst 0x4f86e3ea  // sdot v10.4s, v31.16b, v6.4b[0]\n"
+      ".inst 0x4fa6e3fd  // sdot v29.4s, v31.16b, v6.4b[1]\n"
+      ".inst 0x4f86ebe9  // sdot v9.4s, v31.16b, v6.4b[2]\n"
+      ".inst 0x4fa6ebf4  // sdot v20.4s, v31.16b, v6.4b[3]\n"
+      "sshl v6.16b, v27.16b, v28.16b\n"
+      "sshl v28.16b, v30.16b, v28.16b\n"
+      "and v27.16b, v27.16b, v24.16b\n"
+      "and v30.16b, v30.16b, v24.16b\n"
+      "ldr q24, [x25, #0x20]\n"
+      ".inst 0x4f98e0ca  // sdot v10.4s, v6.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e0dd  // sdot v29.4s, v6.16b, v24.4b[1]\n"
+      ".inst 0x4f98e8c9  // sdot v9.4s, v6.16b, v24.4b[2]\n"
+      ".inst 0x4fb8e8d4  // sdot v20.4s, v6.16b, v24.4b[3]\n"
+      "ldr q24, [x25, #0x30]\n"
+      ".inst 0x4f98e38a  // sdot v10.4s, v28.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e39d  // sdot v29.4s, v28.16b, v24.4b[1]\n"
+      ".inst 0x4f98eb89  // sdot v9.4s, v28.16b, v24.4b[2]\n"
+      ".inst 0x4fb8eb94  // sdot v20.4s, v28.16b, v24.4b[3]\n"
+      "ldr q24, [x25, #0x40]\n"
+      ".inst 0x4f98e06a  // sdot v10.4s, v3.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e07d  // sdot v29.4s, v3.16b, v24.4b[1]\n"
+      ".inst 0x4f98e869  // sdot v9.4s, v3.16b, v24.4b[2]\n"
+      ".inst 0x4fb8e874  // sdot v20.4s, v3.16b, v24.4b[3]\n"
+      "ldr q24, [x25, #0x50]\n"
+      ".inst 0x4f98e2ca  // sdot v10.4s, v22.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e2dd  // sdot v29.4s, v22.16b, v24.4b[1]\n"
+      ".inst 0x4f98eac9  // sdot v9.4s, v22.16b, v24.4b[2]\n"
+      ".inst 0x4fb8ead4  // sdot v20.4s, v22.16b, v24.4b[3]\n"
+      "ldr q24, [x25, #0x60]\n"
+      ".inst 0x4f98e36a  // sdot v10.4s, v27.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e37d  // sdot v29.4s, v27.16b, v24.4b[1]\n"
+      ".inst 0x4f98eb69  // sdot v9.4s, v27.16b, v24.4b[2]\n"
+      ".inst 0x4fb8eb74  // sdot v20.4s, v27.16b, v24.4b[3]\n"
+      "ldr q24, [x25, #0x70]\n"
+      "add x25, x25, #0x88\n"
+      ".inst 0x4f98e3ca  // sdot v10.4s, v30.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e3dd  // sdot v29.4s, v30.16b, v24.4b[1]\n"
+      ".inst 0x4f98ebc9  // sdot v9.4s, v30.16b, v24.4b[2]\n"
+      ".inst 0x4fb8ebd4  // sdot v20.4s, v30.16b, v24.4b[3]\n"
+      "fmul v24.4s, v17.4s, v2.s[0]\n"
+      "scvtf v10.4s, v10.4s, #0x4\n"
+      "scvtf v29.4s, v29.4s, #0x4\n"
+      "scvtf v9.4s, v9.4s, #0x4\n"
+      "scvtf v20.4s, v20.4s, #0x4\n"
+      "fmla v15.4s, v10.4s, v24.4s\n"
+      "ldr q24, [x23, #0x0]\n"
+      "fmul v10.4s, v17.4s, v2.s[1]\n"
+      "fmla v19.4s, v29.4s, v10.4s\n"
+      "ldr q10, [x23, #0x10]\n"
+      "fmul v29.4s, v17.4s, v2.s[2]\n"
+      "fmul v2.4s, v17.4s, v2.s[3]\n"
+      "fmla v18.4s, v9.4s, v29.4s\n"
+      "movi v9.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      ".inst 0x4f98e189  // sdot v9.4s, v12.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e19d  // sdot v29.4s, v12.16b, v24.4b[1]\n"
+      "fmla v14.4s, v20.4s, v2.4s\n"
+      "movi v20.4s, #0x0\n"
+      "movi v2.4s, #0x0\n"
+      ".inst 0x4f98e994  // sdot v20.4s, v12.16b, v24.4b[2]\n"
+      ".inst 0x4fb8e982  // sdot v2.4s, v12.16b, v24.4b[3]\n"
+      "ldr q24, [x23, #0x20]\n"
+      ".inst 0x4f8ae3e9  // sdot v9.4s, v31.16b, v10.4b[0]\n"
+      ".inst 0x4faae3fd  // sdot v29.4s, v31.16b, v10.4b[1]\n"
+      ".inst 0x4f8aebf4  // sdot v20.4s, v31.16b, v10.4b[2]\n"
+      ".inst 0x4faaebe2  // sdot v2.4s, v31.16b, v10.4b[3]\n"
+      "ldr q10, [x23, #0x30]\n"
+      ".inst 0x4f98e0c9  // sdot v9.4s, v6.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e0dd  // sdot v29.4s, v6.16b, v24.4b[1]\n"
+      ".inst 0x4f98e8d4  // sdot v20.4s, v6.16b, v24.4b[2]\n"
+      ".inst 0x4fb8e8c2  // sdot v2.4s, v6.16b, v24.4b[3]\n"
+      "ldr q24, [x23, #0x40]\n"
+      ".inst 0x4f8ae389  // sdot v9.4s, v28.16b, v10.4b[0]\n"
+      ".inst 0x4faae39d  // sdot v29.4s, v28.16b, v10.4b[1]\n"
+      ".inst 0x4f8aeb94  // sdot v20.4s, v28.16b, v10.4b[2]\n"
+      ".inst 0x4faaeb82  // sdot v2.4s, v28.16b, v10.4b[3]\n"
+      "ldr q10, [x23, #0x50]\n"
+      ".inst 0x4f98e069  // sdot v9.4s, v3.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e07d  // sdot v29.4s, v3.16b, v24.4b[1]\n"
+      ".inst 0x4f98e874  // sdot v20.4s, v3.16b, v24.4b[2]\n"
+      ".inst 0x4fb8e862  // sdot v2.4s, v3.16b, v24.4b[3]\n"
+      "ldr q24, [x23, #0x60]\n"
+      ".inst 0x4f8ae2c9  // sdot v9.4s, v22.16b, v10.4b[0]\n"
+      ".inst 0x4faae2dd  // sdot v29.4s, v22.16b, v10.4b[1]\n"
+      ".inst 0x4f8aead4  // sdot v20.4s, v22.16b, v10.4b[2]\n"
+      ".inst 0x4faaeac2  // sdot v2.4s, v22.16b, v10.4b[3]\n"
+      "ldr q10, [x23, #0x70]\n"
+      "add x23, x23, #0x88\n"
+      ".inst 0x4f98e369  // sdot v9.4s, v27.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e37d  // sdot v29.4s, v27.16b, v24.4b[1]\n"
+      ".inst 0x4f98eb74  // sdot v20.4s, v27.16b, v24.4b[2]\n"
+      ".inst 0x4fb8eb62  // sdot v2.4s, v27.16b, v24.4b[3]\n"
+      "ldr q24, [x22, #0x0]\n"
+      ".inst 0x4f8ae3c9  // sdot v9.4s, v30.16b, v10.4b[0]\n"
+      ".inst 0x4faae3dd  // sdot v29.4s, v30.16b, v10.4b[1]\n"
+      ".inst 0x4f8aebd4  // sdot v20.4s, v30.16b, v10.4b[2]\n"
+      ".inst 0x4faaebc2  // sdot v2.4s, v30.16b, v10.4b[3]\n"
+      "fmul v10.4s, v17.4s, v26.s[0]\n"
+      "scvtf v9.4s, v9.4s, #0x4\n"
+      "scvtf v29.4s, v29.4s, #0x4\n"
+      "scvtf v20.4s, v20.4s, #0x4\n"
+      "scvtf v2.4s, v2.4s, #0x4\n"
+      "fmla v11.4s, v9.4s, v10.4s\n"
+      "ldr q9, [x22, #0x10]\n"
+      "fmul v10.4s, v17.4s, v26.s[1]\n"
+      "fmla v13.4s, v29.4s, v10.4s\n"
+      "ldr d29, [x22, #-0x8]\n"
+      "fmul v10.4s, v17.4s, v26.s[2]\n"
+      "fmul v26.4s, v17.4s, v26.s[3]\n"
+      "fcvtl v29.4s, v29.4h\n"
+      "fmla v23.4s, v20.4s, v10.4s\n"
+      "movi v20.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "fmla v16.4s, v2.4s, v26.4s\n"
+      "movi v26.4s, #0x0\n"
+      "movi v2.4s, #0x0\n"
+      ".inst 0x4f98e194  // sdot v20.4s, v12.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e18a  // sdot v10.4s, v12.16b, v24.4b[1]\n"
+      ".inst 0x4f98e99a  // sdot v26.4s, v12.16b, v24.4b[2]\n"
+      ".inst 0x4fb8e982  // sdot v2.4s, v12.16b, v24.4b[3]\n"
+      "ldr q24, [x22, #0x20]\n"
+      ".inst 0x4f89e3f4  // sdot v20.4s, v31.16b, v9.4b[0]\n"
+      ".inst 0x4fa9e3ea  // sdot v10.4s, v31.16b, v9.4b[1]\n"
+      ".inst 0x4f89ebfa  // sdot v26.4s, v31.16b, v9.4b[2]\n"
+      ".inst 0x4fa9ebe2  // sdot v2.4s, v31.16b, v9.4b[3]\n"
+      "ldr q9, [x22, #0x30]\n"
+      ".inst 0x4f98e0d4  // sdot v20.4s, v6.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e0ca  // sdot v10.4s, v6.16b, v24.4b[1]\n"
+      ".inst 0x4f98e8da  // sdot v26.4s, v6.16b, v24.4b[2]\n"
+      ".inst 0x4fb8e8c2  // sdot v2.4s, v6.16b, v24.4b[3]\n"
+      "ldr q24, [x22, #0x40]\n"
+      ".inst 0x4f89e394  // sdot v20.4s, v28.16b, v9.4b[0]\n"
+      ".inst 0x4fa9e38a  // sdot v10.4s, v28.16b, v9.4b[1]\n"
+      ".inst 0x4f89eb9a  // sdot v26.4s, v28.16b, v9.4b[2]\n"
+      ".inst 0x4fa9eb82  // sdot v2.4s, v28.16b, v9.4b[3]\n"
+      "ldr q9, [x22, #0x50]\n"
+      ".inst 0x4f98e074  // sdot v20.4s, v3.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e06a  // sdot v10.4s, v3.16b, v24.4b[1]\n"
+      ".inst 0x4f98e87a  // sdot v26.4s, v3.16b, v24.4b[2]\n"
+      ".inst 0x4fb8e862  // sdot v2.4s, v3.16b, v24.4b[3]\n"
+      "ldr q24, [x22, #0x60]\n"
+      ".inst 0x4f89e2d4  // sdot v20.4s, v22.16b, v9.4b[0]\n"
+      ".inst 0x4fa9e2ca  // sdot v10.4s, v22.16b, v9.4b[1]\n"
+      ".inst 0x4f89eada  // sdot v26.4s, v22.16b, v9.4b[2]\n"
+      ".inst 0x4fa9eac2  // sdot v2.4s, v22.16b, v9.4b[3]\n"
+      "ldr q9, [x22, #0x70]\n"
+      "add x22, x22, #0x88\n"
+      ".inst 0x4f98e374  // sdot v20.4s, v27.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e36a  // sdot v10.4s, v27.16b, v24.4b[1]\n"
+      ".inst 0x4f98eb7a  // sdot v26.4s, v27.16b, v24.4b[2]\n"
+      ".inst 0x4fb8eb62  // sdot v2.4s, v27.16b, v24.4b[3]\n"
+      "ldr q24, [x21, #0x0]\n"
+      ".inst 0x4f89e3d4  // sdot v20.4s, v30.16b, v9.4b[0]\n"
+      ".inst 0x4fa9e3ca  // sdot v10.4s, v30.16b, v9.4b[1]\n"
+      ".inst 0x4f89ebda  // sdot v26.4s, v30.16b, v9.4b[2]\n"
+      ".inst 0x4fa9ebc2  // sdot v2.4s, v30.16b, v9.4b[3]\n"
+      "fmul v9.4s, v17.4s, v29.s[0]\n"
+      "scvtf v20.4s, v20.4s, #0x4\n"
+      "scvtf v10.4s, v10.4s, #0x4\n"
+      "scvtf v26.4s, v26.4s, #0x4\n"
+      "scvtf v2.4s, v2.4s, #0x4\n"
+      "fmla v25.4s, v20.4s, v9.4s\n"
+      "ldr q9, [x21, #0x10]\n"
+      "fmul v20.4s, v17.4s, v29.s[1]\n"
+      "fmla v7.4s, v10.4s, v20.4s\n"
+      "ldr d20, [x21, #-0x8]\n"
+      "fmul v10.4s, v17.4s, v29.s[2]\n"
+      "fmul v29.4s, v17.4s, v29.s[3]\n"
+      "fcvtl v20.4s, v20.4h\n"
+      "fmla v0.4s, v26.4s, v10.4s\n"
+      "movi v26.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "fmla v4.4s, v2.4s, v29.4s\n"
+      "movi v2.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      ".inst 0x4f98e19a  // sdot v26.4s, v12.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e18a  // sdot v10.4s, v12.16b, v24.4b[1]\n"
+      ".inst 0x4f98e982  // sdot v2.4s, v12.16b, v24.4b[2]\n"
+      ".inst 0x4fb8e99d  // sdot v29.4s, v12.16b, v24.4b[3]\n"
+      "ldr q12, [x21, #0x20]\n"
+      "fmul v24.4s, v17.4s, v20.s[0]\n"
+      ".inst 0x4f89e3fa  // sdot v26.4s, v31.16b, v9.4b[0]\n"
+      ".inst 0x4fa9e3ea  // sdot v10.4s, v31.16b, v9.4b[1]\n"
+      ".inst 0x4f89ebe2  // sdot v2.4s, v31.16b, v9.4b[2]\n"
+      ".inst 0x4fa9ebfd  // sdot v29.4s, v31.16b, v9.4b[3]\n"
+      "ldr q9, [x21, #0x30]\n"
+      "fmul v31.4s, v17.4s, v20.s[1]\n"
+      ".inst 0x4f8ce0da  // sdot v26.4s, v6.16b, v12.4b[0]\n"
+      ".inst 0x4face0ca  // sdot v10.4s, v6.16b, v12.4b[1]\n"
+      ".inst 0x4f8ce8c2  // sdot v2.4s, v6.16b, v12.4b[2]\n"
+      ".inst 0x4face8dd  // sdot v29.4s, v6.16b, v12.4b[3]\n"
+      "ldr q12, [x21, #0x40]\n"
+      "fmul v6.4s, v17.4s, v20.s[2]\n"
+      "fmul v20.4s, v17.4s, v20.s[3]\n"
+      ".inst 0x4f89e39a  // sdot v26.4s, v28.16b, v9.4b[0]\n"
+      ".inst 0x4fa9e38a  // sdot v10.4s, v28.16b, v9.4b[1]\n"
+      ".inst 0x4f89eb82  // sdot v2.4s, v28.16b, v9.4b[2]\n"
+      ".inst 0x4fa9eb9d  // sdot v29.4s, v28.16b, v9.4b[3]\n"
+      "ldr q9, [x21, #0x50]\n"
+      ".inst 0x4f8ce07a  // sdot v26.4s, v3.16b, v12.4b[0]\n"
+      ".inst 0x4face06a  // sdot v10.4s, v3.16b, v12.4b[1]\n"
+      ".inst 0x4f8ce862  // sdot v2.4s, v3.16b, v12.4b[2]\n"
+      ".inst 0x4face87d  // sdot v29.4s, v3.16b, v12.4b[3]\n"
+      "ldr q12, [x21, #0x60]\n"
+      ".inst 0x4f89e2da  // sdot v26.4s, v22.16b, v9.4b[0]\n"
+      ".inst 0x4fa9e2ca  // sdot v10.4s, v22.16b, v9.4b[1]\n"
+      ".inst 0x4f89eac2  // sdot v2.4s, v22.16b, v9.4b[2]\n"
+      ".inst 0x4fa9eadd  // sdot v29.4s, v22.16b, v9.4b[3]\n"
+      "ldr q17, [x21, #0x70]\n"
+      "add x21, x21, #0x88\n"
+      ".inst 0x4f8ce37a  // sdot v26.4s, v27.16b, v12.4b[0]\n"
+      ".inst 0x4face36a  // sdot v10.4s, v27.16b, v12.4b[1]\n"
+      ".inst 0x4f8ceb62  // sdot v2.4s, v27.16b, v12.4b[2]\n"
+      ".inst 0x4faceb7d  // sdot v29.4s, v27.16b, v12.4b[3]\n"
+      ".inst 0x4f91e3da  // sdot v26.4s, v30.16b, v17.4b[0]\n"
+      ".inst 0x4fb1e3ca  // sdot v10.4s, v30.16b, v17.4b[1]\n"
+      ".inst 0x4f91ebc2  // sdot v2.4s, v30.16b, v17.4b[2]\n"
+      ".inst 0x4fb1ebdd  // sdot v29.4s, v30.16b, v17.4b[3]\n"
+      "scvtf v26.4s, v26.4s, #0x4\n"
+      "scvtf v10.4s, v10.4s, #0x4\n"
+      "fmla v5.4s, v26.4s, v24.4s\n"
+      "scvtf v2.4s, v2.4s, #0x4\n"
+      "scvtf v29.4s, v29.4s, #0x4\n"
+      "fmla v21.4s, v10.4s, v31.4s\n"
+      "fmla v8.4s, v2.4s, v6.4s\n"
+      "fmla v1.4s, v29.4s, v20.4s\n"
+      "bgt 3b\n"
+      "mov x20, %x[res_ptr]\n"
+      "subs x27, x27, #0x4\n"
+      "add %x[res_ptr], %x[res_ptr], #0x10\n"
+      "str q15, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q19, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q18, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q14, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q11, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q13, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q23, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q16, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q25, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q7, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q0, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q4, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q5, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q21, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q8, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q1, [x20, #0x0]\n"
+      "bne 2b\n"
+      "mov x20, #0x4\n"
+      "sub x10, x10, #0x10\n"
+      "cmp x10, #0x10\n"
+      "mov %x[res_ptr], x26\n"
+      "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
+      "bge 1b\n"
+      "4:"  // Row loop skip
+      "cbz x10, 9f\n"
+      "5:"  // Row tail: Row loop
+      "add x24, %x[b_ptr], #0x8\n"
+      "mov x23, %x[width]\n"
+      "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
+      "6:"  // Row tail: Column loop
+      "movi v15.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "add x25, %x[a_ptr], #0x8\n"
+      "mov x21, %x[num_blocks]\n"
+      "movi v18.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "7:"  // Row tail: Block loop
+      "ldr q7, [x24, #0x0]\n"
+      "ldr q5, [x25, #0x0]\n"
+      "movi v9.16b, #0x4\n"
+      "movi v4.4s, #0x0\n"
+      "ldr q3, [x24, #0x10]\n"
+      "ldr q2, [x25, #0x10]\n"
+      "movi v1.4s, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "ldr q13, [x24, #0x20]\n"
+      "ldr q31, [x25, #0x20]\n"
+      "movi v30.4s, #0x0\n"
+      "movi v29.16b, #0xf0\n"
+      "ldr q28, [x24, #0x30]\n"
+      "ldr q27, [x25, #0x30]\n"
+      "sshl v20.16b, v7.16b, v9.16b\n"
+      "sub x20, x24, #0x8\n"
+      "ldr q26, [x25, #0x40]\n"
+      "ldr q25, [x25, #0x50]\n"
+      "sshl v17.16b, v3.16b, v9.16b\n"
+      "and v7.16b, v7.16b, v29.16b\n"
+      "ldr q24, [x25, #0x60]\n"
+      "ldr q16, [x25, #0x70]\n"
+      "sshl v22.16b, v13.16b, v9.16b\n"
+      "and v3.16b, v3.16b, v29.16b\n"
+      "ldr d21, [x20, #0x0]\n"
+      "ldr d12, [x25, #-0x8]\n"
+      ".inst 0x4f85e284  // sdot v4.4s, v20.16b, v5.4b[0]\n"
+      ".inst 0x4fa5e281  // sdot v1.4s, v20.16b, v5.4b[1]\n"
+      ".inst 0x4f85ea80  // sdot v0.4s, v20.16b, v5.4b[2]\n"
+      ".inst 0x4fa5ea9e  // sdot v30.4s, v20.16b, v5.4b[3]\n"
+      "sshl v9.16b, v28.16b, v9.16b\n"
+      "subs x21, x21, #0x1\n"
+      "and v13.16b, v13.16b, v29.16b\n"
+      "and v28.16b, v28.16b, v29.16b\n"
+      "add x25, x25, #0x88\n"
+      "add x24, x24, #0x48\n"
+      "fcvtl v21.4s, v21.4h\n"
+      "fcvtl v12.4s, v12.4h\n"
+      ".inst 0x4f82e224  // sdot v4.4s, v17.16b, v2.4b[0]\n"
+      ".inst 0x4fa2e221  // sdot v1.4s, v17.16b, v2.4b[1]\n"
+      ".inst 0x4f82ea20  // sdot v0.4s, v17.16b, v2.4b[2]\n"
+      ".inst 0x4fa2ea3e  // sdot v30.4s, v17.16b, v2.4b[3]\n"
+      "fmul v11.4s, v21.4s, v12.s[0]\n"
+      "fmul v23.4s, v21.4s, v12.s[1]\n"
+      "fmul v17.4s, v21.4s, v12.s[2]\n"
+      ".inst 0x4f9fe2c4  // sdot v4.4s, v22.16b, v31.4b[0]\n"
+      "fmul v6.4s, v21.4s, v12.s[3]\n"
+      ".inst 0x4fbfe2c1  // sdot v1.4s, v22.16b, v31.4b[1]\n"
+      ".inst 0x4f9feac0  // sdot v0.4s, v22.16b, v31.4b[2]\n"
+      ".inst 0x4fbfeade  // sdot v30.4s, v22.16b, v31.4b[3]\n"
+      ".inst 0x4f9be124  // sdot v4.4s, v9.16b, v27.4b[0]\n"
+      ".inst 0x4fbbe121  // sdot v1.4s, v9.16b, v27.4b[1]\n"
+      ".inst 0x4f9be920  // sdot v0.4s, v9.16b, v27.4b[2]\n"
+      ".inst 0x4fbbe93e  // sdot v30.4s, v9.16b, v27.4b[3]\n"
+      ".inst 0x4f9ae0e4  // sdot v4.4s, v7.16b, v26.4b[0]\n"
+      ".inst 0x4fbae0e1  // sdot v1.4s, v7.16b, v26.4b[1]\n"
+      ".inst 0x4f9ae8e0  // sdot v0.4s, v7.16b, v26.4b[2]\n"
+      ".inst 0x4fbae8fe  // sdot v30.4s, v7.16b, v26.4b[3]\n"
+      ".inst 0x4f99e064  // sdot v4.4s, v3.16b, v25.4b[0]\n"
+      ".inst 0x4fb9e061  // sdot v1.4s, v3.16b, v25.4b[1]\n"
+      ".inst 0x4f99e860  // sdot v0.4s, v3.16b, v25.4b[2]\n"
+      ".inst 0x4fb9e87e  // sdot v30.4s, v3.16b, v25.4b[3]\n"
+      ".inst 0x4f98e1a4  // sdot v4.4s, v13.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e1a1  // sdot v1.4s, v13.16b, v24.4b[1]\n"
+      ".inst 0x4f98e9a0  // sdot v0.4s, v13.16b, v24.4b[2]\n"
+      ".inst 0x4fb8e9be  // sdot v30.4s, v13.16b, v24.4b[3]\n"
+      ".inst 0x4f90e384  // sdot v4.4s, v28.16b, v16.4b[0]\n"
+      ".inst 0x4fb0e381  // sdot v1.4s, v28.16b, v16.4b[1]\n"
+      ".inst 0x4f90eb80  // sdot v0.4s, v28.16b, v16.4b[2]\n"
+      ".inst 0x4fb0eb9e  // sdot v30.4s, v28.16b, v16.4b[3]\n"
+      "scvtf v4.4s, v4.4s, #0x4\n"
+      "scvtf v1.4s, v1.4s, #0x4\n"
+      "scvtf v0.4s, v0.4s, #0x4\n"
+      "fmla v15.4s, v4.4s, v11.4s\n"
+      "scvtf v30.4s, v30.4s, #0x4\n"
+      "fmla v19.4s, v1.4s, v23.4s\n"
+      "fmla v18.4s, v0.4s, v17.4s\n"
+      "fmla v14.4s, v30.4s, v6.4s\n"
+      "bgt 7b\n"
+      "mov x20, %x[res_ptr]\n"
+      "cmp x10, #0x1\n"
+      "str q15, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "ble 8f\n"
+      "cmp x10, #0x2\n"
+      "str q19, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "ble 8f\n"
+      "cmp x10, #0x3\n"
+      "str q18, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "ble 8f\n"
+      "str q14, [x20, #0x0]\n"
+      "8:"  // Row tail: Accumulator store skip
+      "subs x23, x23, #0x4\n"
+      "add %x[res_ptr], %x[res_ptr], #0x10\n"
+      "bne 6b\n"
+      "subs x10, x10, #0x4\n"
+      "add %x[a_ptr], %x[a_ptr], x9\n"
+      "mov %x[res_ptr], x22\n"
+      "bgt 5b\n"
+      "9:"  // Row tail: Row loop skip
+      : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
+      : [b_ptr] "r" (b_ptr), [height] "r" (height), [num_blocks] "r" (num_blocks), [res_stride] "r" (res_stride), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+#endif
+}
+
+void ggml_gemm_q8_0_q8_0(const int n, int rows, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
 #if defined(__ARM_FEATURE_MATMUL_INT8)
-    int rows = 2;
     int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)4);
     int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)4);
 
@@ -15570,22 +17023,27 @@ void ggml_gemm_q8_0_q8_0_2x4blocked_mmla(const int n, int output_channels, int i
     int64_t a_nb = n / QK8_0;
 
     const block_q8_0x4 * b_ptr_start = vx;
-    const block_q8_0x2 * a_ptr_start = vy;
+    const block_q8_0x4 * a_ptr_start = vy;
 
-    for (int64_t y = 0; y < input_width / 2; y += rows / 2) {
+    for (int64_t y = 0; y < input_width / 4; y += rows / 4) {
         for (int64_t x = x0 / 4; x < xend / 4; x++) {
-            const block_q8_0x2 * a_ptrs[rows / 2];
+            const block_q8_0x4 * a_ptrs[rows / 4];
 
             a_ptrs[0] = a_ptr_start + (y * a_nb);
+            for (int i = 0; i < (rows / 4) - 1; i++) {
+                a_ptrs[i + 1] = a_ptrs[i] + a_nb;
+            }
 
             const block_q8_0x4 * b_ptr = b_ptr_start + (x * nb);
 
             // Master FP accumulators
             float32x4_t acc_rows[rows];
-            acc_rows[0] = vdupq_n_f32(0.0f);
-            acc_rows[1] = vdupq_n_f32(0.0f);
+            for (int i = 0; i < rows; i++) {
+                acc_rows[i] = vdupq_n_f32(0.0f);
+            }
 
             for (int64_t b = 0; b < nb; b++) {
+                // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
                 const int8x16_t rhs_mat_01_0 = vld1q_s8(b_ptr[b].qs);
                 const int8x16_t rhs_mat_23_0 = vld1q_s8(b_ptr[b].qs + 16);
                 const int8x16_t rhs_mat_01_1 = vld1q_s8(b_ptr[b].qs + 32);
@@ -15600,33 +17058,46 @@ void ggml_gemm_q8_0_q8_0_2x4blocked_mmla(const int n, int output_channels, int i
                 const float32x4_t col_scale_f32 = vcvt_f32_f16(col_scale_f16);
 
                 // Process LHS in pairs of rows
-                int rp = 0;
-                const int8x16_t lhs_mat_01_0 = vld1q_s8(a_ptrs[rp][b].qs);
-                const int8x16_t lhs_mat_01_1 = vld1q_s8(a_ptrs[rp][b].qs + 16);
+                for (int rp = 0; rp < rows / 4; rp++) {
+                    const int8x16_t lhs_mat_01_0 = vld1q_s8(a_ptrs[rp][b].qs);
+                    const int8x16_t lhs_mat_23_0 = vld1q_s8(a_ptrs[rp][b].qs + 16);
+                    const int8x16_t lhs_mat_01_1 = vld1q_s8(a_ptrs[rp][b].qs + 32);
+                    const int8x16_t lhs_mat_23_1 = vld1q_s8(a_ptrs[rp][b].qs + 48);
 
-                const int8x16_t lhs_mat_01_2 = vld1q_s8(a_ptrs[rp][b].qs + 32);
-                const int8x16_t lhs_mat_01_3 = vld1q_s8(a_ptrs[rp][b].qs + 48);
+                    const int8x16_t lhs_mat_01_2 = vld1q_s8(a_ptrs[rp][b].qs + 64);
+                    const int8x16_t lhs_mat_23_2 = vld1q_s8(a_ptrs[rp][b].qs + 80);
+                    const int8x16_t lhs_mat_01_3 = vld1q_s8(a_ptrs[rp][b].qs + 96);
+                    const int8x16_t lhs_mat_23_3 = vld1q_s8(a_ptrs[rp][b].qs + 112);
 
-                // Do the MMLAs into 2x2 matrices
-                const int32x4_t iacc_mat_00 =
-                    vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_01_0), lhs_mat_01_1, rhs_mat_01_1), lhs_mat_01_2, rhs_mat_01_2), lhs_mat_01_3, rhs_mat_01_3);
-                const int32x4_t iacc_mat_01 =
-                    vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_23_0), lhs_mat_01_1, rhs_mat_23_1), lhs_mat_01_2, rhs_mat_23_2), lhs_mat_01_3, rhs_mat_23_3);
+                    // Do the MMLAs into 2x2 matrices
+                    const int32x4_t iacc_mat_00 =
+                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_01_0), lhs_mat_01_1, rhs_mat_01_1), lhs_mat_01_2, rhs_mat_01_2), lhs_mat_01_3, rhs_mat_01_3);
+                    const int32x4_t iacc_mat_01 =
+                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_23_0), lhs_mat_01_1, rhs_mat_23_1), lhs_mat_01_2, rhs_mat_23_2), lhs_mat_01_3, rhs_mat_23_3);
+                    const int32x4_t iacc_mat_10 =
+                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_23_0, rhs_mat_01_0), lhs_mat_23_1, rhs_mat_01_1), lhs_mat_23_2, rhs_mat_01_2), lhs_mat_23_3, rhs_mat_01_3);
+                    const int32x4_t iacc_mat_11 =
+                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_23_0, rhs_mat_23_0), lhs_mat_23_1, rhs_mat_23_1), lhs_mat_23_2, rhs_mat_23_2), lhs_mat_23_3, rhs_mat_23_3);
+
+                    // Straighten out to make 4 row vectors
+                    const int32x4_t iacc_row_0 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
+                    const int32x4_t iacc_row_1 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
+                    const int32x4_t iacc_row_2 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_10), vreinterpretq_u64_s32(iacc_mat_11)));
+                    const int32x4_t iacc_row_3 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_10), vreinterpretq_u64_s32(iacc_mat_11)));
 
-                // Straighten out to make 2 row vectors
-                const int32x4_t iacc_row_0 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
-                const int32x4_t iacc_row_1 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
+                    const float16x4_t row_scale_f16 = vld1_f16(a_ptrs[rp][b].d);
+                    const float32x4_t row_scale_f32 = vcvt_f32_f16(row_scale_f16);
 
-                const float16x4_t row_scale_f16_0 = vld1_dup_f16(&(a_ptrs[rp][b].d[0]));
-                const float32x4_t row_scale_f32_0 = vcvt_f32_f16(row_scale_f16_0);
-                const float16x4_t row_scale_f16_1 = vld1_dup_f16(&(a_ptrs[rp][b].d[1]));
-                const float32x4_t row_scale_f32_1 = vcvt_f32_f16(row_scale_f16_1);
+                    acc_rows[rp * 4] = vfmaq_f32(acc_rows[rp * 4], vcvtq_f32_s32(iacc_row_0), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 0));
+                    acc_rows[rp * 4 + 1] = vfmaq_f32(acc_rows[rp * 4 + 1], vcvtq_f32_s32(iacc_row_1), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 1));
+                    acc_rows[rp * 4 + 2] = vfmaq_f32(acc_rows[rp * 4 + 2], vcvtq_f32_s32(iacc_row_2), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 2));
+                    acc_rows[rp * 4 + 3] = vfmaq_f32(acc_rows[rp * 4 + 3], vcvtq_f32_s32(iacc_row_3), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 3));
+                }
+            }
 
-                acc_rows[rp * 2] = vfmaq_f32(acc_rows[rp * 2], vcvtq_f32_s32(iacc_row_0), vmulq_f32(col_scale_f32, row_scale_f32_0));
-                acc_rows[rp * 2 + 1] = vfmaq_f32(acc_rows[rp * 2 + 1], vcvtq_f32_s32(iacc_row_1), vmulq_f32(col_scale_f32, row_scale_f32_1));
+            for (int i = 0; i < rows; i++) {
+                vst1q_f32(s + ((y * 4 + i) * output_channels + x * 4), acc_rows[i]);
             }
-            vst1q_f32(s + ((y * 2) * output_channels + x * 4), acc_rows[0]);
-            vst1q_f32(s + ((y * 2 + 1) * output_channels + x * 4), acc_rows[1]);
         }
     }
 #endif
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
index 852263da609b8..61b8ce421ee3b 100644
--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@@ -70,24 +70,6 @@ typedef struct {
 } block_q4_0x8;
 static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_fp16_t) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
 
-typedef struct {
-    ggml_fp16_t d[16];     // deltas for 16 q4_0 blocks
-    uint8_t qs[QK4_0 * 8]; // nibbles / quants for 16 q4_0 blocks
-} block_q4_0x16;
-static_assert(sizeof(block_q4_0x16) == 16 * sizeof(ggml_fp16_t) + QK4_0 * 8, "wrong q4_0x16 block size/padding");
-
-typedef struct {
-    ggml_fp16_t d[64];     // deltas for 64 q4_0 blocks
-    uint8_t qs[QK4_0 * 32];// nibbles / quants for 64 q4_0 blocks
-} block_q4_0x64;
-static_assert(sizeof(block_q4_0x64) == 64 * sizeof(ggml_fp16_t) + QK4_0 * 32, "wrong q4_0x64 block size/padding");
-
-typedef struct {
-    ggml_fp16_t d[2];      // deltas for 2 q8_0 blocks
-    int8_t qs[QK8_0 * 2];  // quants for 2 q8_0 blocks
-} block_q8_0x2;
-static_assert(sizeof(block_q8_0x2) == 2 * sizeof(ggml_fp16_t) + QK8_0 * 2, "wrong q8_0x2 block size/padding");
-
 typedef struct {
     ggml_fp16_t d[4];      // deltas for 4 q8_0 blocks
     int8_t qs[QK8_0 * 4];  // quants for 4 q8_0 blocks
@@ -366,30 +348,34 @@ size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
 size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q4_0_aarch64(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 
 void iq2xs_init_impl(enum ggml_type type);
 void iq2xs_free_impl(enum ggml_type type);
 void iq3xs_init_impl(int grid_size);
 void iq3xs_free_impl(int grid_size);
 
-block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned int block_len);
-block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned int block_len);
+block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned int block_len, unsigned int xor_mask);
+block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned int block_len, unsigned int xor_mask);
 block_q8_0x4 make_block_q8_0x4(const block_q8_0 * const in[4], unsigned int block_len);
 block_q8_0x8 make_block_q8_0x8(const block_q8_0 * const in[8], unsigned int block_len);
-void quantize_row_q8_0_and_make_block_q8_0x2(const float * restrict x, void * restrict vy, int k, int rows_interleaved);
-void quantize_row_q8_0_and_make_block_q8_0x4(const float * restrict x, void * restrict vy, int k, int rows_interleaved);
+void quantize_row_q8_0_aarch64(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k, int nrows_interleaved, int blocklen_per_row);
 
 // GEMV
-void ggml_gemv_q4_0_q8_0_blocked8_neon(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
-void ggml_gemv_q4_0_q8_0_blocked8_sve(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
-void ggml_gemv_q8_0_q8_0_blocked8_neon(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
-void ggml_gemv_q8_0_q8_0_blocked8_sve(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
+void ggml_gemv_q4_0_q8_0_blocked8_neon(const int n, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
+void ggml_gemv_q4_0_q8_0_blocked8_sve(const int n, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
+void ggml_gemv_q4_0_q8_0_aarch64_sve256(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
+void ggml_gemv_q4_0_q8_0_aarch64_neon(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
+void ggml_gemv_q4_0_q8_0_aarch64_neon_noi8mm(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
+void ggml_gemv_q8_0_q8_0_blocked8_neon(const int n, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
+void ggml_gemv_q8_0_q8_0_blocked8_sve(const int n, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
 
 // GEMM
-void ggml_gemm_q4_0_q8_0(const int n, int rows, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
-void ggml_gemm_q4_0_q8_0_2x4blocked_mmla(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
-void ggml_gemm_q8_0_q8_0(const int n, int rows, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
-void ggml_gemm_q8_0_q8_0_2x4blocked_mmla(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
+void ggml_gemm_q4_0_q8_0(const int n, int rows, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
+void ggml_gemm_q4_0_q8_0_aarch64_sve256(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
+void ggml_gemm_q4_0_q8_0_aarch64_neon(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
+void ggml_gemm_q4_0_q8_0_aarch64_neon_noi8mm(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
+void ggml_gemm_q8_0_q8_0(const int n, int rows, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
 
 #ifdef __cplusplus
 }
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 8b613a6a09534..ddeda43364dda 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -486,192 +486,6 @@ int64_t ggml_cycles_per_ms(void) {
 #define ggml_perf_cycles_per_ms() 0
 #endif
 
-void rearrange_q4_0_weights_blocked8_neon(struct ggml_tensor * cur) {
-    block_q4_0x8 * out_ptr_B = malloc(ggml_nbytes(cur));  // B_blocked->data;
-    block_q4_0x8 * out_ptr_B_start = out_ptr_B;
-    int64_t nb = cur->ne[0] / QK4_0;
-
-    for (int y_out = 0; y_out < cur->ne[1] / 8; y_out++) {
-        const block_q4_0 * in_ptrs[8];
-
-        in_ptrs[0] = (block_q4_0 *) cur->data + (y_out * 8 * nb);
-        for (int i = 0; i < 7; i++) {
-            in_ptrs[i + 1] = in_ptrs[i] + nb;
-        }
-
-        for (int64_t x = 0; x < nb; x++) {
-            *out_ptr_B = make_block_q4_0x8(in_ptrs, 4);  // block_len=4 for SDOT
-            out_ptr_B++;
-
-            for (int i = 0; i < 8; i++) {
-                in_ptrs[i]++;
-            }
-        }
-    }
-    cur->rearranged_weight_gemv = (uint8_t *) out_ptr_B_start;
-}
-
-void rearrange_q4_0_weights_blocked8_sve(struct ggml_tensor * cur) {
-#if defined(__ARM_FEATURE_SVE)
-    if (svcntw() != 8) {
-        printf("ggml_gemv_q4_0_q8_0_blocked8_sve: SVE VL != 256 - aborting. Use Arm Neon GEMV kernels\n");
-        exit(1);
-    }
-
-    block_q4_0x8 * out_ptr_B = malloc(ggml_nbytes(cur));  // B_blocked->data;
-    block_q4_0x8 * out_ptr_B_start = out_ptr_B;
-    int64_t nb = cur->ne[0] / QK4_0;
-
-    for (int y_out = 0; y_out < cur->ne[1] / 8; y_out++) {
-        const block_q4_0 * in_ptrs[8];
-
-        in_ptrs[0] = (block_q4_0 *) cur->data + (y_out * 8 * nb);
-        for (int i = 0; i < 7; i++) {
-            in_ptrs[i + 1] = in_ptrs[i] + nb;
-        }
-
-        for (int64_t x = 0; x < nb; x++) {
-            *out_ptr_B = make_block_q4_0x8(in_ptrs, 4);  // block_len=4 for SDOT
-            out_ptr_B++;
-
-            for (int i = 0; i < 8; i++) {
-                in_ptrs[i]++;
-            }
-        }
-    }
-    cur->rearranged_weight_gemv = (uint8_t *) out_ptr_B_start;
-#endif
-}
-
-#if defined(__ARM_FEATURE_SVE)
-static void (*_rearrange_q4_0_weights_for_gemv)(struct ggml_tensor *) = &rearrange_q4_0_weights_blocked8_sve;
-#elif defined(__ARM_NEON)
-static void (*_rearrange_q4_0_weights_for_gemv)(struct ggml_tensor *) = &rearrange_q4_0_weights_blocked8_neon;
-#endif
-
-#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
-void rearrange_q4_0_weights_for_gemv(struct ggml_tensor * cur) { _rearrange_q4_0_weights_for_gemv(cur); }
-#endif
-
-void rearrange_q4_0_weights_for_gemm(struct ggml_tensor * cur) {
-    block_q4_0x4 * out_ptr_B = malloc(ggml_nbytes(cur));  // B_blocked->data;
-    block_q4_0x4 * out_ptr_B_start = out_ptr_B;
-    int64_t nb = cur->ne[0] / QK4_0;
-
-    for (int y_out = 0; y_out < cur->ne[1] / 4; y_out++) {
-        const block_q4_0 * in_ptrs[4];
-
-        in_ptrs[0] = (block_q4_0 *) cur->data + (y_out * 4 * nb);
-        for (int i = 0; i < 3; i++) {
-            in_ptrs[i + 1] = in_ptrs[i] + nb;
-        }
-
-        for (int64_t x = 0; x < nb; x++) {
-            *out_ptr_B =
-                make_block_q4_0x4(in_ptrs, 8);  // block_len=8 for SMMLA
-            out_ptr_B++;
-
-            for (int i = 0; i < 4; i++) {
-                in_ptrs[i]++;
-            }
-        }
-    }
-    cur->rearranged_weight_gemm = (uint8_t *) out_ptr_B_start;
-}
-
-void rearrange_q8_0_weights_blocked8_neon(struct ggml_tensor * cur) {
-    block_q8_0x8 * out_ptr_B = malloc(ggml_nbytes(cur));  // B_blocked->data;
-    block_q8_0x8 * out_ptr_B_start = out_ptr_B;
-    int64_t nb = cur->ne[0] / QK8_0;
-
-    for (int y_out = 0; y_out < cur->ne[1] / 8; y_out++) {
-        const block_q8_0 * in_ptrs[8];
-
-        in_ptrs[0] = (block_q8_0 *) cur->data + (y_out * 8 * nb);
-        for (int i = 0; i < 7; i++) {
-            in_ptrs[i + 1] = in_ptrs[i] + nb;
-        }
-
-        for (int64_t x = 0; x < nb; x++) {
-            *out_ptr_B = make_block_q8_0x8(in_ptrs, 4);  // block_len=4 for SDOT
-            out_ptr_B++;
-
-            for (int i = 0; i < 8; i++) {
-                in_ptrs[i]++;
-            }
-        }
-    }
-    cur->rearranged_weight_gemv = (uint8_t *) out_ptr_B_start;
-}
-
-void rearrange_q8_0_weights_blocked8_sve(struct ggml_tensor * cur) {
-#if defined(__ARM_FEATURE_SVE)
-    if (svcntw() != 8) {
-        printf("ggml_gemv_q8_0_q8_0_blocked8_sve: SVE VL != 256 - aborting. Use Arm Neon GEMV kernels\n");
-        exit(1);
-    }
-
-    block_q8_0x8 * out_ptr_B = malloc(ggml_nbytes(cur));  // B_blocked->data;
-    block_q8_0x8 * out_ptr_B_start = out_ptr_B;
-    int64_t nb = cur->ne[0] / QK8_0;
-
-    for (int y_out = 0; y_out < cur->ne[1] / 8; y_out++) {
-        const block_q8_0 * in_ptrs[8];
-
-        in_ptrs[0] = (block_q8_0 *) cur->data + (y_out * 8 * nb);
-        for (int i = 0; i < 7; i++) {
-            in_ptrs[i + 1] = in_ptrs[i] + nb;
-        }
-
-        for (int64_t x = 0; x < nb; x++) {
-            *out_ptr_B = make_block_q8_0x8(in_ptrs, 4);  // block_len=4 for SDOT
-            out_ptr_B++;
-
-            for (int i = 0; i < 8; i++) {
-                in_ptrs[i]++;
-            }
-        }
-    }
-    cur->rearranged_weight_gemv = (uint8_t *) out_ptr_B_start;
-#endif
-}
-
-#if defined(__ARM_FEATURE_SVE)
-static void (*_rearrange_q8_0_weights_for_gemv)(struct ggml_tensor *) = &rearrange_q8_0_weights_blocked8_sve;
-#elif defined(__ARM_NEON)
-static void (*_rearrange_q8_0_weights_for_gemv)(struct ggml_tensor *) = &rearrange_q8_0_weights_blocked8_neon;
-#endif
-
-#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
-void rearrange_q8_0_weights_for_gemv(struct ggml_tensor * cur) { _rearrange_q8_0_weights_for_gemv(cur); }
-#endif
-
-void rearrange_q8_0_weights_for_gemm(struct ggml_tensor * cur) {
-    block_q8_0x4 * out_ptr_B = malloc(ggml_nbytes(cur));  // B_blocked->data;
-    block_q8_0x4 * out_ptr_B_start = out_ptr_B;
-    int64_t nb = cur->ne[0] / QK8_0;
-
-    for (int y_out = 0; y_out < cur->ne[1] / 4; y_out++) {
-        const block_q8_0 * in_ptrs[4];
-
-        in_ptrs[0] = (block_q8_0 *) cur->data + (y_out * 4 * nb);
-        for (int i = 0; i < 3; i++) {
-            in_ptrs[i + 1] = in_ptrs[i] + nb;
-        }
-
-        for (int64_t x = 0; x < nb; x++) {
-            *out_ptr_B =
-                make_block_q8_0x4(in_ptrs, 8);  // block_len=8 for SMMLA
-            out_ptr_B++;
-
-            for (int i = 0; i < 4; i++) {
-                in_ptrs[i]++;
-            }
-        }
-    }
-    cur->rearranged_weight_gemm = (uint8_t *) out_ptr_B_start;
-}
-
 //
 // cross-platform UTF-8 file paths
 //
@@ -891,6 +705,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
 #else
         .nrows                    = 1,
 #endif
+        .from_float_to_mat        = quantize_row_q8_0_aarch64,
     },
     [GGML_TYPE_Q8_1] = {
         .type_name                = "q8_1",
@@ -1088,6 +903,32 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_bf16,
         .vec_dot_type             = GGML_TYPE_BF16,
         .nrows                    = 1,
+    },
+    [GGML_TYPE_Q4_0_AARCH64] = {
+        .type_name                = "q4_0_aarch64",
+        .blck_size                = QK4_0,
+        .type_size                = sizeof(block_q4_0),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q4_0,
+        .from_float               = quantize_row_q4_0,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_0_reference,
+        .vec_dot                  = ggml_vec_dot_q4_0_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+#if defined (__ARM_FEATURE_MATMUL_INT8)
+        .nrows                    = 2,
+#else
+        .nrows                    = 1,
+#endif
+#if defined(__ARM_FEATURE_SVE)
+        .gemv                     = ggml_gemv_q4_0_q8_0_aarch64_sve256,
+        .gemm                     = ggml_gemm_q4_0_q8_0_aarch64_sve256,
+#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+        .gemv                     = ggml_gemv_q4_0_q8_0_aarch64_neon,
+        .gemm                     = ggml_gemm_q4_0_q8_0_aarch64_neon,
+#elif defined(__ARM_NEON)
+        .gemv                     = ggml_gemv_q4_0_q8_0_aarch64_neon_noi8mm,
+        .gemm                     = ggml_gemm_q4_0_q8_0_aarch64_neon_noi8mm,
+#endif
     }
 };
 
@@ -2804,10 +2645,6 @@ inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
     *s = idx;
 }
 
-static void ggml_gemv_q4_0_q8_0(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
-
-static void ggml_gemv_q8_0_q8_0(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
-
 //
 // data types
 //
@@ -3391,6 +3228,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
         case GGML_FTYPE_MOSTLY_IQ4_XS:        wtype = GGML_TYPE_IQ4_XS;   break;
         case GGML_FTYPE_MOSTLY_IQ3_S:         wtype = GGML_TYPE_IQ3_S;    break;
         case GGML_FTYPE_MOSTLY_IQ2_S:         wtype = GGML_TYPE_IQ2_S;    break;
+        case GGML_FTYPE_MOSTLY_Q4_0_AARCH64:  wtype = GGML_TYPE_Q4_0_AARCH64;  break;
         case GGML_FTYPE_UNKNOWN:              wtype = GGML_TYPE_COUNT; break;
         case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
     }
@@ -3850,9 +3688,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
         /*.name         =*/ { 0 },
         /*.extra        =*/ NULL,
         ///*.padding      =*/ { 0 },
-        /*.rearranged_weight_gemv =*/ NULL,
-        /*.rearranged_weight_gemm =*/ NULL,
-        /*.weight_rearranged      =*/ false,
     };
 
 #ifdef __clang__
@@ -9638,6 +9473,7 @@ static void ggml_compute_forward_add(
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q4_0_AARCH64:
             {
                 ggml_compute_forward_add_q_f32(params, dst);
             } break;
@@ -10013,6 +9849,7 @@ static void ggml_compute_forward_add1(
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q4_0_AARCH64:
             {
                 ggml_compute_forward_add1_q_f32(params, dst);
             } break;
@@ -10138,6 +9975,7 @@ static void ggml_compute_forward_acc(
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q4_0_AARCH64:
         default:
             {
                 GGML_ASSERT(false);
@@ -12340,6 +12178,9 @@ static void ggml_compute_forward_mul_mat(
     enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
     ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
     int64_t           const vec_dot_num_rows      = type_traits[type].nrows;
+    ggml_from_float_to_mat_t const from_float_to_mat = type_traits[vec_dot_type].from_float_to_mat;
+    ggml_gemv_t       const gemv                  = type_traits[type].gemv;
+    ggml_gemm_t       const gemm                  = type_traits[type].gemm;
 
     GGML_ASSERT(ne0 == ne01);
     GGML_ASSERT(ne1 == ne11);
@@ -12405,10 +12246,9 @@ UseGgmlGemm1:;
                 }
             }
         }
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-            if ((src0->weight_rearranged == true) && (ne11 >= 4) && (ne12 == 1) && (ne13 == 1)) {
+            if ((type == GGML_TYPE_Q4_0_AARCH64) && (ne11 >= 4) && (ne12 == 1) && (ne13 == 1)) {
                     for (int64_t i11 = 0; i11 < ne11 / 4; ++i11) {
-                        quantize_row_q8_0_and_make_block_q8_0x4((float *)((char *) src1->data + i11 * 4 * nb11), (void *) wdata, ne10, 4);
+                        from_float_to_mat((float *)((char *) src1->data + i11 * 4 * nb11), (void *) wdata, ne10, 4, ggml_cpu_has_matmul_int8() ? 8 : 4);
                         wdata += row_size * 4;
                     }
                     for (int64_t i11 = (ne11 / 4) * 4; i11 < ne11; ++i11) {
@@ -12416,10 +12256,7 @@ UseGgmlGemm1:;
                          wdata += row_size;
                      }
             }
-#endif
-#if defined(__ARM_FEATURE_MATMUL_INT8)
             else {
-#endif
                 for (int64_t i13 = 0; i13 < ne13; ++i13) {
                     for (int64_t i12 = 0; i12 < ne12; ++i12) {
                         for (int64_t i11 = 0; i11 < ne11; ++i11) {
@@ -12428,9 +12265,7 @@ UseGgmlGemm1:;
                         }
                     }
                 }
-#if defined(__ARM_FEATURE_MATMUL_INT8)
             }
-#endif
 
     if (ith == 0) {
         // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
@@ -12509,114 +12344,50 @@ UseGgmlGemm2:;
     //if (ith == 0)
     //    printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d.  Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);
 
-#if defined(__ARM_FEATURE_MATMUL_INT8) && (defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE))
-    if ((ggml_n_dims(src0) == 2) && (ne11 == 1) && (src0->weight_rearranged == true)) {
-        if (src0->type == GGML_TYPE_Q4_0) {
-            ggml_gemv_q4_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data), (const char *) src0->rearranged_weight_gemv, (const char *) wdata, ith, nth); // use Arm Neon/SVE GEMV kernels
-        } else if (src0->type == GGML_TYPE_Q8_0) {
-            ggml_gemv_q8_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data), (const char *) src0->rearranged_weight_gemv, (const char *) wdata, ith, nth); // use Arm Neon/SVE GEMV kernels
-        }
+    if ((ggml_n_dims(src0) == 2) && (ne11 == 1) && (type == GGML_TYPE_Q4_0_AARCH64)) {
+        gemv(ne00, ne01, 1, (float *)((char *) dst->data), (const char *) src0->data, (const char *) wdata, ith, nth); // use Arm Neon/SVE GEMV kernels
     }
-    else if ((ggml_n_dims(src0) == 2) && (ne11 >= 16) && (src0->weight_rearranged == true)) {
+    else if ((ggml_n_dims(src0) == 2) && (ne11 >= 16) && (type == GGML_TYPE_Q4_0_AARCH64)) {
         // use batch-sized 16, 8, and 4 GEMM kernels
-        if (src0->type == GGML_TYPE_Q4_0) {
-            for (int row_iter = 0; row_iter < ne11 / 16; row_iter++) {
-                ggml_gemm_q4_0_q8_0(ne00, 16, ne01, 16, (float *)((char *) dst->data + (row_iter * 16 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 16) * row_size : (row_iter * 16 * nb11)), ith, nth);
-            }
-            int rows_processed = (ne11 / 16) * 16;
-            for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 8; row_iter++) {
-                ggml_gemm_q4_0_q8_0(ne00, 8, ne01, 8, (float *)((char *) dst->data + ((rows_processed + row_iter * 8) * nb1)), (const char *) src0->rearranged_weight_gemm,
-                                            (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 8) * row_size : ((rows_processed + row_iter * 8) * nb11)), ith, nth);
-            }
-            rows_processed = rows_processed + ((ne11 - rows_processed) / 8) * 8;
-            for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
-                ggml_gemm_q4_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->rearranged_weight_gemm,
-                                            (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
-            }
-            rows_processed = rows_processed + ((ne11 - rows_processed) / 4) * 4;
-            for (int row_iter = rows_processed; row_iter < ne11; row_iter++) {
-                ggml_gemv_q4_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
-            }
-        } else if (src0->type == GGML_TYPE_Q8_0) {
-            for (int row_iter = 0; row_iter < ne11 / 16; row_iter++) {
-                ggml_gemm_q8_0_q8_0(ne00, 16, ne01, 16, (float *)((char *) dst->data + (row_iter * 16 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 16) * row_size : (row_iter * 16 * nb11)), ith, nth);
-            }
-            int rows_processed = (ne11 / 16) * 16;
-            for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 8; row_iter++) {
-                ggml_gemm_q8_0_q8_0(ne00, 8, ne01, 8, (float *)((char *) dst->data + ((rows_processed + row_iter * 8) * nb1)), (const char *) src0->rearranged_weight_gemm,
-                                            (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 8) * row_size : ((rows_processed + row_iter * 8) * nb11)), ith, nth);
-            }
-            rows_processed = rows_processed + ((ne11 - rows_processed) / 8) * 8;
-            for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
-                ggml_gemm_q8_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->rearranged_weight_gemm,
-                                            (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
-            }
-            rows_processed = rows_processed + ((ne11 - rows_processed) / 4) * 4;
-            for (int row_iter = rows_processed; row_iter < ne11; row_iter++) {
-                ggml_gemv_q8_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
-            }
+        for (int row_iter = 0; row_iter < ne11 / 16; row_iter++) {
+            gemm(ne00, ne01, 16, (float *)((char *) dst->data + (row_iter * 16 * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 16) * row_size : (row_iter * 16 * nb11)), ith, nth);
+        }
+        int rows_processed = (ne11 / 16) * 16;
+        for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 8; row_iter++) {
+            gemm(ne00, ne01, 8, (float *)((char *) dst->data + ((rows_processed + row_iter * 8) * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 8) * row_size : ((rows_processed + row_iter * 8) * nb11)), ith, nth);
+        }
+        rows_processed = rows_processed + ((ne11 - rows_processed) / 8) * 8;
+        for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
+            gemm(ne00, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
+        }
+        rows_processed = rows_processed + ((ne11 - rows_processed) / 4) * 4;
+        for (int row_iter = rows_processed; row_iter < ne11; row_iter++) {
+            gemv(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
         }
-    } else if ((ggml_n_dims(src0) == 2) && (ne11 >= 8) && (src0->weight_rearranged == true)) {
+    }
+    else if ((ggml_n_dims(src0) == 2) && (ne11 >= 8) && (type == GGML_TYPE_Q4_0_AARCH64)) {
         // use batch-sized 8, and 4 GEMM kernels
-        if (src0->type == GGML_TYPE_Q4_0) {
-            for (int row_iter = 0; row_iter < ne11 / 8; row_iter++) {
-                ggml_gemm_q4_0_q8_0(ne00, 8, ne01, 8, (float *)((char *) dst->data + (row_iter * 8 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 8) * row_size : (row_iter * 8 * nb11)), ith, nth);
-            }
-            int rows_processed = (ne11 / 8) * 8;
-            for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
-                ggml_gemm_q4_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->rearranged_weight_gemm,
-                                            (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
-            }
-            for (int row_iter = ((ne11 / 8) * 8) + ((ne11 - rows_processed) / 4 * 4); row_iter < ne11; row_iter++) {
-                ggml_gemv_q4_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
-            }
-        } else if (src0->type == GGML_TYPE_Q8_0) {
-            for (int row_iter = 0; row_iter < ne11 / 8; row_iter++) {
-                ggml_gemm_q8_0_q8_0(ne00, 8, ne01, 8, (float *)((char *) dst->data + (row_iter * 8 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 8) * row_size : (row_iter * 8 * nb11)), ith, nth);
-            }
-            int rows_processed = (ne11 / 8) * 8;
-            for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
-                ggml_gemm_q8_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->rearranged_weight_gemm,
-                                            (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
-            }
-            for (int row_iter = ((ne11 / 8) * 8) + ((ne11 - rows_processed) / 4 * 4); row_iter < ne11; row_iter++) {
-                ggml_gemv_q8_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
-            }
+        for (int row_iter = 0; row_iter < ne11 / 8; row_iter++) {
+            gemm(ne00, ne01, 8, (float *)((char *) dst->data + (row_iter * 8 * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 8) * row_size : (row_iter * 8 * nb11)), ith, nth);
+        }
+        int rows_processed = (ne11 / 8) * 8;
+        for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
+            gemm(ne00, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
         }
-    } else if ((ggml_n_dims(src0) == 2) && (ne11 >= 4) && (src0->weight_rearranged == true)) {
+        for (int row_iter = ((ne11 / 8) * 8) + ((ne11 - rows_processed) / 4 * 4); row_iter < ne11; row_iter++) {
+            gemv(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
+        }
+    } 
+    else if ((ggml_n_dims(src0) == 2) && (ne11 >= 4) && (type == GGML_TYPE_Q4_0_AARCH64)) {
         // use batch-sized 4 GEMM kernel
-        if (src0->type == GGML_TYPE_Q4_0) {
-            for (int row_iter = 0; row_iter < ne11 / 4; row_iter++) {
-                ggml_gemm_q4_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + (row_iter * 4 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 4) * row_size : (row_iter * 4 * nb11)), ith, nth);
-            }
-            for (int row_iter = (ne11 / 4) * 4; row_iter < ne11; row_iter++) {
-                ggml_gemv_q4_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
-            }
-        } else if (src0->type == GGML_TYPE_Q8_0) {
-            for (int row_iter = 0; row_iter < ne11 / 4; row_iter++) {
-                ggml_gemm_q8_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + (row_iter * 4 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 4) * row_size : (row_iter * 4 * nb11)), ith, nth);
-            }
-            for (int row_iter = (ne11 / 4) * 4; row_iter < ne11; row_iter++) {
-                ggml_gemv_q8_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
-            }
+        for (int row_iter = 0; row_iter < ne11 / 4; row_iter++) {
+            gemm(ne00, ne01, 4, (float *)((char *) dst->data + (row_iter * 4 * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 4) * row_size : (row_iter * 4 * nb11)), ith, nth);
         }
-    }
-#elif defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
-    if ((ggml_n_dims(src0) == 2) && (src0->weight_rearranged == true)) {
-        if (src0->type == GGML_TYPE_Q4_0) {
-            for (int row_iter = 0; row_iter < ne11; row_iter++) {
-                ggml_gemv_q4_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
-            }
-        } else if (src0->type == GGML_TYPE_Q8_0) {
-            for (int row_iter = 0; row_iter < ne11; row_iter++) {
-                ggml_gemv_q8_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
-            }
+        for (int row_iter = (ne11 / 4) * 4; row_iter < ne11; row_iter++) {
+            gemv(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
         }
     }
-#endif
-#if defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
     else {
-#endif
         // The first chunk comes from our thread_id, the rest will get auto-assigned.
         int current_chunk = ith;
 
@@ -12638,9 +12409,7 @@ UseGgmlGemm2:;
 
             current_chunk = atomic_fetch_add(&params->shared->current_chunk, 1);
         }
-#if defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
     }
-#endif
 }
 
 // ggml_compute_forward_mul_mat_id
@@ -13051,6 +12820,7 @@ static void ggml_compute_forward_out_prod(
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q4_0_AARCH64:
             {
                 ggml_compute_forward_out_prod_q_f32(params, dst);
             } break;
@@ -13236,6 +13006,7 @@ static void ggml_compute_forward_set(
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q4_0_AARCH64:
         default:
             {
                 GGML_ASSERT(false);
@@ -13495,6 +13266,7 @@ static void ggml_compute_forward_get_rows(
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q4_0_AARCH64:
             {
                 ggml_compute_forward_get_rows_q(params, dst);
             } break;
@@ -14081,6 +13853,7 @@ static void ggml_compute_forward_clamp(
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
         case GGML_TYPE_Q8_K:
+        case GGML_TYPE_Q4_0_AARCH64:
         case GGML_TYPE_I8:
         case GGML_TYPE_I16:
         case GGML_TYPE_I32:
@@ -20804,6 +20577,7 @@ size_t ggml_quantize_chunk(
         case GGML_TYPE_IQ1_M:   result = quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ4_NL:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ4_XS:  result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q4_0_AARCH64: result = quantize_q4_0_aarch64(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_F16:
             {
                 size_t elemsize = sizeof(ggml_fp16_t);
@@ -22238,26 +22012,12 @@ int ggml_cpu_has_matmul_int8(void) {
 #endif
 }
 
+int ggml_cpu_has_sve(void) {
 #if defined(__ARM_FEATURE_SVE)
-static void (*_ggml_gemv_q4_0_q8_0)(const int, int, int, float * restrict, const void * restrict, const void * restrict, int, int) = &ggml_gemv_q4_0_q8_0_blocked8_sve;
-#elif defined(__ARM_NEON)
-static void (*_ggml_gemv_q4_0_q8_0)(const int, int, int, float * restrict, const void * restrict, const void * restrict, int, int) = &ggml_gemv_q4_0_q8_0_blocked8_neon;
-#endif
-
-#if defined(__ARM_FEATURE_SVE)
-static void (*_ggml_gemv_q8_0_q8_0)(const int, int, int, float * restrict, const void * restrict, const void * restrict, int, int) = &ggml_gemv_q8_0_q8_0_blocked8_sve;
-#elif defined(__ARM_NEON)
-static void (*_ggml_gemv_q8_0_q8_0)(const int, int, int, float * restrict, const void * restrict, const void * restrict, int, int) = &ggml_gemv_q8_0_q8_0_blocked8_neon;
+    return 1;
+#else
+    return 0;
 #endif
-
-#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
-static void ggml_gemv_q4_0_q8_0(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
-    _ggml_gemv_q4_0_q8_0(n, output_channels, input_width, s, vx, vy, ith, nth);
-}
-
-static void ggml_gemv_q8_0_q8_0(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
-    _ggml_gemv_q8_0_q8_0(n, output_channels, input_width, s, vx, vy, ith, nth);
 }
-#endif
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/llama.h b/include/llama.h
index bb4b05ba63671..bd108ec699c75 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -162,6 +162,7 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_BF16          = 32, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0_AARCH64  = 33, // except 1d tensors
 
         LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
     };
diff --git a/src/llama.cpp b/src/llama.cpp
index 7aecda2f594e5..ff76310542170 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3783,6 +3783,7 @@ struct llama_model_loader {
                 case GGML_TYPE_IQ4_NL:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL;  break;
                 case GGML_TYPE_IQ4_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS;  break;
                 case GGML_TYPE_IQ3_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ3_S;   break;
+                case GGML_TYPE_Q4_0_AARCH64: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_AARCH64; break;
                 default:
                     {
                         LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -4359,32 +4360,6 @@ struct llama_model_loader {
                 }
             }
 
-#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
-            if ((cur->type == GGML_TYPE_Q4_0) && (cur->ne[1] % 4 == 0)) {
-                cur->weight_rearranged = true;
-#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
-                rearrange_q4_0_weights_for_gemv(cur); // rearrange weights for Arm Neon/SVE GEMV kernels
-#endif
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-                rearrange_q4_0_weights_for_gemm(cur); // rearrange weights for GEMM MMLA kernels
-#endif
-            }
-            else if ((cur->type == GGML_TYPE_Q8_0) && (cur->ne[1] % 4 == 0)) {
-                cur->weight_rearranged = true;
-#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
-                rearrange_q8_0_weights_for_gemv(cur); // rearrange weights for Arm Neon/SVE GEMV kernels
-#endif
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-                rearrange_q8_0_weights_for_gemm(cur); // rearrange weights for GEMM MMLA kernels
-#endif
-            }
-            else {
-                cur->weight_rearranged = false;
-            }
-#else
-            cur->weight_rearranged = false;
-#endif
-
             size_done += n_size;
         }
 
@@ -4502,6 +4477,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
         case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ3_S:  return "IQ3_S - 3.4375 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ3_M:  return "IQ3_S mix - 3.66 bpw";
+        case LLAMA_FTYPE_MOSTLY_Q4_0_AARCH64: return "Q4_0_AARCH64";
 
         default: return "unknown, may not work";
     }
@@ -17787,6 +17763,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
             else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
                 new_type = GGML_TYPE_IQ3_S;
             }
+            else if (new_type == GGML_TYPE_Q4_0_AARCH64) {
+                new_type = GGML_TYPE_Q4_0;
+            }
         }
     } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
                ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M    || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
@@ -18099,6 +18078,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         case LLAMA_FTYPE_MOSTLY_IQ4_XS:  default_type = GGML_TYPE_IQ4_XS;  break;
         case LLAMA_FTYPE_MOSTLY_IQ3_S:   default_type = GGML_TYPE_IQ3_S;   break;
         case LLAMA_FTYPE_MOSTLY_IQ3_M:   default_type = GGML_TYPE_IQ3_S;   break;
+        case LLAMA_FTYPE_MOSTLY_Q4_0_AARCH64: default_type = GGML_TYPE_Q4_0_AARCH64; break;
 
         default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
     }
@@ -18409,6 +18389,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                 f32_data = (float *) f32_conv_buf.data();
             }
 
+            if (new_type == GGML_TYPE_Q4_0_AARCH64) {
+                if ((ggml_cpu_has_neon() == 0) && (ggml_cpu_has_sve() == 0)) new_type = GGML_TYPE_Q4_0;
+                if ((nelements / tensor->ne[0]) % 4 != 0) new_type = GGML_TYPE_Q4_0;
+                if (nthread > 1) nthread = 1;
+            }
+
             LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
             fflush(stdout);
 
@@ -21702,6 +21688,7 @@ const char * llama_print_system_info(void) {
 #else
     s += "LLAMAFILE = 0 | ";
 #endif
+    s += "SVE = "         + std::to_string(ggml_cpu_has_sve())         + " | ";
 
     return s.c_str();
 }

From 81215ff43a2f52fe1655f56c8597d7fb00bcaf9e Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar109@gmail.com>
Date: Tue, 23 Apr 2024 07:36:22 +0000
Subject: [PATCH 03/28] Arm AArch64: add optimized GEMV and GEMM asm kernels
 for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780
 suggestions

---
 ggml/src/ggml-common.h |  24 +++++
 ggml/src/ggml-quants.c |  59 +----------
 ggml/src/ggml-quants.h | 226 -----------------------------------------
 3 files changed, 25 insertions(+), 284 deletions(-)

diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
index c74060cc4b991..fafd5fa7ae000 100644
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -199,6 +199,30 @@ typedef struct {
 } block_q8_1;
 static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
 
+typedef struct {
+    ggml_half d[4];        // deltas for 4 q4_0 blocks
+    uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks
+} block_q4_0x4;
+static_assert(sizeof(block_q4_0x4) == 4 * sizeof(ggml_half) + QK4_0 * 2, "wrong q4_0x4 block size/padding");
+
+typedef struct {
+    ggml_half d[8];        // deltas for 8 q4_0 blocks
+    uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks
+} block_q4_0x8;
+static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_half) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
+
+typedef struct {
+    ggml_half d[4];        // deltas for 4 q8_0 blocks
+    int8_t qs[QK8_0 * 4];  // quants for 4 q8_0 blocks
+} block_q8_0x4;
+static_assert(sizeof(block_q8_0x4) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong q8_0x4 block size/padding");
+
+typedef struct {
+    ggml_half d[8];        // deltas for 8 q8_0 blocks
+    int8_t qs[QK8_0 * 8];  // quants for 8 q8_0 blocks
+} block_q8_0x8;
+static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
+
 //
 // Super-block quantization structures
 //
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index f774810375211..2004ae356691d 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -700,64 +700,6 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
     quantize_row_q4_0_reference(x, y, k);
 }
 
-void quantize_row_q4_0_aarch64(const float * src, void * dst, int n, int k) {
-    int nrows_interleaved, blocklen_per_row;
-    typedef block_q4_0x8 block_q4_0xn;
-    typedef block_q4_0xn (*make_block_q4_0xn_t)(const block_q4_0 *, unsigned int, unsigned int);
-    make_block_q4_0xn_t make_block_q4_0xn = make_block_q4_0x8;
-
-    if (ggml_cpu_has_sve() && (svcntw() == 8)) {
-        nrows_interleaved = 8;
-        blocklen_per_row = 8;
-        typedef block_q4_0x8 block_q4_0xn;
-        make_block_q4_0xn = make_block_q4_0x8;
-    }
-    else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-        nrows_interleaved = 4;
-        blocklen_per_row = 8;
-        typedef block_q4_0x4 block_q4_0xn;
-        make_block_q4_0xn = make_block_q4_0x4;
-    }
-    else if (ggml_cpu_has_neon()) {
-        nrows_interleaved = 4;
-        blocklen_per_row = 4;
-        typedef block_q4_0x4 block_q4_0xn;
-        make_block_q4_0xn = make_block_q4_0x4;
-    }
-    else {
-        assert(false);
-    }
-
-    assert(k % QK4_0 == 0);
-    const int nb = k / QK4_0;
-
-    block_q4_0xn * out_ptr_B = (block_q4_0xn *) malloc(sizeof(block_q4_0xn) * nb);
-    block_q4_0xn * out_ptr_B_start = out_ptr_B;
-
-    for (int b = 0; b < n; b += nrows_interleaved * k) {
-        const block_q4_0 * in_ptrs[nrows_interleaved];
-
-        for (int i  = 0; i < nrows_interleaved; i++ ) {
-            in_ptrs[i] = (block_q4_0 *) dst + (b + i * k) / QK4_0;
-            quantize_row_q4_0_reference(src + b + i * k, in_ptrs[i], k);
-        }
-
-        for (int64_t x = 0; x < nb; x++) {
-            *out_ptr_B = make_block_q4_0xn(in_ptrs, blocklen_per_row, 0x88);
-            out_ptr_B++;
-
-            for (int i = 0; i < nrows_interleaved; i++) {
-                in_ptrs[i]++;
-            }
-        }
-        out_ptr_B = out_ptr_B_start;
-        memcpy ((block_q4_0 *) dst + b / QK4_0, out_ptr_B_start, sizeof(block_q4_0xn) * nb);
-    }
-    if (out_ptr_B_start) free(out_ptr_B_start);
-
-    return (n / QK4_0 * sizeof(block_q4_0));
-}
-
 
 void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
     const int qk = QK4_1;
@@ -14835,6 +14777,7 @@ void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k)
     assert(k % QK_K == 0);
     block_iq2_s * restrict y = vy;
     quantize_row_iq2_s_reference(x, y, k);
+}
 
 // Routines to create the blocked formats
 // Note input is array of pointers.
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
index 61b8ce421ee3b..ccc255d19ac99 100644
--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@@ -8,232 +8,6 @@
 
 // GGML internal header
 
-#include <stdint.h>
-#include <stddef.h>
-
-#define QK4_0 32
-typedef struct {
-    ggml_fp16_t d;          // delta
-    uint8_t qs[QK4_0 / 2];  // nibbles / quants
-} block_q4_0;
-static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
-
-#define QK4_1 32
-typedef struct {
-    ggml_fp16_t d;          // delta
-    ggml_fp16_t m;          // min
-    uint8_t qs[QK4_1 / 2];  // nibbles / quants
-} block_q4_1;
-static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
-
-#define QK5_0 32
-typedef struct {
-    ggml_fp16_t d;         // delta
-    uint8_t qh[4];         // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2]; // nibbles / quants
-} block_q5_0;
-static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
-
-#define QK5_1 32
-typedef struct {
-    ggml_fp16_t d;         // delta
-    ggml_fp16_t m;         // min
-    uint8_t qh[4];         // 5-th bit of quants
-    uint8_t qs[QK5_1 / 2]; // nibbles / quants
-} block_q5_1;
-static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
-
-#define QK8_0 32
-typedef struct {
-    ggml_fp16_t d;         // delta
-    int8_t  qs[QK8_0];     // quants
-} block_q8_0;
-static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
-
-#define QK8_1 32
-typedef struct {
-    float d;               // delta
-    float s;               // d * sum(qs[i])
-    int8_t  qs[QK8_1];     // quants
-} block_q8_1;
-static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
-
-typedef struct {
-    ggml_fp16_t d[4];      // deltas for 4 q4_0 blocks
-    uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks
-} block_q4_0x4;
-static_assert(sizeof(block_q4_0x4) == 4 * sizeof(ggml_fp16_t) + QK4_0 * 2, "wrong q4_0x4 block size/padding");
-
-typedef struct {
-    ggml_fp16_t d[8];      // deltas for 8 q4_0 blocks
-    uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks
-} block_q4_0x8;
-static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_fp16_t) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
-
-typedef struct {
-    ggml_fp16_t d[4];      // deltas for 4 q8_0 blocks
-    int8_t qs[QK8_0 * 4];  // quants for 4 q8_0 blocks
-} block_q8_0x4;
-static_assert(sizeof(block_q8_0x4) == 4 * sizeof(ggml_fp16_t) + QK8_0 * 4, "wrong q8_0x4 block size/padding");
-
-typedef struct {
-    ggml_fp16_t d[8];      // deltas for 8 q8_0 blocks
-    int8_t qs[QK8_0 * 8];  // quants for 8 q8_0 blocks
-} block_q8_0x8;
-static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_fp16_t) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
-
-//
-// Super-block quantization structures
-//
-
-// Super-block size
-#ifdef GGML_QKK_64
-#define QK_K 64
-#define K_SCALE_SIZE 4
-#else
-#define QK_K 256
-#define K_SCALE_SIZE 12
-#endif
-
-// 2-bit quantization
-// weight is represented as x = a * q + b
-// 16 blocks of 16 elements each
-// Effectively 2.625 bits per weight
-typedef struct {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    ggml_fp16_t d;           // super-block scale for quantized scales
-    ggml_fp16_t dmin;        // super-block scale for quantized mins
-} block_q2_K;
-static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
-
-// 3-bit quantization
-// weight is represented as x = a * q
-// 16 blocks of 16 elements each
-// Effectively 3.4375 bits per weight
-#ifdef GGML_QKK_64
-typedef struct {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-    uint8_t scales[2];
-    ggml_fp16_t d;             // super-block scale
-} block_q3_K;
-static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
-#else
-typedef struct {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-    uint8_t scales[12];        // scales, quantized with 6 bits
-    ggml_fp16_t d;             // super-block scale
-} block_q3_K;
-static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
-#endif
-
-// 4-bit quantization
-// 8 blocks of 32 elements each
-// weight is represented as x = a * q + b
-// Effectively 4.5 bits per weight
-#ifdef GGML_QKK_64
-typedef struct {
-    ggml_fp16_t d[2];          // super-block scales/mins
-    uint8_t scales[2];         // 4-bit block scales/mins
-    uint8_t qs[QK_K/2];        // 4--bit quants
-} block_q4_K;
-static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
-#else
-typedef struct {
-    ggml_fp16_t d;             // super-block scale for quantized scales
-    ggml_fp16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-} block_q4_K;
-static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
-#endif
-
-// 5-bit quantization
-// 8 blocks of 32 elements each
-// weight is represented as x = a * q + b
-// Effectively 5.5 bits per weight
-#ifdef GGML_QKK_64
-typedef struct {
-    ggml_fp16_t d;               // super-block scale
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-} block_q5_K;
-static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
-#else
-typedef struct {
-    ggml_fp16_t d;               // super-block scale for quantized scales
-    ggml_fp16_t dmin;            // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-} block_q5_K;
-static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
-#endif
-
-// 6-bit quantization
-// weight is represented as x = a * q
-// 16 blocks of 16 elements each
-// Effectively 6.5625 bits per weight
-typedef struct {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    ggml_fp16_t d;           // super-block scale
-} block_q6_K;
-static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
-
-// This is only used for intermediate quantization and dot products
-typedef struct {
-    float   d;              // delta
-    int8_t  qs[QK_K];       // quants
-    int16_t bsums[QK_K/16]; // sum of quants in groups of 16
-} block_q8_K;
-static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
-
-// (Almost) "true" 2-bit quantization.
-// Due to the need to use blocks as per ggml design, it ends up using
-// 2.0625 bpw because of the 16-bit scale for each block of 256.
-typedef struct {
-    ggml_fp16_t d;
-    uint16_t qs[QK_K/8];
-} block_iq2_xxs;
-static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
-
-// 2.3125 bpw quants
-typedef struct {
-    ggml_fp16_t d;
-    uint16_t qs[QK_K/8];
-    uint8_t  scales[QK_K/32];
-} block_iq2_xs;
-static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
-
-// (Almost) "true" 3-bit quantization.
-// Due to the need to use blocks as per ggml design, it ends up using
-// 3.0625 bpw because of the 16-bit scale for each block of 256.
-typedef struct {
-    ggml_fp16_t d;
-    uint8_t qs[3*QK_K/8];
-} block_iq3_xxs;
-static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
-
-typedef struct {
-    ggml_fp16_t d;
-    uint8_t qs[QK_K/8];
-    uint8_t scales[QK_K/16];
-} block_iq1_s;
-static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
-
-// Non-linear quants
-#define QK4_NL 32
-typedef struct {
-    ggml_fp16_t d;
-    uint8_t qs[QK4_NL/2];
-} block_iq4_nl;
-static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
-
 #ifdef __cplusplus
 extern "C" {
 #endif

From 6c8d8266b116686f289b0097ce47b029ea0d37a6 Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar109@gmail.com>
Date: Thu, 25 Apr 2024 03:57:15 +0000
Subject: [PATCH 04/28] Arm AArch64: add optimized GEMV and GEMM asm kernels
 for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780
 suggestions

---
 ggml/src/ggml-quants.c | 64 +++++++++++++++++++++++++-----------------
 ggml/src/ggml.c        |  2 +-
 2 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 2004ae356691d..868784cc63fbd 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -3309,41 +3309,37 @@ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nr
 
 size_t quantize_q4_0_aarch64(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     if (!quant_weights) {
-        //quantize_row_q4_0_reference(src, dst, (int64_t)nrow*n_per_row);
-        //return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
-
         int nrows_interleaved, blocklen_per_row;
-        typedef block_q4_0x8 block_q4_0xn;
-        typedef block_q4_0xn (*make_block_q4_0xn_t)(const block_q4_0 *, unsigned int, unsigned int);
-        make_block_q4_0xn_t make_block_q4_0xn = make_block_q4_0x8;
 
-        if (ggml_cpu_has_sve() && (svcntw() == 8)) {
+#if defined(__ARM_FEATURE_SVE)
+        if (svcntw() == 8) {
             nrows_interleaved = 8;
             blocklen_per_row = 8;
-            typedef block_q4_0x8 block_q4_0xn;
-            make_block_q4_0xn = make_block_q4_0x8;
         }
         else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
             nrows_interleaved = 4;
             blocklen_per_row = 8;
-            typedef block_q4_0x4 block_q4_0xn;
-            make_block_q4_0xn = make_block_q4_0x4;
-        }
-        else if (ggml_cpu_has_neon()) {
-            nrows_interleaved = 4;
-            blocklen_per_row = 4;
-            typedef block_q4_0x4 block_q4_0xn;
-            make_block_q4_0xn = make_block_q4_0x4;
-        }
-        else {
-            assert(false);
         }
+#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+        nrows_interleaved = 4;
+        blocklen_per_row = 8;
+#elif defined(__ARM_NEON)
+        nrows_interleaved = 4;
+        blocklen_per_row = 4;
+#endif
 
         assert(n_per_row % QK4_0 == 0);
         const int nb = n_per_row / QK4_0;
 
-        block_q4_0xn * out_ptr_B = (block_q4_0xn *) malloc(sizeof(block_q4_0xn) * nb);
-        block_q4_0xn * out_ptr_B_start = out_ptr_B;
+        void * out_ptr_B, * out_ptr_B_start;
+        if (nrows_interleaved == 8) {
+            out_ptr_B = (block_q4_0x8 *) malloc(sizeof(block_q4_0x8) * nb);
+            out_ptr_B_start = out_ptr_B;
+        }
+        else if (nrows_interleaved == 4) {
+            out_ptr_B = (block_q4_0x4 *) malloc(sizeof(block_q4_0x4) * nb);
+            out_ptr_B_start = out_ptr_B;
+        }
 
         for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) {
             const block_q4_0 * in_ptrs[nrows_interleaved];
@@ -3354,18 +3350,26 @@ size_t quantize_q4_0_aarch64(const float * restrict src, void * restrict dst, in
             }
 
             for (int64_t x = 0; x < nb; x++) {
-                *out_ptr_B = make_block_q4_0xn(in_ptrs, blocklen_per_row, 0x88);
-                out_ptr_B++;
+                if (nrows_interleaved == 8) {
+                    *(block_q4_0x8 *) out_ptr_B = make_block_q4_0x8(in_ptrs, blocklen_per_row, 0x88);
+                    out_ptr_B = (block_q4_0x8 *) out_ptr_B + 1;
+                }
+                else if (nrows_interleaved == 4) {
+                    *(block_q4_0x4 *) out_ptr_B = make_block_q4_0x4(in_ptrs, blocklen_per_row, 0x88);
+                    out_ptr_B = (block_q4_0x4 *) out_ptr_B + 1;
+                }
 
                 for (int i = 0; i < nrows_interleaved; i++) {
                     in_ptrs[i]++;
                 }
             }
             out_ptr_B = out_ptr_B_start;
-            memcpy ((block_q4_0 *) dst + b / QK4_0, out_ptr_B_start, sizeof(block_q4_0xn) * nb);
+            if (nrows_interleaved == 8) memcpy ((block_q4_0 *) dst + b / QK4_0, out_ptr_B_start, sizeof(block_q4_0x8) * nb);
+            else if (nrows_interleaved == 4) memcpy ((block_q4_0 *) dst + b / QK4_0, out_ptr_B_start, sizeof(block_q4_0x4) * nb);
         }
         if (out_ptr_B_start) free(out_ptr_B_start);
-        return (nrow * n_per_row / QK4_0 * sizeof(block_q4_0));
+
+        return ((nrow * n_per_row) / QK4_0 * sizeof(block_q4_0));
     }
     size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
     char * qrow = (char *)dst;
@@ -15179,6 +15183,10 @@ void ggml_gemv_q4_0_q8_0_blocked8_sve(const int n, int output_channels, int inpu
 
 void ggml_gemv_q4_0_q8_0_aarch64_sve256(size_t depth, size_t output_channels, size_t height, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
 #if defined(__ARM_FEATURE_SVE)
+    if (svcntw() != 8) {
+        if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) ggml_gemv_q4_0_q8_0_aarch64_neon(depth, output_channels, height, s, vx, vy, ith, nth);
+        return;
+    }
     int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)8);
     int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)8);
     size_t width = xend - x0;
@@ -15657,6 +15665,10 @@ void ggml_gemm_q4_0_q8_0(const int n, int rows, int output_channels, int input_w
 
 void ggml_gemm_q4_0_q8_0_aarch64_sve256(size_t depth, size_t output_channels, size_t height, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
+    if (svcntw() != 8) {
+        if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) ggml_gemm_q4_0_q8_0_aarch64_neon(depth, output_channels, height, s, vx, vy, ith, nth);
+        return;
+    }
     int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)8);
     int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)8);
     size_t width = xend - x0;
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index ddeda43364dda..ced8a1a606289 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -12377,7 +12377,7 @@ UseGgmlGemm2:;
         for (int row_iter = ((ne11 / 8) * 8) + ((ne11 - rows_processed) / 4 * 4); row_iter < ne11; row_iter++) {
             gemv(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
         }
-    } 
+    }
     else if ((ggml_n_dims(src0) == 2) && (ne11 >= 4) && (type == GGML_TYPE_Q4_0_AARCH64)) {
         // use batch-sized 4 GEMM kernel
         for (int row_iter = 0; row_iter < ne11 / 4; row_iter++) {

From 43e12974ede70534a53299dcaf326b2eeb1d0195 Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar109@gmail.com>
Date: Mon, 29 Apr 2024 05:51:07 +0000
Subject: [PATCH 05/28] Arm AArch64: add optimized GEMV and GEMM asm kernels
 for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780
 suggestions

---
 Package.swift          |    1 +
 build.zig              |  173 +++
 ggml-aarch64.cpp       | 2099 +++++++++++++++++++++++++++++++++++
 ggml-aarch64.h         |   42 +
 ggml/include/ggml.h    |    8 +-
 ggml/src/ggml-quants.c | 2349 ----------------------------------------
 ggml/src/ggml-quants.h |   23 -
 ggml/src/ggml.c        |   27 +-
 8 files changed, 2334 insertions(+), 2388 deletions(-)
 create mode 100644 build.zig
 create mode 100644 ggml-aarch64.cpp
 create mode 100644 ggml-aarch64.h

diff --git a/Package.swift b/Package.swift
index 77fed86df3105..c357751dd3196 100644
--- a/Package.swift
+++ b/Package.swift
@@ -10,6 +10,7 @@ var sources = [
     "ggml/src/ggml-alloc.c",
     "ggml/src/ggml-backend.c",
     "ggml/src/ggml-quants.c",
+    "ggml/src/ggml-aarch64.cpp",
 ]
 
 var resources: [Resource] = []
diff --git a/build.zig b/build.zig
new file mode 100644
index 0000000000000..804634f2a023b
--- /dev/null
+++ b/build.zig
@@ -0,0 +1,173 @@
+// Compatible with Zig Version 0.11.0
+const std = @import("std");
+const ArrayList = std.ArrayList;
+const Compile = std.Build.Step.Compile;
+const ConfigHeader = std.Build.Step.ConfigHeader;
+const Mode = std.builtin.Mode;
+const CrossTarget = std.zig.CrossTarget;
+
+const Maker = struct {
+    builder: *std.build.Builder,
+    target: CrossTarget,
+    optimize: Mode,
+    enable_lto: bool,
+
+    include_dirs: ArrayList([]const u8),
+    cflags: ArrayList([]const u8),
+    cxxflags: ArrayList([]const u8),
+    objs: ArrayList(*Compile),
+
+    fn addInclude(m: *Maker, dir: []const u8) !void {
+        try m.include_dirs.append(dir);
+    }
+    fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
+        try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
+    }
+    fn addCFlag(m: *Maker, flag: []const u8) !void {
+        try m.cflags.append(flag);
+    }
+    fn addCxxFlag(m: *Maker, flag: []const u8) !void {
+        try m.cxxflags.append(flag);
+    }
+    fn addFlag(m: *Maker, flag: []const u8) !void {
+        try m.addCFlag(flag);
+        try m.addCxxFlag(flag);
+    }
+
+    fn init(builder: *std.build.Builder) !Maker {
+        const target = builder.standardTargetOptions(.{});
+        const zig_version = @import("builtin").zig_version_string;
+        const commit_hash = try std.ChildProcess.exec(
+            .{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
+        );
+        try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
+            \\int LLAMA_BUILD_NUMBER = {};
+            \\char const *LLAMA_COMMIT = "{s}";
+            \\char const *LLAMA_COMPILER = "Zig {s}";
+            \\char const *LLAMA_BUILD_TARGET = "{s}";
+            \\
+        , .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
+        var m = Maker{
+            .builder = builder,
+            .target = target,
+            .optimize = builder.standardOptimizeOption(.{}),
+            .enable_lto = false,
+            .include_dirs = ArrayList([]const u8).init(builder.allocator),
+            .cflags = ArrayList([]const u8).init(builder.allocator),
+            .cxxflags = ArrayList([]const u8).init(builder.allocator),
+            .objs = ArrayList(*Compile).init(builder.allocator),
+        };
+
+        try m.addCFlag("-std=c11");
+        try m.addCxxFlag("-std=c++11");
+        try m.addProjectInclude(&.{});
+        try m.addProjectInclude(&.{"common"});
+        return m;
+    }
+
+    fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
+        const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
+        if (o.target.getAbi() != .msvc)
+            o.defineCMacro("_GNU_SOURCE", null);
+
+        if (std.mem.endsWith(u8, src, ".c")) {
+            o.addCSourceFiles(&.{src}, m.cflags.items);
+            o.linkLibC();
+        } else {
+            o.addCSourceFiles(&.{src}, m.cxxflags.items);
+            if (o.target.getAbi() == .msvc) {
+                o.linkLibC(); // need winsdk + crt
+            } else {
+                // linkLibCpp already add (libc++ + libunwind + libc)
+                o.linkLibCpp();
+            }
+        }
+        for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
+        o.want_lto = m.enable_lto;
+        return o;
+    }
+
+    fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
+        const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
+        e.addCSourceFiles(&.{src}, m.cxxflags.items);
+        for (deps) |d| e.addObject(d);
+        for (m.objs.items) |o| e.addObject(o);
+        for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
+
+        // https://github.com/ziglang/zig/issues/15448
+        if (e.target.getAbi() == .msvc) {
+            e.linkLibC(); // need winsdk + crt
+        } else {
+            // linkLibCpp already add (libc++ + libunwind + libc)
+            e.linkLibCpp();
+        }
+        m.builder.installArtifact(e);
+        e.want_lto = m.enable_lto;
+        return e;
+    }
+};
+
+pub fn build(b: *std.build.Builder) !void {
+    var make = try Maker.init(b);
+    make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
+
+    const ggml = make.obj("ggml", "ggml.c");
+    const sgemm = make.obj("sgemm", "sgemm.cpp");
+    const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
+    const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
+    const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
+    const unicode = make.obj("unicode", "unicode.cpp");
+    const unicode_data = make.obj("unicode-data", "unicode-data.cpp");
+    const llama = make.obj("llama", "llama.cpp");
+    const buildinfo = make.obj("common", "common/build-info.cpp");
+    const common = make.obj("common", "common/common.cpp");
+    const console = make.obj("console", "common/console.cpp");
+    const sampling = make.obj("sampling", "common/sampling.cpp");
+    const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
+    const json_schema_to_grammar = make.obj("json-schema-to-grammar", "common/json-schema-to-grammar.cpp");
+    const train = make.obj("train", "common/train.cpp");
+    const clip = make.obj("clip", "examples/llava/clip.cpp");
+    const llava = make.obj("llava", "examples/llava/llava.cpp");
+    const ggml_aarch64 = make.obj("ggml-aarch64", "ggml-aarch64.cpp");
+
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, ggml_aarch64, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
+    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, ggml_aarch64, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
+    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, ggml_aarch64, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
+    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, ggml_aarch64, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
+    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, ggml_aarch64, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
+    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, ggml_aarch64, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
+
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, ggml_aarch64, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
+    if (server.target.isWindows()) {
+        server.linkSystemLibrary("ws2_32");
+    }
+
+    const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" };
+    for (server_assets) |asset| {
+        const input_path = b.fmt("examples/server/public/{s}", .{asset});
+        const output_path = b.fmt("examples/server/{s}.hpp", .{asset});
+
+        // Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`:
+
+        const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize));
+        defer b.allocator.free(input);
+
+        var buf = std.ArrayList(u8).init(b.allocator);
+        defer buf.deinit();
+
+        for (input) |byte| {
+            try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte});
+        }
+
+        var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_");
+        defer b.allocator.free(name);
+        std.mem.replaceScalar(u8, name, '.', '_');
+
+        try std.fs.cwd().writeFile(output_path, b.fmt(
+            "unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n",
+            .{ name, buf.items, name, input.len },
+        ));
+
+        std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path });
+    }
+}
diff --git a/ggml-aarch64.cpp b/ggml-aarch64.cpp
new file mode 100644
index 0000000000000..8dedc7e52701a
--- /dev/null
+++ b/ggml-aarch64.cpp
@@ -0,0 +1,2099 @@
+// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
+#define GGML_COMMON_IMPL_C
+#include "ggml-common.h"
+
+#include "ggml-quants.h"
+#include "ggml-impl.h"
+
+#define GGML_COMMON_IMPL_C
+#include "ggml-common.h"
+
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include <float.h>
+#include <stdlib.h> // for qsort
+#include <stdio.h>  // for GGML_ASSERT
+
+#include "ggml-aarch64.h"
+
+#define UNUSED GGML_UNUSED
+
+size_t quantize_q4_0_aarch64(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    if (!quant_weights) {
+        int nrows_interleaved = 1;
+        int blocklen_per_row;
+
+#if defined(__ARM_FEATURE_SVE)
+        if (svcntw() == 8) {
+            nrows_interleaved = 8;
+            blocklen_per_row = 8;
+        }
+        else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
+            nrows_interleaved = 4;
+            blocklen_per_row = 8;
+        }
+#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+        nrows_interleaved = 4;
+        blocklen_per_row = 8;
+#elif defined(__ARM_NEON)
+        nrows_interleaved = 4;
+        blocklen_per_row = 4;
+#endif
+
+        assert(n_per_row % QK4_0 == 0);
+        const int nb = n_per_row / QK4_0;
+
+        void * out_ptr_B = NULL;
+        void * out_ptr_B_start = NULL;
+        if (nrows_interleaved == 8) {
+            out_ptr_B = (block_q4_0x8 *) malloc(sizeof(block_q4_0x8) * nb);
+            out_ptr_B_start = out_ptr_B;
+        }
+        else if (nrows_interleaved == 4) {
+            out_ptr_B = (block_q4_0x4 *) malloc(sizeof(block_q4_0x4) * nb);
+            out_ptr_B_start = out_ptr_B;
+        }
+
+        for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) {
+            block_q4_0 ** in_ptrs = new block_q4_0 * [nrows_interleaved];
+
+            for (int i  = 0; i < nrows_interleaved; i++ ) {
+                in_ptrs[i] = (block_q4_0 *) dst + (b + i * n_per_row) / QK4_0;
+                quantize_row_q4_0_reference(src + b + i * n_per_row, (block_q4_0 *) in_ptrs[i], n_per_row);
+            }
+
+            for (int64_t x = 0; x < nb; x++) {
+                if (nrows_interleaved == 8) {
+                    *(block_q4_0x8 *) out_ptr_B = make_block_q4_0x8(in_ptrs, blocklen_per_row, 0x88);
+                    out_ptr_B = (block_q4_0x8 *) out_ptr_B + 1;
+                }
+                else if (nrows_interleaved == 4) {
+                    *(block_q4_0x4 *) out_ptr_B = make_block_q4_0x4(in_ptrs, blocklen_per_row, 0x88);
+                    out_ptr_B = (block_q4_0x4 *) out_ptr_B + 1;
+                }
+
+                for (int i = 0; i < nrows_interleaved; i++) {
+                    in_ptrs[i]++;
+                }
+            }
+            delete [] in_ptrs;
+            out_ptr_B = out_ptr_B_start;
+            if (nrows_interleaved == 8) memcpy ((block_q4_0 *) dst + b / QK4_0, out_ptr_B_start, sizeof(block_q4_0x8) * nb);
+            else if (nrows_interleaved == 4) memcpy ((block_q4_0 *) dst + b / QK4_0, out_ptr_B_start, sizeof(block_q4_0x4) * nb);
+        }
+        if (out_ptr_B_start) free(out_ptr_B_start);
+
+        return ((nrow * n_per_row) / QK4_0 * sizeof(block_q4_0));
+    }
+    else {
+        assert(false);
+        return 0;
+    }
+}
+
+void quantize_row_q8_0_aarch64(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k, int nrows_interleaved, int blocklen_per_row) {
+    assert(QK8_0 == 32);
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
+
+#if defined(__ARM_NEON)
+    float * id = new float[nrows_interleaved];
+    auto srcv = new float32x4_t[nrows_interleaved][8];
+
+    for (int i = 0; i < nb; i++) {
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+
+        for (int row_iter = 0; row_iter < nrows_interleaved; row_iter++) {
+            for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j);
+            for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]);
+
+            for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]);
+            for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]);
+            for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]);
+
+            const float amax = vmaxvq_f32(amaxv[0]);
+
+            const float d = amax / ((1 << 7) - 1);
+            id[row_iter] = d ? 1.0f / d : 0.0f;
+
+            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
+        }
+
+        if (blocklen_per_row == 8) {
+            for (int j = 0; j < 4; j++) {
+                float32x4_t v = vmulq_n_f32(srcv[0][2 * j], id[0]);
+                int32x4_t vi = vcvtnq_s32_f32(v);
+                y[i].qs[32 * j + 0] = vgetq_lane_s32(vi, 0);
+                y[i].qs[32 * j + 1] = vgetq_lane_s32(vi, 1);
+                y[i].qs[32 * j + 2] = vgetq_lane_s32(vi, 2);
+                y[i].qs[32 * j + 3] = vgetq_lane_s32(vi, 3);
+                v = vmulq_n_f32(srcv[0][2 * j + 1], id[0]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[32 * j + 4] = vgetq_lane_s32(vi, 0);
+                y[i].qs[32 * j + 5] = vgetq_lane_s32(vi, 1);
+                y[i].qs[32 * j + 6] = vgetq_lane_s32(vi, 2);
+                y[i].qs[32 * j + 7] = vgetq_lane_s32(vi, 3);
+
+                v = vmulq_n_f32(srcv[1][2 * j], id[1]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[32 * j + 8] = vgetq_lane_s32(vi, 0);
+                y[i].qs[32 * j + 9] = vgetq_lane_s32(vi, 1);
+                y[i].qs[32 * j + 10] = vgetq_lane_s32(vi, 2);
+                y[i].qs[32 * j + 11] = vgetq_lane_s32(vi, 3);
+                v = vmulq_n_f32(srcv[1][2 * j + 1], id[1]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[32 * j + 12] = vgetq_lane_s32(vi, 0);
+                y[i].qs[32 * j + 13] = vgetq_lane_s32(vi, 1);
+                y[i].qs[32 * j + 14] = vgetq_lane_s32(vi, 2);
+                y[i].qs[32 * j + 15] = vgetq_lane_s32(vi, 3);
+
+                v = vmulq_n_f32(srcv[2][2 * j], id[2]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[32 * j + 16] = vgetq_lane_s32(vi, 0);
+                y[i].qs[32 * j + 17] = vgetq_lane_s32(vi, 1);
+                y[i].qs[32 * j + 18] = vgetq_lane_s32(vi, 2);
+                y[i].qs[32 * j + 19] = vgetq_lane_s32(vi, 3);
+                v = vmulq_n_f32(srcv[2][2 * j + 1], id[2]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[32 * j + 20] = vgetq_lane_s32(vi, 0);
+                y[i].qs[32 * j + 21] = vgetq_lane_s32(vi, 1);
+                y[i].qs[32 * j + 22] = vgetq_lane_s32(vi, 2);
+                y[i].qs[32 * j + 23] = vgetq_lane_s32(vi, 3);
+
+                v = vmulq_n_f32(srcv[3][2 * j], id[3]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[32 * j + 24] = vgetq_lane_s32(vi, 0);
+                y[i].qs[32 * j + 25] = vgetq_lane_s32(vi, 1);
+                y[i].qs[32 * j + 26] = vgetq_lane_s32(vi, 2);
+                y[i].qs[32 * j + 27] = vgetq_lane_s32(vi, 3);
+                v = vmulq_n_f32(srcv[3][2 * j + 1], id[3]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[32 * j + 28] = vgetq_lane_s32(vi, 0);
+                y[i].qs[32 * j + 29] = vgetq_lane_s32(vi, 1);
+                y[i].qs[32 * j + 30] = vgetq_lane_s32(vi, 2);
+                y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3);
+            }
+        }
+        else if (blocklen_per_row == 4) {
+            for (int j = 0; j < 8; j++) {
+                float32x4_t v = vmulq_n_f32(srcv[0][j], id[0]);
+                int32x4_t vi = vcvtnq_s32_f32(v);
+                y[i].qs[16 * j + 0] = vgetq_lane_s32(vi, 0);
+                y[i].qs[16 * j + 1] = vgetq_lane_s32(vi, 1);
+                y[i].qs[16 * j + 2] = vgetq_lane_s32(vi, 2);
+                y[i].qs[16 * j + 3] = vgetq_lane_s32(vi, 3);
+
+                v = vmulq_n_f32(srcv[1][j], id[1]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[16 * j + 4] = vgetq_lane_s32(vi, 0);
+                y[i].qs[16 * j + 5] = vgetq_lane_s32(vi, 1);
+                y[i].qs[16 * j + 6] = vgetq_lane_s32(vi, 2);
+                y[i].qs[16 * j + 7] = vgetq_lane_s32(vi, 3);
+
+                v = vmulq_n_f32(srcv[2][j], id[2]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[16 * j + 8] = vgetq_lane_s32(vi, 0);
+                y[i].qs[16 * j + 9] = vgetq_lane_s32(vi, 1);
+                y[i].qs[16 * j + 10] = vgetq_lane_s32(vi, 2);
+                y[i].qs[16 * j + 11] = vgetq_lane_s32(vi, 3);
+
+                v = vmulq_n_f32(srcv[3][j], id[3]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[16 * j + 12] = vgetq_lane_s32(vi, 0);
+                y[i].qs[16 * j + 13] = vgetq_lane_s32(vi, 1);
+                y[i].qs[16 * j + 14] = vgetq_lane_s32(vi, 2);
+                y[i].qs[16 * j + 15] = vgetq_lane_s32(vi, 3);
+            }
+        }
+    }
+    delete [] id;
+    delete [] srcv;
+#endif
+}
+
+// Routines to create the blocked formats
+// Note input is array of pointers.
+// The exact interleaving format needed is different for GEMM (using SMMLA)
+// and GEMV (using SDOT) cases.  For GEMM, we interleave 8 pairs of values
+// at a time (with the two nibbles separated at runtime to give 2x2x8
+// matrices).  For GEMV, we need to interleave 4 pairs of values instead.
+block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned int block_len, unsigned int xor_mask) {
+    block_q4_0x4 out;
+
+    for (int i = 0; i < 4; i++) {
+        out.d[i] = in[i]->d;
+    }
+
+    for (int i = 0; i < QK4_0 * 2; i++) {
+        // We are interleaving 4 rows in blocks of 8, making a total of 32
+        // output bytes per block (2 MMLA input vectors).  This repeats
+        // until we have processed the whole block.
+        //
+        // Per the comment above, for GEMV cases a similar process is used
+        // but with blocks of 4 instead, giving a single DOT input vector.
+        //
+        // In the case of q4, we add on 128 to convert the top nibble from
+        // "bias offset" form to pure sign form (this saves a subtract when
+        // we unpack it).
+        int src_offset = (i / (4 * block_len)) * block_len;
+        int src_id = (i % (4 * block_len)) / block_len;
+        src_offset += (i % block_len);
+
+        out.qs[i] = in[src_id]->qs[src_offset] ^ xor_mask;
+    }
+
+    return out;
+}
+
+// 8-block version - see comments in code above
+block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned int block_len, unsigned int xor_mask) {
+    block_q4_0x8 out;
+
+    for (int i = 0; i < 8; i++) {
+        out.d[i] = in[i]->d;
+    }
+
+    for (int i = 0; i < QK4_0 * 4; i++) {
+        int src_offset = (i / (8 * block_len)) * block_len;
+        int src_id = (i % (8 * block_len)) / block_len;
+        src_offset += (i % block_len);
+
+        out.qs[i] = in[src_id]->qs[src_offset] ^ xor_mask;
+    }
+
+    return out;
+}
+
+block_q8_0x4 make_block_q8_0x4(const block_q8_0 * const in[4], unsigned int block_len) {
+    block_q8_0x4 out;
+
+    for (int i = 0; i < 4; i++) {
+        out.d[i] = in[i]->d;
+    }
+
+    for (int i = 0; i < QK8_0 * 4; i++) {
+        int src_offset = (i / (4 * block_len)) * block_len;
+        int src_id = (i % (4 * block_len)) / block_len;
+        src_offset += (i % block_len);
+
+        out.qs[i] = in[src_id]->qs[src_offset];
+    }
+
+    return out;
+}
+
+// 8-block version - see comments in code above
+block_q8_0x8 make_block_q8_0x8(const block_q8_0 * const in[8], unsigned int block_len) {
+    block_q8_0x8 out;
+
+    for (int i = 0; i < 8; i++) {
+        out.d[i] = in[i]->d;
+    }
+
+    for (int i = 0; i < QK8_0 * 8; i++) {
+        int src_offset = (i / (8 * block_len)) * block_len;
+        int src_id = (i % (8 * block_len)) / block_len;
+        src_offset += (i % block_len);
+
+        out.qs[i] = in[src_id]->qs[src_offset];
+    }
+
+    return out;
+}
+
+inline int64_t roundup(const int64_t a, const int64_t b) {
+    int64_t rem = a % b;
+
+    if (rem) {
+        return a + b - rem;
+    } else {
+        return a;
+    }
+}
+
+void ggml_gemv_q4_0_q8_0_aarch64_sve256(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth) {
+#if defined(__ARM_FEATURE_SVE)
+    if (svcntw() != 8) {
+        if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) ggml_gemv_q4_0_q8_0_aarch64_neon(n, s, vx, vy, nr, nc, ith, nth);
+        return;
+    }
+    int64_t x0 = roundup((ith * nc) / nth, (int64_t)8);
+    int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)8);
+    size_t width = xend - x0;
+
+    int64_t nb = n / QK4_0;
+    const void * b_ptr = (const void *)((const block_q4_0x8 *) vx + ((x0 / 8) * nb));
+    const void * a_ptr = vy;
+    float * res_ptr = s + x0;
+
+    assert(n % 32 == 0);
+    assert(width % 8 == 0);
+
+    size_t num_blocks = n / 32;
+
+    __asm__ __volatile__(
+        "ptrue p0.b\n"
+        "add %x[b_ptr], %x[b_ptr], #0x10\n"
+        "1:"  // Column loop
+        "add x22, %x[a_ptr], #0x2\n"
+        "mov z31.b, #0x0\n"
+        "mov x21, %x[num_blocks]\n"
+        "2:"  // Block loop
+        "ld1b { z30.b }, p0/Z, [%x[b_ptr]]\n"
+        "ld1b { z29.b }, p0/Z, [%x[b_ptr], #1, MUL VL]\n"
+        "mov z28.s, #0x0\n"
+        "mov z27.s, #0x0\n"
+        "ld1rd { z26.d }, p0/Z, [x22]\n"
+        "ld1b { z25.b }, p0/Z, [%x[b_ptr], #2, MUL VL]\n"
+        "sub x20, x22, #0x2\n"
+        "sub x21, x21, #0x1\n"
+        "ld1b { z24.b }, p0/Z, [%x[b_ptr], #3, MUL VL]\n"
+        "ld1rd { z23.d }, p0/Z, [x22, #8]\n"
+        "lsl z22.b, z30.b, #0x4\n"
+        "lsl z16.b, z29.b, #0x4\n"
+        "and z30.b, z30.b, #0xf0\n"
+        "and z29.b, z29.b, #0xf0\n"
+        "ld1rd { z21.d }, p0/Z, [x22, #16]\n"
+        "ld1rd { z20.d }, p0/Z, [x22, #24]\n"
+        "lsl z19.b, z25.b, #0x4\n"
+        "and z25.b, z25.b, #0xf0\n"
+        "ld1rh { z17.h }, p0/Z, [x20]\n"
+        "ld1h { z18.s }, p0/Z, [%x[b_ptr], #-1, MUL VL]\n"
+        "sdot z28.s, z22.b, z26.b\n"
+        "sdot z27.s, z16.b, z26.b\n"
+        "lsl z16.b, z24.b, #0x4\n"
+        "add x22, x22, #0x22\n"
+        "and z24.b, z24.b, #0xf0\n"
+        "add %x[b_ptr], %x[b_ptr], #0x90\n"
+        "fcvt z17.s, p0/m, z17.h\n"
+        "fcvt z18.s, p0/m, z18.h\n"
+        "sdot z28.s, z19.b, z23.b\n"
+        "sdot z27.s, z16.b, z23.b\n"
+        "fmul z18.s, z18.s, z17.s\n"
+        "sdot z28.s, z30.b, z21.b\n"
+        "sdot z27.s, z29.b, z21.b\n"
+        "sdot z28.s, z25.b, z20.b\n"
+        "sdot z27.s, z24.b, z20.b\n"
+        "uzp1 z17.s, z28.s, z27.s\n"
+        "uzp2 z16.s, z28.s, z27.s\n"
+        "add z17.s, z17.s, z16.s\n"
+        "asr z17.s, z17.s, #0x4\n"
+        "scvtf z17.s, p0/m, z17.s\n"
+        "fmla z31.s, p0/M, z17.s, z18.s\n"
+        "cbnz x21, 2b\n"
+        "sub %x[width], %x[width], #0x8\n"
+        "st1w { z31.s }, p0, [%x[res_ptr]]\n"
+        "add %x[res_ptr], %x[res_ptr], #0x20\n"
+        "cbnz %x[width], 1b\n"
+        : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [width] "+&r" (width)
+        : [a_ptr] "r" (a_ptr), [num_blocks] "r" (num_blocks)
+        : "memory", "p0", "x20", "x21", "x22", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+#endif
+}
+
+void ggml_gemv_q4_0_q8_0_aarch64_neon(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth) {
+    UNUSED(nr);
+#if defined(__ARM_NEON)
+    int64_t x0 = roundup((ith * nc) / nth, (int64_t)4);
+    int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)4);
+    size_t width = xend - x0;
+
+    int64_t nb = n / QK4_0;
+    const void * b_ptr = (const void *)((const block_q4_0x4 *) vx + ((x0 / 4) * nb));
+    const void * a_ptr = vy;
+    float * res_ptr = s + x0;
+
+    assert(n % 32 == 0);
+    assert(width % 4 == 0);
+
+    size_t num_blocks = n / 32;
+
+    __asm__ __volatile__(
+        "movi v2.16b, #0x4\n"
+        "movi v1.16b, #0xf0\n"
+        "add %x[b_ptr], %x[b_ptr], #0x8\n"
+        "1:"  // Column loop
+        "add x23, %x[a_ptr], #0x2\n"
+        "movi v0.16b, #0x0\n"
+        "mov x22, %x[num_blocks]\n"
+        "2:"  // Block loop
+        "ldr q31, [%x[b_ptr], #0x0]\n"
+        "ldr q30, [%x[b_ptr], #0x10]\n"
+        "mov x21, x23\n"
+        "movi v29.4s, #0x0\n"
+        "ldr q28, [%x[b_ptr], #0x20]\n"
+        "ldr q27, [%x[b_ptr], #0x30]\n"
+        "movi v26.4s, #0x0\n"
+        "sub x20, x23, #0x2\n"
+        "ld1r { v25.8h }, [x20]\n"
+        "ldr q24, [%x[b_ptr], #-0x8]\n"
+        "sub x22, x22, #0x1\n"
+        "add x23, x23, #0x22\n"
+        "ld1r { v23.2d }, [x21], #0x8\n"
+        "sshl v22.16b, v31.16b, v2.16b\n"
+        "sshl v16.16b, v30.16b, v2.16b\n"
+        "add %x[b_ptr], %x[b_ptr], #0x48\n"
+        "ld1r { v21.2d }, [x21], #0x8\n"
+        "sshl v20.16b, v28.16b, v2.16b\n"
+        "sshl v19.16b, v27.16b, v2.16b\n"
+        "ld1r { v18.2d }, [x21], #0x8\n"
+        "ld1r { v17.2d }, [x21], #0x8\n"
+        "and v31.16b, v31.16b, v1.16b\n"
+        "and v30.16b, v30.16b, v1.16b\n"
+        ".inst 0x4e9796dd  // sdot v29.4s, v22.16b, v23.16b\n"
+        ".inst 0x4e97961a  // sdot v26.4s, v16.16b, v23.16b\n"
+        "and v28.16b, v28.16b, v1.16b\n"
+        "and v27.16b, v27.16b, v1.16b\n"
+        "fcvtl v25.4s, v25.4h\n"
+        "fcvtl v16.4s, v24.4h\n"
+        ".inst 0x4e95969d  // sdot v29.4s, v20.16b, v21.16b\n"
+        ".inst 0x4e95967a  // sdot v26.4s, v19.16b, v21.16b\n"
+        "fmul v16.4s, v16.4s, v25.4s\n"
+        ".inst 0x4e9297fd  // sdot v29.4s, v31.16b, v18.16b\n"
+        ".inst 0x4e9297da  // sdot v26.4s, v30.16b, v18.16b\n"
+        ".inst 0x4e91979d  // sdot v29.4s, v28.16b, v17.16b\n"
+        ".inst 0x4e91977a  // sdot v26.4s, v27.16b, v17.16b\n"
+        "addp v29.4s, v29.4s, v26.4s\n"
+        "scvtf v29.4s, v29.4s, #0x4\n"
+        "fmla v0.4s, v29.4s, v16.4s\n"
+        "cbnz x22, 2b\n"
+        "sub %x[width], %x[width], #0x4\n"
+        "str q0, [%x[res_ptr], #0x0]\n"
+        "add %x[res_ptr], %x[res_ptr], #0x10\n"
+        "cbnz %x[width], 1b\n"
+        : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [width] "+&r" (width)
+        : [a_ptr] "r" (a_ptr), [num_blocks] "r" (num_blocks)
+        : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
+    );
+#endif
+}
+
+void ggml_gemv_q4_0_q8_0_aarch64_neon_noi8mm(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth) {
+    UNUSED(nr);
+#if defined(__ARM_NEON)
+    int64_t x0 = roundup((ith * nc) / nth, (int64_t)4);
+    int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)4);
+    size_t width = xend - x0;
+
+    int64_t nb = n / QK4_0;
+    const void * b_ptr = (const void *)((const block_q4_0x4 *) vx + ((x0 / 4) * nb));
+    const void * a_ptr = vy;
+    float * res_ptr = s + x0;
+
+    assert(n % 32 == 0);
+    assert(width % 4 == 0);
+
+    size_t num_blocks = n / 32;
+
+    __asm__ __volatile__(
+        "movi v31.16b, #0x4\n"
+        "movi v30.16b, #0xf0\n"
+        "add %x[b_ptr], %x[b_ptr], #0x8\n"
+        "1:"  // Column loop
+        "add x22, %x[a_ptr], #0x2\n"
+        "movi v29.16b, #0x0\n"
+        "mov x21, %x[num_blocks]\n"
+        "2:"  // Block loop
+        "ldr q28, [%x[b_ptr], #0x0]\n"
+        "ldr q27, [x22, #0x0]\n"
+        "movi v26.4s, #0x0\n"
+        "sub x20, x22, #0x2\n"
+        "ldr q25, [x22, #0x10]\n"
+        "ldr q24, [%x[b_ptr], #0x10]\n"
+        "sub x21, x21, #0x1\n"
+        "add x22, x22, #0x22\n"
+        "ldr q23, [%x[b_ptr], #0x20]\n"
+        "ldr q22, [%x[b_ptr], #0x30]\n"
+        "ld1r { v21.8h }, [x20]\n"
+        "ldr q20, [%x[b_ptr], #-0x8]\n"
+        "sshl v16.16b, v28.16b, v31.16b\n"
+        "and v28.16b, v28.16b, v30.16b\n"
+        "sshl v19.16b, v24.16b, v31.16b\n"
+        "and v24.16b, v24.16b, v30.16b\n"
+        "add %x[b_ptr], %x[b_ptr], #0x48\n"
+        "sshl v18.16b, v23.16b, v31.16b\n"
+        "and v23.16b, v23.16b, v30.16b\n"
+        ".inst 0x4f9be21a  // sdot v26.4s, v16.16b, v27.4b[0]\n"
+        "sshl v17.16b, v22.16b, v31.16b\n"
+        "and v22.16b, v22.16b, v30.16b\n"
+        "fcvtl v21.4s, v21.4h\n"
+        "fcvtl v16.4s, v20.4h\n"
+        ".inst 0x4f99e39a  // sdot v26.4s, v28.16b, v25.4b[0]\n"
+        "fmul v16.4s, v16.4s, v21.4s\n"
+        ".inst 0x4fbbe27a  // sdot v26.4s, v19.16b, v27.4b[1]\n"
+        ".inst 0x4fb9e31a  // sdot v26.4s, v24.16b, v25.4b[1]\n"
+        ".inst 0x4f9bea5a  // sdot v26.4s, v18.16b, v27.4b[2]\n"
+        ".inst 0x4f99eafa  // sdot v26.4s, v23.16b, v25.4b[2]\n"
+        ".inst 0x4fbbea3a  // sdot v26.4s, v17.16b, v27.4b[3]\n"
+        ".inst 0x4fb9eada  // sdot v26.4s, v22.16b, v25.4b[3]\n"
+        "scvtf v26.4s, v26.4s, #0x4\n"
+        "fmla v29.4s, v26.4s, v16.4s\n"
+        "cbnz x21, 2b\n"
+        "sub %x[width], %x[width], #0x4\n"
+        "str q29, [%x[res_ptr], #0x0]\n"
+        "add %x[res_ptr], %x[res_ptr], #0x10\n"
+        "cbnz %x[width], 1b\n"
+        : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [width] "+&r" (width)
+        : [a_ptr] "r" (a_ptr), [num_blocks] "r" (num_blocks)
+        : "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22"
+    );
+#endif
+}
+
+void ggml_gemv_q8_0_q8_0_aarch64_sve256(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth) {
+#if defined(__ARM_FEATURE_SVE)
+    int64_t x0 = roundup((ith * nc) / nth, (int64_t)8);
+    int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)8);
+
+    int64_t nb = n / QK8_0;
+    int64_t a_nb = n / QK8_0;
+
+    const svbool_t ptrue = svptrue_b8();
+
+    const block_q8_0x8 * b_ptr_start = (const block_q8_0x8 *) vx;
+    const block_q8_0 * a_ptr_start = (const block_q8_0 *) vy;
+
+    for (int64_t y = 0; y < nr; y++) {
+        for (int64_t x = x0 / 8; x < xend / 8; x++) {
+            // Pointers to LHS blocks
+            const block_q8_0 * a_ptr = a_ptr_start + (y * a_nb);
+            // Pointers to RHS blocks
+            const block_q8_0x8 * b_ptr = b_ptr_start + (x * nb);
+
+            // Master FP accumulator
+            svfloat32_t acc_row = svdup_f32(0.0f);
+
+            for (int64_t b = 0; b < nb; b++) {
+                // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
+                const svint8_t rhs_vec_0_0_0 = svld1_s8(ptrue, b_ptr[b].qs);
+                const svint8_t rhs_vec_0_1_0 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 1);
+                const svint8_t rhs_vec_0_2_0 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 2);
+                const svint8_t rhs_vec_0_3_0 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 3);
+                const svint8_t rhs_vec_0_0_1 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 4);
+                const svint8_t rhs_vec_0_1_1 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 5);
+                const svint8_t rhs_vec_0_2_1 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 6);
+                const svint8_t rhs_vec_0_3_1 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 7);
+
+                // Scale values
+                const svfloat16_t col_scale_f16 = svreinterpret_f16_u32(svld1uh_u32(ptrue, (const uint16_t *) b_ptr[b].d));
+                const svfloat32_t col_scale_f32 = svcvt_f32_f16_x(ptrue, col_scale_f16);
+
+                const svfloat16_t row_scale_f16 = svdup_f16(a_ptr[b].d);
+                const svfloat32_t row_scale_f32 = svcvt_f32_f16_x(ptrue, row_scale_f16);
+
+                const svint8_t lhs_vec_0 = svld1rq_s8(ptrue, a_ptr[b].qs);
+                const svint8_t lhs_vec_1 = svld1rq_s8(ptrue, a_ptr[b].qs + 16);
+
+                svint32_t iacc = svdup_s32(0);
+
+                iacc = svdot_lane(iacc, rhs_vec_0_0_0, lhs_vec_0, 0);
+                iacc = svdot_lane(iacc, rhs_vec_0_0_1, lhs_vec_1, 0);
+
+                iacc = svdot_lane(iacc, rhs_vec_0_1_0, lhs_vec_0, 1);
+                iacc = svdot_lane(iacc, rhs_vec_0_1_1, lhs_vec_1, 1);
+
+                iacc = svdot_lane(iacc, rhs_vec_0_2_0, lhs_vec_0, 2);
+                iacc = svdot_lane(iacc, rhs_vec_0_2_1, lhs_vec_1, 2);
+
+                iacc = svdot_lane(iacc, rhs_vec_0_3_0, lhs_vec_0, 3);
+                iacc = svdot_lane(iacc, rhs_vec_0_3_1, lhs_vec_1, 3);
+
+                acc_row = svmla_x(ptrue, acc_row, svcvt_f32_s32_x(ptrue, iacc), svmul_x(ptrue, col_scale_f32, row_scale_f32));
+            }
+
+            svst1(ptrue, s + (y * nc + x * 8), acc_row);
+        }
+    }
+#endif
+}
+
+void ggml_gemv_q8_0_q8_0_aarch64_neon(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth) {
+#if defined(__ARM_NEON)
+    int64_t x0 = roundup((ith * nc) / nth, (int64_t)8);
+    int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)8);
+
+    int64_t nb = n / QK8_0;
+    int64_t a_nb = n / QK8_0;
+
+    const block_q8_0x8 * b_ptr_start = (const block_q8_0x8 *) vx;
+    const block_q8_0 * a_ptr_start = (const block_q8_0 *) vy;
+
+    for (int64_t y = 0; y < nr; y++) {
+        for (int64_t x = x0 / 8; x < xend / 8; x++) {
+            // Pointers to LHS blocks
+            const block_q8_0 * a_ptr = a_ptr_start + (y * a_nb);
+            // Pointers to RHS blocks
+            const block_q8_0x8 * b_ptr = b_ptr_start + (x * nb);
+            // Master FP accumulator
+            float32x4_t acc_row[2];
+            acc_row[0] = acc_row[1] = vdupq_n_f32(0.0f);
+
+            for (int64_t b = 0; b < nb; b++) {
+                // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
+                const int8x16_t rhs_vec_0_0_0 = vld1q_s8(b_ptr[b].qs);
+                const int8x16_t rhs_vec_1_0_0 = vld1q_s8(b_ptr[b].qs + 16);
+                const int8x16_t rhs_vec_0_1_0 = vld1q_s8(b_ptr[b].qs + 32);
+                const int8x16_t rhs_vec_1_1_0 = vld1q_s8(b_ptr[b].qs + 48);
+                const int8x16_t rhs_vec_0_2_0 = vld1q_s8(b_ptr[b].qs + 64);
+                const int8x16_t rhs_vec_1_2_0 = vld1q_s8(b_ptr[b].qs + 80);
+                const int8x16_t rhs_vec_0_3_0 = vld1q_s8(b_ptr[b].qs + 96);
+                const int8x16_t rhs_vec_1_3_0 = vld1q_s8(b_ptr[b].qs + 112);
+                const int8x16_t rhs_vec_0_0_1 = vld1q_s8(b_ptr[b].qs + 128);
+                const int8x16_t rhs_vec_1_0_1 = vld1q_s8(b_ptr[b].qs + 144);
+                const int8x16_t rhs_vec_0_1_1 = vld1q_s8(b_ptr[b].qs + 160);
+                const int8x16_t rhs_vec_1_1_1 = vld1q_s8(b_ptr[b].qs + 176);
+                const int8x16_t rhs_vec_0_2_1 = vld1q_s8(b_ptr[b].qs + 192);
+                const int8x16_t rhs_vec_1_2_1 = vld1q_s8(b_ptr[b].qs + 208);
+                const int8x16_t rhs_vec_0_3_1 = vld1q_s8(b_ptr[b].qs + 224);
+                const int8x16_t rhs_vec_1_3_1 = vld1q_s8(b_ptr[b].qs + 240);
+
+                // Scale values - assemble the four row/column scales into a (64-bit) vector, then expand to FP32
+                const float16x8_t col_scale_f16 = vld1q_f16((const ggml_fp16_internal_t *)(b_ptr[b].d));
+                const float32x4_t col_scale_f32_0 = vcvt_f32_f16(vget_low_f16(col_scale_f16));
+                const float32x4_t col_scale_f32_1 = vcvt_f32_f16(vget_high_f16(col_scale_f16));
+
+                const float16x4_t row_scale_f16 = vld1_dup_f16((const ggml_fp16_internal_t *)(&(a_ptr[b].d)));
+                const float32x4_t row_scale_f32 = vcvt_f32_f16(row_scale_f16);
+
+                const int8x16_t lhs_vec_0 = vld1q_s8(a_ptr[b].qs);
+                const int8x16_t lhs_vec_1 = vld1q_s8(a_ptr[b].qs + 16);
+
+                int32x4_t iacc0 = vdupq_n_s32(0);
+                int32x4_t iacc1 = vdupq_n_s32(0);
+
+                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_0_0, lhs_vec_0, 0);
+                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_0_1, lhs_vec_1, 0);
+
+                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_0_0, lhs_vec_0, 0);
+                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_0_1, lhs_vec_1, 0);
+
+                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_1_0, lhs_vec_0, 1);
+                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_1_1, lhs_vec_1, 1);
+
+                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_1_0, lhs_vec_0, 1);
+                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_1_1, lhs_vec_1, 1);
+
+                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_2_0, lhs_vec_0, 2);
+                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_2_1, lhs_vec_1, 2);
+
+                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_2_0, lhs_vec_0, 2);
+                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_2_1, lhs_vec_1, 2);
+
+                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_3_0, lhs_vec_0, 3);
+                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_3_1, lhs_vec_1, 3);
+
+                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_3_0, lhs_vec_0, 3);
+                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_3_1, lhs_vec_1, 3);
+
+                acc_row[0] = vfmaq_f32(acc_row[0], vcvtq_f32_s32(iacc0), vmulq_f32(col_scale_f32_0, row_scale_f32));
+                acc_row[1] = vfmaq_f32(acc_row[1], vcvtq_f32_s32(iacc1), vmulq_f32(col_scale_f32_1, row_scale_f32));
+            }
+
+            vst1q_f32(s + (y * nc + x * 8), acc_row[0]);
+            vst1q_f32(s + (y * nc + x * 8 + 4), acc_row[1]);
+        }
+    }
+#endif
+}
+
+void ggml_gemm_q4_0_q8_0_aarch64_sve256(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth) {
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
+    if (svcntw() != 8) {
+        if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) ggml_gemm_q4_0_q8_0_aarch64_neon(n, s, vx, vy, nr, nc, ith, nth);
+        return;
+    }
+    int64_t x0 = roundup((ith * nc) / nth, (int64_t)8);
+    int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)8);
+    size_t width = xend - x0;
+
+    int64_t nb = n / QK4_0;
+    const void * b_ptr = (const void *)((const block_q4_0x8 *) vx + ((x0 / 8) * nb));
+    const void * a_ptr = vy;
+    float * res_ptr = s + x0;
+    size_t res_stride = nc * sizeof(float);
+
+    assert(n % 32 == 0);
+    assert(width % 8 == 0);
+
+    size_t num_blocks = n / 32;
+
+    __asm__ __volatile__(
+        "mov x20, #0x4\n"
+        "mov x13, %x[nr]\n"
+        "mov z28.s, #-0x4\n"
+        "mov x12, #0x88\n"
+        "ptrue p1.b\n"
+        "whilelt p0.s, XZR, x20\n"
+        "cmp x13, #0x10\n"
+        "mul x12, %x[num_blocks], x12\n"
+        "blt 4f\n"
+        "1:"  // Row loop
+        "add x11, %x[b_ptr], #0x10\n"
+        "mov x10, %x[width]\n"
+        "add x9, %x[res_ptr], %x[res_stride], LSL #4\n"
+        "2:"  // Column loop
+        "add x28, %x[a_ptr], #0x8\n"
+        "mov z24.b, #0x0\n"
+        "mov z15.b, #0x0\n"
+        "mov x27, %x[num_blocks]\n"
+        "add x26, x28, x12\n"
+        "mov z12.b, #0x0\n"
+        "mov z0.b, #0x0\n"
+        "add x25, x26, x12\n"
+        "mov z13.b, #0x0\n"
+        "mov z1.b, #0x0\n"
+        "add x24, x25, x12\n"
+        "mov z20.b, #0x0\n"
+        "mov z25.b, #0x0\n"
+        "mov z11.b, #0x0\n"
+        "mov z16.b, #0x0\n"
+        "mov z19.b, #0x0\n"
+        "mov z26.b, #0x0\n"
+        "mov z8.b, #0x0\n"
+        "mov z29.b, #0x0\n"
+        "mov z27.b, #0x0\n"
+        "mov z10.b, #0x0\n"
+        "3:"  // Block loop
+        "ld1b { z30.b }, p1/Z, [x11]\n"
+        "ld1b { z21.b }, p1/Z, [x11, #1, MUL VL]\n"
+        "mov z18.s, #0x0\n"
+        "mov z7.s, #0x0\n"
+        "ld1rqb { z3.b }, p1/Z, [x28]\n"
+        "ld1rqb { z5.b }, p1/Z, [x28, #16]\n"
+        "mov z9.s, #0x0\n"
+        "mov z22.s, #0x0\n"
+        "ld1b { z4.b }, p1/Z, [x11, #2, MUL VL]\n"
+        "ld1b { z17.b }, p1/Z, [x11, #3, MUL VL]\n"
+        "sub x20, x11, #0x10\n"
+        "sub x23, x28, #0x8\n"
+        "lsl z31.b, z30.b, #0x4\n"
+        "lsl z6.b, z21.b, #0x4\n"
+        "ld1h { z23.s }, p1/Z, [x20]\n"
+        "sub x22, x26, #0x8\n"
+        "and z30.b, z30.b, #0xf0\n"
+        "and z21.b, z21.b, #0xf0\n"
+        "sub x21, x25, #0x8\n"
+        "sub x20, x24, #0x8\n"
+        "lsl z14.b, z4.b, #0x4\n"
+        "lsl z2.b, z17.b, #0x4\n"
+        "subs x27, x27, #0x1\n"
+        "add x11, x11, #0x90\n"
+        ".inst 0x451f9872  // smmla z18.s, z3.b, z31.b\n"
+        ".inst 0x45069867  // smmla z7.s, z3.b, z6.b\n"
+        "ld1rqb { z3.b }, p1/Z, [x28, #32]\n"
+        "and z4.b, z4.b, #0xf0\n"
+        ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
+        ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
+        "ld1rqb { z5.b }, p1/Z, [x28, #48]\n"
+        "and z17.b, z17.b, #0xf0\n"
+        "fcvt z23.s, p1/m, z23.h\n"
+        ".inst 0x450e9872  // smmla z18.s, z3.b, z14.b\n"
+        ".inst 0x45029867  // smmla z7.s, z3.b, z2.b\n"
+        "ld1rqb { z3.b }, p1/Z, [x28, #64]\n"
+        ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
+        ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
+        "ld1rqb { z5.b }, p1/Z, [x28, #80]\n"
+        "fscale z23.s, p1/m, z23.s, z28.s\n"
+        ".inst 0x451e9872  // smmla z18.s, z3.b, z30.b\n"
+        ".inst 0x45159867  // smmla z7.s, z3.b, z21.b\n"
+        "ld1rqb { z3.b }, p1/Z, [x28, #96]\n"
+        ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
+        ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
+        "ld1rqb { z5.b }, p1/Z, [x28, #112]\n"
+        "add x28, x28, #0x88\n"
+        ".inst 0x45049872  // smmla z18.s, z3.b, z4.b\n"
+        ".inst 0x45119867  // smmla z7.s, z3.b, z17.b\n"
+        "ld1h { z3.s }, p0/Z, [x23]\n"
+        ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
+        ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
+        "fcvt z3.s, p1/m, z3.h\n"
+        "uzp1 z5.d, z18.d, z7.d\n"
+        "uzp2 z18.d, z18.d, z7.d\n"
+        "mov z3.q, z3.q[0]\n"
+        "uzp1 z7.d, z9.d, z22.d\n"
+        "uzp2 z22.d, z9.d, z22.d\n"
+        "fmul z9.s, z23.s, z3.s[0]\n"
+        "scvtf z5.s, p1/m, z5.s\n"
+        "scvtf z18.s, p1/m, z18.s\n"
+        "scvtf z7.s, p1/m, z7.s\n"
+        "scvtf z22.s, p1/m, z22.s\n"
+        "fmla z24.s, p1/M, z5.s, z9.s\n"
+        "ld1rqb { z5.b }, p1/Z, [x26]\n"
+        "fmul z9.s, z23.s, z3.s[1]\n"
+        "fmla z15.s, p1/M, z18.s, z9.s\n"
+        "ld1rqb { z18.b }, p1/Z, [x26, #16]\n"
+        "fmul z9.s, z23.s, z3.s[2]\n"
+        "fmul z3.s, z23.s, z3.s[3]\n"
+        "fmla z12.s, p1/M, z7.s, z9.s\n"
+        "mov z9.s, #0x0\n"
+        "ld1h { z7.s }, p0/Z, [x22]\n"
+        ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
+        "fmla z0.s, p1/M, z22.s, z3.s\n"
+        "mov z22.s, #0x0\n"
+        "ld1h { z3.s }, p0/Z, [x21]\n"
+        ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
+        "ld1rqb { z5.b }, p1/Z, [x26, #32]\n"
+        "fcvt z7.s, p1/m, z7.h\n"
+        "fcvt z3.s, p1/m, z3.h\n"
+        ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
+        ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
+        "ld1rqb { z5.b }, p1/Z, [x26, #64]\n"
+        "mov z7.q, z7.q[0]\n"
+        "mov z3.q, z3.q[0]\n"
+        ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
+        ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
+        "ld1rqb { z5.b }, p1/Z, [x26, #96]\n"
+        ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
+        ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
+        "uzp1 z5.d, z9.d, z22.d\n"
+        "scvtf z5.s, p1/m, z5.s\n"
+        "uzp2 z22.d, z9.d, z22.d\n"
+        "fmul z9.s, z23.s, z7.s[0]\n"
+        "scvtf z22.s, p1/m, z22.s\n"
+        "fmla z13.s, p1/M, z5.s, z9.s\n"
+        "ld1rqb { z9.b }, p1/Z, [x25]\n"
+        "fmul z5.s, z23.s, z7.s[1]\n"
+        "fmla z1.s, p1/M, z22.s, z5.s\n"
+        "mov z5.s, #0x0\n"
+        "mov z22.s, #0x0\n"
+        ".inst 0x451f9a45  // smmla z5.s, z18.b, z31.b\n"
+        ".inst 0x45069a56  // smmla z22.s, z18.b, z6.b\n"
+        "ld1rqb { z18.b }, p1/Z, [x26, #48]\n"
+        ".inst 0x450e9a45  // smmla z5.s, z18.b, z14.b\n"
+        ".inst 0x45029a56  // smmla z22.s, z18.b, z2.b\n"
+        "ld1rqb { z18.b }, p1/Z, [x26, #80]\n"
+        ".inst 0x451e9a45  // smmla z5.s, z18.b, z30.b\n"
+        ".inst 0x45159a56  // smmla z22.s, z18.b, z21.b\n"
+        "ld1rqb { z18.b }, p1/Z, [x26, #112]\n"
+        "add x26, x26, #0x88\n"
+        ".inst 0x45049a45  // smmla z5.s, z18.b, z4.b\n"
+        ".inst 0x45119a56  // smmla z22.s, z18.b, z17.b\n"
+        "uzp1 z18.d, z5.d, z22.d\n"
+        "scvtf z18.s, p1/m, z18.s\n"
+        "uzp2 z22.d, z5.d, z22.d\n"
+        "fmul z5.s, z23.s, z7.s[2]\n"
+        "fmul z7.s, z23.s, z7.s[3]\n"
+        "scvtf z22.s, p1/m, z22.s\n"
+        "fmla z20.s, p1/M, z18.s, z5.s\n"
+        "ld1rqb { z18.b }, p1/Z, [x25, #16]\n"
+        "ld1h { z5.s }, p0/Z, [x20]\n"
+        "fcvt z5.s, p1/m, z5.h\n"
+        "fmla z25.s, p1/M, z22.s, z7.s\n"
+        "mov z22.s, #0x0\n"
+        "mov z7.s, #0x0\n"
+        ".inst 0x451f9936  // smmla z22.s, z9.b, z31.b\n"
+        ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
+        "ld1rqb { z9.b }, p1/Z, [x25, #32]\n"
+        "mov z5.q, z5.q[0]\n"
+        ".inst 0x450e9936  // smmla z22.s, z9.b, z14.b\n"
+        ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
+        "ld1rqb { z9.b }, p1/Z, [x25, #64]\n"
+        ".inst 0x451e9936  // smmla z22.s, z9.b, z30.b\n"
+        ".inst 0x45159927  // smmla z7.s, z9.b, z21.b\n"
+        "ld1rqb { z9.b }, p1/Z, [x25, #96]\n"
+        ".inst 0x45049936  // smmla z22.s, z9.b, z4.b\n"
+        ".inst 0x45119927  // smmla z7.s, z9.b, z17.b\n"
+        "uzp1 z9.d, z22.d, z7.d\n"
+        "scvtf z9.s, p1/m, z9.s\n"
+        "uzp2 z22.d, z22.d, z7.d\n"
+        "fmul z7.s, z23.s, z3.s[0]\n"
+        "scvtf z22.s, p1/m, z22.s\n"
+        "fmla z11.s, p1/M, z9.s, z7.s\n"
+        "ld1rqb { z9.b }, p1/Z, [x24]\n"
+        "fmul z7.s, z23.s, z3.s[1]\n"
+        "fmla z16.s, p1/M, z22.s, z7.s\n"
+        "mov z22.s, #0x0\n"
+        "mov z7.s, #0x0\n"
+        ".inst 0x451f9a56  // smmla z22.s, z18.b, z31.b\n"
+        ".inst 0x45069a47  // smmla z7.s, z18.b, z6.b\n"
+        "ld1rqb { z18.b }, p1/Z, [x25, #48]\n"
+        ".inst 0x450e9a56  // smmla z22.s, z18.b, z14.b\n"
+        ".inst 0x45029a47  // smmla z7.s, z18.b, z2.b\n"
+        "ld1rqb { z18.b }, p1/Z, [x25, #80]\n"
+        ".inst 0x451e9a56  // smmla z22.s, z18.b, z30.b\n"
+        ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
+        "ld1rqb { z18.b }, p1/Z, [x25, #112]\n"
+        "add x25, x25, #0x88\n"
+        ".inst 0x45049a56  // smmla z22.s, z18.b, z4.b\n"
+        ".inst 0x45119a47  // smmla z7.s, z18.b, z17.b\n"
+        "uzp1 z18.d, z22.d, z7.d\n"
+        "scvtf z18.s, p1/m, z18.s\n"
+        "uzp2 z7.d, z22.d, z7.d\n"
+        "fmul z22.s, z23.s, z3.s[2]\n"
+        "fmul z3.s, z23.s, z3.s[3]\n"
+        "scvtf z7.s, p1/m, z7.s\n"
+        "fmla z19.s, p1/M, z18.s, z22.s\n"
+        "ld1rqb { z18.b }, p1/Z, [x24, #16]\n"
+        "fmul z22.s, z23.s, z5.s[0]\n"
+        "fmla z26.s, p1/M, z7.s, z3.s\n"
+        "mov z3.s, #0x0\n"
+        "mov z7.s, #0x0\n"
+        ".inst 0x451f9923  // smmla z3.s, z9.b, z31.b\n"
+        ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
+        "ld1rqb { z9.b }, p1/Z, [x24, #32]\n"
+        ".inst 0x450e9923  // smmla z3.s, z9.b, z14.b\n"
+        ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
+        "mov z9.s, #0x0\n"
+        ".inst 0x451f9a49  // smmla z9.s, z18.b, z31.b\n"
+        "mov z31.s, #0x0\n"
+        ".inst 0x45069a5f  // smmla z31.s, z18.b, z6.b\n"
+        "ld1rqb { z6.b }, p1/Z, [x24, #48]\n"
+        "ld1rqb { z18.b }, p1/Z, [x24, #64]\n"
+        ".inst 0x450e98c9  // smmla z9.s, z6.b, z14.b\n"
+        "fmul z14.s, z23.s, z5.s[1]\n"
+        ".inst 0x450298df  // smmla z31.s, z6.b, z2.b\n"
+        "ld1rqb { z6.b }, p1/Z, [x24, #80]\n"
+        "fmul z2.s, z23.s, z5.s[2]\n"
+        "fmul z23.s, z23.s, z5.s[3]\n"
+        ".inst 0x451e9a43  // smmla z3.s, z18.b, z30.b\n"
+        ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
+        "ld1rqb { z5.b }, p1/Z, [x24, #96]\n"
+        ".inst 0x451e98c9  // smmla z9.s, z6.b, z30.b\n"
+        ".inst 0x451598df  // smmla z31.s, z6.b, z21.b\n"
+        "ld1rqb { z18.b }, p1/Z, [x24, #112]\n"
+        "add x24, x24, #0x88\n"
+        ".inst 0x450498a3  // smmla z3.s, z5.b, z4.b\n"
+        ".inst 0x451198a7  // smmla z7.s, z5.b, z17.b\n"
+        ".inst 0x45049a49  // smmla z9.s, z18.b, z4.b\n"
+        ".inst 0x45119a5f  // smmla z31.s, z18.b, z17.b\n"
+        "uzp1 z18.d, z3.d, z7.d\n"
+        "uzp2 z5.d, z3.d, z7.d\n"
+        "scvtf z18.s, p1/m, z18.s\n"
+        "uzp1 z6.d, z9.d, z31.d\n"
+        "uzp2 z9.d, z9.d, z31.d\n"
+        "scvtf z5.s, p1/m, z5.s\n"
+        "fmla z8.s, p1/M, z18.s, z22.s\n"
+        "scvtf z6.s, p1/m, z6.s\n"
+        "scvtf z9.s, p1/m, z9.s\n"
+        "fmla z29.s, p1/M, z5.s, z14.s\n"
+        "fmla z27.s, p1/M, z6.s, z2.s\n"
+        "fmla z10.s, p1/M, z9.s, z23.s\n"
+        "bgt 3b\n"
+        "mov x20, %x[res_ptr]\n"
+        "subs x10, x10, #0x8\n"
+        "add %x[res_ptr], %x[res_ptr], #0x20\n"
+        "st1w { z24.s }, p1, [x20]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "st1w { z15.s }, p1, [x20]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "st1w { z12.s }, p1, [x20]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "st1w { z0.s }, p1, [x20]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "st1w { z13.s }, p1, [x20]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "st1w { z1.s }, p1, [x20]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "st1w { z20.s }, p1, [x20]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "st1w { z25.s }, p1, [x20]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "st1w { z11.s }, p1, [x20]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "st1w { z16.s }, p1, [x20]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "st1w { z19.s }, p1, [x20]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "st1w { z26.s }, p1, [x20]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "st1w { z8.s }, p1, [x20]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "st1w { z29.s }, p1, [x20]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "st1w { z27.s }, p1, [x20]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "st1w { z10.s }, p1, [x20]\n"
+        "bne 2b\n"
+        "mov x20, #0x4\n"
+        "sub x13, x13, #0x10\n"
+        "cmp x13, #0x10\n"
+        "mov %x[res_ptr], x9\n"
+        "madd %x[a_ptr], x20, x12, %x[a_ptr]\n"
+        "bge 1b\n"
+        "4:"  // Row loop skip
+        "cbz x13, 9f\n"
+        "5:"  // Row tail: Row loop
+        "add x25, %x[b_ptr], #0x10\n"
+        "mov x24, %x[width]\n"
+        "add x23, %x[res_ptr], %x[res_stride], LSL #2\n"
+        "6:"  // Row tail: Column loop
+        "mov z24.b, #0x0\n"
+        "mov z15.b, #0x0\n"
+        "add x28, %x[a_ptr], #0x8\n"
+        "mov x22, %x[num_blocks]\n"
+        "mov z12.b, #0x0\n"
+        "mov z0.b, #0x0\n"
+        "7:"  // Row tail: Block loop
+        "ld1b { z3.b }, p1/Z, [x25]\n"
+        "ld1b { z6.b }, p1/Z, [x25, #1, MUL VL]\n"
+        "mov z2.s, #0x0\n"
+        "mov z25.s, #0x0\n"
+        "ld1rqb { z26.b }, p1/Z, [x28]\n"
+        "ld1rqb { z21.b }, p1/Z, [x28, #16]\n"
+        "mov z27.s, #0x0\n"
+        "mov z19.s, #0x0\n"
+        "ld1b { z29.b }, p1/Z, [x25, #2, MUL VL]\n"
+        "ld1b { z16.b }, p1/Z, [x25, #3, MUL VL]\n"
+        "sub x21, x25, #0x10\n"
+        "sub x20, x28, #0x8\n"
+        "lsl z20.b, z3.b, #0x4\n"
+        "lsl z4.b, z6.b, #0x4\n"
+        "ld1rqb { z10.b }, p1/Z, [x28, #32]\n"
+        "ld1rqb { z23.b }, p1/Z, [x28, #48]\n"
+        "and z3.b, z3.b, #0xf0\n"
+        "and z6.b, z6.b, #0xf0\n"
+        "ld1rqb { z11.b }, p1/Z, [x28, #64]\n"
+        "ld1rqb { z7.b }, p1/Z, [x28, #80]\n"
+        "lsl z8.b, z29.b, #0x4\n"
+        "lsl z14.b, z16.b, #0x4\n"
+        "ld1rqb { z18.b }, p1/Z, [x28, #96]\n"
+        "ld1rqb { z30.b }, p1/Z, [x28, #112]\n"
+        ".inst 0x45149b42  // smmla z2.s, z26.b, z20.b\n"
+        ".inst 0x45049b59  // smmla z25.s, z26.b, z4.b\n"
+        "and z29.b, z29.b, #0xf0\n"
+        "ld1h { z17.s }, p1/Z, [x21]\n"
+        ".inst 0x45149abb  // smmla z27.s, z21.b, z20.b\n"
+        ".inst 0x45049ab3  // smmla z19.s, z21.b, z4.b\n"
+        "and z16.b, z16.b, #0xf0\n"
+        "ld1h { z4.s }, p0/Z, [x20]\n"
+        "subs x22, x22, #0x1\n"
+        "add x28, x28, #0x88\n"
+        "fcvt z17.s, p1/m, z17.h\n"
+        "add x25, x25, #0x90\n"
+        ".inst 0x45089942  // smmla z2.s, z10.b, z8.b\n"
+        ".inst 0x450e9959  // smmla z25.s, z10.b, z14.b\n"
+        "fcvt z4.s, p1/m, z4.h\n"
+        ".inst 0x45089afb  // smmla z27.s, z23.b, z8.b\n"
+        ".inst 0x450e9af3  // smmla z19.s, z23.b, z14.b\n"
+        "fscale z17.s, p1/m, z17.s, z28.s\n"
+        "mov z4.q, z4.q[0]\n"
+        ".inst 0x45039962  // smmla z2.s, z11.b, z3.b\n"
+        ".inst 0x45069979  // smmla z25.s, z11.b, z6.b\n"
+        "fmul z23.s, z17.s, z4.s[0]\n"
+        "fmul z9.s, z17.s, z4.s[1]\n"
+        "fmul z21.s, z17.s, z4.s[2]\n"
+        "fmul z4.s, z17.s, z4.s[3]\n"
+        ".inst 0x450398fb  // smmla z27.s, z7.b, z3.b\n"
+        ".inst 0x450698f3  // smmla z19.s, z7.b, z6.b\n"
+        ".inst 0x451d9a42  // smmla z2.s, z18.b, z29.b\n"
+        ".inst 0x45109a59  // smmla z25.s, z18.b, z16.b\n"
+        ".inst 0x451d9bdb  // smmla z27.s, z30.b, z29.b\n"
+        ".inst 0x45109bd3  // smmla z19.s, z30.b, z16.b\n"
+        "uzp1 z31.d, z2.d, z25.d\n"
+        "uzp2 z13.d, z2.d, z25.d\n"
+        "scvtf z31.s, p1/m, z31.s\n"
+        "uzp1 z17.d, z27.d, z19.d\n"
+        "uzp2 z18.d, z27.d, z19.d\n"
+        "scvtf z13.s, p1/m, z13.s\n"
+        "fmla z24.s, p1/M, z31.s, z23.s\n"
+        "scvtf z17.s, p1/m, z17.s\n"
+        "scvtf z18.s, p1/m, z18.s\n"
+        "fmla z15.s, p1/M, z13.s, z9.s\n"
+        "fmla z12.s, p1/M, z17.s, z21.s\n"
+        "fmla z0.s, p1/M, z18.s, z4.s\n"
+        "bgt 7b\n"
+        "mov x20, %x[res_ptr]\n"
+        "cmp x13, #0x1\n"
+        "st1w { z24.s }, p1, [x20]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "ble 8f\n"
+        "cmp x13, #0x2\n"
+        "st1w { z15.s }, p1, [x20]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "ble 8f\n"
+        "cmp x13, #0x3\n"
+        "st1w { z12.s }, p1, [x20]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "ble 8f\n"
+        "st1w { z0.s }, p1, [x20]\n"
+        "8:"  // Row tail: Accumulator store skip
+        "subs x24, x24, #0x8\n"
+        "add %x[res_ptr], %x[res_ptr], #0x20\n"
+        "bne 6b\n"
+        "subs x13, x13, #0x4\n"
+        "add %x[a_ptr], %x[a_ptr], x12\n"
+        "mov %x[res_ptr], x23\n"
+        "bgt 5b\n"
+        "9:"  // Row tail: Row loop skip
+        : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
+        : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [num_blocks] "r" (num_blocks), [res_stride] "r" (res_stride), [width] "r" (width)
+        : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+#endif
+}
+
+void ggml_gemm_q4_0_q8_0_aarch64_neon(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth) {
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    int64_t x0 = roundup((ith * nc) / nth, (int64_t)4);
+    int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)4);
+    size_t width = xend - x0;
+
+    int64_t nb = n / QK4_0;
+    const void * b_ptr = (const void *)((const block_q4_0x4 *) vx + ((x0 / 4) * nb));
+    const void * a_ptr = vy;
+    float * res_ptr = s + x0;
+    size_t res_stride = nc * sizeof(float);
+
+    assert(n % 32 == 0);
+    assert(width % 4 == 0);
+
+    size_t num_blocks = n / 32;
+
+    __asm__ __volatile__(
+        "mov x10, %x[nr]\n"
+        "mov x9, #0x88\n"
+        "cmp x10, #0x10\n"
+        "mul x9, %x[num_blocks], x9\n"
+        "blt 4f\n"
+        "1:"  // Row loop
+        "add x28, %x[b_ptr], #0x8\n"
+        "mov x27, %x[width]\n"
+        "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
+        "2:"  // Column loop
+        "add x25, %x[a_ptr], #0x8\n"
+        "movi v2.16b, #0x0\n"
+        "movi v10.16b, #0x0\n"
+        "mov x24, %x[num_blocks]\n"
+        "add x23, x25, x9\n"
+        "movi v12.16b, #0x0\n"
+        "movi v28.16b, #0x0\n"
+        "add x22, x23, x9\n"
+        "movi v11.16b, #0x0\n"
+        "movi v13.16b, #0x0\n"
+        "add x21, x22, x9\n"
+        "movi v22.16b, #0x0\n"
+        "movi v23.16b, #0x0\n"
+        "movi v25.16b, #0x0\n"
+        "movi v5.16b, #0x0\n"
+        "movi v7.16b, #0x0\n"
+        "movi v4.16b, #0x0\n"
+        "movi v6.16b, #0x0\n"
+        "movi v30.16b, #0x0\n"
+        "movi v24.16b, #0x0\n"
+        "movi v14.16b, #0x0\n"
+        "3:"  // Block loop
+        "ldr q21, [x28, #0x0]\n"
+        "ldr q16, [x28, #0x10]\n"
+        "movi v1.16b, #0x4\n"
+        "movi v19.4s, #0x0\n"
+        "ldr q27, [x25, #0x0]\n"
+        "ldr q15, [x25, #0x10]\n"
+        "movi v26.4s, #0x0\n"
+        "movi v18.4s, #0x0\n"
+        "ldr q29, [x28, #0x20]\n"
+        "ldr q3, [x28, #0x30]\n"
+        "movi v17.4s, #0x0\n"
+        "movi v0.16b, #0xf0\n"
+        "ldr d20, [x25, #-0x8]\n"
+        "ldr d9, [x23, #-0x8]\n"
+        "sshl v8.16b, v21.16b, v1.16b\n"
+        "sshl v31.16b, v16.16b, v1.16b\n"
+        "and v21.16b, v21.16b, v0.16b\n"
+        "and v16.16b, v16.16b, v0.16b\n"
+        "sub x20, x28, #0x8\n"
+        "subs x24, x24, #0x1\n"
+        "add x28, x28, #0x48\n"
+        ".inst 0x4e88a773  // smmla v19.4s, v27.16b, v8.16b\n"
+        ".inst 0x4e9fa77a  // smmla v26.4s, v27.16b, v31.16b\n"
+        "ldr q27, [x25, #0x20]\n"
+        ".inst 0x4e88a5f2  // smmla v18.4s, v15.16b, v8.16b\n"
+        ".inst 0x4e9fa5f1  // smmla v17.4s, v15.16b, v31.16b\n"
+        "sshl v15.16b, v29.16b, v1.16b\n"
+        "sshl v1.16b, v3.16b, v1.16b\n"
+        "and v29.16b, v29.16b, v0.16b\n"
+        "and v3.16b, v3.16b, v0.16b\n"
+        "ldr q0, [x25, #0x30]\n"
+        "fcvtl v20.4s, v20.4h\n"
+        ".inst 0x4e8fa773  // smmla v19.4s, v27.16b, v15.16b\n"
+        "fcvtl v9.4s, v9.4h\n"
+        ".inst 0x4e81a77a  // smmla v26.4s, v27.16b, v1.16b\n"
+        "ldr q27, [x25, #0x40]\n"
+        ".inst 0x4e8fa412  // smmla v18.4s, v0.16b, v15.16b\n"
+        ".inst 0x4e81a411  // smmla v17.4s, v0.16b, v1.16b\n"
+        "ldr q0, [x25, #0x50]\n"
+        ".inst 0x4e95a773  // smmla v19.4s, v27.16b, v21.16b\n"
+        ".inst 0x4e90a77a  // smmla v26.4s, v27.16b, v16.16b\n"
+        "ldr q27, [x25, #0x60]\n"
+        ".inst 0x4e95a412  // smmla v18.4s, v0.16b, v21.16b\n"
+        ".inst 0x4e90a411  // smmla v17.4s, v0.16b, v16.16b\n"
+        "ldr q0, [x25, #0x70]\n"
+        "add x25, x25, #0x88\n"
+        ".inst 0x4e9da773  // smmla v19.4s, v27.16b, v29.16b\n"
+        ".inst 0x4e83a77a  // smmla v26.4s, v27.16b, v3.16b\n"
+        "ldr d27, [x20, #0x0]\n"
+        ".inst 0x4e9da412  // smmla v18.4s, v0.16b, v29.16b\n"
+        ".inst 0x4e83a411  // smmla v17.4s, v0.16b, v3.16b\n"
+        "fcvtl v27.4s, v27.4h\n"
+        "uzp1 v0.2d, v19.2d, v26.2d\n"
+        "uzp2 v26.2d, v19.2d, v26.2d\n"
+        "fmul v19.4s, v27.4s, v20.s[0]\n"
+        "scvtf v0.4s, v0.4s, #0x4\n"
+        "scvtf v26.4s, v26.4s, #0x4\n"
+        "fmla v2.4s, v0.4s, v19.4s\n"
+        "ldr q19, [x23, #0x0]\n"
+        "uzp1 v0.2d, v18.2d, v17.2d\n"
+        "uzp2 v18.2d, v18.2d, v17.2d\n"
+        "fmul v17.4s, v27.4s, v20.s[1]\n"
+        "scvtf v0.4s, v0.4s, #0x4\n"
+        "scvtf v18.4s, v18.4s, #0x4\n"
+        "fmla v10.4s, v26.4s, v17.4s\n"
+        "ldr q17, [x23, #0x10]\n"
+        "fmul v26.4s, v27.4s, v20.s[2]\n"
+        "fmul v20.4s, v27.4s, v20.s[3]\n"
+        "fmla v12.4s, v0.4s, v26.4s\n"
+        "ldr d0, [x22, #-0x8]\n"
+        "ldr d26, [x21, #-0x8]\n"
+        "fcvtl v0.4s, v0.4h\n"
+        "fmla v28.4s, v18.4s, v20.4s\n"
+        "movi v20.4s, #0x0\n"
+        "movi v18.4s, #0x0\n"
+        ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
+        ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
+        "ldr q19, [x23, #0x20]\n"
+        "fcvtl v26.4s, v26.4h\n"
+        ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
+        ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
+        "ldr q19, [x23, #0x40]\n"
+        ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
+        ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
+        "ldr q19, [x23, #0x60]\n"
+        ".inst 0x4e9da674  // smmla v20.4s, v19.16b, v29.16b\n"
+        ".inst 0x4e83a672  // smmla v18.4s, v19.16b, v3.16b\n"
+        "uzp1 v19.2d, v20.2d, v18.2d\n"
+        "scvtf v19.4s, v19.4s, #0x4\n"
+        "uzp2 v20.2d, v20.2d, v18.2d\n"
+        "fmul v18.4s, v27.4s, v9.s[0]\n"
+        "scvtf v20.4s, v20.4s, #0x4\n"
+        "fmla v11.4s, v19.4s, v18.4s\n"
+        "ldr q18, [x22, #0x0]\n"
+        "fmul v19.4s, v27.4s, v9.s[1]\n"
+        "fmla v13.4s, v20.4s, v19.4s\n"
+        "movi v19.4s, #0x0\n"
+        "movi v20.4s, #0x0\n"
+        ".inst 0x4e88a633  // smmla v19.4s, v17.16b, v8.16b\n"
+        ".inst 0x4e9fa634  // smmla v20.4s, v17.16b, v31.16b\n"
+        "ldr q17, [x23, #0x30]\n"
+        ".inst 0x4e8fa633  // smmla v19.4s, v17.16b, v15.16b\n"
+        ".inst 0x4e81a634  // smmla v20.4s, v17.16b, v1.16b\n"
+        "ldr q17, [x23, #0x50]\n"
+        ".inst 0x4e95a633  // smmla v19.4s, v17.16b, v21.16b\n"
+        ".inst 0x4e90a634  // smmla v20.4s, v17.16b, v16.16b\n"
+        "ldr q17, [x23, #0x70]\n"
+        "add x23, x23, #0x88\n"
+        ".inst 0x4e9da633  // smmla v19.4s, v17.16b, v29.16b\n"
+        ".inst 0x4e83a634  // smmla v20.4s, v17.16b, v3.16b\n"
+        "uzp1 v17.2d, v19.2d, v20.2d\n"
+        "scvtf v17.4s, v17.4s, #0x4\n"
+        "uzp2 v20.2d, v19.2d, v20.2d\n"
+        "fmul v19.4s, v27.4s, v9.s[2]\n"
+        "fmul v9.4s, v27.4s, v9.s[3]\n"
+        "scvtf v20.4s, v20.4s, #0x4\n"
+        "fmla v22.4s, v17.4s, v19.4s\n"
+        "ldr q17, [x22, #0x10]\n"
+        "movi v19.4s, #0x0\n"
+        ".inst 0x4e88a653  // smmla v19.4s, v18.16b, v8.16b\n"
+        "fmla v23.4s, v20.4s, v9.4s\n"
+        "movi v20.4s, #0x0\n"
+        "movi v9.4s, #0x0\n"
+        ".inst 0x4e9fa654  // smmla v20.4s, v18.16b, v31.16b\n"
+        "ldr q18, [x22, #0x20]\n"
+        ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
+        ".inst 0x4e8fa653  // smmla v19.4s, v18.16b, v15.16b\n"
+        ".inst 0x4e81a654  // smmla v20.4s, v18.16b, v1.16b\n"
+        "ldr q18, [x22, #0x40]\n"
+        ".inst 0x4e95a653  // smmla v19.4s, v18.16b, v21.16b\n"
+        ".inst 0x4e90a654  // smmla v20.4s, v18.16b, v16.16b\n"
+        "ldr q18, [x22, #0x60]\n"
+        ".inst 0x4e9da653  // smmla v19.4s, v18.16b, v29.16b\n"
+        ".inst 0x4e83a654  // smmla v20.4s, v18.16b, v3.16b\n"
+        "movi v18.4s, #0x0\n"
+        ".inst 0x4e9fa632  // smmla v18.4s, v17.16b, v31.16b\n"
+        "ldr q17, [x22, #0x30]\n"
+        ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
+        ".inst 0x4e81a632  // smmla v18.4s, v17.16b, v1.16b\n"
+        "ldr q17, [x22, #0x50]\n"
+        ".inst 0x4e95a629  // smmla v9.4s, v17.16b, v21.16b\n"
+        ".inst 0x4e90a632  // smmla v18.4s, v17.16b, v16.16b\n"
+        "ldr q17, [x22, #0x70]\n"
+        "add x22, x22, #0x88\n"
+        ".inst 0x4e9da629  // smmla v9.4s, v17.16b, v29.16b\n"
+        ".inst 0x4e83a632  // smmla v18.4s, v17.16b, v3.16b\n"
+        "uzp1 v17.2d, v19.2d, v20.2d\n"
+        "uzp2 v20.2d, v19.2d, v20.2d\n"
+        "fmul v19.4s, v27.4s, v0.s[0]\n"
+        "scvtf v17.4s, v17.4s, #0x4\n"
+        "scvtf v20.4s, v20.4s, #0x4\n"
+        "fmla v25.4s, v17.4s, v19.4s\n"
+        "ldr q19, [x21, #0x0]\n"
+        "fmul v17.4s, v27.4s, v0.s[1]\n"
+        "fmla v5.4s, v20.4s, v17.4s\n"
+        "ldr q17, [x21, #0x10]\n"
+        "uzp1 v20.2d, v9.2d, v18.2d\n"
+        "uzp2 v9.2d, v9.2d, v18.2d\n"
+        "fmul v18.4s, v27.4s, v0.s[2]\n"
+        "fmul v0.4s, v27.4s, v0.s[3]\n"
+        "scvtf v20.4s, v20.4s, #0x4\n"
+        "scvtf v9.4s, v9.4s, #0x4\n"
+        "fmla v7.4s, v20.4s, v18.4s\n"
+        "movi v20.4s, #0x0\n"
+        "movi v18.4s, #0x0\n"
+        ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
+        ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
+        "ldr q19, [x21, #0x20]\n"
+        "fmla v4.4s, v9.4s, v0.4s\n"
+        "movi v9.4s, #0x0\n"
+        "movi v0.4s, #0x0\n"
+        ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
+        "fmul v8.4s, v27.4s, v26.s[0]\n"
+        ".inst 0x4e9fa620  // smmla v0.4s, v17.16b, v31.16b\n"
+        "ldr q17, [x21, #0x30]\n"
+        ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
+        "fmul v31.4s, v27.4s, v26.s[1]\n"
+        ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
+        "ldr q19, [x21, #0x40]\n"
+        ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
+        "fmul v15.4s, v27.4s, v26.s[2]\n"
+        "fmul v27.4s, v27.4s, v26.s[3]\n"
+        ".inst 0x4e81a620  // smmla v0.4s, v17.16b, v1.16b\n"
+        "ldr q1, [x21, #0x50]\n"
+        ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
+        ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
+        "ldr q26, [x21, #0x60]\n"
+        ".inst 0x4e95a429  // smmla v9.4s, v1.16b, v21.16b\n"
+        ".inst 0x4e90a420  // smmla v0.4s, v1.16b, v16.16b\n"
+        "ldr q21, [x21, #0x70]\n"
+        "add x21, x21, #0x88\n"
+        ".inst 0x4e9da754  // smmla v20.4s, v26.16b, v29.16b\n"
+        ".inst 0x4e83a752  // smmla v18.4s, v26.16b, v3.16b\n"
+        ".inst 0x4e9da6a9  // smmla v9.4s, v21.16b, v29.16b\n"
+        ".inst 0x4e83a6a0  // smmla v0.4s, v21.16b, v3.16b\n"
+        "uzp1 v29.2d, v20.2d, v18.2d\n"
+        "uzp2 v21.2d, v20.2d, v18.2d\n"
+        "scvtf v29.4s, v29.4s, #0x4\n"
+        "uzp1 v18.2d, v9.2d, v0.2d\n"
+        "uzp2 v16.2d, v9.2d, v0.2d\n"
+        "scvtf v21.4s, v21.4s, #0x4\n"
+        "fmla v6.4s, v29.4s, v8.4s\n"
+        "scvtf v18.4s, v18.4s, #0x4\n"
+        "scvtf v16.4s, v16.4s, #0x4\n"
+        "fmla v30.4s, v21.4s, v31.4s\n"
+        "fmla v24.4s, v18.4s, v15.4s\n"
+        "fmla v14.4s, v16.4s, v27.4s\n"
+        "bgt 3b\n"
+        "mov x20, %x[res_ptr]\n"
+        "subs x27, x27, #0x4\n"
+        "add %x[res_ptr], %x[res_ptr], #0x10\n"
+        "str q2, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q10, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q12, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q28, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q11, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q13, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q22, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q23, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q25, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q5, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q7, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q4, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q6, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q30, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q24, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q14, [x20, #0x0]\n"
+        "bne 2b\n"
+        "mov x20, #0x4\n"
+        "sub x10, x10, #0x10\n"
+        "cmp x10, #0x10\n"
+        "mov %x[res_ptr], x26\n"
+        "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
+        "bge 1b\n"
+        "4:"  // Row loop skip
+        "cbz x10, 9f\n"
+        "5:"  // Row tail: Row loop
+        "add x24, %x[b_ptr], #0x8\n"
+        "mov x23, %x[width]\n"
+        "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
+        "6:"  // Row tail: Column loop
+        "movi v2.16b, #0x0\n"
+        "movi v10.16b, #0x0\n"
+        "add x25, %x[a_ptr], #0x8\n"
+        "mov x21, %x[num_blocks]\n"
+        "movi v12.16b, #0x0\n"
+        "movi v28.16b, #0x0\n"
+        "7:"  // Row tail: Block loop
+        "ldr q6, [x24, #0x0]\n"
+        "ldr q5, [x24, #0x10]\n"
+        "movi v17.16b, #0x4\n"
+        "movi v8.4s, #0x0\n"
+        "ldr q4, [x25, #0x0]\n"
+        "ldr q13, [x25, #0x10]\n"
+        "movi v27.4s, #0x0\n"
+        "movi v0.4s, #0x0\n"
+        "ldr q31, [x24, #0x20]\n"
+        "ldr q14, [x24, #0x30]\n"
+        "movi v29.4s, #0x0\n"
+        "movi v22.16b, #0xf0\n"
+        "ldr q11, [x25, #0x20]\n"
+        "ldr q23, [x25, #0x30]\n"
+        "sshl v21.16b, v6.16b, v17.16b\n"
+        "sshl v16.16b, v5.16b, v17.16b\n"
+        "ldr q20, [x25, #0x40]\n"
+        "ldr q26, [x25, #0x50]\n"
+        "and v6.16b, v6.16b, v22.16b\n"
+        "and v5.16b, v5.16b, v22.16b\n"
+        "ldr q25, [x25, #0x60]\n"
+        "ldr q3, [x25, #0x70]\n"
+        "sshl v19.16b, v31.16b, v17.16b\n"
+        "sshl v18.16b, v14.16b, v17.16b\n"
+        "ldr d17, [x25, #-0x8]\n"
+        ".inst 0x4e95a488  // smmla v8.4s, v4.16b, v21.16b\n"
+        ".inst 0x4e90a49b  // smmla v27.4s, v4.16b, v16.16b\n"
+        "and v31.16b, v31.16b, v22.16b\n"
+        ".inst 0x4e95a5a0  // smmla v0.4s, v13.16b, v21.16b\n"
+        ".inst 0x4e90a5bd  // smmla v29.4s, v13.16b, v16.16b\n"
+        "and v14.16b, v14.16b, v22.16b\n"
+        "sub x20, x24, #0x8\n"
+        "ldr d16, [x20, #0x0]\n"
+        "subs x21, x21, #0x1\n"
+        "add x25, x25, #0x88\n"
+        "fcvtl v17.4s, v17.4h\n"
+        "add x24, x24, #0x48\n"
+        ".inst 0x4e93a568  // smmla v8.4s, v11.16b, v19.16b\n"
+        ".inst 0x4e92a57b  // smmla v27.4s, v11.16b, v18.16b\n"
+        ".inst 0x4e93a6e0  // smmla v0.4s, v23.16b, v19.16b\n"
+        ".inst 0x4e92a6fd  // smmla v29.4s, v23.16b, v18.16b\n"
+        "fcvtl v16.4s, v16.4h\n"
+        ".inst 0x4e86a688  // smmla v8.4s, v20.16b, v6.16b\n"
+        ".inst 0x4e85a69b  // smmla v27.4s, v20.16b, v5.16b\n"
+        "fmul v23.4s, v16.4s, v17.s[0]\n"
+        "fmul v21.4s, v16.4s, v17.s[1]\n"
+        "fmul v1.4s, v16.4s, v17.s[2]\n"
+        "fmul v20.4s, v16.4s, v17.s[3]\n"
+        ".inst 0x4e86a740  // smmla v0.4s, v26.16b, v6.16b\n"
+        ".inst 0x4e85a75d  // smmla v29.4s, v26.16b, v5.16b\n"
+        ".inst 0x4e9fa728  // smmla v8.4s, v25.16b, v31.16b\n"
+        ".inst 0x4e8ea73b  // smmla v27.4s, v25.16b, v14.16b\n"
+        ".inst 0x4e9fa460  // smmla v0.4s, v3.16b, v31.16b\n"
+        ".inst 0x4e8ea47d  // smmla v29.4s, v3.16b, v14.16b\n"
+        "uzp1 v19.2d, v8.2d, v27.2d\n"
+        "uzp2 v18.2d, v8.2d, v27.2d\n"
+        "scvtf v19.4s, v19.4s, #0x4\n"
+        "uzp1 v17.2d, v0.2d, v29.2d\n"
+        "uzp2 v16.2d, v0.2d, v29.2d\n"
+        "scvtf v18.4s, v18.4s, #0x4\n"
+        "fmla v2.4s, v19.4s, v23.4s\n"
+        "scvtf v17.4s, v17.4s, #0x4\n"
+        "scvtf v16.4s, v16.4s, #0x4\n"
+        "fmla v10.4s, v18.4s, v21.4s\n"
+        "fmla v12.4s, v17.4s, v1.4s\n"
+        "fmla v28.4s, v16.4s, v20.4s\n"
+        "bgt 7b\n"
+        "mov x20, %x[res_ptr]\n"
+        "cmp x10, #0x1\n"
+        "str q2, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "ble 8f\n"
+        "cmp x10, #0x2\n"
+        "str q10, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "ble 8f\n"
+        "cmp x10, #0x3\n"
+        "str q12, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "ble 8f\n"
+        "str q28, [x20, #0x0]\n"
+        "8:"  // Row tail: Accumulator store skip
+        "subs x23, x23, #0x4\n"
+        "add %x[res_ptr], %x[res_ptr], #0x10\n"
+        "bne 6b\n"
+        "subs x10, x10, #0x4\n"
+        "add %x[a_ptr], %x[a_ptr], x9\n"
+        "mov %x[res_ptr], x22\n"
+        "bgt 5b\n"
+        "9:"  // Row tail: Row loop skip
+        : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
+        : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [num_blocks] "r" (num_blocks), [res_stride] "r" (res_stride), [width] "r" (width)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+#endif
+}
+
+void ggml_gemm_q4_0_q8_0_aarch64_neon_noi8mm(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth) {
+#if defined(__ARM_NEON)
+    int64_t x0 = roundup((ith * nc) / nth, (int64_t)4);
+    int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)4);
+    size_t width = xend - x0;
+
+    int64_t nb = n / QK4_0;
+    const void * b_ptr = (const void *)((const block_q4_0x4 *) vx + ((x0/4) * nb));
+    const void * a_ptr = vy;
+    float * res_ptr = s + x0;
+    size_t res_stride = nc * sizeof(float);
+
+    assert(n % 32 == 0);
+    assert(width % 4 == 0);
+
+    size_t num_blocks = n / 32;
+
+    __asm__ __volatile__(
+        "mov x10, %x[nr]\n"
+        "mov x9, #0x88\n"
+        "cmp x10, #0x10\n"
+        "mul x9, %x[num_blocks], x9\n"
+        "blt 4f\n"
+        "1:"  // Row loop
+        "add x28, %x[b_ptr], #0x8\n"
+        "mov x27, %x[width]\n"
+        "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
+        "2:"  // Column loop
+        "add x25, %x[a_ptr], #0x8\n"
+        "movi v15.16b, #0x0\n"
+        "movi v19.16b, #0x0\n"
+        "mov x24, %x[num_blocks]\n"
+        "add x23, x25, x9\n"
+        "movi v18.16b, #0x0\n"
+        "movi v14.16b, #0x0\n"
+        "add x22, x23, x9\n"
+        "movi v11.16b, #0x0\n"
+        "movi v13.16b, #0x0\n"
+        "add x21, x22, x9\n"
+        "movi v23.16b, #0x0\n"
+        "movi v16.16b, #0x0\n"
+        "movi v25.16b, #0x0\n"
+        "movi v7.16b, #0x0\n"
+        "movi v0.16b, #0x0\n"
+        "movi v4.16b, #0x0\n"
+        "movi v5.16b, #0x0\n"
+        "movi v21.16b, #0x0\n"
+        "movi v8.16b, #0x0\n"
+        "movi v1.16b, #0x0\n"
+        "3:"  // Block loop
+        "ldr q3, [x28, #0x0]\n"
+        "ldr q31, [x25, #0x0]\n"
+        "movi v28.16b, #0x4\n"
+        "movi v10.4s, #0x0\n"
+        "ldr q22, [x28, #0x10]\n"
+        "ldr q6, [x25, #0x10]\n"
+        "movi v29.4s, #0x0\n"
+        "movi v9.4s, #0x0\n"
+        "ldr q27, [x28, #0x20]\n"
+        "ldr q30, [x28, #0x30]\n"
+        "movi v20.4s, #0x0\n"
+        "movi v24.16b, #0xf0\n"
+        "ldr d2, [x25, #-0x8]\n"
+        "ldr d26, [x23, #-0x8]\n"
+        "sshl v12.16b, v3.16b, v28.16b\n"
+        "sub x20, x28, #0x8\n"
+        "ldr d17, [x20, #0x0]\n"
+        "and v3.16b, v3.16b, v24.16b\n"
+        "subs x24, x24, #0x1\n"
+        "add x28, x28, #0x48\n"
+        ".inst 0x4f9fe18a  // sdot v10.4s, v12.16b, v31.4b[0]\n"
+        ".inst 0x4fbfe19d  // sdot v29.4s, v12.16b, v31.4b[1]\n"
+        ".inst 0x4f9fe989  // sdot v9.4s, v12.16b, v31.4b[2]\n"
+        ".inst 0x4fbfe994  // sdot v20.4s, v12.16b, v31.4b[3]\n"
+        "sshl v31.16b, v22.16b, v28.16b\n"
+        "and v22.16b, v22.16b, v24.16b\n"
+        "fcvtl v17.4s, v17.4h\n"
+        "fcvtl v2.4s, v2.4h\n"
+        "fcvtl v26.4s, v26.4h\n"
+        ".inst 0x4f86e3ea  // sdot v10.4s, v31.16b, v6.4b[0]\n"
+        ".inst 0x4fa6e3fd  // sdot v29.4s, v31.16b, v6.4b[1]\n"
+        ".inst 0x4f86ebe9  // sdot v9.4s, v31.16b, v6.4b[2]\n"
+        ".inst 0x4fa6ebf4  // sdot v20.4s, v31.16b, v6.4b[3]\n"
+        "sshl v6.16b, v27.16b, v28.16b\n"
+        "sshl v28.16b, v30.16b, v28.16b\n"
+        "and v27.16b, v27.16b, v24.16b\n"
+        "and v30.16b, v30.16b, v24.16b\n"
+        "ldr q24, [x25, #0x20]\n"
+        ".inst 0x4f98e0ca  // sdot v10.4s, v6.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e0dd  // sdot v29.4s, v6.16b, v24.4b[1]\n"
+        ".inst 0x4f98e8c9  // sdot v9.4s, v6.16b, v24.4b[2]\n"
+        ".inst 0x4fb8e8d4  // sdot v20.4s, v6.16b, v24.4b[3]\n"
+        "ldr q24, [x25, #0x30]\n"
+        ".inst 0x4f98e38a  // sdot v10.4s, v28.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e39d  // sdot v29.4s, v28.16b, v24.4b[1]\n"
+        ".inst 0x4f98eb89  // sdot v9.4s, v28.16b, v24.4b[2]\n"
+        ".inst 0x4fb8eb94  // sdot v20.4s, v28.16b, v24.4b[3]\n"
+        "ldr q24, [x25, #0x40]\n"
+        ".inst 0x4f98e06a  // sdot v10.4s, v3.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e07d  // sdot v29.4s, v3.16b, v24.4b[1]\n"
+        ".inst 0x4f98e869  // sdot v9.4s, v3.16b, v24.4b[2]\n"
+        ".inst 0x4fb8e874  // sdot v20.4s, v3.16b, v24.4b[3]\n"
+        "ldr q24, [x25, #0x50]\n"
+        ".inst 0x4f98e2ca  // sdot v10.4s, v22.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e2dd  // sdot v29.4s, v22.16b, v24.4b[1]\n"
+        ".inst 0x4f98eac9  // sdot v9.4s, v22.16b, v24.4b[2]\n"
+        ".inst 0x4fb8ead4  // sdot v20.4s, v22.16b, v24.4b[3]\n"
+        "ldr q24, [x25, #0x60]\n"
+        ".inst 0x4f98e36a  // sdot v10.4s, v27.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e37d  // sdot v29.4s, v27.16b, v24.4b[1]\n"
+        ".inst 0x4f98eb69  // sdot v9.4s, v27.16b, v24.4b[2]\n"
+        ".inst 0x4fb8eb74  // sdot v20.4s, v27.16b, v24.4b[3]\n"
+        "ldr q24, [x25, #0x70]\n"
+        "add x25, x25, #0x88\n"
+        ".inst 0x4f98e3ca  // sdot v10.4s, v30.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e3dd  // sdot v29.4s, v30.16b, v24.4b[1]\n"
+        ".inst 0x4f98ebc9  // sdot v9.4s, v30.16b, v24.4b[2]\n"
+        ".inst 0x4fb8ebd4  // sdot v20.4s, v30.16b, v24.4b[3]\n"
+        "fmul v24.4s, v17.4s, v2.s[0]\n"
+        "scvtf v10.4s, v10.4s, #0x4\n"
+        "scvtf v29.4s, v29.4s, #0x4\n"
+        "scvtf v9.4s, v9.4s, #0x4\n"
+        "scvtf v20.4s, v20.4s, #0x4\n"
+        "fmla v15.4s, v10.4s, v24.4s\n"
+        "ldr q24, [x23, #0x0]\n"
+        "fmul v10.4s, v17.4s, v2.s[1]\n"
+        "fmla v19.4s, v29.4s, v10.4s\n"
+        "ldr q10, [x23, #0x10]\n"
+        "fmul v29.4s, v17.4s, v2.s[2]\n"
+        "fmul v2.4s, v17.4s, v2.s[3]\n"
+        "fmla v18.4s, v9.4s, v29.4s\n"
+        "movi v9.4s, #0x0\n"
+        "movi v29.4s, #0x0\n"
+        ".inst 0x4f98e189  // sdot v9.4s, v12.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e19d  // sdot v29.4s, v12.16b, v24.4b[1]\n"
+        "fmla v14.4s, v20.4s, v2.4s\n"
+        "movi v20.4s, #0x0\n"
+        "movi v2.4s, #0x0\n"
+        ".inst 0x4f98e994  // sdot v20.4s, v12.16b, v24.4b[2]\n"
+        ".inst 0x4fb8e982  // sdot v2.4s, v12.16b, v24.4b[3]\n"
+        "ldr q24, [x23, #0x20]\n"
+        ".inst 0x4f8ae3e9  // sdot v9.4s, v31.16b, v10.4b[0]\n"
+        ".inst 0x4faae3fd  // sdot v29.4s, v31.16b, v10.4b[1]\n"
+        ".inst 0x4f8aebf4  // sdot v20.4s, v31.16b, v10.4b[2]\n"
+        ".inst 0x4faaebe2  // sdot v2.4s, v31.16b, v10.4b[3]\n"
+        "ldr q10, [x23, #0x30]\n"
+        ".inst 0x4f98e0c9  // sdot v9.4s, v6.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e0dd  // sdot v29.4s, v6.16b, v24.4b[1]\n"
+        ".inst 0x4f98e8d4  // sdot v20.4s, v6.16b, v24.4b[2]\n"
+        ".inst 0x4fb8e8c2  // sdot v2.4s, v6.16b, v24.4b[3]\n"
+        "ldr q24, [x23, #0x40]\n"
+        ".inst 0x4f8ae389  // sdot v9.4s, v28.16b, v10.4b[0]\n"
+        ".inst 0x4faae39d  // sdot v29.4s, v28.16b, v10.4b[1]\n"
+        ".inst 0x4f8aeb94  // sdot v20.4s, v28.16b, v10.4b[2]\n"
+        ".inst 0x4faaeb82  // sdot v2.4s, v28.16b, v10.4b[3]\n"
+        "ldr q10, [x23, #0x50]\n"
+        ".inst 0x4f98e069  // sdot v9.4s, v3.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e07d  // sdot v29.4s, v3.16b, v24.4b[1]\n"
+        ".inst 0x4f98e874  // sdot v20.4s, v3.16b, v24.4b[2]\n"
+        ".inst 0x4fb8e862  // sdot v2.4s, v3.16b, v24.4b[3]\n"
+        "ldr q24, [x23, #0x60]\n"
+        ".inst 0x4f8ae2c9  // sdot v9.4s, v22.16b, v10.4b[0]\n"
+        ".inst 0x4faae2dd  // sdot v29.4s, v22.16b, v10.4b[1]\n"
+        ".inst 0x4f8aead4  // sdot v20.4s, v22.16b, v10.4b[2]\n"
+        ".inst 0x4faaeac2  // sdot v2.4s, v22.16b, v10.4b[3]\n"
+        "ldr q10, [x23, #0x70]\n"
+        "add x23, x23, #0x88\n"
+        ".inst 0x4f98e369  // sdot v9.4s, v27.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e37d  // sdot v29.4s, v27.16b, v24.4b[1]\n"
+        ".inst 0x4f98eb74  // sdot v20.4s, v27.16b, v24.4b[2]\n"
+        ".inst 0x4fb8eb62  // sdot v2.4s, v27.16b, v24.4b[3]\n"
+        "ldr q24, [x22, #0x0]\n"
+        ".inst 0x4f8ae3c9  // sdot v9.4s, v30.16b, v10.4b[0]\n"
+        ".inst 0x4faae3dd  // sdot v29.4s, v30.16b, v10.4b[1]\n"
+        ".inst 0x4f8aebd4  // sdot v20.4s, v30.16b, v10.4b[2]\n"
+        ".inst 0x4faaebc2  // sdot v2.4s, v30.16b, v10.4b[3]\n"
+        "fmul v10.4s, v17.4s, v26.s[0]\n"
+        "scvtf v9.4s, v9.4s, #0x4\n"
+        "scvtf v29.4s, v29.4s, #0x4\n"
+        "scvtf v20.4s, v20.4s, #0x4\n"
+        "scvtf v2.4s, v2.4s, #0x4\n"
+        "fmla v11.4s, v9.4s, v10.4s\n"
+        "ldr q9, [x22, #0x10]\n"
+        "fmul v10.4s, v17.4s, v26.s[1]\n"
+        "fmla v13.4s, v29.4s, v10.4s\n"
+        "ldr d29, [x22, #-0x8]\n"
+        "fmul v10.4s, v17.4s, v26.s[2]\n"
+        "fmul v26.4s, v17.4s, v26.s[3]\n"
+        "fcvtl v29.4s, v29.4h\n"
+        "fmla v23.4s, v20.4s, v10.4s\n"
+        "movi v20.4s, #0x0\n"
+        "movi v10.4s, #0x0\n"
+        "fmla v16.4s, v2.4s, v26.4s\n"
+        "movi v26.4s, #0x0\n"
+        "movi v2.4s, #0x0\n"
+        ".inst 0x4f98e194  // sdot v20.4s, v12.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e18a  // sdot v10.4s, v12.16b, v24.4b[1]\n"
+        ".inst 0x4f98e99a  // sdot v26.4s, v12.16b, v24.4b[2]\n"
+        ".inst 0x4fb8e982  // sdot v2.4s, v12.16b, v24.4b[3]\n"
+        "ldr q24, [x22, #0x20]\n"
+        ".inst 0x4f89e3f4  // sdot v20.4s, v31.16b, v9.4b[0]\n"
+        ".inst 0x4fa9e3ea  // sdot v10.4s, v31.16b, v9.4b[1]\n"
+        ".inst 0x4f89ebfa  // sdot v26.4s, v31.16b, v9.4b[2]\n"
+        ".inst 0x4fa9ebe2  // sdot v2.4s, v31.16b, v9.4b[3]\n"
+        "ldr q9, [x22, #0x30]\n"
+        ".inst 0x4f98e0d4  // sdot v20.4s, v6.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e0ca  // sdot v10.4s, v6.16b, v24.4b[1]\n"
+        ".inst 0x4f98e8da  // sdot v26.4s, v6.16b, v24.4b[2]\n"
+        ".inst 0x4fb8e8c2  // sdot v2.4s, v6.16b, v24.4b[3]\n"
+        "ldr q24, [x22, #0x40]\n"
+        ".inst 0x4f89e394  // sdot v20.4s, v28.16b, v9.4b[0]\n"
+        ".inst 0x4fa9e38a  // sdot v10.4s, v28.16b, v9.4b[1]\n"
+        ".inst 0x4f89eb9a  // sdot v26.4s, v28.16b, v9.4b[2]\n"
+        ".inst 0x4fa9eb82  // sdot v2.4s, v28.16b, v9.4b[3]\n"
+        "ldr q9, [x22, #0x50]\n"
+        ".inst 0x4f98e074  // sdot v20.4s, v3.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e06a  // sdot v10.4s, v3.16b, v24.4b[1]\n"
+        ".inst 0x4f98e87a  // sdot v26.4s, v3.16b, v24.4b[2]\n"
+        ".inst 0x4fb8e862  // sdot v2.4s, v3.16b, v24.4b[3]\n"
+        "ldr q24, [x22, #0x60]\n"
+        ".inst 0x4f89e2d4  // sdot v20.4s, v22.16b, v9.4b[0]\n"
+        ".inst 0x4fa9e2ca  // sdot v10.4s, v22.16b, v9.4b[1]\n"
+        ".inst 0x4f89eada  // sdot v26.4s, v22.16b, v9.4b[2]\n"
+        ".inst 0x4fa9eac2  // sdot v2.4s, v22.16b, v9.4b[3]\n"
+        "ldr q9, [x22, #0x70]\n"
+        "add x22, x22, #0x88\n"
+        ".inst 0x4f98e374  // sdot v20.4s, v27.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e36a  // sdot v10.4s, v27.16b, v24.4b[1]\n"
+        ".inst 0x4f98eb7a  // sdot v26.4s, v27.16b, v24.4b[2]\n"
+        ".inst 0x4fb8eb62  // sdot v2.4s, v27.16b, v24.4b[3]\n"
+        "ldr q24, [x21, #0x0]\n"
+        ".inst 0x4f89e3d4  // sdot v20.4s, v30.16b, v9.4b[0]\n"
+        ".inst 0x4fa9e3ca  // sdot v10.4s, v30.16b, v9.4b[1]\n"
+        ".inst 0x4f89ebda  // sdot v26.4s, v30.16b, v9.4b[2]\n"
+        ".inst 0x4fa9ebc2  // sdot v2.4s, v30.16b, v9.4b[3]\n"
+        "fmul v9.4s, v17.4s, v29.s[0]\n"
+        "scvtf v20.4s, v20.4s, #0x4\n"
+        "scvtf v10.4s, v10.4s, #0x4\n"
+        "scvtf v26.4s, v26.4s, #0x4\n"
+        "scvtf v2.4s, v2.4s, #0x4\n"
+        "fmla v25.4s, v20.4s, v9.4s\n"
+        "ldr q9, [x21, #0x10]\n"
+        "fmul v20.4s, v17.4s, v29.s[1]\n"
+        "fmla v7.4s, v10.4s, v20.4s\n"
+        "ldr d20, [x21, #-0x8]\n"
+        "fmul v10.4s, v17.4s, v29.s[2]\n"
+        "fmul v29.4s, v17.4s, v29.s[3]\n"
+        "fcvtl v20.4s, v20.4h\n"
+        "fmla v0.4s, v26.4s, v10.4s\n"
+        "movi v26.4s, #0x0\n"
+        "movi v10.4s, #0x0\n"
+        "fmla v4.4s, v2.4s, v29.4s\n"
+        "movi v2.4s, #0x0\n"
+        "movi v29.4s, #0x0\n"
+        ".inst 0x4f98e19a  // sdot v26.4s, v12.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e18a  // sdot v10.4s, v12.16b, v24.4b[1]\n"
+        ".inst 0x4f98e982  // sdot v2.4s, v12.16b, v24.4b[2]\n"
+        ".inst 0x4fb8e99d  // sdot v29.4s, v12.16b, v24.4b[3]\n"
+        "ldr q12, [x21, #0x20]\n"
+        "fmul v24.4s, v17.4s, v20.s[0]\n"
+        ".inst 0x4f89e3fa  // sdot v26.4s, v31.16b, v9.4b[0]\n"
+        ".inst 0x4fa9e3ea  // sdot v10.4s, v31.16b, v9.4b[1]\n"
+        ".inst 0x4f89ebe2  // sdot v2.4s, v31.16b, v9.4b[2]\n"
+        ".inst 0x4fa9ebfd  // sdot v29.4s, v31.16b, v9.4b[3]\n"
+        "ldr q9, [x21, #0x30]\n"
+        "fmul v31.4s, v17.4s, v20.s[1]\n"
+        ".inst 0x4f8ce0da  // sdot v26.4s, v6.16b, v12.4b[0]\n"
+        ".inst 0x4face0ca  // sdot v10.4s, v6.16b, v12.4b[1]\n"
+        ".inst 0x4f8ce8c2  // sdot v2.4s, v6.16b, v12.4b[2]\n"
+        ".inst 0x4face8dd  // sdot v29.4s, v6.16b, v12.4b[3]\n"
+        "ldr q12, [x21, #0x40]\n"
+        "fmul v6.4s, v17.4s, v20.s[2]\n"
+        "fmul v20.4s, v17.4s, v20.s[3]\n"
+        ".inst 0x4f89e39a  // sdot v26.4s, v28.16b, v9.4b[0]\n"
+        ".inst 0x4fa9e38a  // sdot v10.4s, v28.16b, v9.4b[1]\n"
+        ".inst 0x4f89eb82  // sdot v2.4s, v28.16b, v9.4b[2]\n"
+        ".inst 0x4fa9eb9d  // sdot v29.4s, v28.16b, v9.4b[3]\n"
+        "ldr q9, [x21, #0x50]\n"
+        ".inst 0x4f8ce07a  // sdot v26.4s, v3.16b, v12.4b[0]\n"
+        ".inst 0x4face06a  // sdot v10.4s, v3.16b, v12.4b[1]\n"
+        ".inst 0x4f8ce862  // sdot v2.4s, v3.16b, v12.4b[2]\n"
+        ".inst 0x4face87d  // sdot v29.4s, v3.16b, v12.4b[3]\n"
+        "ldr q12, [x21, #0x60]\n"
+        ".inst 0x4f89e2da  // sdot v26.4s, v22.16b, v9.4b[0]\n"
+        ".inst 0x4fa9e2ca  // sdot v10.4s, v22.16b, v9.4b[1]\n"
+        ".inst 0x4f89eac2  // sdot v2.4s, v22.16b, v9.4b[2]\n"
+        ".inst 0x4fa9eadd  // sdot v29.4s, v22.16b, v9.4b[3]\n"
+        "ldr q17, [x21, #0x70]\n"
+        "add x21, x21, #0x88\n"
+        ".inst 0x4f8ce37a  // sdot v26.4s, v27.16b, v12.4b[0]\n"
+        ".inst 0x4face36a  // sdot v10.4s, v27.16b, v12.4b[1]\n"
+        ".inst 0x4f8ceb62  // sdot v2.4s, v27.16b, v12.4b[2]\n"
+        ".inst 0x4faceb7d  // sdot v29.4s, v27.16b, v12.4b[3]\n"
+        ".inst 0x4f91e3da  // sdot v26.4s, v30.16b, v17.4b[0]\n"
+        ".inst 0x4fb1e3ca  // sdot v10.4s, v30.16b, v17.4b[1]\n"
+        ".inst 0x4f91ebc2  // sdot v2.4s, v30.16b, v17.4b[2]\n"
+        ".inst 0x4fb1ebdd  // sdot v29.4s, v30.16b, v17.4b[3]\n"
+        "scvtf v26.4s, v26.4s, #0x4\n"
+        "scvtf v10.4s, v10.4s, #0x4\n"
+        "fmla v5.4s, v26.4s, v24.4s\n"
+        "scvtf v2.4s, v2.4s, #0x4\n"
+        "scvtf v29.4s, v29.4s, #0x4\n"
+        "fmla v21.4s, v10.4s, v31.4s\n"
+        "fmla v8.4s, v2.4s, v6.4s\n"
+        "fmla v1.4s, v29.4s, v20.4s\n"
+        "bgt 3b\n"
+        "mov x20, %x[res_ptr]\n"
+        "subs x27, x27, #0x4\n"
+        "add %x[res_ptr], %x[res_ptr], #0x10\n"
+        "str q15, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q19, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q18, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q14, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q11, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q13, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q23, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q16, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q25, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q7, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q0, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q4, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q5, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q21, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q8, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q1, [x20, #0x0]\n"
+        "bne 2b\n"
+        "mov x20, #0x4\n"
+        "sub x10, x10, #0x10\n"
+        "cmp x10, #0x10\n"
+        "mov %x[res_ptr], x26\n"
+        "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
+        "bge 1b\n"
+        "4:"  // Row loop skip
+        "cbz x10, 9f\n"
+        "5:"  // Row tail: Row loop
+        "add x24, %x[b_ptr], #0x8\n"
+        "mov x23, %x[width]\n"
+        "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
+        "6:"  // Row tail: Column loop
+        "movi v15.16b, #0x0\n"
+        "movi v19.16b, #0x0\n"
+        "add x25, %x[a_ptr], #0x8\n"
+        "mov x21, %x[num_blocks]\n"
+        "movi v18.16b, #0x0\n"
+        "movi v14.16b, #0x0\n"
+        "7:"  // Row tail: Block loop
+        "ldr q7, [x24, #0x0]\n"
+        "ldr q5, [x25, #0x0]\n"
+        "movi v9.16b, #0x4\n"
+        "movi v4.4s, #0x0\n"
+        "ldr q3, [x24, #0x10]\n"
+        "ldr q2, [x25, #0x10]\n"
+        "movi v1.4s, #0x0\n"
+        "movi v0.4s, #0x0\n"
+        "ldr q13, [x24, #0x20]\n"
+        "ldr q31, [x25, #0x20]\n"
+        "movi v30.4s, #0x0\n"
+        "movi v29.16b, #0xf0\n"
+        "ldr q28, [x24, #0x30]\n"
+        "ldr q27, [x25, #0x30]\n"
+        "sshl v20.16b, v7.16b, v9.16b\n"
+        "sub x20, x24, #0x8\n"
+        "ldr q26, [x25, #0x40]\n"
+        "ldr q25, [x25, #0x50]\n"
+        "sshl v17.16b, v3.16b, v9.16b\n"
+        "and v7.16b, v7.16b, v29.16b\n"
+        "ldr q24, [x25, #0x60]\n"
+        "ldr q16, [x25, #0x70]\n"
+        "sshl v22.16b, v13.16b, v9.16b\n"
+        "and v3.16b, v3.16b, v29.16b\n"
+        "ldr d21, [x20, #0x0]\n"
+        "ldr d12, [x25, #-0x8]\n"
+        ".inst 0x4f85e284  // sdot v4.4s, v20.16b, v5.4b[0]\n"
+        ".inst 0x4fa5e281  // sdot v1.4s, v20.16b, v5.4b[1]\n"
+        ".inst 0x4f85ea80  // sdot v0.4s, v20.16b, v5.4b[2]\n"
+        ".inst 0x4fa5ea9e  // sdot v30.4s, v20.16b, v5.4b[3]\n"
+        "sshl v9.16b, v28.16b, v9.16b\n"
+        "subs x21, x21, #0x1\n"
+        "and v13.16b, v13.16b, v29.16b\n"
+        "and v28.16b, v28.16b, v29.16b\n"
+        "add x25, x25, #0x88\n"
+        "add x24, x24, #0x48\n"
+        "fcvtl v21.4s, v21.4h\n"
+        "fcvtl v12.4s, v12.4h\n"
+        ".inst 0x4f82e224  // sdot v4.4s, v17.16b, v2.4b[0]\n"
+        ".inst 0x4fa2e221  // sdot v1.4s, v17.16b, v2.4b[1]\n"
+        ".inst 0x4f82ea20  // sdot v0.4s, v17.16b, v2.4b[2]\n"
+        ".inst 0x4fa2ea3e  // sdot v30.4s, v17.16b, v2.4b[3]\n"
+        "fmul v11.4s, v21.4s, v12.s[0]\n"
+        "fmul v23.4s, v21.4s, v12.s[1]\n"
+        "fmul v17.4s, v21.4s, v12.s[2]\n"
+        ".inst 0x4f9fe2c4  // sdot v4.4s, v22.16b, v31.4b[0]\n"
+        "fmul v6.4s, v21.4s, v12.s[3]\n"
+        ".inst 0x4fbfe2c1  // sdot v1.4s, v22.16b, v31.4b[1]\n"
+        ".inst 0x4f9feac0  // sdot v0.4s, v22.16b, v31.4b[2]\n"
+        ".inst 0x4fbfeade  // sdot v30.4s, v22.16b, v31.4b[3]\n"
+        ".inst 0x4f9be124  // sdot v4.4s, v9.16b, v27.4b[0]\n"
+        ".inst 0x4fbbe121  // sdot v1.4s, v9.16b, v27.4b[1]\n"
+        ".inst 0x4f9be920  // sdot v0.4s, v9.16b, v27.4b[2]\n"
+        ".inst 0x4fbbe93e  // sdot v30.4s, v9.16b, v27.4b[3]\n"
+        ".inst 0x4f9ae0e4  // sdot v4.4s, v7.16b, v26.4b[0]\n"
+        ".inst 0x4fbae0e1  // sdot v1.4s, v7.16b, v26.4b[1]\n"
+        ".inst 0x4f9ae8e0  // sdot v0.4s, v7.16b, v26.4b[2]\n"
+        ".inst 0x4fbae8fe  // sdot v30.4s, v7.16b, v26.4b[3]\n"
+        ".inst 0x4f99e064  // sdot v4.4s, v3.16b, v25.4b[0]\n"
+        ".inst 0x4fb9e061  // sdot v1.4s, v3.16b, v25.4b[1]\n"
+        ".inst 0x4f99e860  // sdot v0.4s, v3.16b, v25.4b[2]\n"
+        ".inst 0x4fb9e87e  // sdot v30.4s, v3.16b, v25.4b[3]\n"
+        ".inst 0x4f98e1a4  // sdot v4.4s, v13.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e1a1  // sdot v1.4s, v13.16b, v24.4b[1]\n"
+        ".inst 0x4f98e9a0  // sdot v0.4s, v13.16b, v24.4b[2]\n"
+        ".inst 0x4fb8e9be  // sdot v30.4s, v13.16b, v24.4b[3]\n"
+        ".inst 0x4f90e384  // sdot v4.4s, v28.16b, v16.4b[0]\n"
+        ".inst 0x4fb0e381  // sdot v1.4s, v28.16b, v16.4b[1]\n"
+        ".inst 0x4f90eb80  // sdot v0.4s, v28.16b, v16.4b[2]\n"
+        ".inst 0x4fb0eb9e  // sdot v30.4s, v28.16b, v16.4b[3]\n"
+        "scvtf v4.4s, v4.4s, #0x4\n"
+        "scvtf v1.4s, v1.4s, #0x4\n"
+        "scvtf v0.4s, v0.4s, #0x4\n"
+        "fmla v15.4s, v4.4s, v11.4s\n"
+        "scvtf v30.4s, v30.4s, #0x4\n"
+        "fmla v19.4s, v1.4s, v23.4s\n"
+        "fmla v18.4s, v0.4s, v17.4s\n"
+        "fmla v14.4s, v30.4s, v6.4s\n"
+        "bgt 7b\n"
+        "mov x20, %x[res_ptr]\n"
+        "cmp x10, #0x1\n"
+        "str q15, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "ble 8f\n"
+        "cmp x10, #0x2\n"
+        "str q19, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "ble 8f\n"
+        "cmp x10, #0x3\n"
+        "str q18, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "ble 8f\n"
+        "str q14, [x20, #0x0]\n"
+        "8:"  // Row tail: Accumulator store skip
+        "subs x23, x23, #0x4\n"
+        "add %x[res_ptr], %x[res_ptr], #0x10\n"
+        "bne 6b\n"
+        "subs x10, x10, #0x4\n"
+        "add %x[a_ptr], %x[a_ptr], x9\n"
+        "mov %x[res_ptr], x22\n"
+        "bgt 5b\n"
+        "9:"  // Row tail: Row loop skip
+        : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
+        : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [num_blocks] "r" (num_blocks), [res_stride] "r" (res_stride), [width] "r" (width)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+#endif
+}
+
+void ggml_gemm_q8_0_q8_0_aarch64(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth) {
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    int64_t x0 = roundup((ith * nc) / nth, (int64_t)4);
+    int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)4);
+
+    int64_t nb = n / QK8_0;
+    int64_t a_nb = n / QK8_0;
+
+    const block_q8_0x4 * b_ptr_start = (const block_q8_0x4 *) vx;
+    const block_q8_0x4 * a_ptr_start = (const block_q8_0x4 *) vy;
+
+    for (int64_t y = 0; y < nr / 4; y += nr / 4) {
+        for (int64_t x = x0 / 4; x < xend / 4; x++) {
+            const block_q8_0x4 ** a_ptrs = new const block_q8_0x4 * [nr / 4];
+
+            a_ptrs[0] = a_ptr_start + (y * a_nb);
+            for (int i = 0; i < (nr / 4) - 1; i++) {
+                a_ptrs[i + 1] = a_ptrs[i] + a_nb;
+            }
+
+            const block_q8_0x4 * b_ptr = b_ptr_start + (x * nb);
+
+            // Master FP accumulators
+            float32x4_t * acc_rows = new float32x4_t[nr];
+            for (int i = 0; i < nr; i++) {
+                acc_rows[i] = vdupq_n_f32(0.0f);
+            }
+
+            for (int64_t b = 0; b < nb; b++) {
+                // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
+                const int8x16_t rhs_mat_01_0 = vld1q_s8(b_ptr[b].qs);
+                const int8x16_t rhs_mat_23_0 = vld1q_s8(b_ptr[b].qs + 16);
+                const int8x16_t rhs_mat_01_1 = vld1q_s8(b_ptr[b].qs + 32);
+                const int8x16_t rhs_mat_23_1 = vld1q_s8(b_ptr[b].qs + 48);
+                const int8x16_t rhs_mat_01_2 = vld1q_s8(b_ptr[b].qs + 64);
+                const int8x16_t rhs_mat_23_2 = vld1q_s8(b_ptr[b].qs + 80);
+                const int8x16_t rhs_mat_01_3 = vld1q_s8(b_ptr[b].qs + 96);
+                const int8x16_t rhs_mat_23_3 = vld1q_s8(b_ptr[b].qs + 112);
+
+                // Scale values - assemble the four row/column scales into a (64-bit) vector, then expand to FP32
+                const float16x4_t col_scale_f16 = vld1_f16((const ggml_fp16_internal_t *)(b_ptr[b].d));
+                const float32x4_t col_scale_f32 = vcvt_f32_f16(col_scale_f16);
+
+                // Process LHS in pairs of rows
+                for (int rp = 0; rp < nr / 4; rp++) {
+                    const int8x16_t lhs_mat_01_0 = vld1q_s8(a_ptrs[rp][b].qs);
+                    const int8x16_t lhs_mat_23_0 = vld1q_s8(a_ptrs[rp][b].qs + 16);
+                    const int8x16_t lhs_mat_01_1 = vld1q_s8(a_ptrs[rp][b].qs + 32);
+                    const int8x16_t lhs_mat_23_1 = vld1q_s8(a_ptrs[rp][b].qs + 48);
+
+                    const int8x16_t lhs_mat_01_2 = vld1q_s8(a_ptrs[rp][b].qs + 64);
+                    const int8x16_t lhs_mat_23_2 = vld1q_s8(a_ptrs[rp][b].qs + 80);
+                    const int8x16_t lhs_mat_01_3 = vld1q_s8(a_ptrs[rp][b].qs + 96);
+                    const int8x16_t lhs_mat_23_3 = vld1q_s8(a_ptrs[rp][b].qs + 112);
+
+                    // Do the MMLAs into 2x2 matrices
+                    const int32x4_t iacc_mat_00 =
+                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_01_0), lhs_mat_01_1, rhs_mat_01_1), lhs_mat_01_2, rhs_mat_01_2), lhs_mat_01_3, rhs_mat_01_3);
+                    const int32x4_t iacc_mat_01 =
+                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_23_0), lhs_mat_01_1, rhs_mat_23_1), lhs_mat_01_2, rhs_mat_23_2), lhs_mat_01_3, rhs_mat_23_3);
+                    const int32x4_t iacc_mat_10 =
+                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_23_0, rhs_mat_01_0), lhs_mat_23_1, rhs_mat_01_1), lhs_mat_23_2, rhs_mat_01_2), lhs_mat_23_3, rhs_mat_01_3);
+                    const int32x4_t iacc_mat_11 =
+                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_23_0, rhs_mat_23_0), lhs_mat_23_1, rhs_mat_23_1), lhs_mat_23_2, rhs_mat_23_2), lhs_mat_23_3, rhs_mat_23_3);
+
+                    // Straighten out to make 4 row vectors
+                    const int32x4_t iacc_row_0 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
+                    const int32x4_t iacc_row_1 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
+                    const int32x4_t iacc_row_2 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_10), vreinterpretq_u64_s32(iacc_mat_11)));
+                    const int32x4_t iacc_row_3 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_10), vreinterpretq_u64_s32(iacc_mat_11)));
+
+                    const float16x4_t row_scale_f16 = vld1_f16((const ggml_fp16_internal_t *)(a_ptrs[rp][b].d));
+                    const float32x4_t row_scale_f32 = vcvt_f32_f16(row_scale_f16);
+
+                    acc_rows[rp * 4] = vfmaq_f32(acc_rows[rp * 4], vcvtq_f32_s32(iacc_row_0), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 0));
+                    acc_rows[rp * 4 + 1] = vfmaq_f32(acc_rows[rp * 4 + 1], vcvtq_f32_s32(iacc_row_1), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 1));
+                    acc_rows[rp * 4 + 2] = vfmaq_f32(acc_rows[rp * 4 + 2], vcvtq_f32_s32(iacc_row_2), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 2));
+                    acc_rows[rp * 4 + 3] = vfmaq_f32(acc_rows[rp * 4 + 3], vcvtq_f32_s32(iacc_row_3), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 3));
+                }
+            }
+
+            for (int i = 0; i < nr; i++) {
+                vst1q_f32(s + ((y * 4 + i) * nc + x * 4), acc_rows[i]);
+            }
+            delete [] acc_rows;
+            delete [] a_ptrs;
+        }
+    }
+#endif
+}
diff --git a/ggml-aarch64.h b/ggml-aarch64.h
new file mode 100644
index 0000000000000..bff5b7b80c88b
--- /dev/null
+++ b/ggml-aarch64.h
@@ -0,0 +1,42 @@
+// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
+#pragma once
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+
+#include "ggml.h"
+
+// GGML internal header
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Quantization
+void quantize_row_q8_0_aarch64(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k, int nrows_interleaved, int blocklen_per_row);
+
+// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
+size_t quantize_q4_0_aarch64(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+
+block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned int block_len, unsigned int xor_mask);
+block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned int block_len, unsigned int xor_mask);
+block_q8_0x4 make_block_q8_0x4(const block_q8_0 * const in[4], unsigned int block_len);
+block_q8_0x8 make_block_q8_0x8(const block_q8_0 * const in[8], unsigned int block_len);
+
+// GEMV
+void ggml_gemv_q4_0_q8_0_aarch64_sve256     (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
+void ggml_gemv_q4_0_q8_0_aarch64_neon       (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
+void ggml_gemv_q4_0_q8_0_aarch64_neon_noi8mm(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
+void ggml_gemv_q8_0_q8_0_aarch64_sve256     (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
+void ggml_gemv_q8_0_q8_0_aarch64_neon       (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
+
+// GEMM
+void ggml_gemm_q4_0_q8_0_aarch64_sve256     (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
+void ggml_gemm_q4_0_q8_0_aarch64_neon       (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
+void ggml_gemm_q4_0_q8_0_aarch64_neon_noi8mm(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
+void ggml_gemm_q8_0_q8_0_aarch64            (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index bea898c32bdb6..8037e21a1a1b5 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2410,9 +2410,11 @@ extern "C" {
     typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
     typedef void (*ggml_vec_dot_t)   (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
                                       const void * GGML_RESTRICT y, size_t by, int nrc);
-    typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int k, int n, int b);
-    typedef void (*ggml_gemv_t)      (size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
-    typedef void (*ggml_gemm_t)      (size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
+    typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k, int n, int b);
+    typedef void (*ggml_gemv_t)      (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy,
+                                      int nr, int nc, int ith, int nth);
+    typedef void (*ggml_gemm_t)      (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy,
+                                      int nr, int nc, int ith, int nth);
 
     typedef struct {
         const char      * type_name;
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 868784cc63fbd..64aae855873fc 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -3307,80 +3307,6 @@ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nr
     return nrow * row_size;
 }
 
-size_t quantize_q4_0_aarch64(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
-    if (!quant_weights) {
-        int nrows_interleaved, blocklen_per_row;
-
-#if defined(__ARM_FEATURE_SVE)
-        if (svcntw() == 8) {
-            nrows_interleaved = 8;
-            blocklen_per_row = 8;
-        }
-        else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-            nrows_interleaved = 4;
-            blocklen_per_row = 8;
-        }
-#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
-        nrows_interleaved = 4;
-        blocklen_per_row = 8;
-#elif defined(__ARM_NEON)
-        nrows_interleaved = 4;
-        blocklen_per_row = 4;
-#endif
-
-        assert(n_per_row % QK4_0 == 0);
-        const int nb = n_per_row / QK4_0;
-
-        void * out_ptr_B, * out_ptr_B_start;
-        if (nrows_interleaved == 8) {
-            out_ptr_B = (block_q4_0x8 *) malloc(sizeof(block_q4_0x8) * nb);
-            out_ptr_B_start = out_ptr_B;
-        }
-        else if (nrows_interleaved == 4) {
-            out_ptr_B = (block_q4_0x4 *) malloc(sizeof(block_q4_0x4) * nb);
-            out_ptr_B_start = out_ptr_B;
-        }
-
-        for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) {
-            const block_q4_0 * in_ptrs[nrows_interleaved];
-
-            for (int i  = 0; i < nrows_interleaved; i++ ) {
-                in_ptrs[i] = (block_q4_0 *) dst + (b + i * n_per_row) / QK4_0;
-                quantize_row_q4_0_reference(src + b + i * n_per_row, in_ptrs[i], n_per_row);
-            }
-
-            for (int64_t x = 0; x < nb; x++) {
-                if (nrows_interleaved == 8) {
-                    *(block_q4_0x8 *) out_ptr_B = make_block_q4_0x8(in_ptrs, blocklen_per_row, 0x88);
-                    out_ptr_B = (block_q4_0x8 *) out_ptr_B + 1;
-                }
-                else if (nrows_interleaved == 4) {
-                    *(block_q4_0x4 *) out_ptr_B = make_block_q4_0x4(in_ptrs, blocklen_per_row, 0x88);
-                    out_ptr_B = (block_q4_0x4 *) out_ptr_B + 1;
-                }
-
-                for (int i = 0; i < nrows_interleaved; i++) {
-                    in_ptrs[i]++;
-                }
-            }
-            out_ptr_B = out_ptr_B_start;
-            if (nrows_interleaved == 8) memcpy ((block_q4_0 *) dst + b / QK4_0, out_ptr_B_start, sizeof(block_q4_0x8) * nb);
-            else if (nrows_interleaved == 4) memcpy ((block_q4_0 *) dst + b / QK4_0, out_ptr_B_start, sizeof(block_q4_0x4) * nb);
-        }
-        if (out_ptr_B_start) free(out_ptr_B_start);
-
-        return ((nrow * n_per_row) / QK4_0 * sizeof(block_q4_0));
-    }
-    size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
-    char * qrow = (char *)dst;
-    for (int64_t row = 0; row < nrow; ++row) {
-        quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
-        src += n_per_row;
-        qrow += row_size;
-    }
-    return nrow * row_size;
-}
-
 // ====================== "True" 2-bit (de)-quantization
 
 void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
@@ -14783,2281 +14709,6 @@ void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k)
     quantize_row_iq2_s_reference(x, y, k);
 }
 
-// Routines to create the blocked formats
-// Note input is array of pointers.
-// The exact interleaving format needed is different for GEMM (using SMMLA)
-// and GEMV (using SDOT) cases.  For GEMM, we interleave 8 pairs of values
-// at a time (with the two nibbles separated at runtime to give 2x2x8
-// matrices).  For GEMV, we need to interleave 4 pairs of values instead.
-block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned int block_len, unsigned int xor_mask) {
-    block_q4_0x4 out;
-
-    for (int i = 0; i < 4; i++) {
-        out.d[i] = in[i]->d;
-    }
-
-    for (int i = 0; i < QK4_0 * 2; i++) {
-        // We are interleaving 4 rows in blocks of 8, making a total of 32
-        // output bytes per block (2 MMLA input vectors).  This repeats
-        // until we have processed the whole block.
-        //
-        // Per the comment above, for GEMV cases a similar process is used
-        // but with blocks of 4 instead, giving a single DOT input vector.
-        //
-        // In the case of q4, we add on 128 to convert the top nibble from
-        // "bias offset" form to pure sign form (this saves a subtract when
-        // we unpack it).
-        int src_offset = (i / (4 * block_len)) * block_len;
-        int src_id = (i % (4 * block_len)) / block_len;
-        src_offset += (i % block_len);
-
-        out.qs[i] = in[src_id]->qs[src_offset] ^ xor_mask;
-    }
-
-    return out;
-}
-
-// 8-block version - see comments in code above
-block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned int block_len, unsigned int xor_mask) {
-    block_q4_0x8 out;
-
-    for (int i = 0; i < 8; i++) {
-        out.d[i] = in[i]->d;
-    }
-
-    for (int i = 0; i < QK4_0 * 4; i++) {
-        int src_offset = (i / (8 * block_len)) * block_len;
-        int src_id = (i % (8 * block_len)) / block_len;
-        src_offset += (i % block_len);
-
-        out.qs[i] = in[src_id]->qs[src_offset] ^ xor_mask;
-    }
-
-    return out;
-}
-
-block_q8_0x4 make_block_q8_0x4(const block_q8_0 * const in[4], unsigned int block_len) {
-    block_q8_0x4 out;
-
-    for (int i = 0; i < 4; i++) {
-        out.d[i] = in[i]->d;
-    }
-
-    for (int i = 0; i < QK8_0 * 4; i++) {
-        int src_offset = (i / (4 * block_len)) * block_len;
-        int src_id = (i % (4 * block_len)) / block_len;
-        src_offset += (i % block_len);
-
-        out.qs[i] = in[src_id]->qs[src_offset];
-    }
-
-    return out;
-}
-
-// 8-block version - see comments in code above
-block_q8_0x8 make_block_q8_0x8(const block_q8_0 * const in[8], unsigned int block_len) {
-    block_q8_0x8 out;
-
-    for (int i = 0; i < 8; i++) {
-        out.d[i] = in[i]->d;
-    }
-
-    for (int i = 0; i < QK8_0 * 8; i++) {
-        int src_offset = (i / (8 * block_len)) * block_len;
-        int src_id = (i % (8 * block_len)) / block_len;
-        src_offset += (i % block_len);
-
-        out.qs[i] = in[src_id]->qs[src_offset];
-    }
-
-    return out;
-}
-
-void quantize_row_q8_0_aarch64(const float * restrict x, void * restrict vy, int k, int nrows_interleaved, int blocklen_per_row) {
-    assert(QK8_0 == 32);
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
-
-    block_q8_0x4 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv[nrows_interleaved][8];
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
-        float id[nrows_interleaved];
-
-        for (int row_iter = 0; row_iter < nrows_interleaved; row_iter++) {
-            for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j);
-            for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]);
-
-            for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]);
-            for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]);
-            for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]);
-
-            const float amax = vmaxvq_f32(amaxv[0]);
-
-            const float d = amax / ((1 << 7) - 1);
-            id[row_iter] = d ? 1.0f / d : 0.0f;
-
-            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
-        }
-
-        if (blocklen_per_row == 8) {
-            for (int j = 0; j < 4; j++) {
-                float32x4_t v = vmulq_n_f32(srcv[0][2 * j], id[0]);
-                int32x4_t vi = vcvtnq_s32_f32(v);
-                y[i].qs[32 * j + 0] = vgetq_lane_s32(vi, 0);
-                y[i].qs[32 * j + 1] = vgetq_lane_s32(vi, 1);
-                y[i].qs[32 * j + 2] = vgetq_lane_s32(vi, 2);
-                y[i].qs[32 * j + 3] = vgetq_lane_s32(vi, 3);
-                v = vmulq_n_f32(srcv[0][2 * j + 1], id[0]);
-                vi = vcvtnq_s32_f32(v);
-                y[i].qs[32 * j + 4] = vgetq_lane_s32(vi, 0);
-                y[i].qs[32 * j + 5] = vgetq_lane_s32(vi, 1);
-                y[i].qs[32 * j + 6] = vgetq_lane_s32(vi, 2);
-                y[i].qs[32 * j + 7] = vgetq_lane_s32(vi, 3);
-
-                v = vmulq_n_f32(srcv[1][2 * j], id[1]);
-                vi = vcvtnq_s32_f32(v);
-                y[i].qs[32 * j + 8] = vgetq_lane_s32(vi, 0);
-                y[i].qs[32 * j + 9] = vgetq_lane_s32(vi, 1);
-                y[i].qs[32 * j + 10] = vgetq_lane_s32(vi, 2);
-                y[i].qs[32 * j + 11] = vgetq_lane_s32(vi, 3);
-                v = vmulq_n_f32(srcv[1][2 * j + 1], id[1]);
-                vi = vcvtnq_s32_f32(v);
-                y[i].qs[32 * j + 12] = vgetq_lane_s32(vi, 0);
-                y[i].qs[32 * j + 13] = vgetq_lane_s32(vi, 1);
-                y[i].qs[32 * j + 14] = vgetq_lane_s32(vi, 2);
-                y[i].qs[32 * j + 15] = vgetq_lane_s32(vi, 3);
-
-                v = vmulq_n_f32(srcv[2][2 * j], id[2]);
-                vi = vcvtnq_s32_f32(v);
-                y[i].qs[32 * j + 16] = vgetq_lane_s32(vi, 0);
-                y[i].qs[32 * j + 17] = vgetq_lane_s32(vi, 1);
-                y[i].qs[32 * j + 18] = vgetq_lane_s32(vi, 2);
-                y[i].qs[32 * j + 19] = vgetq_lane_s32(vi, 3);
-                v = vmulq_n_f32(srcv[2][2 * j + 1], id[2]);
-                vi = vcvtnq_s32_f32(v);
-                y[i].qs[32 * j + 20] = vgetq_lane_s32(vi, 0);
-                y[i].qs[32 * j + 21] = vgetq_lane_s32(vi, 1);
-                y[i].qs[32 * j + 22] = vgetq_lane_s32(vi, 2);
-                y[i].qs[32 * j + 23] = vgetq_lane_s32(vi, 3);
-
-                v = vmulq_n_f32(srcv[3][2 * j], id[3]);
-                vi = vcvtnq_s32_f32(v);
-                y[i].qs[32 * j + 24] = vgetq_lane_s32(vi, 0);
-                y[i].qs[32 * j + 25] = vgetq_lane_s32(vi, 1);
-                y[i].qs[32 * j + 26] = vgetq_lane_s32(vi, 2);
-                y[i].qs[32 * j + 27] = vgetq_lane_s32(vi, 3);
-                v = vmulq_n_f32(srcv[3][2 * j + 1], id[3]);
-                vi = vcvtnq_s32_f32(v);
-                y[i].qs[32 * j + 28] = vgetq_lane_s32(vi, 0);
-                y[i].qs[32 * j + 29] = vgetq_lane_s32(vi, 1);
-                y[i].qs[32 * j + 30] = vgetq_lane_s32(vi, 2);
-                y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3);
-            }
-        }
-        else if (blocklen_per_row == 4) {
-            for (int j = 0; j < 8; j++) {
-                float32x4_t v = vmulq_n_f32(srcv[0][j], id[0]);
-                int32x4_t vi = vcvtnq_s32_f32(v);
-                y[i].qs[16 * j + 0] = vgetq_lane_s32(vi, 0);
-                y[i].qs[16 * j + 1] = vgetq_lane_s32(vi, 1);
-                y[i].qs[16 * j + 2] = vgetq_lane_s32(vi, 2);
-                y[i].qs[16 * j + 3] = vgetq_lane_s32(vi, 3);
-
-                v = vmulq_n_f32(srcv[1][j], id[1]);
-                vi = vcvtnq_s32_f32(v);
-                y[i].qs[16 * j + 4] = vgetq_lane_s32(vi, 0);
-                y[i].qs[16 * j + 5] = vgetq_lane_s32(vi, 1);
-                y[i].qs[16 * j + 6] = vgetq_lane_s32(vi, 2);
-                y[i].qs[16 * j + 7] = vgetq_lane_s32(vi, 3);
-
-                v = vmulq_n_f32(srcv[2][j], id[2]);
-                vi = vcvtnq_s32_f32(v);
-                y[i].qs[16 * j + 8] = vgetq_lane_s32(vi, 0);
-                y[i].qs[16 * j + 9] = vgetq_lane_s32(vi, 1);
-                y[i].qs[16 * j + 10] = vgetq_lane_s32(vi, 2);
-                y[i].qs[16 * j + 11] = vgetq_lane_s32(vi, 3);
-
-                v = vmulq_n_f32(srcv[3][j], id[3]);
-                vi = vcvtnq_s32_f32(v);
-                y[i].qs[16 * j + 12] = vgetq_lane_s32(vi, 0);
-                y[i].qs[16 * j + 13] = vgetq_lane_s32(vi, 1);
-                y[i].qs[16 * j + 14] = vgetq_lane_s32(vi, 2);
-                y[i].qs[16 * j + 15] = vgetq_lane_s32(vi, 3);
-            }
-        }
-    }
-#endif
-}
-
-inline int64_t roundup(const int64_t a, const int64_t b) {
-    int64_t rem = a % b;
-
-    if (rem) {
-        return a + b - rem;
-    } else {
-        return a;
-    }
-}
-
-void ggml_gemv_q4_0_q8_0_blocked8_neon(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
-#if defined(__ARM_NEON)
-    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)8);
-    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)8);
-
-    int64_t nb = n / QK4_0;
-    int64_t a_nb = n / QK8_0;
-
-    const uint8x16_t m4b = vdupq_n_u8(0x0F);
-    const int8x16_t  s8b = vdupq_n_s8(0x8);
-
-    const block_q4_0x8 * b_ptr_start = vx;
-    const block_q8_0 * a_ptr_start = vy;
-
-    for (int64_t y = 0; y < input_width; y++) {
-        for (int64_t x = x0 / 8; x < xend / 8; x++) {
-            // Pointers to LHS blocks
-            const block_q8_0 * a_ptr = a_ptr_start + (y * a_nb);
-            // Pointers to RHS blocks
-            const block_q4_0x8 * b_ptr = b_ptr_start + (x * nb);
-            // Master FP accumulator
-            float32x4_t acc_row[2];
-            acc_row[0] = acc_row[1] = vdupq_n_f32(0.0f);
-
-            for (int64_t b = 0; b < nb; b++) {
-                // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
-                const uint8x16_t rhs_raw_vec_0_0 = vld1q_u8(b_ptr[b].qs);
-                const uint8x16_t rhs_raw_vec_1_0 = vld1q_u8(b_ptr[b].qs + 16);
-                const uint8x16_t rhs_raw_vec_0_1 = vld1q_u8(b_ptr[b].qs + 32);
-                const uint8x16_t rhs_raw_vec_1_1 = vld1q_u8(b_ptr[b].qs + 48);
-                const uint8x16_t rhs_raw_vec_0_2 = vld1q_u8(b_ptr[b].qs + 64);
-                const uint8x16_t rhs_raw_vec_1_2 = vld1q_u8(b_ptr[b].qs + 80);
-                const uint8x16_t rhs_raw_vec_0_3 = vld1q_u8(b_ptr[b].qs + 96);
-                const uint8x16_t rhs_raw_vec_1_3 = vld1q_u8(b_ptr[b].qs + 112);
-
-                const int8x16_t rhs_vec_0_0_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_vec_0_0, m4b)), s8b);
-                const int8x16_t rhs_vec_0_1_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_vec_0_1, m4b)), s8b);
-                const int8x16_t rhs_vec_0_2_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_vec_0_2, m4b)), s8b);
-                const int8x16_t rhs_vec_0_3_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_vec_0_3, m4b)), s8b);
-                const int8x16_t rhs_vec_1_0_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_vec_1_0, m4b)), s8b);
-                const int8x16_t rhs_vec_1_1_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_vec_1_1, m4b)), s8b);
-                const int8x16_t rhs_vec_1_2_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_vec_1_2, m4b)), s8b);
-                const int8x16_t rhs_vec_1_3_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_vec_1_3, m4b)), s8b);
-
-                const int8x16_t rhs_vec_0_0_1 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_vec_0_0), 4);
-                const int8x16_t rhs_vec_0_1_1 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_vec_0_1), 4);
-                const int8x16_t rhs_vec_0_2_1 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_vec_0_2), 4);
-                const int8x16_t rhs_vec_0_3_1 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_vec_0_3), 4);
-                const int8x16_t rhs_vec_1_0_1 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_vec_1_0), 4);
-                const int8x16_t rhs_vec_1_1_1 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_vec_1_1), 4);
-                const int8x16_t rhs_vec_1_2_1 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_vec_1_2), 4);
-                const int8x16_t rhs_vec_1_3_1 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_vec_1_3), 4);
-
-                // Scale values - assemble the four row/column scales into a (64-bit) vector, then expand to FP32
-                const float16x8_t col_scale_f16 = vld1q_f16(b_ptr[b].d);
-                const float32x4_t col_scale_f32_0 = vcvt_f32_f16(vget_low_f16(col_scale_f16));
-                const float32x4_t col_scale_f32_1 = vcvt_f32_f16(vget_high_f16(col_scale_f16));
-
-                const float16x4_t row_scale_f16 = vld1_dup_f16(&(a_ptr[b].d));
-                const float32x4_t row_scale_f32 = vcvt_f32_f16(row_scale_f16);
-
-                const int8x16_t lhs_vec_0 = vld1q_s8(a_ptr[b].qs);
-                const int8x16_t lhs_vec_1 = vld1q_s8(a_ptr[b].qs + 16);
-
-                int32x4_t iacc0 = vdupq_n_s32(0);
-                int32x4_t iacc1 = vdupq_n_s32(0);
-
-                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_0_0, lhs_vec_0, 0);
-                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_0_1, lhs_vec_1, 0);
-
-                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_0_0, lhs_vec_0, 0);
-                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_0_1, lhs_vec_1, 0);
-
-                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_1_0, lhs_vec_0, 1);
-                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_1_1, lhs_vec_1, 1);
-
-                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_1_0, lhs_vec_0, 1);
-                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_1_1, lhs_vec_1, 1);
-
-                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_2_0, lhs_vec_0, 2);
-                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_2_1, lhs_vec_1, 2);
-
-                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_2_0, lhs_vec_0, 2);
-                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_2_1, lhs_vec_1, 2);
-
-                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_3_0, lhs_vec_0, 3);
-                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_3_1, lhs_vec_1, 3);
-
-                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_3_0, lhs_vec_0, 3);
-                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_3_1, lhs_vec_1, 3);
-
-                acc_row[0] = vfmaq_f32(acc_row[0], vcvtq_f32_s32(iacc0), vmulq_f32(col_scale_f32_0, row_scale_f32));
-                acc_row[1] = vfmaq_f32(acc_row[1], vcvtq_f32_s32(iacc1), vmulq_f32(col_scale_f32_1, row_scale_f32));
-            }
-
-            vst1q_f32(s + (y * output_channels + x * 8), acc_row[0]);
-            vst1q_f32(s + (y * output_channels + x * 8 + 4), acc_row[1]);
-        }
-    }
-#endif
-}
-
-void ggml_gemv_q4_0_q8_0_blocked8_sve(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
-#if defined(__ARM_FEATURE_SVE)
-    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)8);
-    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)8);
-
-    int64_t nb = n / QK4_0;
-    int64_t a_nb = n / QK8_0;
-
-    const svuint8_t m4b = svdup_u8(0x0F);
-    const svint8_t  s8b = svdup_s8(0x8);
-
-    const svbool_t ptrue = svptrue_b8();
-
-    const block_q4_0x8 * b_ptr_start = vx;
-    const block_q8_0 * a_ptr_start = vy;
-
-    for (int64_t y = 0; y < input_width; y++) {
-        for (int64_t x = x0 / 8; x < xend / 8; x++) {
-            // Pointers to LHS blocks
-            const block_q8_0 * a_ptr = a_ptr_start + (y * a_nb);
-            // Pointers to RHS blocks
-            const block_q4_0x8 * b_ptr = b_ptr_start + (x * nb);
-
-            // Master FP accumulator
-            svfloat32_t acc_row = svdup_f32(0.0f);
-
-            for (int64_t b = 0; b < nb; b++) {
-                // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
-                const svuint8_t rhs_raw_vec_0_0 = svld1_u8(ptrue, b_ptr[b].qs);
-                const svuint8_t rhs_raw_vec_0_1 = svld1_vnum_u8(ptrue, b_ptr[b].qs, 1);
-                const svuint8_t rhs_raw_vec_0_2 = svld1_vnum_u8(ptrue, b_ptr[b].qs, 2);
-                const svuint8_t rhs_raw_vec_0_3 = svld1_vnum_u8(ptrue, b_ptr[b].qs, 3);
-
-                const svint8_t rhs_vec_0_0_1 = svasr_n_s8_x(ptrue, svreinterpret_s8_u8(rhs_raw_vec_0_0), 4);
-                const svint8_t rhs_vec_0_1_1 = svasr_n_s8_x(ptrue, svreinterpret_s8_u8(rhs_raw_vec_0_1), 4);
-                const svint8_t rhs_vec_0_2_1 = svasr_n_s8_x(ptrue, svreinterpret_s8_u8(rhs_raw_vec_0_2), 4);
-                const svint8_t rhs_vec_0_3_1 = svasr_n_s8_x(ptrue, svreinterpret_s8_u8(rhs_raw_vec_0_3), 4);
-
-                const svint8_t rhs_vec_0_0_0 = svsub_s8_x(ptrue, svreinterpret_s8_u8(svand_u8_x(ptrue, rhs_raw_vec_0_0, m4b)), s8b);
-                const svint8_t rhs_vec_0_1_0 = svsub_s8_x(ptrue, svreinterpret_s8_u8(svand_u8_x(ptrue, rhs_raw_vec_0_1, m4b)), s8b);
-                const svint8_t rhs_vec_0_2_0 = svsub_s8_x(ptrue, svreinterpret_s8_u8(svand_u8_x(ptrue, rhs_raw_vec_0_2, m4b)), s8b);
-                const svint8_t rhs_vec_0_3_0 = svsub_s8_x(ptrue, svreinterpret_s8_u8(svand_u8_x(ptrue, rhs_raw_vec_0_3, m4b)), s8b);
-
-                // Scale values
-                const svfloat16_t col_scale_f16 = svreinterpret_f16_u32(svld1uh_u32(ptrue, (const uint16_t *) b_ptr[b].d));
-                const svfloat32_t col_scale_f32 = svcvt_f32_f16_x(ptrue, col_scale_f16);
-
-                const svfloat16_t row_scale_f16 = svdup_f16(a_ptr[b].d);
-                const svfloat32_t row_scale_f32 = svcvt_f32_f16_x(ptrue, row_scale_f16);
-
-                const svint8_t lhs_vec_0 = svld1rq_s8(ptrue, a_ptr[b].qs);
-                const svint8_t lhs_vec_1 = svld1rq_s8(ptrue, a_ptr[b].qs + 16);
-
-                svint32_t iacc = svdup_s32(0);
-
-                iacc = svdot_lane(iacc, rhs_vec_0_0_0, lhs_vec_0, 0);
-                iacc = svdot_lane(iacc, rhs_vec_0_0_1, lhs_vec_1, 0);
-
-                iacc = svdot_lane(iacc, rhs_vec_0_1_0, lhs_vec_0, 1);
-                iacc = svdot_lane(iacc, rhs_vec_0_1_1, lhs_vec_1, 1);
-
-                iacc = svdot_lane(iacc, rhs_vec_0_2_0, lhs_vec_0, 2);
-                iacc = svdot_lane(iacc, rhs_vec_0_2_1, lhs_vec_1, 2);
-
-                iacc = svdot_lane(iacc, rhs_vec_0_3_0, lhs_vec_0, 3);
-                iacc = svdot_lane(iacc, rhs_vec_0_3_1, lhs_vec_1, 3);
-
-                acc_row = svmla_x(ptrue, acc_row, svcvt_f32_s32_x(ptrue, iacc), svmul_x(ptrue, col_scale_f32, row_scale_f32));
-            }
-
-            svst1(ptrue, s + (y * output_channels + x * 8), acc_row);
-        }
-    }
-#endif
-}
-
-void ggml_gemv_q4_0_q8_0_aarch64_sve256(size_t depth, size_t output_channels, size_t height, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
-#if defined(__ARM_FEATURE_SVE)
-    if (svcntw() != 8) {
-        if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) ggml_gemv_q4_0_q8_0_aarch64_neon(depth, output_channels, height, s, vx, vy, ith, nth);
-        return;
-    }
-    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)8);
-    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)8);
-    size_t width = xend - x0;
-
-    int64_t nb = depth / QK4_0;
-    const void * b_ptr = (void *)((block_q4_0x8 *) vx + ((x0 / 8) * nb));
-    const void * a_ptr = vy;
-    float * res_ptr = s + x0;
-
-    assert(depth % 32 == 0);
-    assert(width % 8 == 0);
-
-    size_t num_blocks = depth / 32;
-
-    __asm__ __volatile__(
-      "ptrue p0.b\n"
-      "add %x[b_ptr], %x[b_ptr], #0x10\n"
-      "1:"  // Column loop
-      "add x22, %x[a_ptr], #0x2\n"
-      "mov z31.b, #0x0\n"
-      "mov x21, %x[num_blocks]\n"
-      "2:"  // Block loop
-      "ld1b { z30.b }, p0/Z, [%x[b_ptr]]\n"
-      "ld1b { z29.b }, p0/Z, [%x[b_ptr], #1, MUL VL]\n"
-      "mov z28.s, #0x0\n"
-      "mov z27.s, #0x0\n"
-      "ld1rd { z26.d }, p0/Z, [x22]\n"
-      "ld1b { z25.b }, p0/Z, [%x[b_ptr], #2, MUL VL]\n"
-      "sub x20, x22, #0x2\n"
-      "sub x21, x21, #0x1\n"
-      "ld1b { z24.b }, p0/Z, [%x[b_ptr], #3, MUL VL]\n"
-      "ld1rd { z23.d }, p0/Z, [x22, #8]\n"
-      "lsl z22.b, z30.b, #0x4\n"
-      "lsl z16.b, z29.b, #0x4\n"
-      "and z30.b, z30.b, #0xf0\n"
-      "and z29.b, z29.b, #0xf0\n"
-      "ld1rd { z21.d }, p0/Z, [x22, #16]\n"
-      "ld1rd { z20.d }, p0/Z, [x22, #24]\n"
-      "lsl z19.b, z25.b, #0x4\n"
-      "and z25.b, z25.b, #0xf0\n"
-      "ld1rh { z17.h }, p0/Z, [x20]\n"
-      "ld1h { z18.s }, p0/Z, [%x[b_ptr], #-1, MUL VL]\n"
-      "sdot z28.s, z22.b, z26.b\n"
-      "sdot z27.s, z16.b, z26.b\n"
-      "lsl z16.b, z24.b, #0x4\n"
-      "add x22, x22, #0x22\n"
-      "and z24.b, z24.b, #0xf0\n"
-      "add %x[b_ptr], %x[b_ptr], #0x90\n"
-      "fcvt z17.s, p0/m, z17.h\n"
-      "fcvt z18.s, p0/m, z18.h\n"
-      "sdot z28.s, z19.b, z23.b\n"
-      "sdot z27.s, z16.b, z23.b\n"
-      "fmul z18.s, z18.s, z17.s\n"
-      "sdot z28.s, z30.b, z21.b\n"
-      "sdot z27.s, z29.b, z21.b\n"
-      "sdot z28.s, z25.b, z20.b\n"
-      "sdot z27.s, z24.b, z20.b\n"
-      "uzp1 z17.s, z28.s, z27.s\n"
-      "uzp2 z16.s, z28.s, z27.s\n"
-      "add z17.s, z17.s, z16.s\n"
-      "asr z17.s, z17.s, #0x4\n"
-      "scvtf z17.s, p0/m, z17.s\n"
-      "fmla z31.s, p0/M, z17.s, z18.s\n"
-      "cbnz x21, 2b\n"
-      "sub %x[width], %x[width], #0x8\n"
-      "st1w { z31.s }, p0, [%x[res_ptr]]\n"
-      "add %x[res_ptr], %x[res_ptr], #0x20\n"
-      "cbnz %x[width], 1b\n"
-      : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [width] "+&r" (width)
-      : [a_ptr] "r" (a_ptr), [num_blocks] "r" (num_blocks)
-      : "memory", "p0", "x20", "x21", "x22", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
-    );
-#endif
-}
-
-void ggml_gemv_q4_0_q8_0_aarch64_neon(size_t depth, size_t output_channels, size_t height, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
-#if defined(__ARM_NEON)
-    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)4);
-    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)4);
-    size_t width = xend - x0;
-
-    int64_t nb = depth / QK4_0;
-    const void * b_ptr = (void *)((block_q4_0x4 *) vx + ((x0 / 4) * nb));
-    const void * a_ptr = vy;
-    float * res_ptr = s + x0;
-
-    assert(depth % 32 == 0);
-    assert(width % 4 == 0);
-
-    size_t num_blocks = depth / 32;
-
-    __asm__ __volatile__(
-      "movi v2.16b, #0x4\n"
-      "movi v1.16b, #0xf0\n"
-      "add %x[b_ptr], %x[b_ptr], #0x8\n"
-      "1:"  // Column loop
-      "add x23, %x[a_ptr], #0x2\n"
-      "movi v0.16b, #0x0\n"
-      "mov x22, %x[num_blocks]\n"
-      "2:"  // Block loop
-      "ldr q31, [%x[b_ptr], #0x0]\n"
-      "ldr q30, [%x[b_ptr], #0x10]\n"
-      "mov x21, x23\n"
-      "movi v29.4s, #0x0\n"
-      "ldr q28, [%x[b_ptr], #0x20]\n"
-      "ldr q27, [%x[b_ptr], #0x30]\n"
-      "movi v26.4s, #0x0\n"
-      "sub x20, x23, #0x2\n"
-      "ld1r { v25.8h }, [x20]\n"
-      "ldr q24, [%x[b_ptr], #-0x8]\n"
-      "sub x22, x22, #0x1\n"
-      "add x23, x23, #0x22\n"
-      "ld1r { v23.2d }, [x21], #0x8\n"
-      "sshl v22.16b, v31.16b, v2.16b\n"
-      "sshl v16.16b, v30.16b, v2.16b\n"
-      "add %x[b_ptr], %x[b_ptr], #0x48\n"
-      "ld1r { v21.2d }, [x21], #0x8\n"
-      "sshl v20.16b, v28.16b, v2.16b\n"
-      "sshl v19.16b, v27.16b, v2.16b\n"
-      "ld1r { v18.2d }, [x21], #0x8\n"
-      "ld1r { v17.2d }, [x21], #0x8\n"
-      "and v31.16b, v31.16b, v1.16b\n"
-      "and v30.16b, v30.16b, v1.16b\n"
-      ".inst 0x4e9796dd  // sdot v29.4s, v22.16b, v23.16b\n"
-      ".inst 0x4e97961a  // sdot v26.4s, v16.16b, v23.16b\n"
-      "and v28.16b, v28.16b, v1.16b\n"
-      "and v27.16b, v27.16b, v1.16b\n"
-      "fcvtl v25.4s, v25.4h\n"
-      "fcvtl v16.4s, v24.4h\n"
-      ".inst 0x4e95969d  // sdot v29.4s, v20.16b, v21.16b\n"
-      ".inst 0x4e95967a  // sdot v26.4s, v19.16b, v21.16b\n"
-      "fmul v16.4s, v16.4s, v25.4s\n"
-      ".inst 0x4e9297fd  // sdot v29.4s, v31.16b, v18.16b\n"
-      ".inst 0x4e9297da  // sdot v26.4s, v30.16b, v18.16b\n"
-      ".inst 0x4e91979d  // sdot v29.4s, v28.16b, v17.16b\n"
-      ".inst 0x4e91977a  // sdot v26.4s, v27.16b, v17.16b\n"
-      "addp v29.4s, v29.4s, v26.4s\n"
-      "scvtf v29.4s, v29.4s, #0x4\n"
-      "fmla v0.4s, v29.4s, v16.4s\n"
-      "cbnz x22, 2b\n"
-      "sub %x[width], %x[width], #0x4\n"
-      "str q0, [%x[res_ptr], #0x0]\n"
-      "add %x[res_ptr], %x[res_ptr], #0x10\n"
-      "cbnz %x[width], 1b\n"
-      : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [width] "+&r" (width)
-      : [a_ptr] "r" (a_ptr), [num_blocks] "r" (num_blocks)
-      : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
-    );
-#endif
-}
-
-void ggml_gemv_q4_0_q8_0_aarch64_neon_noi8mm(size_t depth, size_t output_channels, size_t height, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
-#if defined(__ARM_NEON)
-    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)4);
-    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)4);
-    size_t width = xend - x0;
-
-    int64_t nb = depth / QK4_0;
-    const void * b_ptr = (void *)((block_q4_0x4 *) vx + ((x0 / 4) * nb));
-    const void * a_ptr = vy;
-    float * res_ptr = s + x0;
-
-    assert(depth % 32 == 0);
-    assert(width % 4 == 0);
-
-    size_t num_blocks = depth / 32;
-
-    __asm__ __volatile__(
-      "movi v31.16b, #0x4\n"
-      "movi v30.16b, #0xf0\n"
-      "add %x[b_ptr], %x[b_ptr], #0x8\n"
-      "1:"  // Column loop
-      "add x22, %x[a_ptr], #0x2\n"
-      "movi v29.16b, #0x0\n"
-      "mov x21, %x[num_blocks]\n"
-      "2:"  // Block loop
-      "ldr q28, [%x[b_ptr], #0x0]\n"
-      "ldr q27, [x22, #0x0]\n"
-      "movi v26.4s, #0x0\n"
-      "sub x20, x22, #0x2\n"
-      "ldr q25, [x22, #0x10]\n"
-      "ldr q24, [%x[b_ptr], #0x10]\n"
-      "sub x21, x21, #0x1\n"
-      "add x22, x22, #0x22\n"
-      "ldr q23, [%x[b_ptr], #0x20]\n"
-      "ldr q22, [%x[b_ptr], #0x30]\n"
-      "ld1r { v21.8h }, [x20]\n"
-      "ldr q20, [%x[b_ptr], #-0x8]\n"
-      "sshl v16.16b, v28.16b, v31.16b\n"
-      "and v28.16b, v28.16b, v30.16b\n"
-      "sshl v19.16b, v24.16b, v31.16b\n"
-      "and v24.16b, v24.16b, v30.16b\n"
-      "add %x[b_ptr], %x[b_ptr], #0x48\n"
-      "sshl v18.16b, v23.16b, v31.16b\n"
-      "and v23.16b, v23.16b, v30.16b\n"
-      ".inst 0x4f9be21a  // sdot v26.4s, v16.16b, v27.4b[0]\n"
-      "sshl v17.16b, v22.16b, v31.16b\n"
-      "and v22.16b, v22.16b, v30.16b\n"
-      "fcvtl v21.4s, v21.4h\n"
-      "fcvtl v16.4s, v20.4h\n"
-      ".inst 0x4f99e39a  // sdot v26.4s, v28.16b, v25.4b[0]\n"
-      "fmul v16.4s, v16.4s, v21.4s\n"
-      ".inst 0x4fbbe27a  // sdot v26.4s, v19.16b, v27.4b[1]\n"
-      ".inst 0x4fb9e31a  // sdot v26.4s, v24.16b, v25.4b[1]\n"
-      ".inst 0x4f9bea5a  // sdot v26.4s, v18.16b, v27.4b[2]\n"
-      ".inst 0x4f99eafa  // sdot v26.4s, v23.16b, v25.4b[2]\n"
-      ".inst 0x4fbbea3a  // sdot v26.4s, v17.16b, v27.4b[3]\n"
-      ".inst 0x4fb9eada  // sdot v26.4s, v22.16b, v25.4b[3]\n"
-      "scvtf v26.4s, v26.4s, #0x4\n"
-      "fmla v29.4s, v26.4s, v16.4s\n"
-      "cbnz x21, 2b\n"
-      "sub %x[width], %x[width], #0x4\n"
-      "str q29, [%x[res_ptr], #0x0]\n"
-      "add %x[res_ptr], %x[res_ptr], #0x10\n"
-      "cbnz %x[width], 1b\n"
-      : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [width] "+&r" (width)
-      : [a_ptr] "r" (a_ptr), [num_blocks] "r" (num_blocks)
-      : "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22"
-    );
-#endif
-}
-
-void ggml_gemv_q8_0_q8_0_blocked8_neon(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
-#if defined(__ARM_NEON)
-    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)8);
-    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)8);
-
-    int64_t nb = n / QK8_0;
-    int64_t a_nb = n / QK8_0;
-
-    const block_q8_0x8 * b_ptr_start = vx;
-    const block_q8_0 * a_ptr_start = vy;
-
-    for (int64_t y = 0; y < input_width; y++) {
-        for (int64_t x = x0 / 8; x < xend / 8; x++) {
-            // Pointers to LHS blocks
-            const block_q8_0 * a_ptr = a_ptr_start + (y * a_nb);
-            // Pointers to RHS blocks
-            const block_q8_0x8 * b_ptr = b_ptr_start + (x * nb);
-            // Master FP accumulator
-            float32x4_t acc_row[2];
-            acc_row[0] = acc_row[1] = vdupq_n_f32(0.0f);
-
-            for (int64_t b = 0; b < nb; b++) {
-                // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
-                const int8x16_t rhs_vec_0_0_0 = vld1q_s8(b_ptr[b].qs);
-                const int8x16_t rhs_vec_1_0_0 = vld1q_s8(b_ptr[b].qs + 16);
-                const int8x16_t rhs_vec_0_1_0 = vld1q_s8(b_ptr[b].qs + 32);
-                const int8x16_t rhs_vec_1_1_0 = vld1q_s8(b_ptr[b].qs + 48);
-                const int8x16_t rhs_vec_0_2_0 = vld1q_s8(b_ptr[b].qs + 64);
-                const int8x16_t rhs_vec_1_2_0 = vld1q_s8(b_ptr[b].qs + 80);
-                const int8x16_t rhs_vec_0_3_0 = vld1q_s8(b_ptr[b].qs + 96);
-                const int8x16_t rhs_vec_1_3_0 = vld1q_s8(b_ptr[b].qs + 112);
-                const int8x16_t rhs_vec_0_0_1 = vld1q_s8(b_ptr[b].qs + 128);
-                const int8x16_t rhs_vec_1_0_1 = vld1q_s8(b_ptr[b].qs + 144);
-                const int8x16_t rhs_vec_0_1_1 = vld1q_s8(b_ptr[b].qs + 160);
-                const int8x16_t rhs_vec_1_1_1 = vld1q_s8(b_ptr[b].qs + 176);
-                const int8x16_t rhs_vec_0_2_1 = vld1q_s8(b_ptr[b].qs + 192);
-                const int8x16_t rhs_vec_1_2_1 = vld1q_s8(b_ptr[b].qs + 208);
-                const int8x16_t rhs_vec_0_3_1 = vld1q_s8(b_ptr[b].qs + 224);
-                const int8x16_t rhs_vec_1_3_1 = vld1q_s8(b_ptr[b].qs + 240);
-
-                // Scale values - assemble the four row/column scales into a (64-bit) vector, then expand to FP32
-                const float16x8_t col_scale_f16 = vld1q_f16(b_ptr[b].d);
-                const float32x4_t col_scale_f32_0 = vcvt_f32_f16(vget_low_f16(col_scale_f16));
-                const float32x4_t col_scale_f32_1 = vcvt_f32_f16(vget_high_f16(col_scale_f16));
-
-                const float16x4_t row_scale_f16 = vld1_dup_f16(&(a_ptr[b].d));
-                const float32x4_t row_scale_f32 = vcvt_f32_f16(row_scale_f16);
-
-                const int8x16_t lhs_vec_0 = vld1q_s8(a_ptr[b].qs);
-                const int8x16_t lhs_vec_1 = vld1q_s8(a_ptr[b].qs + 16);
-
-                int32x4_t iacc0 = vdupq_n_s32(0);
-                int32x4_t iacc1 = vdupq_n_s32(0);
-
-                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_0_0, lhs_vec_0, 0);
-                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_0_1, lhs_vec_1, 0);
-
-                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_0_0, lhs_vec_0, 0);
-                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_0_1, lhs_vec_1, 0);
-
-                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_1_0, lhs_vec_0, 1);
-                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_1_1, lhs_vec_1, 1);
-
-                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_1_0, lhs_vec_0, 1);
-                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_1_1, lhs_vec_1, 1);
-
-                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_2_0, lhs_vec_0, 2);
-                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_2_1, lhs_vec_1, 2);
-
-                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_2_0, lhs_vec_0, 2);
-                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_2_1, lhs_vec_1, 2);
-
-                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_3_0, lhs_vec_0, 3);
-                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_3_1, lhs_vec_1, 3);
-
-                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_3_0, lhs_vec_0, 3);
-                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_3_1, lhs_vec_1, 3);
-
-                acc_row[0] = vfmaq_f32(acc_row[0], vcvtq_f32_s32(iacc0), vmulq_f32(col_scale_f32_0, row_scale_f32));
-                acc_row[1] = vfmaq_f32(acc_row[1], vcvtq_f32_s32(iacc1), vmulq_f32(col_scale_f32_1, row_scale_f32));
-            }
-
-            vst1q_f32(s + (y * output_channels + x * 8), acc_row[0]);
-            vst1q_f32(s + (y * output_channels + x * 8 + 4), acc_row[1]);
-        }
-    }
-#endif
-}
-
-void ggml_gemv_q8_0_q8_0_blocked8_sve(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
-#if defined(__ARM_FEATURE_SVE)
-    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)8);
-    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)8);
-
-    int64_t nb = n / QK8_0;
-    int64_t a_nb = n / QK8_0;
-
-    const svbool_t ptrue = svptrue_b8();
-
-    const block_q8_0x8 * b_ptr_start = vx;
-    const block_q8_0 * a_ptr_start = vy;
-
-    for (int64_t y = 0; y < input_width; y++) {
-        for (int64_t x = x0 / 8; x < xend / 8; x++) {
-            // Pointers to LHS blocks
-            const block_q8_0 * a_ptr = a_ptr_start + (y * a_nb);
-            // Pointers to RHS blocks
-            const block_q8_0x8 * b_ptr = b_ptr_start + (x * nb);
-
-            // Master FP accumulator
-            svfloat32_t acc_row = svdup_f32(0.0f);
-
-            for (int64_t b = 0; b < nb; b++) {
-                // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
-                const svint8_t rhs_vec_0_0_0 = svld1_s8(ptrue, b_ptr[b].qs);
-                const svint8_t rhs_vec_0_1_0 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 1);
-                const svint8_t rhs_vec_0_2_0 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 2);
-                const svint8_t rhs_vec_0_3_0 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 3);
-                const svint8_t rhs_vec_0_0_1 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 4);
-                const svint8_t rhs_vec_0_1_1 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 5);
-                const svint8_t rhs_vec_0_2_1 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 6);
-                const svint8_t rhs_vec_0_3_1 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 7);
-
-                // Scale values
-                const svfloat16_t col_scale_f16 = svreinterpret_f16_u32(svld1uh_u32(ptrue, (const uint16_t *) b_ptr[b].d));
-                const svfloat32_t col_scale_f32 = svcvt_f32_f16_x(ptrue, col_scale_f16);
-
-                const svfloat16_t row_scale_f16 = svdup_f16(a_ptr[b].d);
-                const svfloat32_t row_scale_f32 = svcvt_f32_f16_x(ptrue, row_scale_f16);
-
-                const svint8_t lhs_vec_0 = svld1rq_s8(ptrue, a_ptr[b].qs);
-                const svint8_t lhs_vec_1 = svld1rq_s8(ptrue, a_ptr[b].qs + 16);
-
-                svint32_t iacc = svdup_s32(0);
-
-                iacc = svdot_lane(iacc, rhs_vec_0_0_0, lhs_vec_0, 0);
-                iacc = svdot_lane(iacc, rhs_vec_0_0_1, lhs_vec_1, 0);
-
-                iacc = svdot_lane(iacc, rhs_vec_0_1_0, lhs_vec_0, 1);
-                iacc = svdot_lane(iacc, rhs_vec_0_1_1, lhs_vec_1, 1);
-
-                iacc = svdot_lane(iacc, rhs_vec_0_2_0, lhs_vec_0, 2);
-                iacc = svdot_lane(iacc, rhs_vec_0_2_1, lhs_vec_1, 2);
-
-                iacc = svdot_lane(iacc, rhs_vec_0_3_0, lhs_vec_0, 3);
-                iacc = svdot_lane(iacc, rhs_vec_0_3_1, lhs_vec_1, 3);
-
-                acc_row = svmla_x(ptrue, acc_row, svcvt_f32_s32_x(ptrue, iacc), svmul_x(ptrue, col_scale_f32, row_scale_f32));
-            }
-
-            svst1(ptrue, s + (y * output_channels + x * 8), acc_row);
-        }
-    }
-#endif
-}
-
-void ggml_gemm_q4_0_q8_0(const int n, int rows, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)4);
-    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)4);
-
-    int64_t nb = n / QK4_0;
-    int64_t a_nb = n / QK8_0;
-
-    const uint8x16_t m4b = vdupq_n_u8(0x0F);
-    const int8x16_t  s8b = vdupq_n_s8(0x8);
-
-    const block_q4_0x4 * b_ptr_start = vx;
-    const block_q8_0x4 * a_ptr_start = vy;
-
-    for (int64_t y = 0; y < input_width / 4; y += rows / 4) {
-        for (int64_t x = x0 / 4; x < xend / 4; x++) {
-            const block_q8_0x4 * a_ptrs[rows / 4];
-
-            a_ptrs[0] = a_ptr_start + (y * a_nb);
-            for (int i = 0; i < (rows / 4) - 1; i++) {
-                a_ptrs[i + 1] = a_ptrs[i] + a_nb;
-            }
-
-            const block_q4_0x4 * b_ptr = b_ptr_start + (x * nb);
-
-            // Master FP accumulators
-            float32x4_t acc_rows[rows];
-            for (int i = 0; i < rows; i++) {
-                acc_rows[i] = vdupq_n_f32(0.0f);
-            }
-
-            for (int64_t b = 0; b < nb; b++) {
-                // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
-                const uint8x16_t rhs_raw_mat_01_0 = vld1q_u8(b_ptr[b].qs);
-                const uint8x16_t rhs_raw_mat_23_0 = vld1q_u8(b_ptr[b].qs + 16);
-                const uint8x16_t rhs_raw_mat_01_1 = vld1q_u8(b_ptr[b].qs + 32);
-                const uint8x16_t rhs_raw_mat_23_1 = vld1q_u8(b_ptr[b].qs + 48);
-
-                // 4-bit -> 8-bit
-                const int8x16_t rhs_mat_01_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_01_0, m4b)), s8b);
-                const int8x16_t rhs_mat_23_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_23_0, m4b)), s8b);
-                const int8x16_t rhs_mat_01_1 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_01_1, m4b)), s8b);
-                const int8x16_t rhs_mat_23_1 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_23_1, m4b)), s8b);
-                const int8x16_t rhs_mat_01_2 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_01_0), 4);
-                const int8x16_t rhs_mat_23_2 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_23_0), 4);
-                const int8x16_t rhs_mat_01_3 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_01_1), 4);
-                const int8x16_t rhs_mat_23_3 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_23_1), 4);
-
-                // Scale values - assemble the four row/column scales into a (64-bit) vector, then expand to FP32
-                const float16x4_t col_scale_f16 = vld1_f16(b_ptr[b].d);
-                const float32x4_t col_scale_f32 = vcvt_f32_f16(col_scale_f16);
-
-                // Process LHS in pairs of rows
-                for (int rp = 0; rp < rows / 4; rp++) {
-                    const int8x16_t lhs_mat_01_0 = vld1q_s8(a_ptrs[rp][b].qs);
-                    const int8x16_t lhs_mat_23_0 = vld1q_s8(a_ptrs[rp][b].qs + 16);
-                    const int8x16_t lhs_mat_01_1 = vld1q_s8(a_ptrs[rp][b].qs + 32);
-                    const int8x16_t lhs_mat_23_1 = vld1q_s8(a_ptrs[rp][b].qs + 48);
-
-                    const int8x16_t lhs_mat_01_2 = vld1q_s8(a_ptrs[rp][b].qs + 64);
-                    const int8x16_t lhs_mat_23_2 = vld1q_s8(a_ptrs[rp][b].qs + 80);
-                    const int8x16_t lhs_mat_01_3 = vld1q_s8(a_ptrs[rp][b].qs + 96);
-                    const int8x16_t lhs_mat_23_3 = vld1q_s8(a_ptrs[rp][b].qs + 112);
-
-                    // Do the MMLAs into 2x2 matrices
-                    const int32x4_t iacc_mat_00 =
-                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_01_0), lhs_mat_01_1, rhs_mat_01_1), lhs_mat_01_2, rhs_mat_01_2), lhs_mat_01_3, rhs_mat_01_3);
-                    const int32x4_t iacc_mat_01 =
-                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_23_0), lhs_mat_01_1, rhs_mat_23_1), lhs_mat_01_2, rhs_mat_23_2), lhs_mat_01_3, rhs_mat_23_3);
-                    const int32x4_t iacc_mat_10 =
-                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_23_0, rhs_mat_01_0), lhs_mat_23_1, rhs_mat_01_1), lhs_mat_23_2, rhs_mat_01_2), lhs_mat_23_3, rhs_mat_01_3);
-                    const int32x4_t iacc_mat_11 =
-                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_23_0, rhs_mat_23_0), lhs_mat_23_1, rhs_mat_23_1), lhs_mat_23_2, rhs_mat_23_2), lhs_mat_23_3, rhs_mat_23_3);
-
-                    // Straighten out to make 4 row vectors
-                    const int32x4_t iacc_row_0 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
-                    const int32x4_t iacc_row_1 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
-                    const int32x4_t iacc_row_2 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_10), vreinterpretq_u64_s32(iacc_mat_11)));
-                    const int32x4_t iacc_row_3 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_10), vreinterpretq_u64_s32(iacc_mat_11)));
-
-                    const float16x4_t row_scale_f16 = vld1_f16(a_ptrs[rp][b].d);
-                    const float32x4_t row_scale_f32 = vcvt_f32_f16(row_scale_f16);
-
-                    acc_rows[rp * 4] = vfmaq_f32(acc_rows[rp * 4], vcvtq_f32_s32(iacc_row_0), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 0));
-                    acc_rows[rp * 4 + 1] = vfmaq_f32(acc_rows[rp * 4 + 1], vcvtq_f32_s32(iacc_row_1), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 1));
-                    acc_rows[rp * 4 + 2] = vfmaq_f32(acc_rows[rp * 4 + 2], vcvtq_f32_s32(iacc_row_2), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 2));
-                    acc_rows[rp * 4 + 3] = vfmaq_f32(acc_rows[rp * 4 + 3], vcvtq_f32_s32(iacc_row_3), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 3));
-                }
-            }
-
-            for (int i = 0; i < rows; i++) {
-                vst1q_f32(s + ((y * 4 + i) * output_channels + x * 4), acc_rows[i]);
-            }
-        }
-    }
-#endif
-}
-
-void ggml_gemm_q4_0_q8_0_aarch64_sve256(size_t depth, size_t output_channels, size_t height, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (svcntw() != 8) {
-        if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) ggml_gemm_q4_0_q8_0_aarch64_neon(depth, output_channels, height, s, vx, vy, ith, nth);
-        return;
-    }
-    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)8);
-    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)8);
-    size_t width = xend - x0;
-
-    int64_t nb = depth / QK4_0;
-    const void * b_ptr = (void *)((block_q4_0x8 *) vx + ((x0 / 8) * nb));
-    const void * a_ptr = vy;
-    float * res_ptr = s + x0;
-    size_t res_stride = output_channels  * sizeof(float);
-
-    assert(depth % 32 == 0);
-    assert(width % 8 == 0);
-
-    size_t num_blocks = depth / 32;
-
-    __asm__ __volatile__(
-      "mov x20, #0x4\n"
-      "mov x13, %x[height]\n"
-      "mov z28.s, #-0x4\n"
-      "mov x12, #0x88\n"
-      "ptrue p1.b\n"
-      "whilelt p0.s, XZR, x20\n"
-      "cmp x13, #0x10\n"
-      "mul x12, %x[num_blocks], x12\n"
-      "blt 4f\n"
-      "1:"  // Row loop
-      "add x11, %x[b_ptr], #0x10\n"
-      "mov x10, %x[width]\n"
-      "add x9, %x[res_ptr], %x[res_stride], LSL #4\n"
-      "2:"  // Column loop
-      "add x28, %x[a_ptr], #0x8\n"
-      "mov z24.b, #0x0\n"
-      "mov z15.b, #0x0\n"
-      "mov x27, %x[num_blocks]\n"
-      "add x26, x28, x12\n"
-      "mov z12.b, #0x0\n"
-      "mov z0.b, #0x0\n"
-      "add x25, x26, x12\n"
-      "mov z13.b, #0x0\n"
-      "mov z1.b, #0x0\n"
-      "add x24, x25, x12\n"
-      "mov z20.b, #0x0\n"
-      "mov z25.b, #0x0\n"
-      "mov z11.b, #0x0\n"
-      "mov z16.b, #0x0\n"
-      "mov z19.b, #0x0\n"
-      "mov z26.b, #0x0\n"
-      "mov z8.b, #0x0\n"
-      "mov z29.b, #0x0\n"
-      "mov z27.b, #0x0\n"
-      "mov z10.b, #0x0\n"
-      "3:"  // Block loop
-      "ld1b { z30.b }, p1/Z, [x11]\n"
-      "ld1b { z21.b }, p1/Z, [x11, #1, MUL VL]\n"
-      "mov z18.s, #0x0\n"
-      "mov z7.s, #0x0\n"
-      "ld1rqb { z3.b }, p1/Z, [x28]\n"
-      "ld1rqb { z5.b }, p1/Z, [x28, #16]\n"
-      "mov z9.s, #0x0\n"
-      "mov z22.s, #0x0\n"
-      "ld1b { z4.b }, p1/Z, [x11, #2, MUL VL]\n"
-      "ld1b { z17.b }, p1/Z, [x11, #3, MUL VL]\n"
-      "sub x20, x11, #0x10\n"
-      "sub x23, x28, #0x8\n"
-      "lsl z31.b, z30.b, #0x4\n"
-      "lsl z6.b, z21.b, #0x4\n"
-      "ld1h { z23.s }, p1/Z, [x20]\n"
-      "sub x22, x26, #0x8\n"
-      "and z30.b, z30.b, #0xf0\n"
-      "and z21.b, z21.b, #0xf0\n"
-      "sub x21, x25, #0x8\n"
-      "sub x20, x24, #0x8\n"
-      "lsl z14.b, z4.b, #0x4\n"
-      "lsl z2.b, z17.b, #0x4\n"
-      "subs x27, x27, #0x1\n"
-      "add x11, x11, #0x90\n"
-      ".inst 0x451f9872  // smmla z18.s, z3.b, z31.b\n"
-      ".inst 0x45069867  // smmla z7.s, z3.b, z6.b\n"
-      "ld1rqb { z3.b }, p1/Z, [x28, #32]\n"
-      "and z4.b, z4.b, #0xf0\n"
-      ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
-      ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
-      "ld1rqb { z5.b }, p1/Z, [x28, #48]\n"
-      "and z17.b, z17.b, #0xf0\n"
-      "fcvt z23.s, p1/m, z23.h\n"
-      ".inst 0x450e9872  // smmla z18.s, z3.b, z14.b\n"
-      ".inst 0x45029867  // smmla z7.s, z3.b, z2.b\n"
-      "ld1rqb { z3.b }, p1/Z, [x28, #64]\n"
-      ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
-      ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
-      "ld1rqb { z5.b }, p1/Z, [x28, #80]\n"
-      "fscale z23.s, p1/m, z23.s, z28.s\n"
-      ".inst 0x451e9872  // smmla z18.s, z3.b, z30.b\n"
-      ".inst 0x45159867  // smmla z7.s, z3.b, z21.b\n"
-      "ld1rqb { z3.b }, p1/Z, [x28, #96]\n"
-      ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
-      ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
-      "ld1rqb { z5.b }, p1/Z, [x28, #112]\n"
-      "add x28, x28, #0x88\n"
-      ".inst 0x45049872  // smmla z18.s, z3.b, z4.b\n"
-      ".inst 0x45119867  // smmla z7.s, z3.b, z17.b\n"
-      "ld1h { z3.s }, p0/Z, [x23]\n"
-      ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
-      ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
-      "fcvt z3.s, p1/m, z3.h\n"
-      "uzp1 z5.d, z18.d, z7.d\n"
-      "uzp2 z18.d, z18.d, z7.d\n"
-      "mov z3.q, z3.q[0]\n"
-      "uzp1 z7.d, z9.d, z22.d\n"
-      "uzp2 z22.d, z9.d, z22.d\n"
-      "fmul z9.s, z23.s, z3.s[0]\n"
-      "scvtf z5.s, p1/m, z5.s\n"
-      "scvtf z18.s, p1/m, z18.s\n"
-      "scvtf z7.s, p1/m, z7.s\n"
-      "scvtf z22.s, p1/m, z22.s\n"
-      "fmla z24.s, p1/M, z5.s, z9.s\n"
-      "ld1rqb { z5.b }, p1/Z, [x26]\n"
-      "fmul z9.s, z23.s, z3.s[1]\n"
-      "fmla z15.s, p1/M, z18.s, z9.s\n"
-      "ld1rqb { z18.b }, p1/Z, [x26, #16]\n"
-      "fmul z9.s, z23.s, z3.s[2]\n"
-      "fmul z3.s, z23.s, z3.s[3]\n"
-      "fmla z12.s, p1/M, z7.s, z9.s\n"
-      "mov z9.s, #0x0\n"
-      "ld1h { z7.s }, p0/Z, [x22]\n"
-      ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
-      "fmla z0.s, p1/M, z22.s, z3.s\n"
-      "mov z22.s, #0x0\n"
-      "ld1h { z3.s }, p0/Z, [x21]\n"
-      ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
-      "ld1rqb { z5.b }, p1/Z, [x26, #32]\n"
-      "fcvt z7.s, p1/m, z7.h\n"
-      "fcvt z3.s, p1/m, z3.h\n"
-      ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
-      ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
-      "ld1rqb { z5.b }, p1/Z, [x26, #64]\n"
-      "mov z7.q, z7.q[0]\n"
-      "mov z3.q, z3.q[0]\n"
-      ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
-      ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
-      "ld1rqb { z5.b }, p1/Z, [x26, #96]\n"
-      ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
-      ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
-      "uzp1 z5.d, z9.d, z22.d\n"
-      "scvtf z5.s, p1/m, z5.s\n"
-      "uzp2 z22.d, z9.d, z22.d\n"
-      "fmul z9.s, z23.s, z7.s[0]\n"
-      "scvtf z22.s, p1/m, z22.s\n"
-      "fmla z13.s, p1/M, z5.s, z9.s\n"
-      "ld1rqb { z9.b }, p1/Z, [x25]\n"
-      "fmul z5.s, z23.s, z7.s[1]\n"
-      "fmla z1.s, p1/M, z22.s, z5.s\n"
-      "mov z5.s, #0x0\n"
-      "mov z22.s, #0x0\n"
-      ".inst 0x451f9a45  // smmla z5.s, z18.b, z31.b\n"
-      ".inst 0x45069a56  // smmla z22.s, z18.b, z6.b\n"
-      "ld1rqb { z18.b }, p1/Z, [x26, #48]\n"
-      ".inst 0x450e9a45  // smmla z5.s, z18.b, z14.b\n"
-      ".inst 0x45029a56  // smmla z22.s, z18.b, z2.b\n"
-      "ld1rqb { z18.b }, p1/Z, [x26, #80]\n"
-      ".inst 0x451e9a45  // smmla z5.s, z18.b, z30.b\n"
-      ".inst 0x45159a56  // smmla z22.s, z18.b, z21.b\n"
-      "ld1rqb { z18.b }, p1/Z, [x26, #112]\n"
-      "add x26, x26, #0x88\n"
-      ".inst 0x45049a45  // smmla z5.s, z18.b, z4.b\n"
-      ".inst 0x45119a56  // smmla z22.s, z18.b, z17.b\n"
-      "uzp1 z18.d, z5.d, z22.d\n"
-      "scvtf z18.s, p1/m, z18.s\n"
-      "uzp2 z22.d, z5.d, z22.d\n"
-      "fmul z5.s, z23.s, z7.s[2]\n"
-      "fmul z7.s, z23.s, z7.s[3]\n"
-      "scvtf z22.s, p1/m, z22.s\n"
-      "fmla z20.s, p1/M, z18.s, z5.s\n"
-      "ld1rqb { z18.b }, p1/Z, [x25, #16]\n"
-      "ld1h { z5.s }, p0/Z, [x20]\n"
-      "fcvt z5.s, p1/m, z5.h\n"
-      "fmla z25.s, p1/M, z22.s, z7.s\n"
-      "mov z22.s, #0x0\n"
-      "mov z7.s, #0x0\n"
-      ".inst 0x451f9936  // smmla z22.s, z9.b, z31.b\n"
-      ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
-      "ld1rqb { z9.b }, p1/Z, [x25, #32]\n"
-      "mov z5.q, z5.q[0]\n"
-      ".inst 0x450e9936  // smmla z22.s, z9.b, z14.b\n"
-      ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
-      "ld1rqb { z9.b }, p1/Z, [x25, #64]\n"
-      ".inst 0x451e9936  // smmla z22.s, z9.b, z30.b\n"
-      ".inst 0x45159927  // smmla z7.s, z9.b, z21.b\n"
-      "ld1rqb { z9.b }, p1/Z, [x25, #96]\n"
-      ".inst 0x45049936  // smmla z22.s, z9.b, z4.b\n"
-      ".inst 0x45119927  // smmla z7.s, z9.b, z17.b\n"
-      "uzp1 z9.d, z22.d, z7.d\n"
-      "scvtf z9.s, p1/m, z9.s\n"
-      "uzp2 z22.d, z22.d, z7.d\n"
-      "fmul z7.s, z23.s, z3.s[0]\n"
-      "scvtf z22.s, p1/m, z22.s\n"
-      "fmla z11.s, p1/M, z9.s, z7.s\n"
-      "ld1rqb { z9.b }, p1/Z, [x24]\n"
-      "fmul z7.s, z23.s, z3.s[1]\n"
-      "fmla z16.s, p1/M, z22.s, z7.s\n"
-      "mov z22.s, #0x0\n"
-      "mov z7.s, #0x0\n"
-      ".inst 0x451f9a56  // smmla z22.s, z18.b, z31.b\n"
-      ".inst 0x45069a47  // smmla z7.s, z18.b, z6.b\n"
-      "ld1rqb { z18.b }, p1/Z, [x25, #48]\n"
-      ".inst 0x450e9a56  // smmla z22.s, z18.b, z14.b\n"
-      ".inst 0x45029a47  // smmla z7.s, z18.b, z2.b\n"
-      "ld1rqb { z18.b }, p1/Z, [x25, #80]\n"
-      ".inst 0x451e9a56  // smmla z22.s, z18.b, z30.b\n"
-      ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
-      "ld1rqb { z18.b }, p1/Z, [x25, #112]\n"
-      "add x25, x25, #0x88\n"
-      ".inst 0x45049a56  // smmla z22.s, z18.b, z4.b\n"
-      ".inst 0x45119a47  // smmla z7.s, z18.b, z17.b\n"
-      "uzp1 z18.d, z22.d, z7.d\n"
-      "scvtf z18.s, p1/m, z18.s\n"
-      "uzp2 z7.d, z22.d, z7.d\n"
-      "fmul z22.s, z23.s, z3.s[2]\n"
-      "fmul z3.s, z23.s, z3.s[3]\n"
-      "scvtf z7.s, p1/m, z7.s\n"
-      "fmla z19.s, p1/M, z18.s, z22.s\n"
-      "ld1rqb { z18.b }, p1/Z, [x24, #16]\n"
-      "fmul z22.s, z23.s, z5.s[0]\n"
-      "fmla z26.s, p1/M, z7.s, z3.s\n"
-      "mov z3.s, #0x0\n"
-      "mov z7.s, #0x0\n"
-      ".inst 0x451f9923  // smmla z3.s, z9.b, z31.b\n"
-      ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
-      "ld1rqb { z9.b }, p1/Z, [x24, #32]\n"
-      ".inst 0x450e9923  // smmla z3.s, z9.b, z14.b\n"
-      ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
-      "mov z9.s, #0x0\n"
-      ".inst 0x451f9a49  // smmla z9.s, z18.b, z31.b\n"
-      "mov z31.s, #0x0\n"
-      ".inst 0x45069a5f  // smmla z31.s, z18.b, z6.b\n"
-      "ld1rqb { z6.b }, p1/Z, [x24, #48]\n"
-      "ld1rqb { z18.b }, p1/Z, [x24, #64]\n"
-      ".inst 0x450e98c9  // smmla z9.s, z6.b, z14.b\n"
-      "fmul z14.s, z23.s, z5.s[1]\n"
-      ".inst 0x450298df  // smmla z31.s, z6.b, z2.b\n"
-      "ld1rqb { z6.b }, p1/Z, [x24, #80]\n"
-      "fmul z2.s, z23.s, z5.s[2]\n"
-      "fmul z23.s, z23.s, z5.s[3]\n"
-      ".inst 0x451e9a43  // smmla z3.s, z18.b, z30.b\n"
-      ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
-      "ld1rqb { z5.b }, p1/Z, [x24, #96]\n"
-      ".inst 0x451e98c9  // smmla z9.s, z6.b, z30.b\n"
-      ".inst 0x451598df  // smmla z31.s, z6.b, z21.b\n"
-      "ld1rqb { z18.b }, p1/Z, [x24, #112]\n"
-      "add x24, x24, #0x88\n"
-      ".inst 0x450498a3  // smmla z3.s, z5.b, z4.b\n"
-      ".inst 0x451198a7  // smmla z7.s, z5.b, z17.b\n"
-      ".inst 0x45049a49  // smmla z9.s, z18.b, z4.b\n"
-      ".inst 0x45119a5f  // smmla z31.s, z18.b, z17.b\n"
-      "uzp1 z18.d, z3.d, z7.d\n"
-      "uzp2 z5.d, z3.d, z7.d\n"
-      "scvtf z18.s, p1/m, z18.s\n"
-      "uzp1 z6.d, z9.d, z31.d\n"
-      "uzp2 z9.d, z9.d, z31.d\n"
-      "scvtf z5.s, p1/m, z5.s\n"
-      "fmla z8.s, p1/M, z18.s, z22.s\n"
-      "scvtf z6.s, p1/m, z6.s\n"
-      "scvtf z9.s, p1/m, z9.s\n"
-      "fmla z29.s, p1/M, z5.s, z14.s\n"
-      "fmla z27.s, p1/M, z6.s, z2.s\n"
-      "fmla z10.s, p1/M, z9.s, z23.s\n"
-      "bgt 3b\n"
-      "mov x20, %x[res_ptr]\n"
-      "subs x10, x10, #0x8\n"
-      "add %x[res_ptr], %x[res_ptr], #0x20\n"
-      "st1w { z24.s }, p1, [x20]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "st1w { z15.s }, p1, [x20]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "st1w { z12.s }, p1, [x20]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "st1w { z0.s }, p1, [x20]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "st1w { z13.s }, p1, [x20]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "st1w { z1.s }, p1, [x20]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "st1w { z20.s }, p1, [x20]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "st1w { z25.s }, p1, [x20]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "st1w { z11.s }, p1, [x20]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "st1w { z16.s }, p1, [x20]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "st1w { z19.s }, p1, [x20]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "st1w { z26.s }, p1, [x20]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "st1w { z8.s }, p1, [x20]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "st1w { z29.s }, p1, [x20]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "st1w { z27.s }, p1, [x20]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "st1w { z10.s }, p1, [x20]\n"
-      "bne 2b\n"
-      "mov x20, #0x4\n"
-      "sub x13, x13, #0x10\n"
-      "cmp x13, #0x10\n"
-      "mov %x[res_ptr], x9\n"
-      "madd %x[a_ptr], x20, x12, %x[a_ptr]\n"
-      "bge 1b\n"
-      "4:"  // Row loop skip
-      "cbz x13, 9f\n"
-      "5:"  // Row tail: Row loop
-      "add x25, %x[b_ptr], #0x10\n"
-      "mov x24, %x[width]\n"
-      "add x23, %x[res_ptr], %x[res_stride], LSL #2\n"
-      "6:"  // Row tail: Column loop
-      "mov z24.b, #0x0\n"
-      "mov z15.b, #0x0\n"
-      "add x28, %x[a_ptr], #0x8\n"
-      "mov x22, %x[num_blocks]\n"
-      "mov z12.b, #0x0\n"
-      "mov z0.b, #0x0\n"
-      "7:"  // Row tail: Block loop
-      "ld1b { z3.b }, p1/Z, [x25]\n"
-      "ld1b { z6.b }, p1/Z, [x25, #1, MUL VL]\n"
-      "mov z2.s, #0x0\n"
-      "mov z25.s, #0x0\n"
-      "ld1rqb { z26.b }, p1/Z, [x28]\n"
-      "ld1rqb { z21.b }, p1/Z, [x28, #16]\n"
-      "mov z27.s, #0x0\n"
-      "mov z19.s, #0x0\n"
-      "ld1b { z29.b }, p1/Z, [x25, #2, MUL VL]\n"
-      "ld1b { z16.b }, p1/Z, [x25, #3, MUL VL]\n"
-      "sub x21, x25, #0x10\n"
-      "sub x20, x28, #0x8\n"
-      "lsl z20.b, z3.b, #0x4\n"
-      "lsl z4.b, z6.b, #0x4\n"
-      "ld1rqb { z10.b }, p1/Z, [x28, #32]\n"
-      "ld1rqb { z23.b }, p1/Z, [x28, #48]\n"
-      "and z3.b, z3.b, #0xf0\n"
-      "and z6.b, z6.b, #0xf0\n"
-      "ld1rqb { z11.b }, p1/Z, [x28, #64]\n"
-      "ld1rqb { z7.b }, p1/Z, [x28, #80]\n"
-      "lsl z8.b, z29.b, #0x4\n"
-      "lsl z14.b, z16.b, #0x4\n"
-      "ld1rqb { z18.b }, p1/Z, [x28, #96]\n"
-      "ld1rqb { z30.b }, p1/Z, [x28, #112]\n"
-      ".inst 0x45149b42  // smmla z2.s, z26.b, z20.b\n"
-      ".inst 0x45049b59  // smmla z25.s, z26.b, z4.b\n"
-      "and z29.b, z29.b, #0xf0\n"
-      "ld1h { z17.s }, p1/Z, [x21]\n"
-      ".inst 0x45149abb  // smmla z27.s, z21.b, z20.b\n"
-      ".inst 0x45049ab3  // smmla z19.s, z21.b, z4.b\n"
-      "and z16.b, z16.b, #0xf0\n"
-      "ld1h { z4.s }, p0/Z, [x20]\n"
-      "subs x22, x22, #0x1\n"
-      "add x28, x28, #0x88\n"
-      "fcvt z17.s, p1/m, z17.h\n"
-      "add x25, x25, #0x90\n"
-      ".inst 0x45089942  // smmla z2.s, z10.b, z8.b\n"
-      ".inst 0x450e9959  // smmla z25.s, z10.b, z14.b\n"
-      "fcvt z4.s, p1/m, z4.h\n"
-      ".inst 0x45089afb  // smmla z27.s, z23.b, z8.b\n"
-      ".inst 0x450e9af3  // smmla z19.s, z23.b, z14.b\n"
-      "fscale z17.s, p1/m, z17.s, z28.s\n"
-      "mov z4.q, z4.q[0]\n"
-      ".inst 0x45039962  // smmla z2.s, z11.b, z3.b\n"
-      ".inst 0x45069979  // smmla z25.s, z11.b, z6.b\n"
-      "fmul z23.s, z17.s, z4.s[0]\n"
-      "fmul z9.s, z17.s, z4.s[1]\n"
-      "fmul z21.s, z17.s, z4.s[2]\n"
-      "fmul z4.s, z17.s, z4.s[3]\n"
-      ".inst 0x450398fb  // smmla z27.s, z7.b, z3.b\n"
-      ".inst 0x450698f3  // smmla z19.s, z7.b, z6.b\n"
-      ".inst 0x451d9a42  // smmla z2.s, z18.b, z29.b\n"
-      ".inst 0x45109a59  // smmla z25.s, z18.b, z16.b\n"
-      ".inst 0x451d9bdb  // smmla z27.s, z30.b, z29.b\n"
-      ".inst 0x45109bd3  // smmla z19.s, z30.b, z16.b\n"
-      "uzp1 z31.d, z2.d, z25.d\n"
-      "uzp2 z13.d, z2.d, z25.d\n"
-      "scvtf z31.s, p1/m, z31.s\n"
-      "uzp1 z17.d, z27.d, z19.d\n"
-      "uzp2 z18.d, z27.d, z19.d\n"
-      "scvtf z13.s, p1/m, z13.s\n"
-      "fmla z24.s, p1/M, z31.s, z23.s\n"
-      "scvtf z17.s, p1/m, z17.s\n"
-      "scvtf z18.s, p1/m, z18.s\n"
-      "fmla z15.s, p1/M, z13.s, z9.s\n"
-      "fmla z12.s, p1/M, z17.s, z21.s\n"
-      "fmla z0.s, p1/M, z18.s, z4.s\n"
-      "bgt 7b\n"
-      "mov x20, %x[res_ptr]\n"
-      "cmp x13, #0x1\n"
-      "st1w { z24.s }, p1, [x20]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "ble 8f\n"
-      "cmp x13, #0x2\n"
-      "st1w { z15.s }, p1, [x20]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "ble 8f\n"
-      "cmp x13, #0x3\n"
-      "st1w { z12.s }, p1, [x20]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "ble 8f\n"
-      "st1w { z0.s }, p1, [x20]\n"
-      "8:"  // Row tail: Accumulator store skip
-      "subs x24, x24, #0x8\n"
-      "add %x[res_ptr], %x[res_ptr], #0x20\n"
-      "bne 6b\n"
-      "subs x13, x13, #0x4\n"
-      "add %x[a_ptr], %x[a_ptr], x12\n"
-      "mov %x[res_ptr], x23\n"
-      "bgt 5b\n"
-      "9:"  // Row tail: Row loop skip
-      : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
-      : [b_ptr] "r" (b_ptr), [height] "r" (height), [num_blocks] "r" (num_blocks), [res_stride] "r" (res_stride), [width] "r" (width)
-      : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
-    );
-#endif
-}
-
-void ggml_gemm_q4_0_q8_0_aarch64_neon(size_t depth, size_t output_channels, size_t height, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
-    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)4);
-    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)4);
-    size_t width = xend - x0;
-
-    int64_t nb = depth / QK4_0;
-    const void * b_ptr = (void *)((block_q4_0x4 *) vx + ((x0 / 4) * nb));
-    const void * a_ptr = vy;
-    float * res_ptr = s + x0;
-    size_t res_stride = output_channels  * sizeof(float);
-
-    assert(depth % 32 == 0);
-    assert(width % 4 == 0);
-
-    size_t num_blocks = depth / 32;
-
-    __asm__ __volatile__(
-      "mov x10, %x[height]\n"
-      "mov x9, #0x88\n"
-      "cmp x10, #0x10\n"
-      "mul x9, %x[num_blocks], x9\n"
-      "blt 4f\n"
-      "1:"  // Row loop
-      "add x28, %x[b_ptr], #0x8\n"
-      "mov x27, %x[width]\n"
-      "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
-      "2:"  // Column loop
-      "add x25, %x[a_ptr], #0x8\n"
-      "movi v2.16b, #0x0\n"
-      "movi v10.16b, #0x0\n"
-      "mov x24, %x[num_blocks]\n"
-      "add x23, x25, x9\n"
-      "movi v12.16b, #0x0\n"
-      "movi v28.16b, #0x0\n"
-      "add x22, x23, x9\n"
-      "movi v11.16b, #0x0\n"
-      "movi v13.16b, #0x0\n"
-      "add x21, x22, x9\n"
-      "movi v22.16b, #0x0\n"
-      "movi v23.16b, #0x0\n"
-      "movi v25.16b, #0x0\n"
-      "movi v5.16b, #0x0\n"
-      "movi v7.16b, #0x0\n"
-      "movi v4.16b, #0x0\n"
-      "movi v6.16b, #0x0\n"
-      "movi v30.16b, #0x0\n"
-      "movi v24.16b, #0x0\n"
-      "movi v14.16b, #0x0\n"
-      "3:"  // Block loop
-      "ldr q21, [x28, #0x0]\n"
-      "ldr q16, [x28, #0x10]\n"
-      "movi v1.16b, #0x4\n"
-      "movi v19.4s, #0x0\n"
-      "ldr q27, [x25, #0x0]\n"
-      "ldr q15, [x25, #0x10]\n"
-      "movi v26.4s, #0x0\n"
-      "movi v18.4s, #0x0\n"
-      "ldr q29, [x28, #0x20]\n"
-      "ldr q3, [x28, #0x30]\n"
-      "movi v17.4s, #0x0\n"
-      "movi v0.16b, #0xf0\n"
-      "ldr d20, [x25, #-0x8]\n"
-      "ldr d9, [x23, #-0x8]\n"
-      "sshl v8.16b, v21.16b, v1.16b\n"
-      "sshl v31.16b, v16.16b, v1.16b\n"
-      "and v21.16b, v21.16b, v0.16b\n"
-      "and v16.16b, v16.16b, v0.16b\n"
-      "sub x20, x28, #0x8\n"
-      "subs x24, x24, #0x1\n"
-      "add x28, x28, #0x48\n"
-      ".inst 0x4e88a773  // smmla v19.4s, v27.16b, v8.16b\n"
-      ".inst 0x4e9fa77a  // smmla v26.4s, v27.16b, v31.16b\n"
-      "ldr q27, [x25, #0x20]\n"
-      ".inst 0x4e88a5f2  // smmla v18.4s, v15.16b, v8.16b\n"
-      ".inst 0x4e9fa5f1  // smmla v17.4s, v15.16b, v31.16b\n"
-      "sshl v15.16b, v29.16b, v1.16b\n"
-      "sshl v1.16b, v3.16b, v1.16b\n"
-      "and v29.16b, v29.16b, v0.16b\n"
-      "and v3.16b, v3.16b, v0.16b\n"
-      "ldr q0, [x25, #0x30]\n"
-      "fcvtl v20.4s, v20.4h\n"
-      ".inst 0x4e8fa773  // smmla v19.4s, v27.16b, v15.16b\n"
-      "fcvtl v9.4s, v9.4h\n"
-      ".inst 0x4e81a77a  // smmla v26.4s, v27.16b, v1.16b\n"
-      "ldr q27, [x25, #0x40]\n"
-      ".inst 0x4e8fa412  // smmla v18.4s, v0.16b, v15.16b\n"
-      ".inst 0x4e81a411  // smmla v17.4s, v0.16b, v1.16b\n"
-      "ldr q0, [x25, #0x50]\n"
-      ".inst 0x4e95a773  // smmla v19.4s, v27.16b, v21.16b\n"
-      ".inst 0x4e90a77a  // smmla v26.4s, v27.16b, v16.16b\n"
-      "ldr q27, [x25, #0x60]\n"
-      ".inst 0x4e95a412  // smmla v18.4s, v0.16b, v21.16b\n"
-      ".inst 0x4e90a411  // smmla v17.4s, v0.16b, v16.16b\n"
-      "ldr q0, [x25, #0x70]\n"
-      "add x25, x25, #0x88\n"
-      ".inst 0x4e9da773  // smmla v19.4s, v27.16b, v29.16b\n"
-      ".inst 0x4e83a77a  // smmla v26.4s, v27.16b, v3.16b\n"
-      "ldr d27, [x20, #0x0]\n"
-      ".inst 0x4e9da412  // smmla v18.4s, v0.16b, v29.16b\n"
-      ".inst 0x4e83a411  // smmla v17.4s, v0.16b, v3.16b\n"
-      "fcvtl v27.4s, v27.4h\n"
-      "uzp1 v0.2d, v19.2d, v26.2d\n"
-      "uzp2 v26.2d, v19.2d, v26.2d\n"
-      "fmul v19.4s, v27.4s, v20.s[0]\n"
-      "scvtf v0.4s, v0.4s, #0x4\n"
-      "scvtf v26.4s, v26.4s, #0x4\n"
-      "fmla v2.4s, v0.4s, v19.4s\n"
-      "ldr q19, [x23, #0x0]\n"
-      "uzp1 v0.2d, v18.2d, v17.2d\n"
-      "uzp2 v18.2d, v18.2d, v17.2d\n"
-      "fmul v17.4s, v27.4s, v20.s[1]\n"
-      "scvtf v0.4s, v0.4s, #0x4\n"
-      "scvtf v18.4s, v18.4s, #0x4\n"
-      "fmla v10.4s, v26.4s, v17.4s\n"
-      "ldr q17, [x23, #0x10]\n"
-      "fmul v26.4s, v27.4s, v20.s[2]\n"
-      "fmul v20.4s, v27.4s, v20.s[3]\n"
-      "fmla v12.4s, v0.4s, v26.4s\n"
-      "ldr d0, [x22, #-0x8]\n"
-      "ldr d26, [x21, #-0x8]\n"
-      "fcvtl v0.4s, v0.4h\n"
-      "fmla v28.4s, v18.4s, v20.4s\n"
-      "movi v20.4s, #0x0\n"
-      "movi v18.4s, #0x0\n"
-      ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
-      ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
-      "ldr q19, [x23, #0x20]\n"
-      "fcvtl v26.4s, v26.4h\n"
-      ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
-      ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
-      "ldr q19, [x23, #0x40]\n"
-      ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
-      ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
-      "ldr q19, [x23, #0x60]\n"
-      ".inst 0x4e9da674  // smmla v20.4s, v19.16b, v29.16b\n"
-      ".inst 0x4e83a672  // smmla v18.4s, v19.16b, v3.16b\n"
-      "uzp1 v19.2d, v20.2d, v18.2d\n"
-      "scvtf v19.4s, v19.4s, #0x4\n"
-      "uzp2 v20.2d, v20.2d, v18.2d\n"
-      "fmul v18.4s, v27.4s, v9.s[0]\n"
-      "scvtf v20.4s, v20.4s, #0x4\n"
-      "fmla v11.4s, v19.4s, v18.4s\n"
-      "ldr q18, [x22, #0x0]\n"
-      "fmul v19.4s, v27.4s, v9.s[1]\n"
-      "fmla v13.4s, v20.4s, v19.4s\n"
-      "movi v19.4s, #0x0\n"
-      "movi v20.4s, #0x0\n"
-      ".inst 0x4e88a633  // smmla v19.4s, v17.16b, v8.16b\n"
-      ".inst 0x4e9fa634  // smmla v20.4s, v17.16b, v31.16b\n"
-      "ldr q17, [x23, #0x30]\n"
-      ".inst 0x4e8fa633  // smmla v19.4s, v17.16b, v15.16b\n"
-      ".inst 0x4e81a634  // smmla v20.4s, v17.16b, v1.16b\n"
-      "ldr q17, [x23, #0x50]\n"
-      ".inst 0x4e95a633  // smmla v19.4s, v17.16b, v21.16b\n"
-      ".inst 0x4e90a634  // smmla v20.4s, v17.16b, v16.16b\n"
-      "ldr q17, [x23, #0x70]\n"
-      "add x23, x23, #0x88\n"
-      ".inst 0x4e9da633  // smmla v19.4s, v17.16b, v29.16b\n"
-      ".inst 0x4e83a634  // smmla v20.4s, v17.16b, v3.16b\n"
-      "uzp1 v17.2d, v19.2d, v20.2d\n"
-      "scvtf v17.4s, v17.4s, #0x4\n"
-      "uzp2 v20.2d, v19.2d, v20.2d\n"
-      "fmul v19.4s, v27.4s, v9.s[2]\n"
-      "fmul v9.4s, v27.4s, v9.s[3]\n"
-      "scvtf v20.4s, v20.4s, #0x4\n"
-      "fmla v22.4s, v17.4s, v19.4s\n"
-      "ldr q17, [x22, #0x10]\n"
-      "movi v19.4s, #0x0\n"
-      ".inst 0x4e88a653  // smmla v19.4s, v18.16b, v8.16b\n"
-      "fmla v23.4s, v20.4s, v9.4s\n"
-      "movi v20.4s, #0x0\n"
-      "movi v9.4s, #0x0\n"
-      ".inst 0x4e9fa654  // smmla v20.4s, v18.16b, v31.16b\n"
-      "ldr q18, [x22, #0x20]\n"
-      ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
-      ".inst 0x4e8fa653  // smmla v19.4s, v18.16b, v15.16b\n"
-      ".inst 0x4e81a654  // smmla v20.4s, v18.16b, v1.16b\n"
-      "ldr q18, [x22, #0x40]\n"
-      ".inst 0x4e95a653  // smmla v19.4s, v18.16b, v21.16b\n"
-      ".inst 0x4e90a654  // smmla v20.4s, v18.16b, v16.16b\n"
-      "ldr q18, [x22, #0x60]\n"
-      ".inst 0x4e9da653  // smmla v19.4s, v18.16b, v29.16b\n"
-      ".inst 0x4e83a654  // smmla v20.4s, v18.16b, v3.16b\n"
-      "movi v18.4s, #0x0\n"
-      ".inst 0x4e9fa632  // smmla v18.4s, v17.16b, v31.16b\n"
-      "ldr q17, [x22, #0x30]\n"
-      ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
-      ".inst 0x4e81a632  // smmla v18.4s, v17.16b, v1.16b\n"
-      "ldr q17, [x22, #0x50]\n"
-      ".inst 0x4e95a629  // smmla v9.4s, v17.16b, v21.16b\n"
-      ".inst 0x4e90a632  // smmla v18.4s, v17.16b, v16.16b\n"
-      "ldr q17, [x22, #0x70]\n"
-      "add x22, x22, #0x88\n"
-      ".inst 0x4e9da629  // smmla v9.4s, v17.16b, v29.16b\n"
-      ".inst 0x4e83a632  // smmla v18.4s, v17.16b, v3.16b\n"
-      "uzp1 v17.2d, v19.2d, v20.2d\n"
-      "uzp2 v20.2d, v19.2d, v20.2d\n"
-      "fmul v19.4s, v27.4s, v0.s[0]\n"
-      "scvtf v17.4s, v17.4s, #0x4\n"
-      "scvtf v20.4s, v20.4s, #0x4\n"
-      "fmla v25.4s, v17.4s, v19.4s\n"
-      "ldr q19, [x21, #0x0]\n"
-      "fmul v17.4s, v27.4s, v0.s[1]\n"
-      "fmla v5.4s, v20.4s, v17.4s\n"
-      "ldr q17, [x21, #0x10]\n"
-      "uzp1 v20.2d, v9.2d, v18.2d\n"
-      "uzp2 v9.2d, v9.2d, v18.2d\n"
-      "fmul v18.4s, v27.4s, v0.s[2]\n"
-      "fmul v0.4s, v27.4s, v0.s[3]\n"
-      "scvtf v20.4s, v20.4s, #0x4\n"
-      "scvtf v9.4s, v9.4s, #0x4\n"
-      "fmla v7.4s, v20.4s, v18.4s\n"
-      "movi v20.4s, #0x0\n"
-      "movi v18.4s, #0x0\n"
-      ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
-      ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
-      "ldr q19, [x21, #0x20]\n"
-      "fmla v4.4s, v9.4s, v0.4s\n"
-      "movi v9.4s, #0x0\n"
-      "movi v0.4s, #0x0\n"
-      ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
-      "fmul v8.4s, v27.4s, v26.s[0]\n"
-      ".inst 0x4e9fa620  // smmla v0.4s, v17.16b, v31.16b\n"
-      "ldr q17, [x21, #0x30]\n"
-      ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
-      "fmul v31.4s, v27.4s, v26.s[1]\n"
-      ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
-      "ldr q19, [x21, #0x40]\n"
-      ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
-      "fmul v15.4s, v27.4s, v26.s[2]\n"
-      "fmul v27.4s, v27.4s, v26.s[3]\n"
-      ".inst 0x4e81a620  // smmla v0.4s, v17.16b, v1.16b\n"
-      "ldr q1, [x21, #0x50]\n"
-      ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
-      ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
-      "ldr q26, [x21, #0x60]\n"
-      ".inst 0x4e95a429  // smmla v9.4s, v1.16b, v21.16b\n"
-      ".inst 0x4e90a420  // smmla v0.4s, v1.16b, v16.16b\n"
-      "ldr q21, [x21, #0x70]\n"
-      "add x21, x21, #0x88\n"
-      ".inst 0x4e9da754  // smmla v20.4s, v26.16b, v29.16b\n"
-      ".inst 0x4e83a752  // smmla v18.4s, v26.16b, v3.16b\n"
-      ".inst 0x4e9da6a9  // smmla v9.4s, v21.16b, v29.16b\n"
-      ".inst 0x4e83a6a0  // smmla v0.4s, v21.16b, v3.16b\n"
-      "uzp1 v29.2d, v20.2d, v18.2d\n"
-      "uzp2 v21.2d, v20.2d, v18.2d\n"
-      "scvtf v29.4s, v29.4s, #0x4\n"
-      "uzp1 v18.2d, v9.2d, v0.2d\n"
-      "uzp2 v16.2d, v9.2d, v0.2d\n"
-      "scvtf v21.4s, v21.4s, #0x4\n"
-      "fmla v6.4s, v29.4s, v8.4s\n"
-      "scvtf v18.4s, v18.4s, #0x4\n"
-      "scvtf v16.4s, v16.4s, #0x4\n"
-      "fmla v30.4s, v21.4s, v31.4s\n"
-      "fmla v24.4s, v18.4s, v15.4s\n"
-      "fmla v14.4s, v16.4s, v27.4s\n"
-      "bgt 3b\n"
-      "mov x20, %x[res_ptr]\n"
-      "subs x27, x27, #0x4\n"
-      "add %x[res_ptr], %x[res_ptr], #0x10\n"
-      "str q2, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q10, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q12, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q28, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q11, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q13, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q22, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q23, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q25, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q5, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q7, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q4, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q6, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q30, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q24, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q14, [x20, #0x0]\n"
-      "bne 2b\n"
-      "mov x20, #0x4\n"
-      "sub x10, x10, #0x10\n"
-      "cmp x10, #0x10\n"
-      "mov %x[res_ptr], x26\n"
-      "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
-      "bge 1b\n"
-      "4:"  // Row loop skip
-      "cbz x10, 9f\n"
-      "5:"  // Row tail: Row loop
-      "add x24, %x[b_ptr], #0x8\n"
-      "mov x23, %x[width]\n"
-      "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
-      "6:"  // Row tail: Column loop
-      "movi v2.16b, #0x0\n"
-      "movi v10.16b, #0x0\n"
-      "add x25, %x[a_ptr], #0x8\n"
-      "mov x21, %x[num_blocks]\n"
-      "movi v12.16b, #0x0\n"
-      "movi v28.16b, #0x0\n"
-      "7:"  // Row tail: Block loop
-      "ldr q6, [x24, #0x0]\n"
-      "ldr q5, [x24, #0x10]\n"
-      "movi v17.16b, #0x4\n"
-      "movi v8.4s, #0x0\n"
-      "ldr q4, [x25, #0x0]\n"
-      "ldr q13, [x25, #0x10]\n"
-      "movi v27.4s, #0x0\n"
-      "movi v0.4s, #0x0\n"
-      "ldr q31, [x24, #0x20]\n"
-      "ldr q14, [x24, #0x30]\n"
-      "movi v29.4s, #0x0\n"
-      "movi v22.16b, #0xf0\n"
-      "ldr q11, [x25, #0x20]\n"
-      "ldr q23, [x25, #0x30]\n"
-      "sshl v21.16b, v6.16b, v17.16b\n"
-      "sshl v16.16b, v5.16b, v17.16b\n"
-      "ldr q20, [x25, #0x40]\n"
-      "ldr q26, [x25, #0x50]\n"
-      "and v6.16b, v6.16b, v22.16b\n"
-      "and v5.16b, v5.16b, v22.16b\n"
-      "ldr q25, [x25, #0x60]\n"
-      "ldr q3, [x25, #0x70]\n"
-      "sshl v19.16b, v31.16b, v17.16b\n"
-      "sshl v18.16b, v14.16b, v17.16b\n"
-      "ldr d17, [x25, #-0x8]\n"
-      ".inst 0x4e95a488  // smmla v8.4s, v4.16b, v21.16b\n"
-      ".inst 0x4e90a49b  // smmla v27.4s, v4.16b, v16.16b\n"
-      "and v31.16b, v31.16b, v22.16b\n"
-      ".inst 0x4e95a5a0  // smmla v0.4s, v13.16b, v21.16b\n"
-      ".inst 0x4e90a5bd  // smmla v29.4s, v13.16b, v16.16b\n"
-      "and v14.16b, v14.16b, v22.16b\n"
-      "sub x20, x24, #0x8\n"
-      "ldr d16, [x20, #0x0]\n"
-      "subs x21, x21, #0x1\n"
-      "add x25, x25, #0x88\n"
-      "fcvtl v17.4s, v17.4h\n"
-      "add x24, x24, #0x48\n"
-      ".inst 0x4e93a568  // smmla v8.4s, v11.16b, v19.16b\n"
-      ".inst 0x4e92a57b  // smmla v27.4s, v11.16b, v18.16b\n"
-      ".inst 0x4e93a6e0  // smmla v0.4s, v23.16b, v19.16b\n"
-      ".inst 0x4e92a6fd  // smmla v29.4s, v23.16b, v18.16b\n"
-      "fcvtl v16.4s, v16.4h\n"
-      ".inst 0x4e86a688  // smmla v8.4s, v20.16b, v6.16b\n"
-      ".inst 0x4e85a69b  // smmla v27.4s, v20.16b, v5.16b\n"
-      "fmul v23.4s, v16.4s, v17.s[0]\n"
-      "fmul v21.4s, v16.4s, v17.s[1]\n"
-      "fmul v1.4s, v16.4s, v17.s[2]\n"
-      "fmul v20.4s, v16.4s, v17.s[3]\n"
-      ".inst 0x4e86a740  // smmla v0.4s, v26.16b, v6.16b\n"
-      ".inst 0x4e85a75d  // smmla v29.4s, v26.16b, v5.16b\n"
-      ".inst 0x4e9fa728  // smmla v8.4s, v25.16b, v31.16b\n"
-      ".inst 0x4e8ea73b  // smmla v27.4s, v25.16b, v14.16b\n"
-      ".inst 0x4e9fa460  // smmla v0.4s, v3.16b, v31.16b\n"
-      ".inst 0x4e8ea47d  // smmla v29.4s, v3.16b, v14.16b\n"
-      "uzp1 v19.2d, v8.2d, v27.2d\n"
-      "uzp2 v18.2d, v8.2d, v27.2d\n"
-      "scvtf v19.4s, v19.4s, #0x4\n"
-      "uzp1 v17.2d, v0.2d, v29.2d\n"
-      "uzp2 v16.2d, v0.2d, v29.2d\n"
-      "scvtf v18.4s, v18.4s, #0x4\n"
-      "fmla v2.4s, v19.4s, v23.4s\n"
-      "scvtf v17.4s, v17.4s, #0x4\n"
-      "scvtf v16.4s, v16.4s, #0x4\n"
-      "fmla v10.4s, v18.4s, v21.4s\n"
-      "fmla v12.4s, v17.4s, v1.4s\n"
-      "fmla v28.4s, v16.4s, v20.4s\n"
-      "bgt 7b\n"
-      "mov x20, %x[res_ptr]\n"
-      "cmp x10, #0x1\n"
-      "str q2, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "ble 8f\n"
-      "cmp x10, #0x2\n"
-      "str q10, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "ble 8f\n"
-      "cmp x10, #0x3\n"
-      "str q12, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "ble 8f\n"
-      "str q28, [x20, #0x0]\n"
-      "8:"  // Row tail: Accumulator store skip
-      "subs x23, x23, #0x4\n"
-      "add %x[res_ptr], %x[res_ptr], #0x10\n"
-      "bne 6b\n"
-      "subs x10, x10, #0x4\n"
-      "add %x[a_ptr], %x[a_ptr], x9\n"
-      "mov %x[res_ptr], x22\n"
-      "bgt 5b\n"
-      "9:"  // Row tail: Row loop skip
-      : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
-      : [b_ptr] "r" (b_ptr), [height] "r" (height), [num_blocks] "r" (num_blocks), [res_stride] "r" (res_stride), [width] "r" (width)
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
-    );
-#endif
-}
-
-void ggml_gemm_q4_0_q8_0_aarch64_neon_noi8mm(size_t depth, size_t output_channels, size_t height, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
-#if defined(__ARM_NEON)
-    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)4);
-    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)4);
-    size_t width = xend - x0;
-
-    int64_t nb = depth / QK4_0;
-    void * b_ptr = (void *)((block_q4_0x4 *) vx + ((x0/4) * nb));
-    void * a_ptr = vy;
-    float * res_ptr = s + x0;
-    size_t res_stride = output_channels  * sizeof(float);
-
-    assert(depth % 32 == 0);
-    assert(width % 4 == 0);
-
-    size_t num_blocks = depth / 32;
-
-    __asm__ __volatile__(
-      "mov x10, %x[height]\n"
-      "mov x9, #0x88\n"
-      "cmp x10, #0x10\n"
-      "mul x9, %x[num_blocks], x9\n"
-      "blt 4f\n"
-      "1:"  // Row loop
-      "add x28, %x[b_ptr], #0x8\n"
-      "mov x27, %x[width]\n"
-      "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
-      "2:"  // Column loop
-      "add x25, %x[a_ptr], #0x8\n"
-      "movi v15.16b, #0x0\n"
-      "movi v19.16b, #0x0\n"
-      "mov x24, %x[num_blocks]\n"
-      "add x23, x25, x9\n"
-      "movi v18.16b, #0x0\n"
-      "movi v14.16b, #0x0\n"
-      "add x22, x23, x9\n"
-      "movi v11.16b, #0x0\n"
-      "movi v13.16b, #0x0\n"
-      "add x21, x22, x9\n"
-      "movi v23.16b, #0x0\n"
-      "movi v16.16b, #0x0\n"
-      "movi v25.16b, #0x0\n"
-      "movi v7.16b, #0x0\n"
-      "movi v0.16b, #0x0\n"
-      "movi v4.16b, #0x0\n"
-      "movi v5.16b, #0x0\n"
-      "movi v21.16b, #0x0\n"
-      "movi v8.16b, #0x0\n"
-      "movi v1.16b, #0x0\n"
-      "3:"  // Block loop
-      "ldr q3, [x28, #0x0]\n"
-      "ldr q31, [x25, #0x0]\n"
-      "movi v28.16b, #0x4\n"
-      "movi v10.4s, #0x0\n"
-      "ldr q22, [x28, #0x10]\n"
-      "ldr q6, [x25, #0x10]\n"
-      "movi v29.4s, #0x0\n"
-      "movi v9.4s, #0x0\n"
-      "ldr q27, [x28, #0x20]\n"
-      "ldr q30, [x28, #0x30]\n"
-      "movi v20.4s, #0x0\n"
-      "movi v24.16b, #0xf0\n"
-      "ldr d2, [x25, #-0x8]\n"
-      "ldr d26, [x23, #-0x8]\n"
-      "sshl v12.16b, v3.16b, v28.16b\n"
-      "sub x20, x28, #0x8\n"
-      "ldr d17, [x20, #0x0]\n"
-      "and v3.16b, v3.16b, v24.16b\n"
-      "subs x24, x24, #0x1\n"
-      "add x28, x28, #0x48\n"
-      ".inst 0x4f9fe18a  // sdot v10.4s, v12.16b, v31.4b[0]\n"
-      ".inst 0x4fbfe19d  // sdot v29.4s, v12.16b, v31.4b[1]\n"
-      ".inst 0x4f9fe989  // sdot v9.4s, v12.16b, v31.4b[2]\n"
-      ".inst 0x4fbfe994  // sdot v20.4s, v12.16b, v31.4b[3]\n"
-      "sshl v31.16b, v22.16b, v28.16b\n"
-      "and v22.16b, v22.16b, v24.16b\n"
-      "fcvtl v17.4s, v17.4h\n"
-      "fcvtl v2.4s, v2.4h\n"
-      "fcvtl v26.4s, v26.4h\n"
-      ".inst 0x4f86e3ea  // sdot v10.4s, v31.16b, v6.4b[0]\n"
-      ".inst 0x4fa6e3fd  // sdot v29.4s, v31.16b, v6.4b[1]\n"
-      ".inst 0x4f86ebe9  // sdot v9.4s, v31.16b, v6.4b[2]\n"
-      ".inst 0x4fa6ebf4  // sdot v20.4s, v31.16b, v6.4b[3]\n"
-      "sshl v6.16b, v27.16b, v28.16b\n"
-      "sshl v28.16b, v30.16b, v28.16b\n"
-      "and v27.16b, v27.16b, v24.16b\n"
-      "and v30.16b, v30.16b, v24.16b\n"
-      "ldr q24, [x25, #0x20]\n"
-      ".inst 0x4f98e0ca  // sdot v10.4s, v6.16b, v24.4b[0]\n"
-      ".inst 0x4fb8e0dd  // sdot v29.4s, v6.16b, v24.4b[1]\n"
-      ".inst 0x4f98e8c9  // sdot v9.4s, v6.16b, v24.4b[2]\n"
-      ".inst 0x4fb8e8d4  // sdot v20.4s, v6.16b, v24.4b[3]\n"
-      "ldr q24, [x25, #0x30]\n"
-      ".inst 0x4f98e38a  // sdot v10.4s, v28.16b, v24.4b[0]\n"
-      ".inst 0x4fb8e39d  // sdot v29.4s, v28.16b, v24.4b[1]\n"
-      ".inst 0x4f98eb89  // sdot v9.4s, v28.16b, v24.4b[2]\n"
-      ".inst 0x4fb8eb94  // sdot v20.4s, v28.16b, v24.4b[3]\n"
-      "ldr q24, [x25, #0x40]\n"
-      ".inst 0x4f98e06a  // sdot v10.4s, v3.16b, v24.4b[0]\n"
-      ".inst 0x4fb8e07d  // sdot v29.4s, v3.16b, v24.4b[1]\n"
-      ".inst 0x4f98e869  // sdot v9.4s, v3.16b, v24.4b[2]\n"
-      ".inst 0x4fb8e874  // sdot v20.4s, v3.16b, v24.4b[3]\n"
-      "ldr q24, [x25, #0x50]\n"
-      ".inst 0x4f98e2ca  // sdot v10.4s, v22.16b, v24.4b[0]\n"
-      ".inst 0x4fb8e2dd  // sdot v29.4s, v22.16b, v24.4b[1]\n"
-      ".inst 0x4f98eac9  // sdot v9.4s, v22.16b, v24.4b[2]\n"
-      ".inst 0x4fb8ead4  // sdot v20.4s, v22.16b, v24.4b[3]\n"
-      "ldr q24, [x25, #0x60]\n"
-      ".inst 0x4f98e36a  // sdot v10.4s, v27.16b, v24.4b[0]\n"
-      ".inst 0x4fb8e37d  // sdot v29.4s, v27.16b, v24.4b[1]\n"
-      ".inst 0x4f98eb69  // sdot v9.4s, v27.16b, v24.4b[2]\n"
-      ".inst 0x4fb8eb74  // sdot v20.4s, v27.16b, v24.4b[3]\n"
-      "ldr q24, [x25, #0x70]\n"
-      "add x25, x25, #0x88\n"
-      ".inst 0x4f98e3ca  // sdot v10.4s, v30.16b, v24.4b[0]\n"
-      ".inst 0x4fb8e3dd  // sdot v29.4s, v30.16b, v24.4b[1]\n"
-      ".inst 0x4f98ebc9  // sdot v9.4s, v30.16b, v24.4b[2]\n"
-      ".inst 0x4fb8ebd4  // sdot v20.4s, v30.16b, v24.4b[3]\n"
-      "fmul v24.4s, v17.4s, v2.s[0]\n"
-      "scvtf v10.4s, v10.4s, #0x4\n"
-      "scvtf v29.4s, v29.4s, #0x4\n"
-      "scvtf v9.4s, v9.4s, #0x4\n"
-      "scvtf v20.4s, v20.4s, #0x4\n"
-      "fmla v15.4s, v10.4s, v24.4s\n"
-      "ldr q24, [x23, #0x0]\n"
-      "fmul v10.4s, v17.4s, v2.s[1]\n"
-      "fmla v19.4s, v29.4s, v10.4s\n"
-      "ldr q10, [x23, #0x10]\n"
-      "fmul v29.4s, v17.4s, v2.s[2]\n"
-      "fmul v2.4s, v17.4s, v2.s[3]\n"
-      "fmla v18.4s, v9.4s, v29.4s\n"
-      "movi v9.4s, #0x0\n"
-      "movi v29.4s, #0x0\n"
-      ".inst 0x4f98e189  // sdot v9.4s, v12.16b, v24.4b[0]\n"
-      ".inst 0x4fb8e19d  // sdot v29.4s, v12.16b, v24.4b[1]\n"
-      "fmla v14.4s, v20.4s, v2.4s\n"
-      "movi v20.4s, #0x0\n"
-      "movi v2.4s, #0x0\n"
-      ".inst 0x4f98e994  // sdot v20.4s, v12.16b, v24.4b[2]\n"
-      ".inst 0x4fb8e982  // sdot v2.4s, v12.16b, v24.4b[3]\n"
-      "ldr q24, [x23, #0x20]\n"
-      ".inst 0x4f8ae3e9  // sdot v9.4s, v31.16b, v10.4b[0]\n"
-      ".inst 0x4faae3fd  // sdot v29.4s, v31.16b, v10.4b[1]\n"
-      ".inst 0x4f8aebf4  // sdot v20.4s, v31.16b, v10.4b[2]\n"
-      ".inst 0x4faaebe2  // sdot v2.4s, v31.16b, v10.4b[3]\n"
-      "ldr q10, [x23, #0x30]\n"
-      ".inst 0x4f98e0c9  // sdot v9.4s, v6.16b, v24.4b[0]\n"
-      ".inst 0x4fb8e0dd  // sdot v29.4s, v6.16b, v24.4b[1]\n"
-      ".inst 0x4f98e8d4  // sdot v20.4s, v6.16b, v24.4b[2]\n"
-      ".inst 0x4fb8e8c2  // sdot v2.4s, v6.16b, v24.4b[3]\n"
-      "ldr q24, [x23, #0x40]\n"
-      ".inst 0x4f8ae389  // sdot v9.4s, v28.16b, v10.4b[0]\n"
-      ".inst 0x4faae39d  // sdot v29.4s, v28.16b, v10.4b[1]\n"
-      ".inst 0x4f8aeb94  // sdot v20.4s, v28.16b, v10.4b[2]\n"
-      ".inst 0x4faaeb82  // sdot v2.4s, v28.16b, v10.4b[3]\n"
-      "ldr q10, [x23, #0x50]\n"
-      ".inst 0x4f98e069  // sdot v9.4s, v3.16b, v24.4b[0]\n"
-      ".inst 0x4fb8e07d  // sdot v29.4s, v3.16b, v24.4b[1]\n"
-      ".inst 0x4f98e874  // sdot v20.4s, v3.16b, v24.4b[2]\n"
-      ".inst 0x4fb8e862  // sdot v2.4s, v3.16b, v24.4b[3]\n"
-      "ldr q24, [x23, #0x60]\n"
-      ".inst 0x4f8ae2c9  // sdot v9.4s, v22.16b, v10.4b[0]\n"
-      ".inst 0x4faae2dd  // sdot v29.4s, v22.16b, v10.4b[1]\n"
-      ".inst 0x4f8aead4  // sdot v20.4s, v22.16b, v10.4b[2]\n"
-      ".inst 0x4faaeac2  // sdot v2.4s, v22.16b, v10.4b[3]\n"
-      "ldr q10, [x23, #0x70]\n"
-      "add x23, x23, #0x88\n"
-      ".inst 0x4f98e369  // sdot v9.4s, v27.16b, v24.4b[0]\n"
-      ".inst 0x4fb8e37d  // sdot v29.4s, v27.16b, v24.4b[1]\n"
-      ".inst 0x4f98eb74  // sdot v20.4s, v27.16b, v24.4b[2]\n"
-      ".inst 0x4fb8eb62  // sdot v2.4s, v27.16b, v24.4b[3]\n"
-      "ldr q24, [x22, #0x0]\n"
-      ".inst 0x4f8ae3c9  // sdot v9.4s, v30.16b, v10.4b[0]\n"
-      ".inst 0x4faae3dd  // sdot v29.4s, v30.16b, v10.4b[1]\n"
-      ".inst 0x4f8aebd4  // sdot v20.4s, v30.16b, v10.4b[2]\n"
-      ".inst 0x4faaebc2  // sdot v2.4s, v30.16b, v10.4b[3]\n"
-      "fmul v10.4s, v17.4s, v26.s[0]\n"
-      "scvtf v9.4s, v9.4s, #0x4\n"
-      "scvtf v29.4s, v29.4s, #0x4\n"
-      "scvtf v20.4s, v20.4s, #0x4\n"
-      "scvtf v2.4s, v2.4s, #0x4\n"
-      "fmla v11.4s, v9.4s, v10.4s\n"
-      "ldr q9, [x22, #0x10]\n"
-      "fmul v10.4s, v17.4s, v26.s[1]\n"
-      "fmla v13.4s, v29.4s, v10.4s\n"
-      "ldr d29, [x22, #-0x8]\n"
-      "fmul v10.4s, v17.4s, v26.s[2]\n"
-      "fmul v26.4s, v17.4s, v26.s[3]\n"
-      "fcvtl v29.4s, v29.4h\n"
-      "fmla v23.4s, v20.4s, v10.4s\n"
-      "movi v20.4s, #0x0\n"
-      "movi v10.4s, #0x0\n"
-      "fmla v16.4s, v2.4s, v26.4s\n"
-      "movi v26.4s, #0x0\n"
-      "movi v2.4s, #0x0\n"
-      ".inst 0x4f98e194  // sdot v20.4s, v12.16b, v24.4b[0]\n"
-      ".inst 0x4fb8e18a  // sdot v10.4s, v12.16b, v24.4b[1]\n"
-      ".inst 0x4f98e99a  // sdot v26.4s, v12.16b, v24.4b[2]\n"
-      ".inst 0x4fb8e982  // sdot v2.4s, v12.16b, v24.4b[3]\n"
-      "ldr q24, [x22, #0x20]\n"
-      ".inst 0x4f89e3f4  // sdot v20.4s, v31.16b, v9.4b[0]\n"
-      ".inst 0x4fa9e3ea  // sdot v10.4s, v31.16b, v9.4b[1]\n"
-      ".inst 0x4f89ebfa  // sdot v26.4s, v31.16b, v9.4b[2]\n"
-      ".inst 0x4fa9ebe2  // sdot v2.4s, v31.16b, v9.4b[3]\n"
-      "ldr q9, [x22, #0x30]\n"
-      ".inst 0x4f98e0d4  // sdot v20.4s, v6.16b, v24.4b[0]\n"
-      ".inst 0x4fb8e0ca  // sdot v10.4s, v6.16b, v24.4b[1]\n"
-      ".inst 0x4f98e8da  // sdot v26.4s, v6.16b, v24.4b[2]\n"
-      ".inst 0x4fb8e8c2  // sdot v2.4s, v6.16b, v24.4b[3]\n"
-      "ldr q24, [x22, #0x40]\n"
-      ".inst 0x4f89e394  // sdot v20.4s, v28.16b, v9.4b[0]\n"
-      ".inst 0x4fa9e38a  // sdot v10.4s, v28.16b, v9.4b[1]\n"
-      ".inst 0x4f89eb9a  // sdot v26.4s, v28.16b, v9.4b[2]\n"
-      ".inst 0x4fa9eb82  // sdot v2.4s, v28.16b, v9.4b[3]\n"
-      "ldr q9, [x22, #0x50]\n"
-      ".inst 0x4f98e074  // sdot v20.4s, v3.16b, v24.4b[0]\n"
-      ".inst 0x4fb8e06a  // sdot v10.4s, v3.16b, v24.4b[1]\n"
-      ".inst 0x4f98e87a  // sdot v26.4s, v3.16b, v24.4b[2]\n"
-      ".inst 0x4fb8e862  // sdot v2.4s, v3.16b, v24.4b[3]\n"
-      "ldr q24, [x22, #0x60]\n"
-      ".inst 0x4f89e2d4  // sdot v20.4s, v22.16b, v9.4b[0]\n"
-      ".inst 0x4fa9e2ca  // sdot v10.4s, v22.16b, v9.4b[1]\n"
-      ".inst 0x4f89eada  // sdot v26.4s, v22.16b, v9.4b[2]\n"
-      ".inst 0x4fa9eac2  // sdot v2.4s, v22.16b, v9.4b[3]\n"
-      "ldr q9, [x22, #0x70]\n"
-      "add x22, x22, #0x88\n"
-      ".inst 0x4f98e374  // sdot v20.4s, v27.16b, v24.4b[0]\n"
-      ".inst 0x4fb8e36a  // sdot v10.4s, v27.16b, v24.4b[1]\n"
-      ".inst 0x4f98eb7a  // sdot v26.4s, v27.16b, v24.4b[2]\n"
-      ".inst 0x4fb8eb62  // sdot v2.4s, v27.16b, v24.4b[3]\n"
-      "ldr q24, [x21, #0x0]\n"
-      ".inst 0x4f89e3d4  // sdot v20.4s, v30.16b, v9.4b[0]\n"
-      ".inst 0x4fa9e3ca  // sdot v10.4s, v30.16b, v9.4b[1]\n"
-      ".inst 0x4f89ebda  // sdot v26.4s, v30.16b, v9.4b[2]\n"
-      ".inst 0x4fa9ebc2  // sdot v2.4s, v30.16b, v9.4b[3]\n"
-      "fmul v9.4s, v17.4s, v29.s[0]\n"
-      "scvtf v20.4s, v20.4s, #0x4\n"
-      "scvtf v10.4s, v10.4s, #0x4\n"
-      "scvtf v26.4s, v26.4s, #0x4\n"
-      "scvtf v2.4s, v2.4s, #0x4\n"
-      "fmla v25.4s, v20.4s, v9.4s\n"
-      "ldr q9, [x21, #0x10]\n"
-      "fmul v20.4s, v17.4s, v29.s[1]\n"
-      "fmla v7.4s, v10.4s, v20.4s\n"
-      "ldr d20, [x21, #-0x8]\n"
-      "fmul v10.4s, v17.4s, v29.s[2]\n"
-      "fmul v29.4s, v17.4s, v29.s[3]\n"
-      "fcvtl v20.4s, v20.4h\n"
-      "fmla v0.4s, v26.4s, v10.4s\n"
-      "movi v26.4s, #0x0\n"
-      "movi v10.4s, #0x0\n"
-      "fmla v4.4s, v2.4s, v29.4s\n"
-      "movi v2.4s, #0x0\n"
-      "movi v29.4s, #0x0\n"
-      ".inst 0x4f98e19a  // sdot v26.4s, v12.16b, v24.4b[0]\n"
-      ".inst 0x4fb8e18a  // sdot v10.4s, v12.16b, v24.4b[1]\n"
-      ".inst 0x4f98e982  // sdot v2.4s, v12.16b, v24.4b[2]\n"
-      ".inst 0x4fb8e99d  // sdot v29.4s, v12.16b, v24.4b[3]\n"
-      "ldr q12, [x21, #0x20]\n"
-      "fmul v24.4s, v17.4s, v20.s[0]\n"
-      ".inst 0x4f89e3fa  // sdot v26.4s, v31.16b, v9.4b[0]\n"
-      ".inst 0x4fa9e3ea  // sdot v10.4s, v31.16b, v9.4b[1]\n"
-      ".inst 0x4f89ebe2  // sdot v2.4s, v31.16b, v9.4b[2]\n"
-      ".inst 0x4fa9ebfd  // sdot v29.4s, v31.16b, v9.4b[3]\n"
-      "ldr q9, [x21, #0x30]\n"
-      "fmul v31.4s, v17.4s, v20.s[1]\n"
-      ".inst 0x4f8ce0da  // sdot v26.4s, v6.16b, v12.4b[0]\n"
-      ".inst 0x4face0ca  // sdot v10.4s, v6.16b, v12.4b[1]\n"
-      ".inst 0x4f8ce8c2  // sdot v2.4s, v6.16b, v12.4b[2]\n"
-      ".inst 0x4face8dd  // sdot v29.4s, v6.16b, v12.4b[3]\n"
-      "ldr q12, [x21, #0x40]\n"
-      "fmul v6.4s, v17.4s, v20.s[2]\n"
-      "fmul v20.4s, v17.4s, v20.s[3]\n"
-      ".inst 0x4f89e39a  // sdot v26.4s, v28.16b, v9.4b[0]\n"
-      ".inst 0x4fa9e38a  // sdot v10.4s, v28.16b, v9.4b[1]\n"
-      ".inst 0x4f89eb82  // sdot v2.4s, v28.16b, v9.4b[2]\n"
-      ".inst 0x4fa9eb9d  // sdot v29.4s, v28.16b, v9.4b[3]\n"
-      "ldr q9, [x21, #0x50]\n"
-      ".inst 0x4f8ce07a  // sdot v26.4s, v3.16b, v12.4b[0]\n"
-      ".inst 0x4face06a  // sdot v10.4s, v3.16b, v12.4b[1]\n"
-      ".inst 0x4f8ce862  // sdot v2.4s, v3.16b, v12.4b[2]\n"
-      ".inst 0x4face87d  // sdot v29.4s, v3.16b, v12.4b[3]\n"
-      "ldr q12, [x21, #0x60]\n"
-      ".inst 0x4f89e2da  // sdot v26.4s, v22.16b, v9.4b[0]\n"
-      ".inst 0x4fa9e2ca  // sdot v10.4s, v22.16b, v9.4b[1]\n"
-      ".inst 0x4f89eac2  // sdot v2.4s, v22.16b, v9.4b[2]\n"
-      ".inst 0x4fa9eadd  // sdot v29.4s, v22.16b, v9.4b[3]\n"
-      "ldr q17, [x21, #0x70]\n"
-      "add x21, x21, #0x88\n"
-      ".inst 0x4f8ce37a  // sdot v26.4s, v27.16b, v12.4b[0]\n"
-      ".inst 0x4face36a  // sdot v10.4s, v27.16b, v12.4b[1]\n"
-      ".inst 0x4f8ceb62  // sdot v2.4s, v27.16b, v12.4b[2]\n"
-      ".inst 0x4faceb7d  // sdot v29.4s, v27.16b, v12.4b[3]\n"
-      ".inst 0x4f91e3da  // sdot v26.4s, v30.16b, v17.4b[0]\n"
-      ".inst 0x4fb1e3ca  // sdot v10.4s, v30.16b, v17.4b[1]\n"
-      ".inst 0x4f91ebc2  // sdot v2.4s, v30.16b, v17.4b[2]\n"
-      ".inst 0x4fb1ebdd  // sdot v29.4s, v30.16b, v17.4b[3]\n"
-      "scvtf v26.4s, v26.4s, #0x4\n"
-      "scvtf v10.4s, v10.4s, #0x4\n"
-      "fmla v5.4s, v26.4s, v24.4s\n"
-      "scvtf v2.4s, v2.4s, #0x4\n"
-      "scvtf v29.4s, v29.4s, #0x4\n"
-      "fmla v21.4s, v10.4s, v31.4s\n"
-      "fmla v8.4s, v2.4s, v6.4s\n"
-      "fmla v1.4s, v29.4s, v20.4s\n"
-      "bgt 3b\n"
-      "mov x20, %x[res_ptr]\n"
-      "subs x27, x27, #0x4\n"
-      "add %x[res_ptr], %x[res_ptr], #0x10\n"
-      "str q15, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q19, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q18, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q14, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q11, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q13, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q23, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q16, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q25, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q7, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q0, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q4, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q5, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q21, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q8, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "str q1, [x20, #0x0]\n"
-      "bne 2b\n"
-      "mov x20, #0x4\n"
-      "sub x10, x10, #0x10\n"
-      "cmp x10, #0x10\n"
-      "mov %x[res_ptr], x26\n"
-      "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
-      "bge 1b\n"
-      "4:"  // Row loop skip
-      "cbz x10, 9f\n"
-      "5:"  // Row tail: Row loop
-      "add x24, %x[b_ptr], #0x8\n"
-      "mov x23, %x[width]\n"
-      "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
-      "6:"  // Row tail: Column loop
-      "movi v15.16b, #0x0\n"
-      "movi v19.16b, #0x0\n"
-      "add x25, %x[a_ptr], #0x8\n"
-      "mov x21, %x[num_blocks]\n"
-      "movi v18.16b, #0x0\n"
-      "movi v14.16b, #0x0\n"
-      "7:"  // Row tail: Block loop
-      "ldr q7, [x24, #0x0]\n"
-      "ldr q5, [x25, #0x0]\n"
-      "movi v9.16b, #0x4\n"
-      "movi v4.4s, #0x0\n"
-      "ldr q3, [x24, #0x10]\n"
-      "ldr q2, [x25, #0x10]\n"
-      "movi v1.4s, #0x0\n"
-      "movi v0.4s, #0x0\n"
-      "ldr q13, [x24, #0x20]\n"
-      "ldr q31, [x25, #0x20]\n"
-      "movi v30.4s, #0x0\n"
-      "movi v29.16b, #0xf0\n"
-      "ldr q28, [x24, #0x30]\n"
-      "ldr q27, [x25, #0x30]\n"
-      "sshl v20.16b, v7.16b, v9.16b\n"
-      "sub x20, x24, #0x8\n"
-      "ldr q26, [x25, #0x40]\n"
-      "ldr q25, [x25, #0x50]\n"
-      "sshl v17.16b, v3.16b, v9.16b\n"
-      "and v7.16b, v7.16b, v29.16b\n"
-      "ldr q24, [x25, #0x60]\n"
-      "ldr q16, [x25, #0x70]\n"
-      "sshl v22.16b, v13.16b, v9.16b\n"
-      "and v3.16b, v3.16b, v29.16b\n"
-      "ldr d21, [x20, #0x0]\n"
-      "ldr d12, [x25, #-0x8]\n"
-      ".inst 0x4f85e284  // sdot v4.4s, v20.16b, v5.4b[0]\n"
-      ".inst 0x4fa5e281  // sdot v1.4s, v20.16b, v5.4b[1]\n"
-      ".inst 0x4f85ea80  // sdot v0.4s, v20.16b, v5.4b[2]\n"
-      ".inst 0x4fa5ea9e  // sdot v30.4s, v20.16b, v5.4b[3]\n"
-      "sshl v9.16b, v28.16b, v9.16b\n"
-      "subs x21, x21, #0x1\n"
-      "and v13.16b, v13.16b, v29.16b\n"
-      "and v28.16b, v28.16b, v29.16b\n"
-      "add x25, x25, #0x88\n"
-      "add x24, x24, #0x48\n"
-      "fcvtl v21.4s, v21.4h\n"
-      "fcvtl v12.4s, v12.4h\n"
-      ".inst 0x4f82e224  // sdot v4.4s, v17.16b, v2.4b[0]\n"
-      ".inst 0x4fa2e221  // sdot v1.4s, v17.16b, v2.4b[1]\n"
-      ".inst 0x4f82ea20  // sdot v0.4s, v17.16b, v2.4b[2]\n"
-      ".inst 0x4fa2ea3e  // sdot v30.4s, v17.16b, v2.4b[3]\n"
-      "fmul v11.4s, v21.4s, v12.s[0]\n"
-      "fmul v23.4s, v21.4s, v12.s[1]\n"
-      "fmul v17.4s, v21.4s, v12.s[2]\n"
-      ".inst 0x4f9fe2c4  // sdot v4.4s, v22.16b, v31.4b[0]\n"
-      "fmul v6.4s, v21.4s, v12.s[3]\n"
-      ".inst 0x4fbfe2c1  // sdot v1.4s, v22.16b, v31.4b[1]\n"
-      ".inst 0x4f9feac0  // sdot v0.4s, v22.16b, v31.4b[2]\n"
-      ".inst 0x4fbfeade  // sdot v30.4s, v22.16b, v31.4b[3]\n"
-      ".inst 0x4f9be124  // sdot v4.4s, v9.16b, v27.4b[0]\n"
-      ".inst 0x4fbbe121  // sdot v1.4s, v9.16b, v27.4b[1]\n"
-      ".inst 0x4f9be920  // sdot v0.4s, v9.16b, v27.4b[2]\n"
-      ".inst 0x4fbbe93e  // sdot v30.4s, v9.16b, v27.4b[3]\n"
-      ".inst 0x4f9ae0e4  // sdot v4.4s, v7.16b, v26.4b[0]\n"
-      ".inst 0x4fbae0e1  // sdot v1.4s, v7.16b, v26.4b[1]\n"
-      ".inst 0x4f9ae8e0  // sdot v0.4s, v7.16b, v26.4b[2]\n"
-      ".inst 0x4fbae8fe  // sdot v30.4s, v7.16b, v26.4b[3]\n"
-      ".inst 0x4f99e064  // sdot v4.4s, v3.16b, v25.4b[0]\n"
-      ".inst 0x4fb9e061  // sdot v1.4s, v3.16b, v25.4b[1]\n"
-      ".inst 0x4f99e860  // sdot v0.4s, v3.16b, v25.4b[2]\n"
-      ".inst 0x4fb9e87e  // sdot v30.4s, v3.16b, v25.4b[3]\n"
-      ".inst 0x4f98e1a4  // sdot v4.4s, v13.16b, v24.4b[0]\n"
-      ".inst 0x4fb8e1a1  // sdot v1.4s, v13.16b, v24.4b[1]\n"
-      ".inst 0x4f98e9a0  // sdot v0.4s, v13.16b, v24.4b[2]\n"
-      ".inst 0x4fb8e9be  // sdot v30.4s, v13.16b, v24.4b[3]\n"
-      ".inst 0x4f90e384  // sdot v4.4s, v28.16b, v16.4b[0]\n"
-      ".inst 0x4fb0e381  // sdot v1.4s, v28.16b, v16.4b[1]\n"
-      ".inst 0x4f90eb80  // sdot v0.4s, v28.16b, v16.4b[2]\n"
-      ".inst 0x4fb0eb9e  // sdot v30.4s, v28.16b, v16.4b[3]\n"
-      "scvtf v4.4s, v4.4s, #0x4\n"
-      "scvtf v1.4s, v1.4s, #0x4\n"
-      "scvtf v0.4s, v0.4s, #0x4\n"
-      "fmla v15.4s, v4.4s, v11.4s\n"
-      "scvtf v30.4s, v30.4s, #0x4\n"
-      "fmla v19.4s, v1.4s, v23.4s\n"
-      "fmla v18.4s, v0.4s, v17.4s\n"
-      "fmla v14.4s, v30.4s, v6.4s\n"
-      "bgt 7b\n"
-      "mov x20, %x[res_ptr]\n"
-      "cmp x10, #0x1\n"
-      "str q15, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "ble 8f\n"
-      "cmp x10, #0x2\n"
-      "str q19, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "ble 8f\n"
-      "cmp x10, #0x3\n"
-      "str q18, [x20, #0x0]\n"
-      "add x20, x20, %x[res_stride]\n"
-      "ble 8f\n"
-      "str q14, [x20, #0x0]\n"
-      "8:"  // Row tail: Accumulator store skip
-      "subs x23, x23, #0x4\n"
-      "add %x[res_ptr], %x[res_ptr], #0x10\n"
-      "bne 6b\n"
-      "subs x10, x10, #0x4\n"
-      "add %x[a_ptr], %x[a_ptr], x9\n"
-      "mov %x[res_ptr], x22\n"
-      "bgt 5b\n"
-      "9:"  // Row tail: Row loop skip
-      : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
-      : [b_ptr] "r" (b_ptr), [height] "r" (height), [num_blocks] "r" (num_blocks), [res_stride] "r" (res_stride), [width] "r" (width)
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
-    );
-#endif
-}
-
-void ggml_gemm_q8_0_q8_0(const int n, int rows, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)4);
-    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)4);
-
-    int64_t nb = n / QK8_0;
-    int64_t a_nb = n / QK8_0;
-
-    const block_q8_0x4 * b_ptr_start = vx;
-    const block_q8_0x4 * a_ptr_start = vy;
-
-    for (int64_t y = 0; y < input_width / 4; y += rows / 4) {
-        for (int64_t x = x0 / 4; x < xend / 4; x++) {
-            const block_q8_0x4 * a_ptrs[rows / 4];
-
-            a_ptrs[0] = a_ptr_start + (y * a_nb);
-            for (int i = 0; i < (rows / 4) - 1; i++) {
-                a_ptrs[i + 1] = a_ptrs[i] + a_nb;
-            }
-
-            const block_q8_0x4 * b_ptr = b_ptr_start + (x * nb);
-
-            // Master FP accumulators
-            float32x4_t acc_rows[rows];
-            for (int i = 0; i < rows; i++) {
-                acc_rows[i] = vdupq_n_f32(0.0f);
-            }
-
-            for (int64_t b = 0; b < nb; b++) {
-                // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
-                const int8x16_t rhs_mat_01_0 = vld1q_s8(b_ptr[b].qs);
-                const int8x16_t rhs_mat_23_0 = vld1q_s8(b_ptr[b].qs + 16);
-                const int8x16_t rhs_mat_01_1 = vld1q_s8(b_ptr[b].qs + 32);
-                const int8x16_t rhs_mat_23_1 = vld1q_s8(b_ptr[b].qs + 48);
-                const int8x16_t rhs_mat_01_2 = vld1q_s8(b_ptr[b].qs + 64);
-                const int8x16_t rhs_mat_23_2 = vld1q_s8(b_ptr[b].qs + 80);
-                const int8x16_t rhs_mat_01_3 = vld1q_s8(b_ptr[b].qs + 96);
-                const int8x16_t rhs_mat_23_3 = vld1q_s8(b_ptr[b].qs + 112);
-
-                // Scale values - assemble the four row/column scales into a (64-bit) vector, then expand to FP32
-                const float16x4_t col_scale_f16 = vld1_f16(b_ptr[b].d);
-                const float32x4_t col_scale_f32 = vcvt_f32_f16(col_scale_f16);
-
-                // Process LHS in pairs of rows
-                for (int rp = 0; rp < rows / 4; rp++) {
-                    const int8x16_t lhs_mat_01_0 = vld1q_s8(a_ptrs[rp][b].qs);
-                    const int8x16_t lhs_mat_23_0 = vld1q_s8(a_ptrs[rp][b].qs + 16);
-                    const int8x16_t lhs_mat_01_1 = vld1q_s8(a_ptrs[rp][b].qs + 32);
-                    const int8x16_t lhs_mat_23_1 = vld1q_s8(a_ptrs[rp][b].qs + 48);
-
-                    const int8x16_t lhs_mat_01_2 = vld1q_s8(a_ptrs[rp][b].qs + 64);
-                    const int8x16_t lhs_mat_23_2 = vld1q_s8(a_ptrs[rp][b].qs + 80);
-                    const int8x16_t lhs_mat_01_3 = vld1q_s8(a_ptrs[rp][b].qs + 96);
-                    const int8x16_t lhs_mat_23_3 = vld1q_s8(a_ptrs[rp][b].qs + 112);
-
-                    // Do the MMLAs into 2x2 matrices
-                    const int32x4_t iacc_mat_00 =
-                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_01_0), lhs_mat_01_1, rhs_mat_01_1), lhs_mat_01_2, rhs_mat_01_2), lhs_mat_01_3, rhs_mat_01_3);
-                    const int32x4_t iacc_mat_01 =
-                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_23_0), lhs_mat_01_1, rhs_mat_23_1), lhs_mat_01_2, rhs_mat_23_2), lhs_mat_01_3, rhs_mat_23_3);
-                    const int32x4_t iacc_mat_10 =
-                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_23_0, rhs_mat_01_0), lhs_mat_23_1, rhs_mat_01_1), lhs_mat_23_2, rhs_mat_01_2), lhs_mat_23_3, rhs_mat_01_3);
-                    const int32x4_t iacc_mat_11 =
-                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_23_0, rhs_mat_23_0), lhs_mat_23_1, rhs_mat_23_1), lhs_mat_23_2, rhs_mat_23_2), lhs_mat_23_3, rhs_mat_23_3);
-
-                    // Straighten out to make 4 row vectors
-                    const int32x4_t iacc_row_0 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
-                    const int32x4_t iacc_row_1 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
-                    const int32x4_t iacc_row_2 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_10), vreinterpretq_u64_s32(iacc_mat_11)));
-                    const int32x4_t iacc_row_3 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_10), vreinterpretq_u64_s32(iacc_mat_11)));
-
-                    const float16x4_t row_scale_f16 = vld1_f16(a_ptrs[rp][b].d);
-                    const float32x4_t row_scale_f32 = vcvt_f32_f16(row_scale_f16);
-
-                    acc_rows[rp * 4] = vfmaq_f32(acc_rows[rp * 4], vcvtq_f32_s32(iacc_row_0), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 0));
-                    acc_rows[rp * 4 + 1] = vfmaq_f32(acc_rows[rp * 4 + 1], vcvtq_f32_s32(iacc_row_1), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 1));
-                    acc_rows[rp * 4 + 2] = vfmaq_f32(acc_rows[rp * 4 + 2], vcvtq_f32_s32(iacc_row_2), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 2));
-                    acc_rows[rp * 4 + 3] = vfmaq_f32(acc_rows[rp * 4 + 3], vcvtq_f32_s32(iacc_row_3), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 3));
-                }
-            }
-
-            for (int i = 0; i < rows; i++) {
-                vst1q_f32(s + ((y * 4 + i) * output_channels + x * 4), acc_rows[i]);
-            }
-        }
-    }
-#endif
-}
-
 static bool validate_float(float f, size_t i) {
     if (isinf(f)) {
         fprintf(stderr, "ggml_validate_row_data: found inf value at block %zu\n", i);
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
index ccc255d19ac99..34ea02189b873 100644
--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@@ -122,35 +122,12 @@ size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
 size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q4_0_aarch64(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 
 void iq2xs_init_impl(enum ggml_type type);
 void iq2xs_free_impl(enum ggml_type type);
 void iq3xs_init_impl(int grid_size);
 void iq3xs_free_impl(int grid_size);
 
-block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned int block_len, unsigned int xor_mask);
-block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned int block_len, unsigned int xor_mask);
-block_q8_0x4 make_block_q8_0x4(const block_q8_0 * const in[4], unsigned int block_len);
-block_q8_0x8 make_block_q8_0x8(const block_q8_0 * const in[8], unsigned int block_len);
-void quantize_row_q8_0_aarch64(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k, int nrows_interleaved, int blocklen_per_row);
-
-// GEMV
-void ggml_gemv_q4_0_q8_0_blocked8_neon(const int n, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
-void ggml_gemv_q4_0_q8_0_blocked8_sve(const int n, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
-void ggml_gemv_q4_0_q8_0_aarch64_sve256(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
-void ggml_gemv_q4_0_q8_0_aarch64_neon(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
-void ggml_gemv_q4_0_q8_0_aarch64_neon_noi8mm(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
-void ggml_gemv_q8_0_q8_0_blocked8_neon(const int n, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
-void ggml_gemv_q8_0_q8_0_blocked8_sve(const int n, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
-
-// GEMM
-void ggml_gemm_q4_0_q8_0(const int n, int rows, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
-void ggml_gemm_q4_0_q8_0_aarch64_sve256(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
-void ggml_gemm_q4_0_q8_0_aarch64_neon(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
-void ggml_gemm_q4_0_q8_0_aarch64_neon_noi8mm(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
-void ggml_gemm_q8_0_q8_0(const int n, int rows, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index ced8a1a606289..84568c7e7b229 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -5,6 +5,7 @@
 #include "ggml-impl.h"
 #include "ggml-quants.h"
 #include "ggml.h"
+#include "ggml-aarch64.h"
 
 
 #if defined(_MSC_VER) || defined(__MINGW32__)
@@ -12345,46 +12346,46 @@ UseGgmlGemm2:;
     //    printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d.  Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);
 
     if ((ggml_n_dims(src0) == 2) && (ne11 == 1) && (type == GGML_TYPE_Q4_0_AARCH64)) {
-        gemv(ne00, ne01, 1, (float *)((char *) dst->data), (const char *) src0->data, (const char *) wdata, ith, nth); // use Arm Neon/SVE GEMV kernels
+        gemv(ne00, (float *)((char *) dst->data), (const char *) src0->data, (const char *) wdata, 1, ne01, ith, nth);
     }
     else if ((ggml_n_dims(src0) == 2) && (ne11 >= 16) && (type == GGML_TYPE_Q4_0_AARCH64)) {
-        // use batch-sized 16, 8, and 4 GEMM kernels
+        // use nrows-sized 16, 8, and 4 GEMM kernels
         for (int row_iter = 0; row_iter < ne11 / 16; row_iter++) {
-            gemm(ne00, ne01, 16, (float *)((char *) dst->data + (row_iter * 16 * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 16) * row_size : (row_iter * 16 * nb11)), ith, nth);
+            gemm(ne00, (float *)((char *) dst->data + (row_iter * 16 * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 16) * row_size : (row_iter * 16 * nb11)), 16, ne01, ith, nth);
         }
         int rows_processed = (ne11 / 16) * 16;
         for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 8; row_iter++) {
-            gemm(ne00, ne01, 8, (float *)((char *) dst->data + ((rows_processed + row_iter * 8) * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 8) * row_size : ((rows_processed + row_iter * 8) * nb11)), ith, nth);
+            gemm(ne00, (float *)((char *) dst->data + ((rows_processed + row_iter * 8) * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 8) * row_size : ((rows_processed + row_iter * 8) * nb11)), 8, ne01, ith, nth);
         }
         rows_processed = rows_processed + ((ne11 - rows_processed) / 8) * 8;
         for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
-            gemm(ne00, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
+            gemm(ne00, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), 4, ne01, ith, nth);
         }
         rows_processed = rows_processed + ((ne11 - rows_processed) / 4) * 4;
         for (int row_iter = rows_processed; row_iter < ne11; row_iter++) {
-            gemv(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
+            gemv(ne00, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), 1, ne01, ith, nth);
         }
     }
     else if ((ggml_n_dims(src0) == 2) && (ne11 >= 8) && (type == GGML_TYPE_Q4_0_AARCH64)) {
-        // use batch-sized 8, and 4 GEMM kernels
+        // use nrows-sized 8, and 4 GEMM kernels
         for (int row_iter = 0; row_iter < ne11 / 8; row_iter++) {
-            gemm(ne00, ne01, 8, (float *)((char *) dst->data + (row_iter * 8 * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 8) * row_size : (row_iter * 8 * nb11)), ith, nth);
+            gemm(ne00, (float *)((char *) dst->data + (row_iter * 8 * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 8) * row_size : (row_iter * 8 * nb11)), 8, ne01, ith, nth);
         }
         int rows_processed = (ne11 / 8) * 8;
         for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
-            gemm(ne00, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
+            gemm(ne00, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), 4, ne01, ith, nth);
         }
         for (int row_iter = ((ne11 / 8) * 8) + ((ne11 - rows_processed) / 4 * 4); row_iter < ne11; row_iter++) {
-            gemv(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
+            gemv(ne00, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), 1, ne01, ith, nth);
         }
     }
     else if ((ggml_n_dims(src0) == 2) && (ne11 >= 4) && (type == GGML_TYPE_Q4_0_AARCH64)) {
-        // use batch-sized 4 GEMM kernel
+        // use nrows-sized 4 GEMM kernel
         for (int row_iter = 0; row_iter < ne11 / 4; row_iter++) {
-            gemm(ne00, ne01, 4, (float *)((char *) dst->data + (row_iter * 4 * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 4) * row_size : (row_iter * 4 * nb11)), ith, nth);
+            gemm(ne00, (float *)((char *) dst->data + (row_iter * 4 * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 4) * row_size : (row_iter * 4 * nb11)), 4, ne01, ith, nth);
         }
         for (int row_iter = (ne11 / 4) * 4; row_iter < ne11; row_iter++) {
-            gemv(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
+            gemv(ne00, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), 1, ne01, ith, nth);
         }
     }
     else {

From 441ab6498918280fe977b48e8c82b54a3b325dae Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar109@gmail.com>
Date: Mon, 29 Apr 2024 15:01:54 +0000
Subject: [PATCH 06/28] Arm AArch64: add copyright claim only to
 ggml-aarch64.cpp and ggml-aarch64.h files

---
 ggml/include/ggml.h    | 1 -
 ggml/src/ggml-impl.h   | 1 -
 ggml/src/ggml-quants.c | 1 -
 ggml/src/ggml-quants.h | 1 -
 ggml/src/ggml.c        | 1 -
 src/llama.cpp          | 1 -
 6 files changed, 6 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 8037e21a1a1b5..1e8bb058cc290 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -1,4 +1,3 @@
-// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
 #pragma once
 
 //
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index 23a85229afaf2..a2c8dbec0824f 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -1,4 +1,3 @@
-// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
 #pragma once
 
 #include "ggml.h"
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 64aae855873fc..0eb52e485089f 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -1,4 +1,3 @@
-// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
 #define GGML_COMMON_IMPL_C
 #include "ggml-common.h"
 
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
index 34ea02189b873..30983b8728fa2 100644
--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@@ -1,4 +1,3 @@
-// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
 #pragma once
 
 #define GGML_COMMON_DECL_C
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 84568c7e7b229..aab44842b5c93 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1,4 +1,3 @@
-// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
 #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
 #define _USE_MATH_DEFINES // For M_PI on MSVC
 
diff --git a/src/llama.cpp b/src/llama.cpp
index ff76310542170..6b19d1b2a0363 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1,4 +1,3 @@
-// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
 #define LLAMA_API_INTERNAL
 #include "llama.h"
 

From 8ee677914750ca915382125b4aa32f50651c2653 Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar109@gmail.com>
Date: Wed, 1 May 2024 06:53:48 +0000
Subject: [PATCH 07/28] Arm AArch64: minor code refactoring for rebase

---
 ggml-aarch64.cpp       |  2 +-
 ggml-aarch64.h         |  2 +-
 ggml/src/ggml-quants.c | 23 +++++++++++++++++++++++
 ggml/src/ggml.c        | 39 ++++++---------------------------------
 4 files changed, 31 insertions(+), 35 deletions(-)

diff --git a/ggml-aarch64.cpp b/ggml-aarch64.cpp
index 8dedc7e52701a..82754b29ea10b 100644
--- a/ggml-aarch64.cpp
+++ b/ggml-aarch64.cpp
@@ -92,7 +92,7 @@ size_t quantize_q4_0_aarch64(const float * GGML_RESTRICT src, void * GGML_RESTRI
     }
 }
 
-void quantize_row_q8_0_aarch64(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k, int nrows_interleaved, int blocklen_per_row) {
+void quantize_q8_0_aarch64(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k, int nrows_interleaved, int blocklen_per_row) {
     assert(QK8_0 == 32);
     assert(k % QK8_0 == 0);
     const int nb = k / QK8_0;
diff --git a/ggml-aarch64.h b/ggml-aarch64.h
index bff5b7b80c88b..e83b0178774aa 100644
--- a/ggml-aarch64.h
+++ b/ggml-aarch64.h
@@ -13,7 +13,7 @@ extern "C" {
 #endif
 
 // Quantization
-void quantize_row_q8_0_aarch64(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k, int nrows_interleaved, int blocklen_per_row);
+void quantize_q8_0_aarch64(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k, int nrows_interleaved, int blocklen_per_row);
 
 // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
 size_t quantize_q4_0_aarch64(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 0eb52e485089f..7320000902f01 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -14760,6 +14760,16 @@ static bool validate_fp16(ggml_fp16_t f, size_t i) {
         } \
     }
 
+#define VALIDATE_ROW_DATA_DVEC_F16_IMPL(type, data, nb, nr) \
+    const type * q = (const type *) (data); \
+    for (size_t i = 0; i < (nb); ++i) { \
+        for (size_t j = 0; j < (nr); ++j) { \
+            if (!validate_fp16(q[i].d[j], i)) { \
+                return false; \
+            } \
+        } \
+    }
+
 bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes) {
     if (type < 0 || type >= GGML_TYPE_COUNT) {
         fprintf(stderr, "%s: invalid type %d\n", __func__, type);
@@ -14977,6 +14987,19 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
             {
                 VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
             } break;
+        case GGML_TYPE_Q4_0_AARCH64:
+            {
+#if defined(__ARM_FEATURE_SVE)
+                if (svcntw() == 8) {
+                    VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8);
+                }
+                else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
+                    VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4);
+                }
+#elif defined(__ARM_NEON)
+                VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4);
+#endif
+            } break;
         case GGML_TYPE_I8:
         case GGML_TYPE_I16:
         case GGML_TYPE_I32:
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index aab44842b5c93..bfa329875d364 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -705,7 +705,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
 #else
         .nrows                    = 1,
 #endif
-        .from_float_to_mat        = quantize_row_q8_0_aarch64,
+        .from_float_to_mat        = quantize_q8_0_aarch64,
     },
     [GGML_TYPE_Q8_1] = {
         .type_name                = "q8_1",
@@ -909,16 +909,12 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .blck_size                = QK4_0,
         .type_size                = sizeof(block_q4_0),
         .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q4_0,
-        .from_float               = quantize_row_q4_0,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_0_reference,
-        .vec_dot                  = ggml_vec_dot_q4_0_q8_0,
+        .to_float                 = NULL,
+        .from_float               = NULL,
+        .from_float_reference     = NULL,
+        .vec_dot                  = NULL,
         .vec_dot_type             = GGML_TYPE_Q8_0,
-#if defined (__ARM_FEATURE_MATMUL_INT8)
-        .nrows                    = 2,
-#else
         .nrows                    = 1,
-#endif
 #if defined(__ARM_FEATURE_SVE)
         .gemv                     = ggml_gemv_q4_0_q8_0_aarch64_sve256,
         .gemm                     = ggml_gemm_q4_0_q8_0_aarch64_sve256,
@@ -12347,8 +12343,7 @@ UseGgmlGemm2:;
     if ((ggml_n_dims(src0) == 2) && (ne11 == 1) && (type == GGML_TYPE_Q4_0_AARCH64)) {
         gemv(ne00, (float *)((char *) dst->data), (const char *) src0->data, (const char *) wdata, 1, ne01, ith, nth);
     }
-    else if ((ggml_n_dims(src0) == 2) && (ne11 >= 16) && (type == GGML_TYPE_Q4_0_AARCH64)) {
-        // use nrows-sized 16, 8, and 4 GEMM kernels
+    else if ((ggml_n_dims(src0) == 2) && (ne11 >= 2) && (type == GGML_TYPE_Q4_0_AARCH64)) {
         for (int row_iter = 0; row_iter < ne11 / 16; row_iter++) {
             gemm(ne00, (float *)((char *) dst->data + (row_iter * 16 * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 16) * row_size : (row_iter * 16 * nb11)), 16, ne01, ith, nth);
         }
@@ -12365,28 +12360,6 @@ UseGgmlGemm2:;
             gemv(ne00, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), 1, ne01, ith, nth);
         }
     }
-    else if ((ggml_n_dims(src0) == 2) && (ne11 >= 8) && (type == GGML_TYPE_Q4_0_AARCH64)) {
-        // use nrows-sized 8, and 4 GEMM kernels
-        for (int row_iter = 0; row_iter < ne11 / 8; row_iter++) {
-            gemm(ne00, (float *)((char *) dst->data + (row_iter * 8 * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 8) * row_size : (row_iter * 8 * nb11)), 8, ne01, ith, nth);
-        }
-        int rows_processed = (ne11 / 8) * 8;
-        for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
-            gemm(ne00, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), 4, ne01, ith, nth);
-        }
-        for (int row_iter = ((ne11 / 8) * 8) + ((ne11 - rows_processed) / 4 * 4); row_iter < ne11; row_iter++) {
-            gemv(ne00, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), 1, ne01, ith, nth);
-        }
-    }
-    else if ((ggml_n_dims(src0) == 2) && (ne11 >= 4) && (type == GGML_TYPE_Q4_0_AARCH64)) {
-        // use nrows-sized 4 GEMM kernel
-        for (int row_iter = 0; row_iter < ne11 / 4; row_iter++) {
-            gemm(ne00, (float *)((char *) dst->data + (row_iter * 4 * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 4) * row_size : (row_iter * 4 * nb11)), 4, ne01, ith, nth);
-        }
-        for (int row_iter = (ne11 / 4) * 4; row_iter < ne11; row_iter++) {
-            gemv(ne00, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), 1, ne01, ith, nth);
-        }
-    }
     else {
         // The first chunk comes from our thread_id, the rest will get auto-assigned.
         int current_chunk = ith;

From a657246d622bef13bfc32871b8b9f869fc0f1725 Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar109@gmail.com>
Date: Thu, 16 May 2024 12:15:48 +0000
Subject: [PATCH 08/28] Arm AArch64: minor code refactoring for resolving a
 build issue with cmake

---
 ggml-aarch64.cpp | 1277 +++++++++++++++++++---------------------------
 ggml-aarch64.h   |   11 +-
 ggml/src/ggml.c  |   59 +--
 3 files changed, 547 insertions(+), 800 deletions(-)

diff --git a/ggml-aarch64.cpp b/ggml-aarch64.cpp
index 82754b29ea10b..b12cd0b28b530 100644
--- a/ggml-aarch64.cpp
+++ b/ggml-aarch64.cpp
@@ -1,4 +1,8 @@
 // SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
+
+#pragma GCC diagnostic ignored "-Wpedantic"
+#pragma GCC diagnostic ignored "-Wignored-attributes"
+
 #define GGML_COMMON_IMPL_C
 #include "ggml-common.h"
 
@@ -315,90 +319,94 @@ inline int64_t roundup(const int64_t a, const int64_t b) {
     }
 }
 
-void ggml_gemv_q4_0_q8_0_aarch64_sve256(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth) {
+void ggml_gemv_q4_0_q8_0_aarch64(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth) {
+    UNUSED(n);
+    UNUSED(s);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(ith);
+    UNUSED(nth);
+
 #if defined(__ARM_FEATURE_SVE)
-    if (svcntw() != 8) {
-        if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) ggml_gemv_q4_0_q8_0_aarch64_neon(n, s, vx, vy, nr, nc, ith, nth);
+    if (svcntw() == 8) {
+        int64_t x0 = roundup((ith * nc) / nth, (int64_t)8);
+        int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)8);
+        size_t width = xend - x0;
+
+        int64_t nb = n / QK4_0;
+        const void * b_ptr = (const void *)((const block_q4_0x8 *) vx + ((x0 / 8) * nb));
+        const void * a_ptr = vy;
+        float * res_ptr = s + x0;
+
+        assert(n % 32 == 0);
+        assert(width % 8 == 0);
+
+        size_t num_blocks = n / 32;
+
+        __asm__ __volatile__(
+            "ptrue p0.b\n"
+            "add %x[b_ptr], %x[b_ptr], #0x10\n"
+            "1:"  // Column loop
+            "add x22, %x[a_ptr], #0x2\n"
+            "mov z31.b, #0x0\n"
+            "mov x21, %x[num_blocks]\n"
+            "2:"  // Block loop
+            "ld1b { z30.b }, p0/Z, [%x[b_ptr]]\n"
+            "ld1b { z29.b }, p0/Z, [%x[b_ptr], #1, MUL VL]\n"
+            "mov z28.s, #0x0\n"
+            "mov z27.s, #0x0\n"
+            "ld1rd { z26.d }, p0/Z, [x22]\n"
+            "ld1b { z25.b }, p0/Z, [%x[b_ptr], #2, MUL VL]\n"
+            "sub x20, x22, #0x2\n"
+            "sub x21, x21, #0x1\n"
+            "ld1b { z24.b }, p0/Z, [%x[b_ptr], #3, MUL VL]\n"
+            "ld1rd { z23.d }, p0/Z, [x22, #8]\n"
+            "lsl z22.b, z30.b, #0x4\n"
+            "lsl z16.b, z29.b, #0x4\n"
+            "and z30.b, z30.b, #0xf0\n"
+            "and z29.b, z29.b, #0xf0\n"
+            "ld1rd { z21.d }, p0/Z, [x22, #16]\n"
+            "ld1rd { z20.d }, p0/Z, [x22, #24]\n"
+            "lsl z19.b, z25.b, #0x4\n"
+            "and z25.b, z25.b, #0xf0\n"
+            "ld1rh { z17.h }, p0/Z, [x20]\n"
+            "ld1h { z18.s }, p0/Z, [%x[b_ptr], #-1, MUL VL]\n"
+            "sdot z28.s, z22.b, z26.b\n"
+            "sdot z27.s, z16.b, z26.b\n"
+            "lsl z16.b, z24.b, #0x4\n"
+            "add x22, x22, #0x22\n"
+            "and z24.b, z24.b, #0xf0\n"
+            "add %x[b_ptr], %x[b_ptr], #0x90\n"
+            "fcvt z17.s, p0/m, z17.h\n"
+            "fcvt z18.s, p0/m, z18.h\n"
+            "sdot z28.s, z19.b, z23.b\n"
+            "sdot z27.s, z16.b, z23.b\n"
+            "fmul z18.s, z18.s, z17.s\n"
+            "sdot z28.s, z30.b, z21.b\n"
+            "sdot z27.s, z29.b, z21.b\n"
+            "sdot z28.s, z25.b, z20.b\n"
+            "sdot z27.s, z24.b, z20.b\n"
+            "uzp1 z17.s, z28.s, z27.s\n"
+            "uzp2 z16.s, z28.s, z27.s\n"
+            "add z17.s, z17.s, z16.s\n"
+            "asr z17.s, z17.s, #0x4\n"
+            "scvtf z17.s, p0/m, z17.s\n"
+            "fmla z31.s, p0/M, z17.s, z18.s\n"
+            "cbnz x21, 2b\n"
+            "sub %x[width], %x[width], #0x8\n"
+            "st1w { z31.s }, p0, [%x[res_ptr]]\n"
+            "add %x[res_ptr], %x[res_ptr], #0x20\n"
+            "cbnz %x[width], 1b\n"
+            : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [width] "+&r" (width)
+            : [a_ptr] "r" (a_ptr), [num_blocks] "r" (num_blocks)
+            : "memory", "p0", "x20", "x21", "x22", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+        );
         return;
     }
-    int64_t x0 = roundup((ith * nc) / nth, (int64_t)8);
-    int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)8);
-    size_t width = xend - x0;
-
-    int64_t nb = n / QK4_0;
-    const void * b_ptr = (const void *)((const block_q4_0x8 *) vx + ((x0 / 8) * nb));
-    const void * a_ptr = vy;
-    float * res_ptr = s + x0;
-
-    assert(n % 32 == 0);
-    assert(width % 8 == 0);
-
-    size_t num_blocks = n / 32;
-
-    __asm__ __volatile__(
-        "ptrue p0.b\n"
-        "add %x[b_ptr], %x[b_ptr], #0x10\n"
-        "1:"  // Column loop
-        "add x22, %x[a_ptr], #0x2\n"
-        "mov z31.b, #0x0\n"
-        "mov x21, %x[num_blocks]\n"
-        "2:"  // Block loop
-        "ld1b { z30.b }, p0/Z, [%x[b_ptr]]\n"
-        "ld1b { z29.b }, p0/Z, [%x[b_ptr], #1, MUL VL]\n"
-        "mov z28.s, #0x0\n"
-        "mov z27.s, #0x0\n"
-        "ld1rd { z26.d }, p0/Z, [x22]\n"
-        "ld1b { z25.b }, p0/Z, [%x[b_ptr], #2, MUL VL]\n"
-        "sub x20, x22, #0x2\n"
-        "sub x21, x21, #0x1\n"
-        "ld1b { z24.b }, p0/Z, [%x[b_ptr], #3, MUL VL]\n"
-        "ld1rd { z23.d }, p0/Z, [x22, #8]\n"
-        "lsl z22.b, z30.b, #0x4\n"
-        "lsl z16.b, z29.b, #0x4\n"
-        "and z30.b, z30.b, #0xf0\n"
-        "and z29.b, z29.b, #0xf0\n"
-        "ld1rd { z21.d }, p0/Z, [x22, #16]\n"
-        "ld1rd { z20.d }, p0/Z, [x22, #24]\n"
-        "lsl z19.b, z25.b, #0x4\n"
-        "and z25.b, z25.b, #0xf0\n"
-        "ld1rh { z17.h }, p0/Z, [x20]\n"
-        "ld1h { z18.s }, p0/Z, [%x[b_ptr], #-1, MUL VL]\n"
-        "sdot z28.s, z22.b, z26.b\n"
-        "sdot z27.s, z16.b, z26.b\n"
-        "lsl z16.b, z24.b, #0x4\n"
-        "add x22, x22, #0x22\n"
-        "and z24.b, z24.b, #0xf0\n"
-        "add %x[b_ptr], %x[b_ptr], #0x90\n"
-        "fcvt z17.s, p0/m, z17.h\n"
-        "fcvt z18.s, p0/m, z18.h\n"
-        "sdot z28.s, z19.b, z23.b\n"
-        "sdot z27.s, z16.b, z23.b\n"
-        "fmul z18.s, z18.s, z17.s\n"
-        "sdot z28.s, z30.b, z21.b\n"
-        "sdot z27.s, z29.b, z21.b\n"
-        "sdot z28.s, z25.b, z20.b\n"
-        "sdot z27.s, z24.b, z20.b\n"
-        "uzp1 z17.s, z28.s, z27.s\n"
-        "uzp2 z16.s, z28.s, z27.s\n"
-        "add z17.s, z17.s, z16.s\n"
-        "asr z17.s, z17.s, #0x4\n"
-        "scvtf z17.s, p0/m, z17.s\n"
-        "fmla z31.s, p0/M, z17.s, z18.s\n"
-        "cbnz x21, 2b\n"
-        "sub %x[width], %x[width], #0x8\n"
-        "st1w { z31.s }, p0, [%x[res_ptr]]\n"
-        "add %x[res_ptr], %x[res_ptr], #0x20\n"
-        "cbnz %x[width], 1b\n"
-        : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [width] "+&r" (width)
-        : [a_ptr] "r" (a_ptr), [num_blocks] "r" (num_blocks)
-        : "memory", "p0", "x20", "x21", "x22", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
-    );
 #endif
-}
-
-void ggml_gemv_q4_0_q8_0_aarch64_neon(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth) {
-    UNUSED(nr);
-#if defined(__ARM_NEON)
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
     int64_t x0 = roundup((ith * nc) / nth, (int64_t)4);
     int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)4);
     size_t width = xend - x0;
@@ -470,12 +478,7 @@ void ggml_gemv_q4_0_q8_0_aarch64_neon(int n, float * GGML_RESTRICT s, const void
         : [a_ptr] "r" (a_ptr), [num_blocks] "r" (num_blocks)
         : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
     );
-#endif
-}
-
-void ggml_gemv_q4_0_q8_0_aarch64_neon_noi8mm(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth) {
-    UNUSED(nr);
-#if defined(__ARM_NEON)
+#elif defined(__ARM_NEON)
     int64_t x0 = roundup((ith * nc) / nth, (int64_t)4);
     int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)4);
     size_t width = xend - x0;
@@ -545,589 +548,438 @@ void ggml_gemv_q4_0_q8_0_aarch64_neon_noi8mm(int n, float * GGML_RESTRICT s, con
 #endif
 }
 
-void ggml_gemv_q8_0_q8_0_aarch64_sve256(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth) {
-#if defined(__ARM_FEATURE_SVE)
-    int64_t x0 = roundup((ith * nc) / nth, (int64_t)8);
-    int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)8);
-
-    int64_t nb = n / QK8_0;
-    int64_t a_nb = n / QK8_0;
-
-    const svbool_t ptrue = svptrue_b8();
-
-    const block_q8_0x8 * b_ptr_start = (const block_q8_0x8 *) vx;
-    const block_q8_0 * a_ptr_start = (const block_q8_0 *) vy;
-
-    for (int64_t y = 0; y < nr; y++) {
-        for (int64_t x = x0 / 8; x < xend / 8; x++) {
-            // Pointers to LHS blocks
-            const block_q8_0 * a_ptr = a_ptr_start + (y * a_nb);
-            // Pointers to RHS blocks
-            const block_q8_0x8 * b_ptr = b_ptr_start + (x * nb);
-
-            // Master FP accumulator
-            svfloat32_t acc_row = svdup_f32(0.0f);
-
-            for (int64_t b = 0; b < nb; b++) {
-                // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
-                const svint8_t rhs_vec_0_0_0 = svld1_s8(ptrue, b_ptr[b].qs);
-                const svint8_t rhs_vec_0_1_0 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 1);
-                const svint8_t rhs_vec_0_2_0 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 2);
-                const svint8_t rhs_vec_0_3_0 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 3);
-                const svint8_t rhs_vec_0_0_1 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 4);
-                const svint8_t rhs_vec_0_1_1 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 5);
-                const svint8_t rhs_vec_0_2_1 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 6);
-                const svint8_t rhs_vec_0_3_1 = svld1_vnum_s8(ptrue, b_ptr[b].qs, 7);
-
-                // Scale values
-                const svfloat16_t col_scale_f16 = svreinterpret_f16_u32(svld1uh_u32(ptrue, (const uint16_t *) b_ptr[b].d));
-                const svfloat32_t col_scale_f32 = svcvt_f32_f16_x(ptrue, col_scale_f16);
-
-                const svfloat16_t row_scale_f16 = svdup_f16(a_ptr[b].d);
-                const svfloat32_t row_scale_f32 = svcvt_f32_f16_x(ptrue, row_scale_f16);
-
-                const svint8_t lhs_vec_0 = svld1rq_s8(ptrue, a_ptr[b].qs);
-                const svint8_t lhs_vec_1 = svld1rq_s8(ptrue, a_ptr[b].qs + 16);
-
-                svint32_t iacc = svdup_s32(0);
-
-                iacc = svdot_lane(iacc, rhs_vec_0_0_0, lhs_vec_0, 0);
-                iacc = svdot_lane(iacc, rhs_vec_0_0_1, lhs_vec_1, 0);
-
-                iacc = svdot_lane(iacc, rhs_vec_0_1_0, lhs_vec_0, 1);
-                iacc = svdot_lane(iacc, rhs_vec_0_1_1, lhs_vec_1, 1);
-
-                iacc = svdot_lane(iacc, rhs_vec_0_2_0, lhs_vec_0, 2);
-                iacc = svdot_lane(iacc, rhs_vec_0_2_1, lhs_vec_1, 2);
-
-                iacc = svdot_lane(iacc, rhs_vec_0_3_0, lhs_vec_0, 3);
-                iacc = svdot_lane(iacc, rhs_vec_0_3_1, lhs_vec_1, 3);
-
-                acc_row = svmla_x(ptrue, acc_row, svcvt_f32_s32_x(ptrue, iacc), svmul_x(ptrue, col_scale_f32, row_scale_f32));
-            }
-
-            svst1(ptrue, s + (y * nc + x * 8), acc_row);
-        }
-    }
-#endif
-}
-
-void ggml_gemv_q8_0_q8_0_aarch64_neon(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth) {
-#if defined(__ARM_NEON)
-    int64_t x0 = roundup((ith * nc) / nth, (int64_t)8);
-    int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)8);
-
-    int64_t nb = n / QK8_0;
-    int64_t a_nb = n / QK8_0;
-
-    const block_q8_0x8 * b_ptr_start = (const block_q8_0x8 *) vx;
-    const block_q8_0 * a_ptr_start = (const block_q8_0 *) vy;
-
-    for (int64_t y = 0; y < nr; y++) {
-        for (int64_t x = x0 / 8; x < xend / 8; x++) {
-            // Pointers to LHS blocks
-            const block_q8_0 * a_ptr = a_ptr_start + (y * a_nb);
-            // Pointers to RHS blocks
-            const block_q8_0x8 * b_ptr = b_ptr_start + (x * nb);
-            // Master FP accumulator
-            float32x4_t acc_row[2];
-            acc_row[0] = acc_row[1] = vdupq_n_f32(0.0f);
-
-            for (int64_t b = 0; b < nb; b++) {
-                // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
-                const int8x16_t rhs_vec_0_0_0 = vld1q_s8(b_ptr[b].qs);
-                const int8x16_t rhs_vec_1_0_0 = vld1q_s8(b_ptr[b].qs + 16);
-                const int8x16_t rhs_vec_0_1_0 = vld1q_s8(b_ptr[b].qs + 32);
-                const int8x16_t rhs_vec_1_1_0 = vld1q_s8(b_ptr[b].qs + 48);
-                const int8x16_t rhs_vec_0_2_0 = vld1q_s8(b_ptr[b].qs + 64);
-                const int8x16_t rhs_vec_1_2_0 = vld1q_s8(b_ptr[b].qs + 80);
-                const int8x16_t rhs_vec_0_3_0 = vld1q_s8(b_ptr[b].qs + 96);
-                const int8x16_t rhs_vec_1_3_0 = vld1q_s8(b_ptr[b].qs + 112);
-                const int8x16_t rhs_vec_0_0_1 = vld1q_s8(b_ptr[b].qs + 128);
-                const int8x16_t rhs_vec_1_0_1 = vld1q_s8(b_ptr[b].qs + 144);
-                const int8x16_t rhs_vec_0_1_1 = vld1q_s8(b_ptr[b].qs + 160);
-                const int8x16_t rhs_vec_1_1_1 = vld1q_s8(b_ptr[b].qs + 176);
-                const int8x16_t rhs_vec_0_2_1 = vld1q_s8(b_ptr[b].qs + 192);
-                const int8x16_t rhs_vec_1_2_1 = vld1q_s8(b_ptr[b].qs + 208);
-                const int8x16_t rhs_vec_0_3_1 = vld1q_s8(b_ptr[b].qs + 224);
-                const int8x16_t rhs_vec_1_3_1 = vld1q_s8(b_ptr[b].qs + 240);
-
-                // Scale values - assemble the four row/column scales into a (64-bit) vector, then expand to FP32
-                const float16x8_t col_scale_f16 = vld1q_f16((const ggml_fp16_internal_t *)(b_ptr[b].d));
-                const float32x4_t col_scale_f32_0 = vcvt_f32_f16(vget_low_f16(col_scale_f16));
-                const float32x4_t col_scale_f32_1 = vcvt_f32_f16(vget_high_f16(col_scale_f16));
-
-                const float16x4_t row_scale_f16 = vld1_dup_f16((const ggml_fp16_internal_t *)(&(a_ptr[b].d)));
-                const float32x4_t row_scale_f32 = vcvt_f32_f16(row_scale_f16);
-
-                const int8x16_t lhs_vec_0 = vld1q_s8(a_ptr[b].qs);
-                const int8x16_t lhs_vec_1 = vld1q_s8(a_ptr[b].qs + 16);
-
-                int32x4_t iacc0 = vdupq_n_s32(0);
-                int32x4_t iacc1 = vdupq_n_s32(0);
-
-                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_0_0, lhs_vec_0, 0);
-                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_0_1, lhs_vec_1, 0);
-
-                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_0_0, lhs_vec_0, 0);
-                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_0_1, lhs_vec_1, 0);
-
-                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_1_0, lhs_vec_0, 1);
-                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_1_1, lhs_vec_1, 1);
-
-                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_1_0, lhs_vec_0, 1);
-                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_1_1, lhs_vec_1, 1);
-
-                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_2_0, lhs_vec_0, 2);
-                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_2_1, lhs_vec_1, 2);
-
-                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_2_0, lhs_vec_0, 2);
-                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_2_1, lhs_vec_1, 2);
-
-                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_3_0, lhs_vec_0, 3);
-                iacc0 = vdotq_laneq_s32(iacc0, rhs_vec_0_3_1, lhs_vec_1, 3);
-
-                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_3_0, lhs_vec_0, 3);
-                iacc1 = vdotq_laneq_s32(iacc1, rhs_vec_1_3_1, lhs_vec_1, 3);
-
-                acc_row[0] = vfmaq_f32(acc_row[0], vcvtq_f32_s32(iacc0), vmulq_f32(col_scale_f32_0, row_scale_f32));
-                acc_row[1] = vfmaq_f32(acc_row[1], vcvtq_f32_s32(iacc1), vmulq_f32(col_scale_f32_1, row_scale_f32));
-            }
-
-            vst1q_f32(s + (y * nc + x * 8), acc_row[0]);
-            vst1q_f32(s + (y * nc + x * 8 + 4), acc_row[1]);
-        }
-    }
-#endif
-}
+void ggml_gemm_q4_0_q8_0_aarch64(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth) {
+    UNUSED(n);
+    UNUSED(s);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(ith);
+    UNUSED(nth);
 
-void ggml_gemm_q4_0_q8_0_aarch64_sve256(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth) {
 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (svcntw() != 8) {
-        if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) ggml_gemm_q4_0_q8_0_aarch64_neon(n, s, vx, vy, nr, nc, ith, nth);
+    if (svcntw() == 8) {
+        int64_t x0 = roundup((ith * nc) / nth, (int64_t)8);
+        int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)8);
+        size_t width = xend - x0;
+
+        int64_t nb = n / QK4_0;
+        const void * b_ptr = (const void *)((const block_q4_0x8 *) vx + ((x0 / 8) * nb));
+        const void * a_ptr = vy;
+        float * res_ptr = s + x0;
+        size_t res_stride = nc * sizeof(float);
+
+        assert(n % 32 == 0);
+        assert(width % 8 == 0);
+
+        size_t num_blocks = n / 32;
+
+        __asm__ __volatile__(
+            "mov x20, #0x4\n"
+            "mov x13, %x[nr]\n"
+            "mov z28.s, #-0x4\n"
+            "mov x12, #0x88\n"
+            "ptrue p1.b\n"
+            "whilelt p0.s, XZR, x20\n"
+            "cmp x13, #0x10\n"
+            "mul x12, %x[num_blocks], x12\n"
+            "blt 4f\n"
+            "1:"  // Row loop
+            "add x11, %x[b_ptr], #0x10\n"
+            "mov x10, %x[width]\n"
+            "add x9, %x[res_ptr], %x[res_stride], LSL #4\n"
+            "2:"  // Column loop
+            "add x28, %x[a_ptr], #0x8\n"
+            "mov z24.b, #0x0\n"
+            "mov z15.b, #0x0\n"
+            "mov x27, %x[num_blocks]\n"
+            "add x26, x28, x12\n"
+            "mov z12.b, #0x0\n"
+            "mov z0.b, #0x0\n"
+            "add x25, x26, x12\n"
+            "mov z13.b, #0x0\n"
+            "mov z1.b, #0x0\n"
+            "add x24, x25, x12\n"
+            "mov z20.b, #0x0\n"
+            "mov z25.b, #0x0\n"
+            "mov z11.b, #0x0\n"
+            "mov z16.b, #0x0\n"
+            "mov z19.b, #0x0\n"
+            "mov z26.b, #0x0\n"
+            "mov z8.b, #0x0\n"
+            "mov z29.b, #0x0\n"
+            "mov z27.b, #0x0\n"
+            "mov z10.b, #0x0\n"
+            "3:"  // Block loop
+            "ld1b { z30.b }, p1/Z, [x11]\n"
+            "ld1b { z21.b }, p1/Z, [x11, #1, MUL VL]\n"
+            "mov z18.s, #0x0\n"
+            "mov z7.s, #0x0\n"
+            "ld1rqb { z3.b }, p1/Z, [x28]\n"
+            "ld1rqb { z5.b }, p1/Z, [x28, #16]\n"
+            "mov z9.s, #0x0\n"
+            "mov z22.s, #0x0\n"
+            "ld1b { z4.b }, p1/Z, [x11, #2, MUL VL]\n"
+            "ld1b { z17.b }, p1/Z, [x11, #3, MUL VL]\n"
+            "sub x20, x11, #0x10\n"
+            "sub x23, x28, #0x8\n"
+            "lsl z31.b, z30.b, #0x4\n"
+            "lsl z6.b, z21.b, #0x4\n"
+            "ld1h { z23.s }, p1/Z, [x20]\n"
+            "sub x22, x26, #0x8\n"
+            "and z30.b, z30.b, #0xf0\n"
+            "and z21.b, z21.b, #0xf0\n"
+            "sub x21, x25, #0x8\n"
+            "sub x20, x24, #0x8\n"
+            "lsl z14.b, z4.b, #0x4\n"
+            "lsl z2.b, z17.b, #0x4\n"
+            "subs x27, x27, #0x1\n"
+            "add x11, x11, #0x90\n"
+            ".inst 0x451f9872  // smmla z18.s, z3.b, z31.b\n"
+            ".inst 0x45069867  // smmla z7.s, z3.b, z6.b\n"
+            "ld1rqb { z3.b }, p1/Z, [x28, #32]\n"
+            "and z4.b, z4.b, #0xf0\n"
+            ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
+            ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x28, #48]\n"
+            "and z17.b, z17.b, #0xf0\n"
+            "fcvt z23.s, p1/m, z23.h\n"
+            ".inst 0x450e9872  // smmla z18.s, z3.b, z14.b\n"
+            ".inst 0x45029867  // smmla z7.s, z3.b, z2.b\n"
+            "ld1rqb { z3.b }, p1/Z, [x28, #64]\n"
+            ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
+            ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x28, #80]\n"
+            "fscale z23.s, p1/m, z23.s, z28.s\n"
+            ".inst 0x451e9872  // smmla z18.s, z3.b, z30.b\n"
+            ".inst 0x45159867  // smmla z7.s, z3.b, z21.b\n"
+            "ld1rqb { z3.b }, p1/Z, [x28, #96]\n"
+            ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
+            ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x28, #112]\n"
+            "add x28, x28, #0x88\n"
+            ".inst 0x45049872  // smmla z18.s, z3.b, z4.b\n"
+            ".inst 0x45119867  // smmla z7.s, z3.b, z17.b\n"
+            "ld1h { z3.s }, p0/Z, [x23]\n"
+            ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
+            ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
+            "fcvt z3.s, p1/m, z3.h\n"
+            "uzp1 z5.d, z18.d, z7.d\n"
+            "uzp2 z18.d, z18.d, z7.d\n"
+            "mov z3.q, z3.q[0]\n"
+            "uzp1 z7.d, z9.d, z22.d\n"
+            "uzp2 z22.d, z9.d, z22.d\n"
+            "fmul z9.s, z23.s, z3.s[0]\n"
+            "scvtf z5.s, p1/m, z5.s\n"
+            "scvtf z18.s, p1/m, z18.s\n"
+            "scvtf z7.s, p1/m, z7.s\n"
+            "scvtf z22.s, p1/m, z22.s\n"
+            "fmla z24.s, p1/M, z5.s, z9.s\n"
+            "ld1rqb { z5.b }, p1/Z, [x26]\n"
+            "fmul z9.s, z23.s, z3.s[1]\n"
+            "fmla z15.s, p1/M, z18.s, z9.s\n"
+            "ld1rqb { z18.b }, p1/Z, [x26, #16]\n"
+            "fmul z9.s, z23.s, z3.s[2]\n"
+            "fmul z3.s, z23.s, z3.s[3]\n"
+            "fmla z12.s, p1/M, z7.s, z9.s\n"
+            "mov z9.s, #0x0\n"
+            "ld1h { z7.s }, p0/Z, [x22]\n"
+            ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
+            "fmla z0.s, p1/M, z22.s, z3.s\n"
+            "mov z22.s, #0x0\n"
+            "ld1h { z3.s }, p0/Z, [x21]\n"
+            ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x26, #32]\n"
+            "fcvt z7.s, p1/m, z7.h\n"
+            "fcvt z3.s, p1/m, z3.h\n"
+            ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
+            ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x26, #64]\n"
+            "mov z7.q, z7.q[0]\n"
+            "mov z3.q, z3.q[0]\n"
+            ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
+            ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x26, #96]\n"
+            ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
+            ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
+            "uzp1 z5.d, z9.d, z22.d\n"
+            "scvtf z5.s, p1/m, z5.s\n"
+            "uzp2 z22.d, z9.d, z22.d\n"
+            "fmul z9.s, z23.s, z7.s[0]\n"
+            "scvtf z22.s, p1/m, z22.s\n"
+            "fmla z13.s, p1/M, z5.s, z9.s\n"
+            "ld1rqb { z9.b }, p1/Z, [x25]\n"
+            "fmul z5.s, z23.s, z7.s[1]\n"
+            "fmla z1.s, p1/M, z22.s, z5.s\n"
+            "mov z5.s, #0x0\n"
+            "mov z22.s, #0x0\n"
+            ".inst 0x451f9a45  // smmla z5.s, z18.b, z31.b\n"
+            ".inst 0x45069a56  // smmla z22.s, z18.b, z6.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x26, #48]\n"
+            ".inst 0x450e9a45  // smmla z5.s, z18.b, z14.b\n"
+            ".inst 0x45029a56  // smmla z22.s, z18.b, z2.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x26, #80]\n"
+            ".inst 0x451e9a45  // smmla z5.s, z18.b, z30.b\n"
+            ".inst 0x45159a56  // smmla z22.s, z18.b, z21.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x26, #112]\n"
+            "add x26, x26, #0x88\n"
+            ".inst 0x45049a45  // smmla z5.s, z18.b, z4.b\n"
+            ".inst 0x45119a56  // smmla z22.s, z18.b, z17.b\n"
+            "uzp1 z18.d, z5.d, z22.d\n"
+            "scvtf z18.s, p1/m, z18.s\n"
+            "uzp2 z22.d, z5.d, z22.d\n"
+            "fmul z5.s, z23.s, z7.s[2]\n"
+            "fmul z7.s, z23.s, z7.s[3]\n"
+            "scvtf z22.s, p1/m, z22.s\n"
+            "fmla z20.s, p1/M, z18.s, z5.s\n"
+            "ld1rqb { z18.b }, p1/Z, [x25, #16]\n"
+            "ld1h { z5.s }, p0/Z, [x20]\n"
+            "fcvt z5.s, p1/m, z5.h\n"
+            "fmla z25.s, p1/M, z22.s, z7.s\n"
+            "mov z22.s, #0x0\n"
+            "mov z7.s, #0x0\n"
+            ".inst 0x451f9936  // smmla z22.s, z9.b, z31.b\n"
+            ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
+            "ld1rqb { z9.b }, p1/Z, [x25, #32]\n"
+            "mov z5.q, z5.q[0]\n"
+            ".inst 0x450e9936  // smmla z22.s, z9.b, z14.b\n"
+            ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
+            "ld1rqb { z9.b }, p1/Z, [x25, #64]\n"
+            ".inst 0x451e9936  // smmla z22.s, z9.b, z30.b\n"
+            ".inst 0x45159927  // smmla z7.s, z9.b, z21.b\n"
+            "ld1rqb { z9.b }, p1/Z, [x25, #96]\n"
+            ".inst 0x45049936  // smmla z22.s, z9.b, z4.b\n"
+            ".inst 0x45119927  // smmla z7.s, z9.b, z17.b\n"
+            "uzp1 z9.d, z22.d, z7.d\n"
+            "scvtf z9.s, p1/m, z9.s\n"
+            "uzp2 z22.d, z22.d, z7.d\n"
+            "fmul z7.s, z23.s, z3.s[0]\n"
+            "scvtf z22.s, p1/m, z22.s\n"
+            "fmla z11.s, p1/M, z9.s, z7.s\n"
+            "ld1rqb { z9.b }, p1/Z, [x24]\n"
+            "fmul z7.s, z23.s, z3.s[1]\n"
+            "fmla z16.s, p1/M, z22.s, z7.s\n"
+            "mov z22.s, #0x0\n"
+            "mov z7.s, #0x0\n"
+            ".inst 0x451f9a56  // smmla z22.s, z18.b, z31.b\n"
+            ".inst 0x45069a47  // smmla z7.s, z18.b, z6.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x25, #48]\n"
+            ".inst 0x450e9a56  // smmla z22.s, z18.b, z14.b\n"
+            ".inst 0x45029a47  // smmla z7.s, z18.b, z2.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x25, #80]\n"
+            ".inst 0x451e9a56  // smmla z22.s, z18.b, z30.b\n"
+            ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x25, #112]\n"
+            "add x25, x25, #0x88\n"
+            ".inst 0x45049a56  // smmla z22.s, z18.b, z4.b\n"
+            ".inst 0x45119a47  // smmla z7.s, z18.b, z17.b\n"
+            "uzp1 z18.d, z22.d, z7.d\n"
+            "scvtf z18.s, p1/m, z18.s\n"
+            "uzp2 z7.d, z22.d, z7.d\n"
+            "fmul z22.s, z23.s, z3.s[2]\n"
+            "fmul z3.s, z23.s, z3.s[3]\n"
+            "scvtf z7.s, p1/m, z7.s\n"
+            "fmla z19.s, p1/M, z18.s, z22.s\n"
+            "ld1rqb { z18.b }, p1/Z, [x24, #16]\n"
+            "fmul z22.s, z23.s, z5.s[0]\n"
+            "fmla z26.s, p1/M, z7.s, z3.s\n"
+            "mov z3.s, #0x0\n"
+            "mov z7.s, #0x0\n"
+            ".inst 0x451f9923  // smmla z3.s, z9.b, z31.b\n"
+            ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
+            "ld1rqb { z9.b }, p1/Z, [x24, #32]\n"
+            ".inst 0x450e9923  // smmla z3.s, z9.b, z14.b\n"
+            ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
+            "mov z9.s, #0x0\n"
+            ".inst 0x451f9a49  // smmla z9.s, z18.b, z31.b\n"
+            "mov z31.s, #0x0\n"
+            ".inst 0x45069a5f  // smmla z31.s, z18.b, z6.b\n"
+            "ld1rqb { z6.b }, p1/Z, [x24, #48]\n"
+            "ld1rqb { z18.b }, p1/Z, [x24, #64]\n"
+            ".inst 0x450e98c9  // smmla z9.s, z6.b, z14.b\n"
+            "fmul z14.s, z23.s, z5.s[1]\n"
+            ".inst 0x450298df  // smmla z31.s, z6.b, z2.b\n"
+            "ld1rqb { z6.b }, p1/Z, [x24, #80]\n"
+            "fmul z2.s, z23.s, z5.s[2]\n"
+            "fmul z23.s, z23.s, z5.s[3]\n"
+            ".inst 0x451e9a43  // smmla z3.s, z18.b, z30.b\n"
+            ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x24, #96]\n"
+            ".inst 0x451e98c9  // smmla z9.s, z6.b, z30.b\n"
+            ".inst 0x451598df  // smmla z31.s, z6.b, z21.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x24, #112]\n"
+            "add x24, x24, #0x88\n"
+            ".inst 0x450498a3  // smmla z3.s, z5.b, z4.b\n"
+            ".inst 0x451198a7  // smmla z7.s, z5.b, z17.b\n"
+            ".inst 0x45049a49  // smmla z9.s, z18.b, z4.b\n"
+            ".inst 0x45119a5f  // smmla z31.s, z18.b, z17.b\n"
+            "uzp1 z18.d, z3.d, z7.d\n"
+            "uzp2 z5.d, z3.d, z7.d\n"
+            "scvtf z18.s, p1/m, z18.s\n"
+            "uzp1 z6.d, z9.d, z31.d\n"
+            "uzp2 z9.d, z9.d, z31.d\n"
+            "scvtf z5.s, p1/m, z5.s\n"
+            "fmla z8.s, p1/M, z18.s, z22.s\n"
+            "scvtf z6.s, p1/m, z6.s\n"
+            "scvtf z9.s, p1/m, z9.s\n"
+            "fmla z29.s, p1/M, z5.s, z14.s\n"
+            "fmla z27.s, p1/M, z6.s, z2.s\n"
+            "fmla z10.s, p1/M, z9.s, z23.s\n"
+            "bgt 3b\n"
+            "mov x20, %x[res_ptr]\n"
+            "subs x10, x10, #0x8\n"
+            "add %x[res_ptr], %x[res_ptr], #0x20\n"
+            "st1w { z24.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z15.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z12.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z0.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z13.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z1.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z20.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z25.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z11.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z16.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z19.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z26.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z8.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z29.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z27.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z10.s }, p1, [x20]\n"
+            "bne 2b\n"
+            "mov x20, #0x4\n"
+            "sub x13, x13, #0x10\n"
+            "cmp x13, #0x10\n"
+            "mov %x[res_ptr], x9\n"
+            "madd %x[a_ptr], x20, x12, %x[a_ptr]\n"
+            "bge 1b\n"
+            "4:"  // Row loop skip
+            "cbz x13, 9f\n"
+            "5:"  // Row tail: Row loop
+            "add x25, %x[b_ptr], #0x10\n"
+            "mov x24, %x[width]\n"
+            "add x23, %x[res_ptr], %x[res_stride], LSL #2\n"
+            "6:"  // Row tail: Column loop
+            "mov z24.b, #0x0\n"
+            "mov z15.b, #0x0\n"
+            "add x28, %x[a_ptr], #0x8\n"
+            "mov x22, %x[num_blocks]\n"
+            "mov z12.b, #0x0\n"
+            "mov z0.b, #0x0\n"
+            "7:"  // Row tail: Block loop
+            "ld1b { z3.b }, p1/Z, [x25]\n"
+            "ld1b { z6.b }, p1/Z, [x25, #1, MUL VL]\n"
+            "mov z2.s, #0x0\n"
+            "mov z25.s, #0x0\n"
+            "ld1rqb { z26.b }, p1/Z, [x28]\n"
+            "ld1rqb { z21.b }, p1/Z, [x28, #16]\n"
+            "mov z27.s, #0x0\n"
+            "mov z19.s, #0x0\n"
+            "ld1b { z29.b }, p1/Z, [x25, #2, MUL VL]\n"
+            "ld1b { z16.b }, p1/Z, [x25, #3, MUL VL]\n"
+            "sub x21, x25, #0x10\n"
+            "sub x20, x28, #0x8\n"
+            "lsl z20.b, z3.b, #0x4\n"
+            "lsl z4.b, z6.b, #0x4\n"
+            "ld1rqb { z10.b }, p1/Z, [x28, #32]\n"
+            "ld1rqb { z23.b }, p1/Z, [x28, #48]\n"
+            "and z3.b, z3.b, #0xf0\n"
+            "and z6.b, z6.b, #0xf0\n"
+            "ld1rqb { z11.b }, p1/Z, [x28, #64]\n"
+            "ld1rqb { z7.b }, p1/Z, [x28, #80]\n"
+            "lsl z8.b, z29.b, #0x4\n"
+            "lsl z14.b, z16.b, #0x4\n"
+            "ld1rqb { z18.b }, p1/Z, [x28, #96]\n"
+            "ld1rqb { z30.b }, p1/Z, [x28, #112]\n"
+            ".inst 0x45149b42  // smmla z2.s, z26.b, z20.b\n"
+            ".inst 0x45049b59  // smmla z25.s, z26.b, z4.b\n"
+            "and z29.b, z29.b, #0xf0\n"
+            "ld1h { z17.s }, p1/Z, [x21]\n"
+            ".inst 0x45149abb  // smmla z27.s, z21.b, z20.b\n"
+            ".inst 0x45049ab3  // smmla z19.s, z21.b, z4.b\n"
+            "and z16.b, z16.b, #0xf0\n"
+            "ld1h { z4.s }, p0/Z, [x20]\n"
+            "subs x22, x22, #0x1\n"
+            "add x28, x28, #0x88\n"
+            "fcvt z17.s, p1/m, z17.h\n"
+            "add x25, x25, #0x90\n"
+            ".inst 0x45089942  // smmla z2.s, z10.b, z8.b\n"
+            ".inst 0x450e9959  // smmla z25.s, z10.b, z14.b\n"
+            "fcvt z4.s, p1/m, z4.h\n"
+            ".inst 0x45089afb  // smmla z27.s, z23.b, z8.b\n"
+            ".inst 0x450e9af3  // smmla z19.s, z23.b, z14.b\n"
+            "fscale z17.s, p1/m, z17.s, z28.s\n"
+            "mov z4.q, z4.q[0]\n"
+            ".inst 0x45039962  // smmla z2.s, z11.b, z3.b\n"
+            ".inst 0x45069979  // smmla z25.s, z11.b, z6.b\n"
+            "fmul z23.s, z17.s, z4.s[0]\n"
+            "fmul z9.s, z17.s, z4.s[1]\n"
+            "fmul z21.s, z17.s, z4.s[2]\n"
+            "fmul z4.s, z17.s, z4.s[3]\n"
+            ".inst 0x450398fb  // smmla z27.s, z7.b, z3.b\n"
+            ".inst 0x450698f3  // smmla z19.s, z7.b, z6.b\n"
+            ".inst 0x451d9a42  // smmla z2.s, z18.b, z29.b\n"
+            ".inst 0x45109a59  // smmla z25.s, z18.b, z16.b\n"
+            ".inst 0x451d9bdb  // smmla z27.s, z30.b, z29.b\n"
+            ".inst 0x45109bd3  // smmla z19.s, z30.b, z16.b\n"
+            "uzp1 z31.d, z2.d, z25.d\n"
+            "uzp2 z13.d, z2.d, z25.d\n"
+            "scvtf z31.s, p1/m, z31.s\n"
+            "uzp1 z17.d, z27.d, z19.d\n"
+            "uzp2 z18.d, z27.d, z19.d\n"
+            "scvtf z13.s, p1/m, z13.s\n"
+            "fmla z24.s, p1/M, z31.s, z23.s\n"
+            "scvtf z17.s, p1/m, z17.s\n"
+            "scvtf z18.s, p1/m, z18.s\n"
+            "fmla z15.s, p1/M, z13.s, z9.s\n"
+            "fmla z12.s, p1/M, z17.s, z21.s\n"
+            "fmla z0.s, p1/M, z18.s, z4.s\n"
+            "bgt 7b\n"
+            "mov x20, %x[res_ptr]\n"
+            "cmp x13, #0x1\n"
+            "st1w { z24.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "cmp x13, #0x2\n"
+            "st1w { z15.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "cmp x13, #0x3\n"
+            "st1w { z12.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "st1w { z0.s }, p1, [x20]\n"
+            "8:"  // Row tail: Accumulator store skip
+            "subs x24, x24, #0x8\n"
+            "add %x[res_ptr], %x[res_ptr], #0x20\n"
+            "bne 6b\n"
+            "subs x13, x13, #0x4\n"
+            "add %x[a_ptr], %x[a_ptr], x12\n"
+            "mov %x[res_ptr], x23\n"
+            "bgt 5b\n"
+            "9:"  // Row tail: Row loop skip
+            : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
+            : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [num_blocks] "r" (num_blocks), [res_stride] "r" (res_stride), [width] "r" (width)
+            : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+        );
         return;
     }
-    int64_t x0 = roundup((ith * nc) / nth, (int64_t)8);
-    int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)8);
-    size_t width = xend - x0;
-
-    int64_t nb = n / QK4_0;
-    const void * b_ptr = (const void *)((const block_q4_0x8 *) vx + ((x0 / 8) * nb));
-    const void * a_ptr = vy;
-    float * res_ptr = s + x0;
-    size_t res_stride = nc * sizeof(float);
-
-    assert(n % 32 == 0);
-    assert(width % 8 == 0);
-
-    size_t num_blocks = n / 32;
-
-    __asm__ __volatile__(
-        "mov x20, #0x4\n"
-        "mov x13, %x[nr]\n"
-        "mov z28.s, #-0x4\n"
-        "mov x12, #0x88\n"
-        "ptrue p1.b\n"
-        "whilelt p0.s, XZR, x20\n"
-        "cmp x13, #0x10\n"
-        "mul x12, %x[num_blocks], x12\n"
-        "blt 4f\n"
-        "1:"  // Row loop
-        "add x11, %x[b_ptr], #0x10\n"
-        "mov x10, %x[width]\n"
-        "add x9, %x[res_ptr], %x[res_stride], LSL #4\n"
-        "2:"  // Column loop
-        "add x28, %x[a_ptr], #0x8\n"
-        "mov z24.b, #0x0\n"
-        "mov z15.b, #0x0\n"
-        "mov x27, %x[num_blocks]\n"
-        "add x26, x28, x12\n"
-        "mov z12.b, #0x0\n"
-        "mov z0.b, #0x0\n"
-        "add x25, x26, x12\n"
-        "mov z13.b, #0x0\n"
-        "mov z1.b, #0x0\n"
-        "add x24, x25, x12\n"
-        "mov z20.b, #0x0\n"
-        "mov z25.b, #0x0\n"
-        "mov z11.b, #0x0\n"
-        "mov z16.b, #0x0\n"
-        "mov z19.b, #0x0\n"
-        "mov z26.b, #0x0\n"
-        "mov z8.b, #0x0\n"
-        "mov z29.b, #0x0\n"
-        "mov z27.b, #0x0\n"
-        "mov z10.b, #0x0\n"
-        "3:"  // Block loop
-        "ld1b { z30.b }, p1/Z, [x11]\n"
-        "ld1b { z21.b }, p1/Z, [x11, #1, MUL VL]\n"
-        "mov z18.s, #0x0\n"
-        "mov z7.s, #0x0\n"
-        "ld1rqb { z3.b }, p1/Z, [x28]\n"
-        "ld1rqb { z5.b }, p1/Z, [x28, #16]\n"
-        "mov z9.s, #0x0\n"
-        "mov z22.s, #0x0\n"
-        "ld1b { z4.b }, p1/Z, [x11, #2, MUL VL]\n"
-        "ld1b { z17.b }, p1/Z, [x11, #3, MUL VL]\n"
-        "sub x20, x11, #0x10\n"
-        "sub x23, x28, #0x8\n"
-        "lsl z31.b, z30.b, #0x4\n"
-        "lsl z6.b, z21.b, #0x4\n"
-        "ld1h { z23.s }, p1/Z, [x20]\n"
-        "sub x22, x26, #0x8\n"
-        "and z30.b, z30.b, #0xf0\n"
-        "and z21.b, z21.b, #0xf0\n"
-        "sub x21, x25, #0x8\n"
-        "sub x20, x24, #0x8\n"
-        "lsl z14.b, z4.b, #0x4\n"
-        "lsl z2.b, z17.b, #0x4\n"
-        "subs x27, x27, #0x1\n"
-        "add x11, x11, #0x90\n"
-        ".inst 0x451f9872  // smmla z18.s, z3.b, z31.b\n"
-        ".inst 0x45069867  // smmla z7.s, z3.b, z6.b\n"
-        "ld1rqb { z3.b }, p1/Z, [x28, #32]\n"
-        "and z4.b, z4.b, #0xf0\n"
-        ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
-        ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
-        "ld1rqb { z5.b }, p1/Z, [x28, #48]\n"
-        "and z17.b, z17.b, #0xf0\n"
-        "fcvt z23.s, p1/m, z23.h\n"
-        ".inst 0x450e9872  // smmla z18.s, z3.b, z14.b\n"
-        ".inst 0x45029867  // smmla z7.s, z3.b, z2.b\n"
-        "ld1rqb { z3.b }, p1/Z, [x28, #64]\n"
-        ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
-        ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
-        "ld1rqb { z5.b }, p1/Z, [x28, #80]\n"
-        "fscale z23.s, p1/m, z23.s, z28.s\n"
-        ".inst 0x451e9872  // smmla z18.s, z3.b, z30.b\n"
-        ".inst 0x45159867  // smmla z7.s, z3.b, z21.b\n"
-        "ld1rqb { z3.b }, p1/Z, [x28, #96]\n"
-        ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
-        ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
-        "ld1rqb { z5.b }, p1/Z, [x28, #112]\n"
-        "add x28, x28, #0x88\n"
-        ".inst 0x45049872  // smmla z18.s, z3.b, z4.b\n"
-        ".inst 0x45119867  // smmla z7.s, z3.b, z17.b\n"
-        "ld1h { z3.s }, p0/Z, [x23]\n"
-        ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
-        ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
-        "fcvt z3.s, p1/m, z3.h\n"
-        "uzp1 z5.d, z18.d, z7.d\n"
-        "uzp2 z18.d, z18.d, z7.d\n"
-        "mov z3.q, z3.q[0]\n"
-        "uzp1 z7.d, z9.d, z22.d\n"
-        "uzp2 z22.d, z9.d, z22.d\n"
-        "fmul z9.s, z23.s, z3.s[0]\n"
-        "scvtf z5.s, p1/m, z5.s\n"
-        "scvtf z18.s, p1/m, z18.s\n"
-        "scvtf z7.s, p1/m, z7.s\n"
-        "scvtf z22.s, p1/m, z22.s\n"
-        "fmla z24.s, p1/M, z5.s, z9.s\n"
-        "ld1rqb { z5.b }, p1/Z, [x26]\n"
-        "fmul z9.s, z23.s, z3.s[1]\n"
-        "fmla z15.s, p1/M, z18.s, z9.s\n"
-        "ld1rqb { z18.b }, p1/Z, [x26, #16]\n"
-        "fmul z9.s, z23.s, z3.s[2]\n"
-        "fmul z3.s, z23.s, z3.s[3]\n"
-        "fmla z12.s, p1/M, z7.s, z9.s\n"
-        "mov z9.s, #0x0\n"
-        "ld1h { z7.s }, p0/Z, [x22]\n"
-        ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
-        "fmla z0.s, p1/M, z22.s, z3.s\n"
-        "mov z22.s, #0x0\n"
-        "ld1h { z3.s }, p0/Z, [x21]\n"
-        ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
-        "ld1rqb { z5.b }, p1/Z, [x26, #32]\n"
-        "fcvt z7.s, p1/m, z7.h\n"
-        "fcvt z3.s, p1/m, z3.h\n"
-        ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
-        ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
-        "ld1rqb { z5.b }, p1/Z, [x26, #64]\n"
-        "mov z7.q, z7.q[0]\n"
-        "mov z3.q, z3.q[0]\n"
-        ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
-        ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
-        "ld1rqb { z5.b }, p1/Z, [x26, #96]\n"
-        ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
-        ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
-        "uzp1 z5.d, z9.d, z22.d\n"
-        "scvtf z5.s, p1/m, z5.s\n"
-        "uzp2 z22.d, z9.d, z22.d\n"
-        "fmul z9.s, z23.s, z7.s[0]\n"
-        "scvtf z22.s, p1/m, z22.s\n"
-        "fmla z13.s, p1/M, z5.s, z9.s\n"
-        "ld1rqb { z9.b }, p1/Z, [x25]\n"
-        "fmul z5.s, z23.s, z7.s[1]\n"
-        "fmla z1.s, p1/M, z22.s, z5.s\n"
-        "mov z5.s, #0x0\n"
-        "mov z22.s, #0x0\n"
-        ".inst 0x451f9a45  // smmla z5.s, z18.b, z31.b\n"
-        ".inst 0x45069a56  // smmla z22.s, z18.b, z6.b\n"
-        "ld1rqb { z18.b }, p1/Z, [x26, #48]\n"
-        ".inst 0x450e9a45  // smmla z5.s, z18.b, z14.b\n"
-        ".inst 0x45029a56  // smmla z22.s, z18.b, z2.b\n"
-        "ld1rqb { z18.b }, p1/Z, [x26, #80]\n"
-        ".inst 0x451e9a45  // smmla z5.s, z18.b, z30.b\n"
-        ".inst 0x45159a56  // smmla z22.s, z18.b, z21.b\n"
-        "ld1rqb { z18.b }, p1/Z, [x26, #112]\n"
-        "add x26, x26, #0x88\n"
-        ".inst 0x45049a45  // smmla z5.s, z18.b, z4.b\n"
-        ".inst 0x45119a56  // smmla z22.s, z18.b, z17.b\n"
-        "uzp1 z18.d, z5.d, z22.d\n"
-        "scvtf z18.s, p1/m, z18.s\n"
-        "uzp2 z22.d, z5.d, z22.d\n"
-        "fmul z5.s, z23.s, z7.s[2]\n"
-        "fmul z7.s, z23.s, z7.s[3]\n"
-        "scvtf z22.s, p1/m, z22.s\n"
-        "fmla z20.s, p1/M, z18.s, z5.s\n"
-        "ld1rqb { z18.b }, p1/Z, [x25, #16]\n"
-        "ld1h { z5.s }, p0/Z, [x20]\n"
-        "fcvt z5.s, p1/m, z5.h\n"
-        "fmla z25.s, p1/M, z22.s, z7.s\n"
-        "mov z22.s, #0x0\n"
-        "mov z7.s, #0x0\n"
-        ".inst 0x451f9936  // smmla z22.s, z9.b, z31.b\n"
-        ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
-        "ld1rqb { z9.b }, p1/Z, [x25, #32]\n"
-        "mov z5.q, z5.q[0]\n"
-        ".inst 0x450e9936  // smmla z22.s, z9.b, z14.b\n"
-        ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
-        "ld1rqb { z9.b }, p1/Z, [x25, #64]\n"
-        ".inst 0x451e9936  // smmla z22.s, z9.b, z30.b\n"
-        ".inst 0x45159927  // smmla z7.s, z9.b, z21.b\n"
-        "ld1rqb { z9.b }, p1/Z, [x25, #96]\n"
-        ".inst 0x45049936  // smmla z22.s, z9.b, z4.b\n"
-        ".inst 0x45119927  // smmla z7.s, z9.b, z17.b\n"
-        "uzp1 z9.d, z22.d, z7.d\n"
-        "scvtf z9.s, p1/m, z9.s\n"
-        "uzp2 z22.d, z22.d, z7.d\n"
-        "fmul z7.s, z23.s, z3.s[0]\n"
-        "scvtf z22.s, p1/m, z22.s\n"
-        "fmla z11.s, p1/M, z9.s, z7.s\n"
-        "ld1rqb { z9.b }, p1/Z, [x24]\n"
-        "fmul z7.s, z23.s, z3.s[1]\n"
-        "fmla z16.s, p1/M, z22.s, z7.s\n"
-        "mov z22.s, #0x0\n"
-        "mov z7.s, #0x0\n"
-        ".inst 0x451f9a56  // smmla z22.s, z18.b, z31.b\n"
-        ".inst 0x45069a47  // smmla z7.s, z18.b, z6.b\n"
-        "ld1rqb { z18.b }, p1/Z, [x25, #48]\n"
-        ".inst 0x450e9a56  // smmla z22.s, z18.b, z14.b\n"
-        ".inst 0x45029a47  // smmla z7.s, z18.b, z2.b\n"
-        "ld1rqb { z18.b }, p1/Z, [x25, #80]\n"
-        ".inst 0x451e9a56  // smmla z22.s, z18.b, z30.b\n"
-        ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
-        "ld1rqb { z18.b }, p1/Z, [x25, #112]\n"
-        "add x25, x25, #0x88\n"
-        ".inst 0x45049a56  // smmla z22.s, z18.b, z4.b\n"
-        ".inst 0x45119a47  // smmla z7.s, z18.b, z17.b\n"
-        "uzp1 z18.d, z22.d, z7.d\n"
-        "scvtf z18.s, p1/m, z18.s\n"
-        "uzp2 z7.d, z22.d, z7.d\n"
-        "fmul z22.s, z23.s, z3.s[2]\n"
-        "fmul z3.s, z23.s, z3.s[3]\n"
-        "scvtf z7.s, p1/m, z7.s\n"
-        "fmla z19.s, p1/M, z18.s, z22.s\n"
-        "ld1rqb { z18.b }, p1/Z, [x24, #16]\n"
-        "fmul z22.s, z23.s, z5.s[0]\n"
-        "fmla z26.s, p1/M, z7.s, z3.s\n"
-        "mov z3.s, #0x0\n"
-        "mov z7.s, #0x0\n"
-        ".inst 0x451f9923  // smmla z3.s, z9.b, z31.b\n"
-        ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
-        "ld1rqb { z9.b }, p1/Z, [x24, #32]\n"
-        ".inst 0x450e9923  // smmla z3.s, z9.b, z14.b\n"
-        ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
-        "mov z9.s, #0x0\n"
-        ".inst 0x451f9a49  // smmla z9.s, z18.b, z31.b\n"
-        "mov z31.s, #0x0\n"
-        ".inst 0x45069a5f  // smmla z31.s, z18.b, z6.b\n"
-        "ld1rqb { z6.b }, p1/Z, [x24, #48]\n"
-        "ld1rqb { z18.b }, p1/Z, [x24, #64]\n"
-        ".inst 0x450e98c9  // smmla z9.s, z6.b, z14.b\n"
-        "fmul z14.s, z23.s, z5.s[1]\n"
-        ".inst 0x450298df  // smmla z31.s, z6.b, z2.b\n"
-        "ld1rqb { z6.b }, p1/Z, [x24, #80]\n"
-        "fmul z2.s, z23.s, z5.s[2]\n"
-        "fmul z23.s, z23.s, z5.s[3]\n"
-        ".inst 0x451e9a43  // smmla z3.s, z18.b, z30.b\n"
-        ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
-        "ld1rqb { z5.b }, p1/Z, [x24, #96]\n"
-        ".inst 0x451e98c9  // smmla z9.s, z6.b, z30.b\n"
-        ".inst 0x451598df  // smmla z31.s, z6.b, z21.b\n"
-        "ld1rqb { z18.b }, p1/Z, [x24, #112]\n"
-        "add x24, x24, #0x88\n"
-        ".inst 0x450498a3  // smmla z3.s, z5.b, z4.b\n"
-        ".inst 0x451198a7  // smmla z7.s, z5.b, z17.b\n"
-        ".inst 0x45049a49  // smmla z9.s, z18.b, z4.b\n"
-        ".inst 0x45119a5f  // smmla z31.s, z18.b, z17.b\n"
-        "uzp1 z18.d, z3.d, z7.d\n"
-        "uzp2 z5.d, z3.d, z7.d\n"
-        "scvtf z18.s, p1/m, z18.s\n"
-        "uzp1 z6.d, z9.d, z31.d\n"
-        "uzp2 z9.d, z9.d, z31.d\n"
-        "scvtf z5.s, p1/m, z5.s\n"
-        "fmla z8.s, p1/M, z18.s, z22.s\n"
-        "scvtf z6.s, p1/m, z6.s\n"
-        "scvtf z9.s, p1/m, z9.s\n"
-        "fmla z29.s, p1/M, z5.s, z14.s\n"
-        "fmla z27.s, p1/M, z6.s, z2.s\n"
-        "fmla z10.s, p1/M, z9.s, z23.s\n"
-        "bgt 3b\n"
-        "mov x20, %x[res_ptr]\n"
-        "subs x10, x10, #0x8\n"
-        "add %x[res_ptr], %x[res_ptr], #0x20\n"
-        "st1w { z24.s }, p1, [x20]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "st1w { z15.s }, p1, [x20]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "st1w { z12.s }, p1, [x20]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "st1w { z0.s }, p1, [x20]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "st1w { z13.s }, p1, [x20]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "st1w { z1.s }, p1, [x20]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "st1w { z20.s }, p1, [x20]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "st1w { z25.s }, p1, [x20]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "st1w { z11.s }, p1, [x20]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "st1w { z16.s }, p1, [x20]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "st1w { z19.s }, p1, [x20]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "st1w { z26.s }, p1, [x20]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "st1w { z8.s }, p1, [x20]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "st1w { z29.s }, p1, [x20]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "st1w { z27.s }, p1, [x20]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "st1w { z10.s }, p1, [x20]\n"
-        "bne 2b\n"
-        "mov x20, #0x4\n"
-        "sub x13, x13, #0x10\n"
-        "cmp x13, #0x10\n"
-        "mov %x[res_ptr], x9\n"
-        "madd %x[a_ptr], x20, x12, %x[a_ptr]\n"
-        "bge 1b\n"
-        "4:"  // Row loop skip
-        "cbz x13, 9f\n"
-        "5:"  // Row tail: Row loop
-        "add x25, %x[b_ptr], #0x10\n"
-        "mov x24, %x[width]\n"
-        "add x23, %x[res_ptr], %x[res_stride], LSL #2\n"
-        "6:"  // Row tail: Column loop
-        "mov z24.b, #0x0\n"
-        "mov z15.b, #0x0\n"
-        "add x28, %x[a_ptr], #0x8\n"
-        "mov x22, %x[num_blocks]\n"
-        "mov z12.b, #0x0\n"
-        "mov z0.b, #0x0\n"
-        "7:"  // Row tail: Block loop
-        "ld1b { z3.b }, p1/Z, [x25]\n"
-        "ld1b { z6.b }, p1/Z, [x25, #1, MUL VL]\n"
-        "mov z2.s, #0x0\n"
-        "mov z25.s, #0x0\n"
-        "ld1rqb { z26.b }, p1/Z, [x28]\n"
-        "ld1rqb { z21.b }, p1/Z, [x28, #16]\n"
-        "mov z27.s, #0x0\n"
-        "mov z19.s, #0x0\n"
-        "ld1b { z29.b }, p1/Z, [x25, #2, MUL VL]\n"
-        "ld1b { z16.b }, p1/Z, [x25, #3, MUL VL]\n"
-        "sub x21, x25, #0x10\n"
-        "sub x20, x28, #0x8\n"
-        "lsl z20.b, z3.b, #0x4\n"
-        "lsl z4.b, z6.b, #0x4\n"
-        "ld1rqb { z10.b }, p1/Z, [x28, #32]\n"
-        "ld1rqb { z23.b }, p1/Z, [x28, #48]\n"
-        "and z3.b, z3.b, #0xf0\n"
-        "and z6.b, z6.b, #0xf0\n"
-        "ld1rqb { z11.b }, p1/Z, [x28, #64]\n"
-        "ld1rqb { z7.b }, p1/Z, [x28, #80]\n"
-        "lsl z8.b, z29.b, #0x4\n"
-        "lsl z14.b, z16.b, #0x4\n"
-        "ld1rqb { z18.b }, p1/Z, [x28, #96]\n"
-        "ld1rqb { z30.b }, p1/Z, [x28, #112]\n"
-        ".inst 0x45149b42  // smmla z2.s, z26.b, z20.b\n"
-        ".inst 0x45049b59  // smmla z25.s, z26.b, z4.b\n"
-        "and z29.b, z29.b, #0xf0\n"
-        "ld1h { z17.s }, p1/Z, [x21]\n"
-        ".inst 0x45149abb  // smmla z27.s, z21.b, z20.b\n"
-        ".inst 0x45049ab3  // smmla z19.s, z21.b, z4.b\n"
-        "and z16.b, z16.b, #0xf0\n"
-        "ld1h { z4.s }, p0/Z, [x20]\n"
-        "subs x22, x22, #0x1\n"
-        "add x28, x28, #0x88\n"
-        "fcvt z17.s, p1/m, z17.h\n"
-        "add x25, x25, #0x90\n"
-        ".inst 0x45089942  // smmla z2.s, z10.b, z8.b\n"
-        ".inst 0x450e9959  // smmla z25.s, z10.b, z14.b\n"
-        "fcvt z4.s, p1/m, z4.h\n"
-        ".inst 0x45089afb  // smmla z27.s, z23.b, z8.b\n"
-        ".inst 0x450e9af3  // smmla z19.s, z23.b, z14.b\n"
-        "fscale z17.s, p1/m, z17.s, z28.s\n"
-        "mov z4.q, z4.q[0]\n"
-        ".inst 0x45039962  // smmla z2.s, z11.b, z3.b\n"
-        ".inst 0x45069979  // smmla z25.s, z11.b, z6.b\n"
-        "fmul z23.s, z17.s, z4.s[0]\n"
-        "fmul z9.s, z17.s, z4.s[1]\n"
-        "fmul z21.s, z17.s, z4.s[2]\n"
-        "fmul z4.s, z17.s, z4.s[3]\n"
-        ".inst 0x450398fb  // smmla z27.s, z7.b, z3.b\n"
-        ".inst 0x450698f3  // smmla z19.s, z7.b, z6.b\n"
-        ".inst 0x451d9a42  // smmla z2.s, z18.b, z29.b\n"
-        ".inst 0x45109a59  // smmla z25.s, z18.b, z16.b\n"
-        ".inst 0x451d9bdb  // smmla z27.s, z30.b, z29.b\n"
-        ".inst 0x45109bd3  // smmla z19.s, z30.b, z16.b\n"
-        "uzp1 z31.d, z2.d, z25.d\n"
-        "uzp2 z13.d, z2.d, z25.d\n"
-        "scvtf z31.s, p1/m, z31.s\n"
-        "uzp1 z17.d, z27.d, z19.d\n"
-        "uzp2 z18.d, z27.d, z19.d\n"
-        "scvtf z13.s, p1/m, z13.s\n"
-        "fmla z24.s, p1/M, z31.s, z23.s\n"
-        "scvtf z17.s, p1/m, z17.s\n"
-        "scvtf z18.s, p1/m, z18.s\n"
-        "fmla z15.s, p1/M, z13.s, z9.s\n"
-        "fmla z12.s, p1/M, z17.s, z21.s\n"
-        "fmla z0.s, p1/M, z18.s, z4.s\n"
-        "bgt 7b\n"
-        "mov x20, %x[res_ptr]\n"
-        "cmp x13, #0x1\n"
-        "st1w { z24.s }, p1, [x20]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "ble 8f\n"
-        "cmp x13, #0x2\n"
-        "st1w { z15.s }, p1, [x20]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "ble 8f\n"
-        "cmp x13, #0x3\n"
-        "st1w { z12.s }, p1, [x20]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "ble 8f\n"
-        "st1w { z0.s }, p1, [x20]\n"
-        "8:"  // Row tail: Accumulator store skip
-        "subs x24, x24, #0x8\n"
-        "add %x[res_ptr], %x[res_ptr], #0x20\n"
-        "bne 6b\n"
-        "subs x13, x13, #0x4\n"
-        "add %x[a_ptr], %x[a_ptr], x12\n"
-        "mov %x[res_ptr], x23\n"
-        "bgt 5b\n"
-        "9:"  // Row tail: Row loop skip
-        : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
-        : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [num_blocks] "r" (num_blocks), [res_stride] "r" (res_stride), [width] "r" (width)
-        : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
-    );
 #endif
-}
-
-void ggml_gemm_q4_0_q8_0_aarch64_neon(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth) {
 #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
     int64_t x0 = roundup((ith * nc) / nth, (int64_t)4);
     int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)4);
@@ -1534,11 +1386,7 @@ void ggml_gemm_q4_0_q8_0_aarch64_neon(int n, float * GGML_RESTRICT s, const void
         : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [num_blocks] "r" (num_blocks), [res_stride] "r" (res_stride), [width] "r" (width)
         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
-#endif
-}
-
-void ggml_gemm_q4_0_q8_0_aarch64_neon_noi8mm(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth) {
-#if defined(__ARM_NEON)
+#elif defined(__ARM_NEON)
     int64_t x0 = roundup((ith * nc) / nth, (int64_t)4);
     int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)4);
     size_t width = xend - x0;
@@ -2006,94 +1854,3 @@ void ggml_gemm_q4_0_q8_0_aarch64_neon_noi8mm(int n, float * GGML_RESTRICT s, con
     );
 #endif
 }
-
-void ggml_gemm_q8_0_q8_0_aarch64(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth) {
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    int64_t x0 = roundup((ith * nc) / nth, (int64_t)4);
-    int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)4);
-
-    int64_t nb = n / QK8_0;
-    int64_t a_nb = n / QK8_0;
-
-    const block_q8_0x4 * b_ptr_start = (const block_q8_0x4 *) vx;
-    const block_q8_0x4 * a_ptr_start = (const block_q8_0x4 *) vy;
-
-    for (int64_t y = 0; y < nr / 4; y += nr / 4) {
-        for (int64_t x = x0 / 4; x < xend / 4; x++) {
-            const block_q8_0x4 ** a_ptrs = new const block_q8_0x4 * [nr / 4];
-
-            a_ptrs[0] = a_ptr_start + (y * a_nb);
-            for (int i = 0; i < (nr / 4) - 1; i++) {
-                a_ptrs[i + 1] = a_ptrs[i] + a_nb;
-            }
-
-            const block_q8_0x4 * b_ptr = b_ptr_start + (x * nb);
-
-            // Master FP accumulators
-            float32x4_t * acc_rows = new float32x4_t[nr];
-            for (int i = 0; i < nr; i++) {
-                acc_rows[i] = vdupq_n_f32(0.0f);
-            }
-
-            for (int64_t b = 0; b < nb; b++) {
-                // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
-                const int8x16_t rhs_mat_01_0 = vld1q_s8(b_ptr[b].qs);
-                const int8x16_t rhs_mat_23_0 = vld1q_s8(b_ptr[b].qs + 16);
-                const int8x16_t rhs_mat_01_1 = vld1q_s8(b_ptr[b].qs + 32);
-                const int8x16_t rhs_mat_23_1 = vld1q_s8(b_ptr[b].qs + 48);
-                const int8x16_t rhs_mat_01_2 = vld1q_s8(b_ptr[b].qs + 64);
-                const int8x16_t rhs_mat_23_2 = vld1q_s8(b_ptr[b].qs + 80);
-                const int8x16_t rhs_mat_01_3 = vld1q_s8(b_ptr[b].qs + 96);
-                const int8x16_t rhs_mat_23_3 = vld1q_s8(b_ptr[b].qs + 112);
-
-                // Scale values - assemble the four row/column scales into a (64-bit) vector, then expand to FP32
-                const float16x4_t col_scale_f16 = vld1_f16((const ggml_fp16_internal_t *)(b_ptr[b].d));
-                const float32x4_t col_scale_f32 = vcvt_f32_f16(col_scale_f16);
-
-                // Process LHS in pairs of rows
-                for (int rp = 0; rp < nr / 4; rp++) {
-                    const int8x16_t lhs_mat_01_0 = vld1q_s8(a_ptrs[rp][b].qs);
-                    const int8x16_t lhs_mat_23_0 = vld1q_s8(a_ptrs[rp][b].qs + 16);
-                    const int8x16_t lhs_mat_01_1 = vld1q_s8(a_ptrs[rp][b].qs + 32);
-                    const int8x16_t lhs_mat_23_1 = vld1q_s8(a_ptrs[rp][b].qs + 48);
-
-                    const int8x16_t lhs_mat_01_2 = vld1q_s8(a_ptrs[rp][b].qs + 64);
-                    const int8x16_t lhs_mat_23_2 = vld1q_s8(a_ptrs[rp][b].qs + 80);
-                    const int8x16_t lhs_mat_01_3 = vld1q_s8(a_ptrs[rp][b].qs + 96);
-                    const int8x16_t lhs_mat_23_3 = vld1q_s8(a_ptrs[rp][b].qs + 112);
-
-                    // Do the MMLAs into 2x2 matrices
-                    const int32x4_t iacc_mat_00 =
-                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_01_0), lhs_mat_01_1, rhs_mat_01_1), lhs_mat_01_2, rhs_mat_01_2), lhs_mat_01_3, rhs_mat_01_3);
-                    const int32x4_t iacc_mat_01 =
-                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_23_0), lhs_mat_01_1, rhs_mat_23_1), lhs_mat_01_2, rhs_mat_23_2), lhs_mat_01_3, rhs_mat_23_3);
-                    const int32x4_t iacc_mat_10 =
-                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_23_0, rhs_mat_01_0), lhs_mat_23_1, rhs_mat_01_1), lhs_mat_23_2, rhs_mat_01_2), lhs_mat_23_3, rhs_mat_01_3);
-                    const int32x4_t iacc_mat_11 =
-                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_23_0, rhs_mat_23_0), lhs_mat_23_1, rhs_mat_23_1), lhs_mat_23_2, rhs_mat_23_2), lhs_mat_23_3, rhs_mat_23_3);
-
-                    // Straighten out to make 4 row vectors
-                    const int32x4_t iacc_row_0 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
-                    const int32x4_t iacc_row_1 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
-                    const int32x4_t iacc_row_2 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_10), vreinterpretq_u64_s32(iacc_mat_11)));
-                    const int32x4_t iacc_row_3 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_10), vreinterpretq_u64_s32(iacc_mat_11)));
-
-                    const float16x4_t row_scale_f16 = vld1_f16((const ggml_fp16_internal_t *)(a_ptrs[rp][b].d));
-                    const float32x4_t row_scale_f32 = vcvt_f32_f16(row_scale_f16);
-
-                    acc_rows[rp * 4] = vfmaq_f32(acc_rows[rp * 4], vcvtq_f32_s32(iacc_row_0), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 0));
-                    acc_rows[rp * 4 + 1] = vfmaq_f32(acc_rows[rp * 4 + 1], vcvtq_f32_s32(iacc_row_1), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 1));
-                    acc_rows[rp * 4 + 2] = vfmaq_f32(acc_rows[rp * 4 + 2], vcvtq_f32_s32(iacc_row_2), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 2));
-                    acc_rows[rp * 4 + 3] = vfmaq_f32(acc_rows[rp * 4 + 3], vcvtq_f32_s32(iacc_row_3), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 3));
-                }
-            }
-
-            for (int i = 0; i < nr; i++) {
-                vst1q_f32(s + ((y * 4 + i) * nc + x * 4), acc_rows[i]);
-            }
-            delete [] acc_rows;
-            delete [] a_ptrs;
-        }
-    }
-#endif
-}
diff --git a/ggml-aarch64.h b/ggml-aarch64.h
index e83b0178774aa..1f0767a99d103 100644
--- a/ggml-aarch64.h
+++ b/ggml-aarch64.h
@@ -24,17 +24,10 @@ block_q8_0x4 make_block_q8_0x4(const block_q8_0 * const in[4], unsigned int bloc
 block_q8_0x8 make_block_q8_0x8(const block_q8_0 * const in[8], unsigned int block_len);
 
 // GEMV
-void ggml_gemv_q4_0_q8_0_aarch64_sve256     (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
-void ggml_gemv_q4_0_q8_0_aarch64_neon       (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
-void ggml_gemv_q4_0_q8_0_aarch64_neon_noi8mm(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
-void ggml_gemv_q8_0_q8_0_aarch64_sve256     (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
-void ggml_gemv_q8_0_q8_0_aarch64_neon       (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
+void ggml_gemv_q4_0_q8_0_aarch64  (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
 
 // GEMM
-void ggml_gemm_q4_0_q8_0_aarch64_sve256     (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
-void ggml_gemm_q4_0_q8_0_aarch64_neon       (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
-void ggml_gemm_q4_0_q8_0_aarch64_neon_noi8mm(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
-void ggml_gemm_q8_0_q8_0_aarch64            (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
+void ggml_gemm_q4_0_q8_0_aarch64  (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
 
 #ifdef __cplusplus
 }
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index bfa329875d364..3a481c0a3e722 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -38,7 +38,7 @@
 #include <unistd.h>
 #endif
 
-#ifdef __ARM_FEATURE_MATMUL_INT8
+#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
 #undef GGML_USE_LLAMAFILE
 #endif
 
@@ -915,16 +915,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot                  = NULL,
         .vec_dot_type             = GGML_TYPE_Q8_0,
         .nrows                    = 1,
-#if defined(__ARM_FEATURE_SVE)
-        .gemv                     = ggml_gemv_q4_0_q8_0_aarch64_sve256,
-        .gemm                     = ggml_gemm_q4_0_q8_0_aarch64_sve256,
-#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
-        .gemv                     = ggml_gemv_q4_0_q8_0_aarch64_neon,
-        .gemm                     = ggml_gemm_q4_0_q8_0_aarch64_neon,
-#elif defined(__ARM_NEON)
-        .gemv                     = ggml_gemv_q4_0_q8_0_aarch64_neon_noi8mm,
-        .gemm                     = ggml_gemm_q4_0_q8_0_aarch64_neon_noi8mm,
-#endif
+        .gemv                     = ggml_gemv_q4_0_q8_0_aarch64,
+        .gemm                     = ggml_gemm_q4_0_q8_0_aarch64,
     }
 };
 
@@ -12242,15 +12234,15 @@ UseGgmlGemm1:;
                 }
             }
         }
-            if ((type == GGML_TYPE_Q4_0_AARCH64) && (ne11 >= 4) && (ne12 == 1) && (ne13 == 1)) {
+            if (from_float_to_mat && gemm && (ne11 >= 4) && (ne12 == 1) && (ne13 == 1)) {
                     for (int64_t i11 = 0; i11 < ne11 / 4; ++i11) {
                         from_float_to_mat((float *)((char *) src1->data + i11 * 4 * nb11), (void *) wdata, ne10, 4, ggml_cpu_has_matmul_int8() ? 8 : 4);
                         wdata += row_size * 4;
                     }
                     for (int64_t i11 = (ne11 / 4) * 4; i11 < ne11; ++i11) {
                         from_float_to_vec_dot((float *)((char *) src1->data + i11 * nb11), (void *) wdata, ne10);
-                         wdata += row_size;
-                     }
+                        wdata += row_size;
+                    }
             }
             else {
                 for (int64_t i13 = 0; i13 < ne13; ++i13) {
@@ -12340,24 +12332,29 @@ UseGgmlGemm2:;
     //if (ith == 0)
     //    printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d.  Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);
 
-    if ((ggml_n_dims(src0) == 2) && (ne11 == 1) && (type == GGML_TYPE_Q4_0_AARCH64)) {
-        gemv(ne00, (float *)((char *) dst->data), (const char *) src0->data, (const char *) wdata, 1, ne01, ith, nth);
-    }
-    else if ((ggml_n_dims(src0) == 2) && (ne11 >= 2) && (type == GGML_TYPE_Q4_0_AARCH64)) {
-        for (int row_iter = 0; row_iter < ne11 / 16; row_iter++) {
-            gemm(ne00, (float *)((char *) dst->data + (row_iter * 16 * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 16) * row_size : (row_iter * 16 * nb11)), 16, ne01, ith, nth);
-        }
-        int rows_processed = (ne11 / 16) * 16;
-        for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 8; row_iter++) {
-            gemm(ne00, (float *)((char *) dst->data + ((rows_processed + row_iter * 8) * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 8) * row_size : ((rows_processed + row_iter * 8) * nb11)), 8, ne01, ith, nth);
-        }
-        rows_processed = rows_processed + ((ne11 - rows_processed) / 8) * 8;
-        for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
-            gemm(ne00, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), 4, ne01, ith, nth);
+    if ((ggml_n_dims(src0) == 2) && gemm && gemv) {
+        if (ne11 == 1) gemv(ne00, (float *)((char *) dst->data), (const char *) src0->data, (const char *) wdata, 1, ne01, ith, nth);
+        else {
+            for (int row_iter = 0; row_iter < ne11 / 16; row_iter++) {
+                gemm(ne00, (float *)((char *) dst->data + (row_iter * 16 * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 16) * row_size : (row_iter * 16 * nb11)), 16, ne01, ith, nth);
+            }
+            int rows_processed = (ne11 / 16) * 16;
+            for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 8; row_iter++) {
+                gemm(ne00, (float *)((char *) dst->data + ((rows_processed + row_iter * 8) * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 8) * row_size : ((rows_processed + row_iter * 8) * nb11)), 8, ne01, ith, nth);
+            }
+            rows_processed = rows_processed + ((ne11 - rows_processed) / 8) * 8;
+            for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
+                gemm(ne00, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), 4, ne01, ith, nth);
+            }
+            rows_processed = rows_processed + ((ne11 - rows_processed) / 4) * 4;
+            for (int row_iter = rows_processed; row_iter < ne11; row_iter++) {
+                gemv(ne00, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * row_size) : (row_iter * nb11)), 1, ne01, ith, nth);
+            }
         }
-        rows_processed = rows_processed + ((ne11 - rows_processed) / 4) * 4;
-        for (int row_iter = rows_processed; row_iter < ne11; row_iter++) {
-            gemv(ne00, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), 1, ne01, ith, nth);
+    }
+    else if ((ggml_n_dims(src0) == 2) && gemv) {
+        for (int row_iter = 0; row_iter < ne11; row_iter++) {
+            gemv(ne00, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * row_size) : (row_iter * nb11)), 1, ne01, ith, nth);
         }
     }
     else {

From 746b57f4c3126abe8c46c7e624e294252a0be503 Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar109@gmail.com>
Date: Tue, 21 May 2024 08:56:45 +0000
Subject: [PATCH 09/28] Arm AArch64: minor code refactoring to split the
 Q4_0_AARC64 type into three separate types: Q4_0_4_4, Q4_0_4_8, and Q4_0_8_8

---
 Package.swift                      |    2 +-
 build.zig                          |    2 +-
 examples/quantize/quantize.cpp     |    4 +-
 ggml-aarch64.cpp => ggml-aarch64.c | 2670 +++++++++++++++-------------
 ggml-aarch64.h                     |   20 +-
 ggml/include/ggml.h                |   10 +-
 ggml/src/ggml-quants.c             |   17 +-
 ggml/src/ggml.c                    |   80 +-
 include/llama.h                    |    4 +-
 src/llama.cpp                      |   18 +-
 10 files changed, 1502 insertions(+), 1325 deletions(-)
 rename ggml-aarch64.cpp => ggml-aarch64.c (82%)

diff --git a/Package.swift b/Package.swift
index c357751dd3196..d40a48385f8c7 100644
--- a/Package.swift
+++ b/Package.swift
@@ -10,7 +10,7 @@ var sources = [
     "ggml/src/ggml-alloc.c",
     "ggml/src/ggml-backend.c",
     "ggml/src/ggml-quants.c",
-    "ggml/src/ggml-aarch64.cpp",
+    "ggml/src/ggml-aarch64.c",
 ]
 
 var resources: [Resource] = []
diff --git a/build.zig b/build.zig
index 804634f2a023b..97fa42fdbb7c8 100644
--- a/build.zig
+++ b/build.zig
@@ -128,7 +128,7 @@ pub fn build(b: *std.build.Builder) !void {
     const train = make.obj("train", "common/train.cpp");
     const clip = make.obj("clip", "examples/llava/clip.cpp");
     const llava = make.obj("llava", "examples/llava/llava.cpp");
-    const ggml_aarch64 = make.obj("ggml-aarch64", "ggml-aarch64.cpp");
+    const ggml_aarch64 = make.obj("ggml-aarch64", "ggml-aarch64.c");
 
     _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, ggml_aarch64, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
     _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, ggml_aarch64, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 214edb03c56b1..1578c4afb5dfa 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -46,7 +46,9 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B",  },
     { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 6.14G, +0.0217 ppl @ Llama-3-8B",  },
     { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 7.96G, +0.0026 ppl @ Llama-3-8B",  },
-    { "Q4_0_AARCH64", LLAMA_FTYPE_MOSTLY_Q4_0_AARCH64, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
+    { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
+    { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
+    { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
     { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "14.00G, +0.0020 ppl @ Mistral-7B",  },
     { "BF16",   LLAMA_FTYPE_MOSTLY_BF16,   "14.00G, -0.0050 ppl @ Mistral-7B",  },
     { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B",          },
diff --git a/ggml-aarch64.cpp b/ggml-aarch64.c
similarity index 82%
rename from ggml-aarch64.cpp
rename to ggml-aarch64.c
index b12cd0b28b530..d888031f315f8 100644
--- a/ggml-aarch64.cpp
+++ b/ggml-aarch64.c
@@ -1,8 +1,4 @@
 // SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
-
-#pragma GCC diagnostic ignored "-Wpedantic"
-#pragma GCC diagnostic ignored "-Wignored-attributes"
-
 #define GGML_COMMON_IMPL_C
 #include "ggml-common.h"
 
@@ -23,95 +19,76 @@
 
 #define UNUSED GGML_UNUSED
 
-size_t quantize_q4_0_aarch64(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
-    if (!quant_weights) {
-        int nrows_interleaved = 1;
-        int blocklen_per_row;
-
-#if defined(__ARM_FEATURE_SVE)
-        if (svcntw() == 8) {
-            nrows_interleaved = 8;
-            blocklen_per_row = 8;
-        }
-        else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-            nrows_interleaved = 4;
-            blocklen_per_row = 8;
-        }
-#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
-        nrows_interleaved = 4;
-        blocklen_per_row = 8;
-#elif defined(__ARM_NEON)
-        nrows_interleaved = 4;
-        blocklen_per_row = 4;
-#endif
+// Functions to create the interleaved data layout formats
+
+// interleave 4 block_q4_0s in blocks of block_len
+// returns an interleaved block_q4_0x4
+// in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks
+// first, then interleave quants from 4 block_q4_0s in blocks of block_len
+//
+// - in        : an array of block_q4_0 pointers
+// - block_len : the block_q4_0 quants bytes are interleaved in blocks of
+//               block_len bytes
+// - xor_mask  : the mask to convert the nibbles in block_q4_0 quants bytes
+//               from bias offset form to pure sign form (this saves subtract
+//               operations durin unpacking)
+//
+static block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned int block_len, unsigned int xor_mask) {
+    block_q4_0x4 out;
 
-        assert(n_per_row % QK4_0 == 0);
-        const int nb = n_per_row / QK4_0;
+    for (int i = 0; i < 4; i++) {
+        out.d[i] = in[i]->d;
+    }
 
-        void * out_ptr_B = NULL;
-        void * out_ptr_B_start = NULL;
-        if (nrows_interleaved == 8) {
-            out_ptr_B = (block_q4_0x8 *) malloc(sizeof(block_q4_0x8) * nb);
-            out_ptr_B_start = out_ptr_B;
-        }
-        else if (nrows_interleaved == 4) {
-            out_ptr_B = (block_q4_0x4 *) malloc(sizeof(block_q4_0x4) * nb);
-            out_ptr_B_start = out_ptr_B;
-        }
+    for (int i = 0; i < QK4_0 * 2; i++) {
+        int src_offset = (i / (4 * block_len)) * block_len;
+        int src_id = (i % (4 * block_len)) / block_len;
+        src_offset += (i % block_len);
 
-        for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) {
-            block_q4_0 ** in_ptrs = new block_q4_0 * [nrows_interleaved];
+        out.qs[i] = in[src_id]->qs[src_offset] ^ xor_mask;
+    }
 
-            for (int i  = 0; i < nrows_interleaved; i++ ) {
-                in_ptrs[i] = (block_q4_0 *) dst + (b + i * n_per_row) / QK4_0;
-                quantize_row_q4_0_reference(src + b + i * n_per_row, (block_q4_0 *) in_ptrs[i], n_per_row);
-            }
+    return out;
+}
 
-            for (int64_t x = 0; x < nb; x++) {
-                if (nrows_interleaved == 8) {
-                    *(block_q4_0x8 *) out_ptr_B = make_block_q4_0x8(in_ptrs, blocklen_per_row, 0x88);
-                    out_ptr_B = (block_q4_0x8 *) out_ptr_B + 1;
-                }
-                else if (nrows_interleaved == 4) {
-                    *(block_q4_0x4 *) out_ptr_B = make_block_q4_0x4(in_ptrs, blocklen_per_row, 0x88);
-                    out_ptr_B = (block_q4_0x4 *) out_ptr_B + 1;
-                }
-
-                for (int i = 0; i < nrows_interleaved; i++) {
-                    in_ptrs[i]++;
-                }
-            }
-            delete [] in_ptrs;
-            out_ptr_B = out_ptr_B_start;
-            if (nrows_interleaved == 8) memcpy ((block_q4_0 *) dst + b / QK4_0, out_ptr_B_start, sizeof(block_q4_0x8) * nb);
-            else if (nrows_interleaved == 4) memcpy ((block_q4_0 *) dst + b / QK4_0, out_ptr_B_start, sizeof(block_q4_0x4) * nb);
-        }
-        if (out_ptr_B_start) free(out_ptr_B_start);
+// interleave 8 block_q4_0s in blocks of block_len
+// returns an interleaved block_q4_0x8
+// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
+// first, then interleave quants from 8 block_q4_0s in blocks of block_len
+static block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned int block_len, unsigned int xor_mask) {
+    block_q4_0x8 out;
 
-        return ((nrow * n_per_row) / QK4_0 * sizeof(block_q4_0));
+    for (int i = 0; i < 8; i++) {
+        out.d[i] = in[i]->d;
     }
-    else {
-        assert(false);
-        return 0;
+
+    for (int i = 0; i < QK4_0 * 4; i++) {
+        int src_offset = (i / (8 * block_len)) * block_len;
+        int src_id = (i % (8 * block_len)) / block_len;
+        src_offset += (i % block_len);
+
+        out.qs[i] = in[src_id]->qs[src_offset] ^ xor_mask;
     }
+
+    return out;
 }
 
-void quantize_q8_0_aarch64(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k, int nrows_interleaved, int blocklen_per_row) {
+void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k) {
     assert(QK8_0 == 32);
     assert(k % QK8_0 == 0);
     const int nb = k / QK8_0;
 
-    block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
+    block_q8_0x4 * restrict y = (block_q8_0x4 *) vy;
 
 #if defined(__ARM_NEON)
-    float * id = new float[nrows_interleaved];
-    auto srcv = new float32x4_t[nrows_interleaved][8];
+    float32x4_t srcv[4][8];
+    float id[4];
 
     for (int i = 0; i < nb; i++) {
         float32x4_t asrcv[8];
         float32x4_t amaxv[8];
 
-        for (int row_iter = 0; row_iter < nrows_interleaved; row_iter++) {
+        for (int row_iter = 0; row_iter < 4; row_iter++) {
             for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j);
             for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]);
 
@@ -127,186 +104,201 @@ void quantize_q8_0_aarch64(const float * GGML_RESTRICT x, void * GGML_RESTRICT v
             y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
         }
 
-        if (blocklen_per_row == 8) {
-            for (int j = 0; j < 4; j++) {
-                float32x4_t v = vmulq_n_f32(srcv[0][2 * j], id[0]);
-                int32x4_t vi = vcvtnq_s32_f32(v);
-                y[i].qs[32 * j + 0] = vgetq_lane_s32(vi, 0);
-                y[i].qs[32 * j + 1] = vgetq_lane_s32(vi, 1);
-                y[i].qs[32 * j + 2] = vgetq_lane_s32(vi, 2);
-                y[i].qs[32 * j + 3] = vgetq_lane_s32(vi, 3);
-                v = vmulq_n_f32(srcv[0][2 * j + 1], id[0]);
-                vi = vcvtnq_s32_f32(v);
-                y[i].qs[32 * j + 4] = vgetq_lane_s32(vi, 0);
-                y[i].qs[32 * j + 5] = vgetq_lane_s32(vi, 1);
-                y[i].qs[32 * j + 6] = vgetq_lane_s32(vi, 2);
-                y[i].qs[32 * j + 7] = vgetq_lane_s32(vi, 3);
-
-                v = vmulq_n_f32(srcv[1][2 * j], id[1]);
-                vi = vcvtnq_s32_f32(v);
-                y[i].qs[32 * j + 8] = vgetq_lane_s32(vi, 0);
-                y[i].qs[32 * j + 9] = vgetq_lane_s32(vi, 1);
-                y[i].qs[32 * j + 10] = vgetq_lane_s32(vi, 2);
-                y[i].qs[32 * j + 11] = vgetq_lane_s32(vi, 3);
-                v = vmulq_n_f32(srcv[1][2 * j + 1], id[1]);
-                vi = vcvtnq_s32_f32(v);
-                y[i].qs[32 * j + 12] = vgetq_lane_s32(vi, 0);
-                y[i].qs[32 * j + 13] = vgetq_lane_s32(vi, 1);
-                y[i].qs[32 * j + 14] = vgetq_lane_s32(vi, 2);
-                y[i].qs[32 * j + 15] = vgetq_lane_s32(vi, 3);
-
-                v = vmulq_n_f32(srcv[2][2 * j], id[2]);
-                vi = vcvtnq_s32_f32(v);
-                y[i].qs[32 * j + 16] = vgetq_lane_s32(vi, 0);
-                y[i].qs[32 * j + 17] = vgetq_lane_s32(vi, 1);
-                y[i].qs[32 * j + 18] = vgetq_lane_s32(vi, 2);
-                y[i].qs[32 * j + 19] = vgetq_lane_s32(vi, 3);
-                v = vmulq_n_f32(srcv[2][2 * j + 1], id[2]);
-                vi = vcvtnq_s32_f32(v);
-                y[i].qs[32 * j + 20] = vgetq_lane_s32(vi, 0);
-                y[i].qs[32 * j + 21] = vgetq_lane_s32(vi, 1);
-                y[i].qs[32 * j + 22] = vgetq_lane_s32(vi, 2);
-                y[i].qs[32 * j + 23] = vgetq_lane_s32(vi, 3);
-
-                v = vmulq_n_f32(srcv[3][2 * j], id[3]);
-                vi = vcvtnq_s32_f32(v);
-                y[i].qs[32 * j + 24] = vgetq_lane_s32(vi, 0);
-                y[i].qs[32 * j + 25] = vgetq_lane_s32(vi, 1);
-                y[i].qs[32 * j + 26] = vgetq_lane_s32(vi, 2);
-                y[i].qs[32 * j + 27] = vgetq_lane_s32(vi, 3);
-                v = vmulq_n_f32(srcv[3][2 * j + 1], id[3]);
-                vi = vcvtnq_s32_f32(v);
-                y[i].qs[32 * j + 28] = vgetq_lane_s32(vi, 0);
-                y[i].qs[32 * j + 29] = vgetq_lane_s32(vi, 1);
-                y[i].qs[32 * j + 30] = vgetq_lane_s32(vi, 2);
-                y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3);
-            }
-        }
-        else if (blocklen_per_row == 4) {
-            for (int j = 0; j < 8; j++) {
-                float32x4_t v = vmulq_n_f32(srcv[0][j], id[0]);
-                int32x4_t vi = vcvtnq_s32_f32(v);
-                y[i].qs[16 * j + 0] = vgetq_lane_s32(vi, 0);
-                y[i].qs[16 * j + 1] = vgetq_lane_s32(vi, 1);
-                y[i].qs[16 * j + 2] = vgetq_lane_s32(vi, 2);
-                y[i].qs[16 * j + 3] = vgetq_lane_s32(vi, 3);
-
-                v = vmulq_n_f32(srcv[1][j], id[1]);
-                vi = vcvtnq_s32_f32(v);
-                y[i].qs[16 * j + 4] = vgetq_lane_s32(vi, 0);
-                y[i].qs[16 * j + 5] = vgetq_lane_s32(vi, 1);
-                y[i].qs[16 * j + 6] = vgetq_lane_s32(vi, 2);
-                y[i].qs[16 * j + 7] = vgetq_lane_s32(vi, 3);
-
-                v = vmulq_n_f32(srcv[2][j], id[2]);
-                vi = vcvtnq_s32_f32(v);
-                y[i].qs[16 * j + 8] = vgetq_lane_s32(vi, 0);
-                y[i].qs[16 * j + 9] = vgetq_lane_s32(vi, 1);
-                y[i].qs[16 * j + 10] = vgetq_lane_s32(vi, 2);
-                y[i].qs[16 * j + 11] = vgetq_lane_s32(vi, 3);
-
-                v = vmulq_n_f32(srcv[3][j], id[3]);
-                vi = vcvtnq_s32_f32(v);
-                y[i].qs[16 * j + 12] = vgetq_lane_s32(vi, 0);
-                y[i].qs[16 * j + 13] = vgetq_lane_s32(vi, 1);
-                y[i].qs[16 * j + 14] = vgetq_lane_s32(vi, 2);
-                y[i].qs[16 * j + 15] = vgetq_lane_s32(vi, 3);
-            }
+        for (int j = 0; j < 8; j++) {
+            float32x4_t v = vmulq_n_f32(srcv[0][j], id[0]);
+            int32x4_t vi = vcvtnq_s32_f32(v);
+            y[i].qs[16 * j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 * j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[16 * j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 * j + 3] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[1][j], id[1]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[16 * j + 4] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 * j + 5] = vgetq_lane_s32(vi, 1);
+            y[i].qs[16 * j + 6] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 * j + 7] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[2][j], id[2]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[16 * j + 8] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 * j + 9] = vgetq_lane_s32(vi, 1);
+            y[i].qs[16 * j + 10] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 * j + 11] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[3][j], id[3]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[16 * j + 12] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 * j + 13] = vgetq_lane_s32(vi, 1);
+            y[i].qs[16 * j + 14] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 * j + 15] = vgetq_lane_s32(vi, 3);
         }
     }
-    delete [] id;
-    delete [] srcv;
 #endif
 }
 
-// Routines to create the blocked formats
-// Note input is array of pointers.
-// The exact interleaving format needed is different for GEMM (using SMMLA)
-// and GEMV (using SDOT) cases.  For GEMM, we interleave 8 pairs of values
-// at a time (with the two nibbles separated at runtime to give 2x2x8
-// matrices).  For GEMV, we need to interleave 4 pairs of values instead.
-block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned int block_len, unsigned int xor_mask) {
-    block_q4_0x4 out;
+void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int64_t k) {
+    assert(QK8_0 == 32);
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
 
-    for (int i = 0; i < 4; i++) {
-        out.d[i] = in[i]->d;
-    }
+    block_q8_0x4 * restrict y = (block_q8_0x4 *) vy;
 
-    for (int i = 0; i < QK4_0 * 2; i++) {
-        // We are interleaving 4 rows in blocks of 8, making a total of 32
-        // output bytes per block (2 MMLA input vectors).  This repeats
-        // until we have processed the whole block.
-        //
-        // Per the comment above, for GEMV cases a similar process is used
-        // but with blocks of 4 instead, giving a single DOT input vector.
-        //
-        // In the case of q4, we add on 128 to convert the top nibble from
-        // "bias offset" form to pure sign form (this saves a subtract when
-        // we unpack it).
-        int src_offset = (i / (4 * block_len)) * block_len;
-        int src_id = (i % (4 * block_len)) / block_len;
-        src_offset += (i % block_len);
+#if defined(__ARM_NEON)
+    float32x4_t srcv[4][8];
+    float id[4];
 
-        out.qs[i] = in[src_id]->qs[src_offset] ^ xor_mask;
-    }
+    for (int i = 0; i < nb; i++) {
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
 
-    return out;
-}
+        for (int row_iter = 0; row_iter < 4; row_iter++) {
+            for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j);
+            for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]);
 
-// 8-block version - see comments in code above
-block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned int block_len, unsigned int xor_mask) {
-    block_q4_0x8 out;
+            for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]);
+            for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]);
+            for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]);
 
-    for (int i = 0; i < 8; i++) {
-        out.d[i] = in[i]->d;
-    }
+            const float amax = vmaxvq_f32(amaxv[0]);
 
-    for (int i = 0; i < QK4_0 * 4; i++) {
-        int src_offset = (i / (8 * block_len)) * block_len;
-        int src_id = (i % (8 * block_len)) / block_len;
-        src_offset += (i % block_len);
+            const float d = amax / ((1 << 7) - 1);
+            id[row_iter] = d ? 1.0f / d : 0.0f;
 
-        out.qs[i] = in[src_id]->qs[src_offset] ^ xor_mask;
-    }
+            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
+        }
 
-    return out;
+        for (int j = 0; j < 4; j++) {
+            float32x4_t v = vmulq_n_f32(srcv[0][2 * j], id[0]);
+            int32x4_t vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 3] = vgetq_lane_s32(vi, 3);
+            v = vmulq_n_f32(srcv[0][2 * j + 1], id[0]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 4] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 5] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 6] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 7] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[1][2 * j], id[1]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 8] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 9] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 10] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 11] = vgetq_lane_s32(vi, 3);
+            v = vmulq_n_f32(srcv[1][2 * j + 1], id[1]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 12] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 13] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 14] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 15] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[2][2 * j], id[2]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 16] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 17] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 18] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 19] = vgetq_lane_s32(vi, 3);
+            v = vmulq_n_f32(srcv[2][2 * j + 1], id[2]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 20] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 21] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 22] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 23] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[3][2 * j], id[3]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 24] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 25] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 26] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 27] = vgetq_lane_s32(vi, 3);
+            v = vmulq_n_f32(srcv[3][2 * j + 1], id[3]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 28] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 29] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 30] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3);
+        }
+    }
+#endif
 }
 
-block_q8_0x4 make_block_q8_0x4(const block_q8_0 * const in[4], unsigned int block_len) {
-    block_q8_0x4 out;
+static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int blocklen_per_row) {
+    assert(n_per_row % QK4_0 == 0);
+    const int nb = n_per_row / QK4_0;
 
-    for (int i = 0; i < 4; i++) {
-        out.d[i] = in[i]->d;
+    void * out_ptr_B = NULL;
+    void * out_ptr_B_start = NULL;
+    if (nrows_interleaved == 8) {
+        out_ptr_B = (block_q4_0x8 *) malloc(sizeof(block_q4_0x8) * nb);
+        out_ptr_B_start = out_ptr_B;
+    }
+    else if (nrows_interleaved == 4) {
+        out_ptr_B = (block_q4_0x4 *) malloc(sizeof(block_q4_0x4) * nb);
+        out_ptr_B_start = out_ptr_B;
     }
 
-    for (int i = 0; i < QK8_0 * 4; i++) {
-        int src_offset = (i / (4 * block_len)) * block_len;
-        int src_id = (i % (4 * block_len)) / block_len;
-        src_offset += (i % block_len);
+    for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) {
+        block_q4_0 * in_ptrs[nrows_interleaved];
+
+        for (int i  = 0; i < nrows_interleaved; i++ ) {
+            in_ptrs[i] = (block_q4_0 *) dst + (b + i * n_per_row) / QK4_0;
+            quantize_row_q4_0_reference(src + b + i * n_per_row, (block_q4_0 *) in_ptrs[i], n_per_row);
+        }
+
+        for (int64_t x = 0; x < nb; x++) {
+            if (nrows_interleaved == 8) {
+                *(block_q4_0x8 *) out_ptr_B = make_block_q4_0x8((const block_q4_0 * const *) in_ptrs, blocklen_per_row, 0x88);
+                out_ptr_B = (block_q4_0x8 *) out_ptr_B + 1;
+            }
+            else if (nrows_interleaved == 4) {
+                *(block_q4_0x4 *) out_ptr_B = make_block_q4_0x4((const block_q4_0 * const *) in_ptrs, blocklen_per_row, 0x88);
+                out_ptr_B = (block_q4_0x4 *) out_ptr_B + 1;
+            }
 
-        out.qs[i] = in[src_id]->qs[src_offset];
+            for (int i = 0; i < nrows_interleaved; i++) {
+                in_ptrs[i]++;
+            }
+        }
+        out_ptr_B = out_ptr_B_start;
+        if (nrows_interleaved == 8) memcpy ((block_q4_0 *) dst + b / QK4_0, out_ptr_B_start, sizeof(block_q4_0x8) * nb);
+        else if (nrows_interleaved == 4) memcpy ((block_q4_0 *) dst + b / QK4_0, out_ptr_B_start, sizeof(block_q4_0x4) * nb);
     }
+    if (out_ptr_B_start) free(out_ptr_B_start);
 
-    return out;
+    return ((nrow * n_per_row) / QK4_0 * sizeof(block_q4_0));
 }
 
-// 8-block version - see comments in code above
-block_q8_0x8 make_block_q8_0x8(const block_q8_0 * const in[8], unsigned int block_len) {
-    block_q8_0x8 out;
-
-    for (int i = 0; i < 8; i++) {
-        out.d[i] = in[i]->d;
+size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    if (!quant_weights) {
+        return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
     }
+    else {
+        assert(false);
+        return 0;
+    }
+}
 
-    for (int i = 0; i < QK8_0 * 8; i++) {
-        int src_offset = (i / (8 * block_len)) * block_len;
-        int src_id = (i % (8 * block_len)) / block_len;
-        src_offset += (i % block_len);
-
-        out.qs[i] = in[src_id]->qs[src_offset];
+size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    if (!quant_weights) {
+        return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8);
+    }
+    else {
+        assert(false);
+        return 0;
     }
+}
 
-    return out;
+size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    if (!quant_weights) {
+        return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
+    }
+    else {
+        assert(false);
+        return 0;
+    }
 }
 
 inline int64_t roundup(const int64_t a, const int64_t b) {
@@ -319,7 +311,7 @@ inline int64_t roundup(const int64_t a, const int64_t b) {
     }
 }
 
-void ggml_gemv_q4_0_q8_0_aarch64(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth) {
+void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy, int nr, int nc, int ith, int nth) {
     UNUSED(n);
     UNUSED(s);
     UNUSED(vx);
@@ -331,82 +323,14 @@ void ggml_gemv_q4_0_q8_0_aarch64(int n, float * GGML_RESTRICT s, const void * GG
 
 #if defined(__ARM_FEATURE_SVE)
     if (svcntw() == 8) {
-        int64_t x0 = roundup((ith * nc) / nth, (int64_t)8);
-        int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)8);
-        size_t width = xend - x0;
-
-        int64_t nb = n / QK4_0;
-        const void * b_ptr = (const void *)((const block_q4_0x8 *) vx + ((x0 / 8) * nb));
-        const void * a_ptr = vy;
-        float * res_ptr = s + x0;
-
-        assert(n % 32 == 0);
-        assert(width % 8 == 0);
-
-        size_t num_blocks = n / 32;
-
-        __asm__ __volatile__(
-            "ptrue p0.b\n"
-            "add %x[b_ptr], %x[b_ptr], #0x10\n"
-            "1:"  // Column loop
-            "add x22, %x[a_ptr], #0x2\n"
-            "mov z31.b, #0x0\n"
-            "mov x21, %x[num_blocks]\n"
-            "2:"  // Block loop
-            "ld1b { z30.b }, p0/Z, [%x[b_ptr]]\n"
-            "ld1b { z29.b }, p0/Z, [%x[b_ptr], #1, MUL VL]\n"
-            "mov z28.s, #0x0\n"
-            "mov z27.s, #0x0\n"
-            "ld1rd { z26.d }, p0/Z, [x22]\n"
-            "ld1b { z25.b }, p0/Z, [%x[b_ptr], #2, MUL VL]\n"
-            "sub x20, x22, #0x2\n"
-            "sub x21, x21, #0x1\n"
-            "ld1b { z24.b }, p0/Z, [%x[b_ptr], #3, MUL VL]\n"
-            "ld1rd { z23.d }, p0/Z, [x22, #8]\n"
-            "lsl z22.b, z30.b, #0x4\n"
-            "lsl z16.b, z29.b, #0x4\n"
-            "and z30.b, z30.b, #0xf0\n"
-            "and z29.b, z29.b, #0xf0\n"
-            "ld1rd { z21.d }, p0/Z, [x22, #16]\n"
-            "ld1rd { z20.d }, p0/Z, [x22, #24]\n"
-            "lsl z19.b, z25.b, #0x4\n"
-            "and z25.b, z25.b, #0xf0\n"
-            "ld1rh { z17.h }, p0/Z, [x20]\n"
-            "ld1h { z18.s }, p0/Z, [%x[b_ptr], #-1, MUL VL]\n"
-            "sdot z28.s, z22.b, z26.b\n"
-            "sdot z27.s, z16.b, z26.b\n"
-            "lsl z16.b, z24.b, #0x4\n"
-            "add x22, x22, #0x22\n"
-            "and z24.b, z24.b, #0xf0\n"
-            "add %x[b_ptr], %x[b_ptr], #0x90\n"
-            "fcvt z17.s, p0/m, z17.h\n"
-            "fcvt z18.s, p0/m, z18.h\n"
-            "sdot z28.s, z19.b, z23.b\n"
-            "sdot z27.s, z16.b, z23.b\n"
-            "fmul z18.s, z18.s, z17.s\n"
-            "sdot z28.s, z30.b, z21.b\n"
-            "sdot z27.s, z29.b, z21.b\n"
-            "sdot z28.s, z25.b, z20.b\n"
-            "sdot z27.s, z24.b, z20.b\n"
-            "uzp1 z17.s, z28.s, z27.s\n"
-            "uzp2 z16.s, z28.s, z27.s\n"
-            "add z17.s, z17.s, z16.s\n"
-            "asr z17.s, z17.s, #0x4\n"
-            "scvtf z17.s, p0/m, z17.s\n"
-            "fmla z31.s, p0/M, z17.s, z18.s\n"
-            "cbnz x21, 2b\n"
-            "sub %x[width], %x[width], #0x8\n"
-            "st1w { z31.s }, p0, [%x[res_ptr]]\n"
-            "add %x[res_ptr], %x[res_ptr], #0x20\n"
-            "cbnz %x[width], 1b\n"
-            : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [width] "+&r" (width)
-            : [a_ptr] "r" (a_ptr), [num_blocks] "r" (num_blocks)
-            : "memory", "p0", "x20", "x21", "x22", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
-        );
-        return;
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+                    "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
     }
 #endif
 #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
+                "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
+#elif defined(__ARM_NEON)
     int64_t x0 = roundup((ith * nc) / nth, (int64_t)4);
     int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)4);
     size_t width = xend - x0;
@@ -422,63 +346,77 @@ void ggml_gemv_q4_0_q8_0_aarch64(int n, float * GGML_RESTRICT s, const void * GG
     size_t num_blocks = n / 32;
 
     __asm__ __volatile__(
-        "movi v2.16b, #0x4\n"
-        "movi v1.16b, #0xf0\n"
+        "movi v31.16b, #0x4\n"
+        "movi v30.16b, #0xf0\n"
         "add %x[b_ptr], %x[b_ptr], #0x8\n"
         "1:"  // Column loop
-        "add x23, %x[a_ptr], #0x2\n"
-        "movi v0.16b, #0x0\n"
-        "mov x22, %x[num_blocks]\n"
+        "add x22, %x[a_ptr], #0x2\n"
+        "movi v29.16b, #0x0\n"
+        "mov x21, %x[num_blocks]\n"
         "2:"  // Block loop
-        "ldr q31, [%x[b_ptr], #0x0]\n"
-        "ldr q30, [%x[b_ptr], #0x10]\n"
-        "mov x21, x23\n"
-        "movi v29.4s, #0x0\n"
-        "ldr q28, [%x[b_ptr], #0x20]\n"
-        "ldr q27, [%x[b_ptr], #0x30]\n"
+        "ldr q28, [%x[b_ptr], #0x0]\n"
+        "ldr q27, [x22, #0x0]\n"
         "movi v26.4s, #0x0\n"
-        "sub x20, x23, #0x2\n"
-        "ld1r { v25.8h }, [x20]\n"
-        "ldr q24, [%x[b_ptr], #-0x8]\n"
-        "sub x22, x22, #0x1\n"
-        "add x23, x23, #0x22\n"
-        "ld1r { v23.2d }, [x21], #0x8\n"
-        "sshl v22.16b, v31.16b, v2.16b\n"
-        "sshl v16.16b, v30.16b, v2.16b\n"
+        "sub x20, x22, #0x2\n"
+        "ldr q25, [x22, #0x10]\n"
+        "ldr q24, [%x[b_ptr], #0x10]\n"
+        "sub x21, x21, #0x1\n"
+        "add x22, x22, #0x22\n"
+        "ldr q23, [%x[b_ptr], #0x20]\n"
+        "ldr q22, [%x[b_ptr], #0x30]\n"
+        "ld1r { v21.8h }, [x20]\n"
+        "ldr q20, [%x[b_ptr], #-0x8]\n"
+        "sshl v16.16b, v28.16b, v31.16b\n"
+        "and v28.16b, v28.16b, v30.16b\n"
+        "sshl v19.16b, v24.16b, v31.16b\n"
+        "and v24.16b, v24.16b, v30.16b\n"
         "add %x[b_ptr], %x[b_ptr], #0x48\n"
-        "ld1r { v21.2d }, [x21], #0x8\n"
-        "sshl v20.16b, v28.16b, v2.16b\n"
-        "sshl v19.16b, v27.16b, v2.16b\n"
-        "ld1r { v18.2d }, [x21], #0x8\n"
-        "ld1r { v17.2d }, [x21], #0x8\n"
-        "and v31.16b, v31.16b, v1.16b\n"
-        "and v30.16b, v30.16b, v1.16b\n"
-        ".inst 0x4e9796dd  // sdot v29.4s, v22.16b, v23.16b\n"
-        ".inst 0x4e97961a  // sdot v26.4s, v16.16b, v23.16b\n"
-        "and v28.16b, v28.16b, v1.16b\n"
-        "and v27.16b, v27.16b, v1.16b\n"
-        "fcvtl v25.4s, v25.4h\n"
-        "fcvtl v16.4s, v24.4h\n"
-        ".inst 0x4e95969d  // sdot v29.4s, v20.16b, v21.16b\n"
-        ".inst 0x4e95967a  // sdot v26.4s, v19.16b, v21.16b\n"
-        "fmul v16.4s, v16.4s, v25.4s\n"
-        ".inst 0x4e9297fd  // sdot v29.4s, v31.16b, v18.16b\n"
-        ".inst 0x4e9297da  // sdot v26.4s, v30.16b, v18.16b\n"
-        ".inst 0x4e91979d  // sdot v29.4s, v28.16b, v17.16b\n"
-        ".inst 0x4e91977a  // sdot v26.4s, v27.16b, v17.16b\n"
-        "addp v29.4s, v29.4s, v26.4s\n"
-        "scvtf v29.4s, v29.4s, #0x4\n"
-        "fmla v0.4s, v29.4s, v16.4s\n"
-        "cbnz x22, 2b\n"
+        "sshl v18.16b, v23.16b, v31.16b\n"
+        "and v23.16b, v23.16b, v30.16b\n"
+        ".inst 0x4f9be21a  // sdot v26.4s, v16.16b, v27.4b[0]\n"
+        "sshl v17.16b, v22.16b, v31.16b\n"
+        "and v22.16b, v22.16b, v30.16b\n"
+        "fcvtl v21.4s, v21.4h\n"
+        "fcvtl v16.4s, v20.4h\n"
+        ".inst 0x4f99e39a  // sdot v26.4s, v28.16b, v25.4b[0]\n"
+        "fmul v16.4s, v16.4s, v21.4s\n"
+        ".inst 0x4fbbe27a  // sdot v26.4s, v19.16b, v27.4b[1]\n"
+        ".inst 0x4fb9e31a  // sdot v26.4s, v24.16b, v25.4b[1]\n"
+        ".inst 0x4f9bea5a  // sdot v26.4s, v18.16b, v27.4b[2]\n"
+        ".inst 0x4f99eafa  // sdot v26.4s, v23.16b, v25.4b[2]\n"
+        ".inst 0x4fbbea3a  // sdot v26.4s, v17.16b, v27.4b[3]\n"
+        ".inst 0x4fb9eada  // sdot v26.4s, v22.16b, v25.4b[3]\n"
+        "scvtf v26.4s, v26.4s, #0x4\n"
+        "fmla v29.4s, v26.4s, v16.4s\n"
+        "cbnz x21, 2b\n"
         "sub %x[width], %x[width], #0x4\n"
-        "str q0, [%x[res_ptr], #0x0]\n"
+        "str q29, [%x[res_ptr], #0x0]\n"
         "add %x[res_ptr], %x[res_ptr], #0x10\n"
         "cbnz %x[width], 1b\n"
         : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [width] "+&r" (width)
         : [a_ptr] "r" (a_ptr), [num_blocks] "r" (num_blocks)
-        : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
+        : "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22"
     );
-#elif defined(__ARM_NEON)
+#endif
+}
+
+void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy, int nr, int nc, int ith, int nth) {
+    UNUSED(n);
+    UNUSED(s);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(ith);
+    UNUSED(nth);
+
+#if defined(__ARM_FEATURE_SVE)
+    if (svcntw() == 8) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+                    "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
+    }
+#endif
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
     int64_t x0 = roundup((ith * nc) / nth, (int64_t)4);
     int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)4);
     size_t width = xend - x0;
@@ -494,61 +432,70 @@ void ggml_gemv_q4_0_q8_0_aarch64(int n, float * GGML_RESTRICT s, const void * GG
     size_t num_blocks = n / 32;
 
     __asm__ __volatile__(
-        "movi v31.16b, #0x4\n"
-        "movi v30.16b, #0xf0\n"
+        "movi v2.16b, #0x4\n"
+        "movi v1.16b, #0xf0\n"
         "add %x[b_ptr], %x[b_ptr], #0x8\n"
         "1:"  // Column loop
-        "add x22, %x[a_ptr], #0x2\n"
-        "movi v29.16b, #0x0\n"
-        "mov x21, %x[num_blocks]\n"
+        "add x23, %x[a_ptr], #0x2\n"
+        "movi v0.16b, #0x0\n"
+        "mov x22, %x[num_blocks]\n"
         "2:"  // Block loop
-        "ldr q28, [%x[b_ptr], #0x0]\n"
-        "ldr q27, [x22, #0x0]\n"
+        "ldr q31, [%x[b_ptr], #0x0]\n"
+        "ldr q30, [%x[b_ptr], #0x10]\n"
+        "mov x21, x23\n"
+        "movi v29.4s, #0x0\n"
+        "ldr q28, [%x[b_ptr], #0x20]\n"
+        "ldr q27, [%x[b_ptr], #0x30]\n"
         "movi v26.4s, #0x0\n"
-        "sub x20, x22, #0x2\n"
-        "ldr q25, [x22, #0x10]\n"
-        "ldr q24, [%x[b_ptr], #0x10]\n"
-        "sub x21, x21, #0x1\n"
-        "add x22, x22, #0x22\n"
-        "ldr q23, [%x[b_ptr], #0x20]\n"
-        "ldr q22, [%x[b_ptr], #0x30]\n"
-        "ld1r { v21.8h }, [x20]\n"
-        "ldr q20, [%x[b_ptr], #-0x8]\n"
-        "sshl v16.16b, v28.16b, v31.16b\n"
-        "and v28.16b, v28.16b, v30.16b\n"
-        "sshl v19.16b, v24.16b, v31.16b\n"
-        "and v24.16b, v24.16b, v30.16b\n"
+        "sub x20, x23, #0x2\n"
+        "ld1r { v25.8h }, [x20]\n"
+        "ldr q24, [%x[b_ptr], #-0x8]\n"
+        "sub x22, x22, #0x1\n"
+        "add x23, x23, #0x22\n"
+        "ld1r { v23.2d }, [x21], #0x8\n"
+        "sshl v22.16b, v31.16b, v2.16b\n"
+        "sshl v16.16b, v30.16b, v2.16b\n"
         "add %x[b_ptr], %x[b_ptr], #0x48\n"
-        "sshl v18.16b, v23.16b, v31.16b\n"
-        "and v23.16b, v23.16b, v30.16b\n"
-        ".inst 0x4f9be21a  // sdot v26.4s, v16.16b, v27.4b[0]\n"
-        "sshl v17.16b, v22.16b, v31.16b\n"
-        "and v22.16b, v22.16b, v30.16b\n"
-        "fcvtl v21.4s, v21.4h\n"
-        "fcvtl v16.4s, v20.4h\n"
-        ".inst 0x4f99e39a  // sdot v26.4s, v28.16b, v25.4b[0]\n"
-        "fmul v16.4s, v16.4s, v21.4s\n"
-        ".inst 0x4fbbe27a  // sdot v26.4s, v19.16b, v27.4b[1]\n"
-        ".inst 0x4fb9e31a  // sdot v26.4s, v24.16b, v25.4b[1]\n"
-        ".inst 0x4f9bea5a  // sdot v26.4s, v18.16b, v27.4b[2]\n"
-        ".inst 0x4f99eafa  // sdot v26.4s, v23.16b, v25.4b[2]\n"
-        ".inst 0x4fbbea3a  // sdot v26.4s, v17.16b, v27.4b[3]\n"
-        ".inst 0x4fb9eada  // sdot v26.4s, v22.16b, v25.4b[3]\n"
-        "scvtf v26.4s, v26.4s, #0x4\n"
-        "fmla v29.4s, v26.4s, v16.4s\n"
-        "cbnz x21, 2b\n"
+        "ld1r { v21.2d }, [x21], #0x8\n"
+        "sshl v20.16b, v28.16b, v2.16b\n"
+        "sshl v19.16b, v27.16b, v2.16b\n"
+        "ld1r { v18.2d }, [x21], #0x8\n"
+        "ld1r { v17.2d }, [x21], #0x8\n"
+        "and v31.16b, v31.16b, v1.16b\n"
+        "and v30.16b, v30.16b, v1.16b\n"
+        ".inst 0x4e9796dd  // sdot v29.4s, v22.16b, v23.16b\n"
+        ".inst 0x4e97961a  // sdot v26.4s, v16.16b, v23.16b\n"
+        "and v28.16b, v28.16b, v1.16b\n"
+        "and v27.16b, v27.16b, v1.16b\n"
+        "fcvtl v25.4s, v25.4h\n"
+        "fcvtl v16.4s, v24.4h\n"
+        ".inst 0x4e95969d  // sdot v29.4s, v20.16b, v21.16b\n"
+        ".inst 0x4e95967a  // sdot v26.4s, v19.16b, v21.16b\n"
+        "fmul v16.4s, v16.4s, v25.4s\n"
+        ".inst 0x4e9297fd  // sdot v29.4s, v31.16b, v18.16b\n"
+        ".inst 0x4e9297da  // sdot v26.4s, v30.16b, v18.16b\n"
+        ".inst 0x4e91979d  // sdot v29.4s, v28.16b, v17.16b\n"
+        ".inst 0x4e91977a  // sdot v26.4s, v27.16b, v17.16b\n"
+        "addp v29.4s, v29.4s, v26.4s\n"
+        "scvtf v29.4s, v29.4s, #0x4\n"
+        "fmla v0.4s, v29.4s, v16.4s\n"
+        "cbnz x22, 2b\n"
         "sub %x[width], %x[width], #0x4\n"
-        "str q29, [%x[res_ptr], #0x0]\n"
+        "str q0, [%x[res_ptr], #0x0]\n"
         "add %x[res_ptr], %x[res_ptr], #0x10\n"
         "cbnz %x[width], 1b\n"
         : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [width] "+&r" (width)
         : [a_ptr] "r" (a_ptr), [num_blocks] "r" (num_blocks)
-        : "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22"
+        : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
     );
+#elif defined(__ARM_NEON)
+    GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
+                "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
+                "performance");
 #endif
 }
 
-void ggml_gemm_q4_0_q8_0_aarch64(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth) {
+void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy, int nr, int nc, int ith, int nth) {
     UNUSED(n);
     UNUSED(s);
     UNUSED(vx);
@@ -558,7 +505,7 @@ void ggml_gemm_q4_0_q8_0_aarch64(int n, float * GGML_RESTRICT s, const void * GG
     UNUSED(ith);
     UNUSED(nth);
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if defined(__ARM_FEATURE_SVE)
     if (svcntw() == 8) {
         int64_t x0 = roundup((ith * nc) / nth, (int64_t)8);
         int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)8);
@@ -568,7 +515,6 @@ void ggml_gemm_q4_0_q8_0_aarch64(int n, float * GGML_RESTRICT s, const void * GG
         const void * b_ptr = (const void *)((const block_q4_0x8 *) vx + ((x0 / 8) * nb));
         const void * a_ptr = vy;
         float * res_ptr = s + x0;
-        size_t res_stride = nc * sizeof(float);
 
         assert(n % 32 == 0);
         assert(width % 8 == 0);
@@ -576,417 +522,112 @@ void ggml_gemm_q4_0_q8_0_aarch64(int n, float * GGML_RESTRICT s, const void * GG
         size_t num_blocks = n / 32;
 
         __asm__ __volatile__(
-            "mov x20, #0x4\n"
-            "mov x13, %x[nr]\n"
-            "mov z28.s, #-0x4\n"
-            "mov x12, #0x88\n"
-            "ptrue p1.b\n"
-            "whilelt p0.s, XZR, x20\n"
-            "cmp x13, #0x10\n"
-            "mul x12, %x[num_blocks], x12\n"
-            "blt 4f\n"
-            "1:"  // Row loop
-            "add x11, %x[b_ptr], #0x10\n"
-            "mov x10, %x[width]\n"
-            "add x9, %x[res_ptr], %x[res_stride], LSL #4\n"
-            "2:"  // Column loop
-            "add x28, %x[a_ptr], #0x8\n"
-            "mov z24.b, #0x0\n"
-            "mov z15.b, #0x0\n"
-            "mov x27, %x[num_blocks]\n"
-            "add x26, x28, x12\n"
-            "mov z12.b, #0x0\n"
-            "mov z0.b, #0x0\n"
-            "add x25, x26, x12\n"
-            "mov z13.b, #0x0\n"
-            "mov z1.b, #0x0\n"
-            "add x24, x25, x12\n"
-            "mov z20.b, #0x0\n"
-            "mov z25.b, #0x0\n"
-            "mov z11.b, #0x0\n"
-            "mov z16.b, #0x0\n"
-            "mov z19.b, #0x0\n"
-            "mov z26.b, #0x0\n"
-            "mov z8.b, #0x0\n"
-            "mov z29.b, #0x0\n"
-            "mov z27.b, #0x0\n"
-            "mov z10.b, #0x0\n"
-            "3:"  // Block loop
-            "ld1b { z30.b }, p1/Z, [x11]\n"
-            "ld1b { z21.b }, p1/Z, [x11, #1, MUL VL]\n"
-            "mov z18.s, #0x0\n"
-            "mov z7.s, #0x0\n"
-            "ld1rqb { z3.b }, p1/Z, [x28]\n"
-            "ld1rqb { z5.b }, p1/Z, [x28, #16]\n"
-            "mov z9.s, #0x0\n"
-            "mov z22.s, #0x0\n"
-            "ld1b { z4.b }, p1/Z, [x11, #2, MUL VL]\n"
-            "ld1b { z17.b }, p1/Z, [x11, #3, MUL VL]\n"
-            "sub x20, x11, #0x10\n"
-            "sub x23, x28, #0x8\n"
-            "lsl z31.b, z30.b, #0x4\n"
-            "lsl z6.b, z21.b, #0x4\n"
-            "ld1h { z23.s }, p1/Z, [x20]\n"
-            "sub x22, x26, #0x8\n"
+            "ptrue p0.b\n"
+            "add %x[b_ptr], %x[b_ptr], #0x10\n"
+            "1:"  // Column loop
+            "add x22, %x[a_ptr], #0x2\n"
+            "mov z31.b, #0x0\n"
+            "mov x21, %x[num_blocks]\n"
+            "2:"  // Block loop
+            "ld1b { z30.b }, p0/Z, [%x[b_ptr]]\n"
+            "ld1b { z29.b }, p0/Z, [%x[b_ptr], #1, MUL VL]\n"
+            "mov z28.s, #0x0\n"
+            "mov z27.s, #0x0\n"
+            "ld1rd { z26.d }, p0/Z, [x22]\n"
+            "ld1b { z25.b }, p0/Z, [%x[b_ptr], #2, MUL VL]\n"
+            "sub x20, x22, #0x2\n"
+            "sub x21, x21, #0x1\n"
+            "ld1b { z24.b }, p0/Z, [%x[b_ptr], #3, MUL VL]\n"
+            "ld1rd { z23.d }, p0/Z, [x22, #8]\n"
+            "lsl z22.b, z30.b, #0x4\n"
+            "lsl z16.b, z29.b, #0x4\n"
             "and z30.b, z30.b, #0xf0\n"
-            "and z21.b, z21.b, #0xf0\n"
-            "sub x21, x25, #0x8\n"
-            "sub x20, x24, #0x8\n"
-            "lsl z14.b, z4.b, #0x4\n"
-            "lsl z2.b, z17.b, #0x4\n"
-            "subs x27, x27, #0x1\n"
-            "add x11, x11, #0x90\n"
-            ".inst 0x451f9872  // smmla z18.s, z3.b, z31.b\n"
-            ".inst 0x45069867  // smmla z7.s, z3.b, z6.b\n"
-            "ld1rqb { z3.b }, p1/Z, [x28, #32]\n"
-            "and z4.b, z4.b, #0xf0\n"
-            ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
-            ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
-            "ld1rqb { z5.b }, p1/Z, [x28, #48]\n"
-            "and z17.b, z17.b, #0xf0\n"
-            "fcvt z23.s, p1/m, z23.h\n"
-            ".inst 0x450e9872  // smmla z18.s, z3.b, z14.b\n"
-            ".inst 0x45029867  // smmla z7.s, z3.b, z2.b\n"
-            "ld1rqb { z3.b }, p1/Z, [x28, #64]\n"
-            ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
-            ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
-            "ld1rqb { z5.b }, p1/Z, [x28, #80]\n"
-            "fscale z23.s, p1/m, z23.s, z28.s\n"
-            ".inst 0x451e9872  // smmla z18.s, z3.b, z30.b\n"
-            ".inst 0x45159867  // smmla z7.s, z3.b, z21.b\n"
-            "ld1rqb { z3.b }, p1/Z, [x28, #96]\n"
-            ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
-            ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
-            "ld1rqb { z5.b }, p1/Z, [x28, #112]\n"
-            "add x28, x28, #0x88\n"
-            ".inst 0x45049872  // smmla z18.s, z3.b, z4.b\n"
-            ".inst 0x45119867  // smmla z7.s, z3.b, z17.b\n"
-            "ld1h { z3.s }, p0/Z, [x23]\n"
-            ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
-            ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
-            "fcvt z3.s, p1/m, z3.h\n"
-            "uzp1 z5.d, z18.d, z7.d\n"
-            "uzp2 z18.d, z18.d, z7.d\n"
-            "mov z3.q, z3.q[0]\n"
-            "uzp1 z7.d, z9.d, z22.d\n"
-            "uzp2 z22.d, z9.d, z22.d\n"
-            "fmul z9.s, z23.s, z3.s[0]\n"
-            "scvtf z5.s, p1/m, z5.s\n"
-            "scvtf z18.s, p1/m, z18.s\n"
-            "scvtf z7.s, p1/m, z7.s\n"
-            "scvtf z22.s, p1/m, z22.s\n"
-            "fmla z24.s, p1/M, z5.s, z9.s\n"
-            "ld1rqb { z5.b }, p1/Z, [x26]\n"
-            "fmul z9.s, z23.s, z3.s[1]\n"
-            "fmla z15.s, p1/M, z18.s, z9.s\n"
-            "ld1rqb { z18.b }, p1/Z, [x26, #16]\n"
-            "fmul z9.s, z23.s, z3.s[2]\n"
-            "fmul z3.s, z23.s, z3.s[3]\n"
-            "fmla z12.s, p1/M, z7.s, z9.s\n"
-            "mov z9.s, #0x0\n"
-            "ld1h { z7.s }, p0/Z, [x22]\n"
-            ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
-            "fmla z0.s, p1/M, z22.s, z3.s\n"
-            "mov z22.s, #0x0\n"
-            "ld1h { z3.s }, p0/Z, [x21]\n"
-            ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
-            "ld1rqb { z5.b }, p1/Z, [x26, #32]\n"
-            "fcvt z7.s, p1/m, z7.h\n"
-            "fcvt z3.s, p1/m, z3.h\n"
-            ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
-            ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
-            "ld1rqb { z5.b }, p1/Z, [x26, #64]\n"
-            "mov z7.q, z7.q[0]\n"
-            "mov z3.q, z3.q[0]\n"
-            ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
-            ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
-            "ld1rqb { z5.b }, p1/Z, [x26, #96]\n"
-            ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
-            ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
-            "uzp1 z5.d, z9.d, z22.d\n"
-            "scvtf z5.s, p1/m, z5.s\n"
-            "uzp2 z22.d, z9.d, z22.d\n"
-            "fmul z9.s, z23.s, z7.s[0]\n"
-            "scvtf z22.s, p1/m, z22.s\n"
-            "fmla z13.s, p1/M, z5.s, z9.s\n"
-            "ld1rqb { z9.b }, p1/Z, [x25]\n"
-            "fmul z5.s, z23.s, z7.s[1]\n"
-            "fmla z1.s, p1/M, z22.s, z5.s\n"
-            "mov z5.s, #0x0\n"
-            "mov z22.s, #0x0\n"
-            ".inst 0x451f9a45  // smmla z5.s, z18.b, z31.b\n"
-            ".inst 0x45069a56  // smmla z22.s, z18.b, z6.b\n"
-            "ld1rqb { z18.b }, p1/Z, [x26, #48]\n"
-            ".inst 0x450e9a45  // smmla z5.s, z18.b, z14.b\n"
-            ".inst 0x45029a56  // smmla z22.s, z18.b, z2.b\n"
-            "ld1rqb { z18.b }, p1/Z, [x26, #80]\n"
-            ".inst 0x451e9a45  // smmla z5.s, z18.b, z30.b\n"
-            ".inst 0x45159a56  // smmla z22.s, z18.b, z21.b\n"
-            "ld1rqb { z18.b }, p1/Z, [x26, #112]\n"
-            "add x26, x26, #0x88\n"
-            ".inst 0x45049a45  // smmla z5.s, z18.b, z4.b\n"
-            ".inst 0x45119a56  // smmla z22.s, z18.b, z17.b\n"
-            "uzp1 z18.d, z5.d, z22.d\n"
-            "scvtf z18.s, p1/m, z18.s\n"
-            "uzp2 z22.d, z5.d, z22.d\n"
-            "fmul z5.s, z23.s, z7.s[2]\n"
-            "fmul z7.s, z23.s, z7.s[3]\n"
-            "scvtf z22.s, p1/m, z22.s\n"
-            "fmla z20.s, p1/M, z18.s, z5.s\n"
-            "ld1rqb { z18.b }, p1/Z, [x25, #16]\n"
-            "ld1h { z5.s }, p0/Z, [x20]\n"
-            "fcvt z5.s, p1/m, z5.h\n"
-            "fmla z25.s, p1/M, z22.s, z7.s\n"
-            "mov z22.s, #0x0\n"
-            "mov z7.s, #0x0\n"
-            ".inst 0x451f9936  // smmla z22.s, z9.b, z31.b\n"
-            ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
-            "ld1rqb { z9.b }, p1/Z, [x25, #32]\n"
-            "mov z5.q, z5.q[0]\n"
-            ".inst 0x450e9936  // smmla z22.s, z9.b, z14.b\n"
-            ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
-            "ld1rqb { z9.b }, p1/Z, [x25, #64]\n"
-            ".inst 0x451e9936  // smmla z22.s, z9.b, z30.b\n"
-            ".inst 0x45159927  // smmla z7.s, z9.b, z21.b\n"
-            "ld1rqb { z9.b }, p1/Z, [x25, #96]\n"
-            ".inst 0x45049936  // smmla z22.s, z9.b, z4.b\n"
-            ".inst 0x45119927  // smmla z7.s, z9.b, z17.b\n"
-            "uzp1 z9.d, z22.d, z7.d\n"
-            "scvtf z9.s, p1/m, z9.s\n"
-            "uzp2 z22.d, z22.d, z7.d\n"
-            "fmul z7.s, z23.s, z3.s[0]\n"
-            "scvtf z22.s, p1/m, z22.s\n"
-            "fmla z11.s, p1/M, z9.s, z7.s\n"
-            "ld1rqb { z9.b }, p1/Z, [x24]\n"
-            "fmul z7.s, z23.s, z3.s[1]\n"
-            "fmla z16.s, p1/M, z22.s, z7.s\n"
-            "mov z22.s, #0x0\n"
-            "mov z7.s, #0x0\n"
-            ".inst 0x451f9a56  // smmla z22.s, z18.b, z31.b\n"
-            ".inst 0x45069a47  // smmla z7.s, z18.b, z6.b\n"
-            "ld1rqb { z18.b }, p1/Z, [x25, #48]\n"
-            ".inst 0x450e9a56  // smmla z22.s, z18.b, z14.b\n"
-            ".inst 0x45029a47  // smmla z7.s, z18.b, z2.b\n"
-            "ld1rqb { z18.b }, p1/Z, [x25, #80]\n"
-            ".inst 0x451e9a56  // smmla z22.s, z18.b, z30.b\n"
-            ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
-            "ld1rqb { z18.b }, p1/Z, [x25, #112]\n"
-            "add x25, x25, #0x88\n"
-            ".inst 0x45049a56  // smmla z22.s, z18.b, z4.b\n"
-            ".inst 0x45119a47  // smmla z7.s, z18.b, z17.b\n"
-            "uzp1 z18.d, z22.d, z7.d\n"
-            "scvtf z18.s, p1/m, z18.s\n"
-            "uzp2 z7.d, z22.d, z7.d\n"
-            "fmul z22.s, z23.s, z3.s[2]\n"
-            "fmul z3.s, z23.s, z3.s[3]\n"
-            "scvtf z7.s, p1/m, z7.s\n"
-            "fmla z19.s, p1/M, z18.s, z22.s\n"
-            "ld1rqb { z18.b }, p1/Z, [x24, #16]\n"
-            "fmul z22.s, z23.s, z5.s[0]\n"
-            "fmla z26.s, p1/M, z7.s, z3.s\n"
-            "mov z3.s, #0x0\n"
-            "mov z7.s, #0x0\n"
-            ".inst 0x451f9923  // smmla z3.s, z9.b, z31.b\n"
-            ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
-            "ld1rqb { z9.b }, p1/Z, [x24, #32]\n"
-            ".inst 0x450e9923  // smmla z3.s, z9.b, z14.b\n"
-            ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
-            "mov z9.s, #0x0\n"
-            ".inst 0x451f9a49  // smmla z9.s, z18.b, z31.b\n"
-            "mov z31.s, #0x0\n"
-            ".inst 0x45069a5f  // smmla z31.s, z18.b, z6.b\n"
-            "ld1rqb { z6.b }, p1/Z, [x24, #48]\n"
-            "ld1rqb { z18.b }, p1/Z, [x24, #64]\n"
-            ".inst 0x450e98c9  // smmla z9.s, z6.b, z14.b\n"
-            "fmul z14.s, z23.s, z5.s[1]\n"
-            ".inst 0x450298df  // smmla z31.s, z6.b, z2.b\n"
-            "ld1rqb { z6.b }, p1/Z, [x24, #80]\n"
-            "fmul z2.s, z23.s, z5.s[2]\n"
-            "fmul z23.s, z23.s, z5.s[3]\n"
-            ".inst 0x451e9a43  // smmla z3.s, z18.b, z30.b\n"
-            ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
-            "ld1rqb { z5.b }, p1/Z, [x24, #96]\n"
-            ".inst 0x451e98c9  // smmla z9.s, z6.b, z30.b\n"
-            ".inst 0x451598df  // smmla z31.s, z6.b, z21.b\n"
-            "ld1rqb { z18.b }, p1/Z, [x24, #112]\n"
-            "add x24, x24, #0x88\n"
-            ".inst 0x450498a3  // smmla z3.s, z5.b, z4.b\n"
-            ".inst 0x451198a7  // smmla z7.s, z5.b, z17.b\n"
-            ".inst 0x45049a49  // smmla z9.s, z18.b, z4.b\n"
-            ".inst 0x45119a5f  // smmla z31.s, z18.b, z17.b\n"
-            "uzp1 z18.d, z3.d, z7.d\n"
-            "uzp2 z5.d, z3.d, z7.d\n"
-            "scvtf z18.s, p1/m, z18.s\n"
-            "uzp1 z6.d, z9.d, z31.d\n"
-            "uzp2 z9.d, z9.d, z31.d\n"
-            "scvtf z5.s, p1/m, z5.s\n"
-            "fmla z8.s, p1/M, z18.s, z22.s\n"
-            "scvtf z6.s, p1/m, z6.s\n"
-            "scvtf z9.s, p1/m, z9.s\n"
-            "fmla z29.s, p1/M, z5.s, z14.s\n"
-            "fmla z27.s, p1/M, z6.s, z2.s\n"
-            "fmla z10.s, p1/M, z9.s, z23.s\n"
-            "bgt 3b\n"
-            "mov x20, %x[res_ptr]\n"
-            "subs x10, x10, #0x8\n"
-            "add %x[res_ptr], %x[res_ptr], #0x20\n"
-            "st1w { z24.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z15.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z12.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z0.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z13.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z1.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z20.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z25.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z11.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z16.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z19.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z26.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z8.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z29.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z27.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z10.s }, p1, [x20]\n"
-            "bne 2b\n"
-            "mov x20, #0x4\n"
-            "sub x13, x13, #0x10\n"
-            "cmp x13, #0x10\n"
-            "mov %x[res_ptr], x9\n"
-            "madd %x[a_ptr], x20, x12, %x[a_ptr]\n"
-            "bge 1b\n"
-            "4:"  // Row loop skip
-            "cbz x13, 9f\n"
-            "5:"  // Row tail: Row loop
-            "add x25, %x[b_ptr], #0x10\n"
-            "mov x24, %x[width]\n"
-            "add x23, %x[res_ptr], %x[res_stride], LSL #2\n"
-            "6:"  // Row tail: Column loop
-            "mov z24.b, #0x0\n"
-            "mov z15.b, #0x0\n"
-            "add x28, %x[a_ptr], #0x8\n"
-            "mov x22, %x[num_blocks]\n"
-            "mov z12.b, #0x0\n"
-            "mov z0.b, #0x0\n"
-            "7:"  // Row tail: Block loop
-            "ld1b { z3.b }, p1/Z, [x25]\n"
-            "ld1b { z6.b }, p1/Z, [x25, #1, MUL VL]\n"
-            "mov z2.s, #0x0\n"
-            "mov z25.s, #0x0\n"
-            "ld1rqb { z26.b }, p1/Z, [x28]\n"
-            "ld1rqb { z21.b }, p1/Z, [x28, #16]\n"
-            "mov z27.s, #0x0\n"
-            "mov z19.s, #0x0\n"
-            "ld1b { z29.b }, p1/Z, [x25, #2, MUL VL]\n"
-            "ld1b { z16.b }, p1/Z, [x25, #3, MUL VL]\n"
-            "sub x21, x25, #0x10\n"
-            "sub x20, x28, #0x8\n"
-            "lsl z20.b, z3.b, #0x4\n"
-            "lsl z4.b, z6.b, #0x4\n"
-            "ld1rqb { z10.b }, p1/Z, [x28, #32]\n"
-            "ld1rqb { z23.b }, p1/Z, [x28, #48]\n"
-            "and z3.b, z3.b, #0xf0\n"
-            "and z6.b, z6.b, #0xf0\n"
-            "ld1rqb { z11.b }, p1/Z, [x28, #64]\n"
-            "ld1rqb { z7.b }, p1/Z, [x28, #80]\n"
-            "lsl z8.b, z29.b, #0x4\n"
-            "lsl z14.b, z16.b, #0x4\n"
-            "ld1rqb { z18.b }, p1/Z, [x28, #96]\n"
-            "ld1rqb { z30.b }, p1/Z, [x28, #112]\n"
-            ".inst 0x45149b42  // smmla z2.s, z26.b, z20.b\n"
-            ".inst 0x45049b59  // smmla z25.s, z26.b, z4.b\n"
             "and z29.b, z29.b, #0xf0\n"
-            "ld1h { z17.s }, p1/Z, [x21]\n"
-            ".inst 0x45149abb  // smmla z27.s, z21.b, z20.b\n"
-            ".inst 0x45049ab3  // smmla z19.s, z21.b, z4.b\n"
-            "and z16.b, z16.b, #0xf0\n"
-            "ld1h { z4.s }, p0/Z, [x20]\n"
-            "subs x22, x22, #0x1\n"
-            "add x28, x28, #0x88\n"
-            "fcvt z17.s, p1/m, z17.h\n"
-            "add x25, x25, #0x90\n"
-            ".inst 0x45089942  // smmla z2.s, z10.b, z8.b\n"
-            ".inst 0x450e9959  // smmla z25.s, z10.b, z14.b\n"
-            "fcvt z4.s, p1/m, z4.h\n"
-            ".inst 0x45089afb  // smmla z27.s, z23.b, z8.b\n"
-            ".inst 0x450e9af3  // smmla z19.s, z23.b, z14.b\n"
-            "fscale z17.s, p1/m, z17.s, z28.s\n"
-            "mov z4.q, z4.q[0]\n"
-            ".inst 0x45039962  // smmla z2.s, z11.b, z3.b\n"
-            ".inst 0x45069979  // smmla z25.s, z11.b, z6.b\n"
-            "fmul z23.s, z17.s, z4.s[0]\n"
-            "fmul z9.s, z17.s, z4.s[1]\n"
-            "fmul z21.s, z17.s, z4.s[2]\n"
-            "fmul z4.s, z17.s, z4.s[3]\n"
-            ".inst 0x450398fb  // smmla z27.s, z7.b, z3.b\n"
-            ".inst 0x450698f3  // smmla z19.s, z7.b, z6.b\n"
-            ".inst 0x451d9a42  // smmla z2.s, z18.b, z29.b\n"
-            ".inst 0x45109a59  // smmla z25.s, z18.b, z16.b\n"
-            ".inst 0x451d9bdb  // smmla z27.s, z30.b, z29.b\n"
-            ".inst 0x45109bd3  // smmla z19.s, z30.b, z16.b\n"
-            "uzp1 z31.d, z2.d, z25.d\n"
-            "uzp2 z13.d, z2.d, z25.d\n"
-            "scvtf z31.s, p1/m, z31.s\n"
-            "uzp1 z17.d, z27.d, z19.d\n"
-            "uzp2 z18.d, z27.d, z19.d\n"
-            "scvtf z13.s, p1/m, z13.s\n"
-            "fmla z24.s, p1/M, z31.s, z23.s\n"
-            "scvtf z17.s, p1/m, z17.s\n"
-            "scvtf z18.s, p1/m, z18.s\n"
-            "fmla z15.s, p1/M, z13.s, z9.s\n"
-            "fmla z12.s, p1/M, z17.s, z21.s\n"
-            "fmla z0.s, p1/M, z18.s, z4.s\n"
-            "bgt 7b\n"
-            "mov x20, %x[res_ptr]\n"
-            "cmp x13, #0x1\n"
-            "st1w { z24.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "ble 8f\n"
-            "cmp x13, #0x2\n"
-            "st1w { z15.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "ble 8f\n"
-            "cmp x13, #0x3\n"
-            "st1w { z12.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "ble 8f\n"
-            "st1w { z0.s }, p1, [x20]\n"
-            "8:"  // Row tail: Accumulator store skip
-            "subs x24, x24, #0x8\n"
+            "ld1rd { z21.d }, p0/Z, [x22, #16]\n"
+            "ld1rd { z20.d }, p0/Z, [x22, #24]\n"
+            "lsl z19.b, z25.b, #0x4\n"
+            "and z25.b, z25.b, #0xf0\n"
+            "ld1rh { z17.h }, p0/Z, [x20]\n"
+            "ld1h { z18.s }, p0/Z, [%x[b_ptr], #-1, MUL VL]\n"
+            "sdot z28.s, z22.b, z26.b\n"
+            "sdot z27.s, z16.b, z26.b\n"
+            "lsl z16.b, z24.b, #0x4\n"
+            "add x22, x22, #0x22\n"
+            "and z24.b, z24.b, #0xf0\n"
+            "add %x[b_ptr], %x[b_ptr], #0x90\n"
+            "fcvt z17.s, p0/m, z17.h\n"
+            "fcvt z18.s, p0/m, z18.h\n"
+            "sdot z28.s, z19.b, z23.b\n"
+            "sdot z27.s, z16.b, z23.b\n"
+            "fmul z18.s, z18.s, z17.s\n"
+            "sdot z28.s, z30.b, z21.b\n"
+            "sdot z27.s, z29.b, z21.b\n"
+            "sdot z28.s, z25.b, z20.b\n"
+            "sdot z27.s, z24.b, z20.b\n"
+            "uzp1 z17.s, z28.s, z27.s\n"
+            "uzp2 z16.s, z28.s, z27.s\n"
+            "add z17.s, z17.s, z16.s\n"
+            "asr z17.s, z17.s, #0x4\n"
+            "scvtf z17.s, p0/m, z17.s\n"
+            "fmla z31.s, p0/M, z17.s, z18.s\n"
+            "cbnz x21, 2b\n"
+            "sub %x[width], %x[width], #0x8\n"
+            "st1w { z31.s }, p0, [%x[res_ptr]]\n"
             "add %x[res_ptr], %x[res_ptr], #0x20\n"
-            "bne 6b\n"
-            "subs x13, x13, #0x4\n"
-            "add %x[a_ptr], %x[a_ptr], x12\n"
-            "mov %x[res_ptr], x23\n"
-            "bgt 5b\n"
-            "9:"  // Row tail: Row loop skip
-            : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
-            : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [num_blocks] "r" (num_blocks), [res_stride] "r" (res_stride), [width] "r" (width)
-            : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+            "cbnz %x[width], 1b\n"
+            : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [width] "+&r" (width)
+            : [a_ptr] "r" (a_ptr), [num_blocks] "r" (num_blocks)
+            : "memory", "p0", "x20", "x21", "x22", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
         );
         return;
     }
+    else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
+        GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
+                    "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
+                    "performance");
+    }
+    else if (ggml_cpu_has_neon()) {
+        GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
+                    "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
+                    "quantization format for optimal performance");
+    }
+#endif
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    GGML_ASSERT(ggml_cpu_has_sve() &&
+                "__ARM_FEATURE_SVE not defined, use the Q4_0_4_8 quantization format for optimal performance");
+#elif defined(__ARM_NEON)
+    GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
+                "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
+                "performance");
+#endif
+}
+
+void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy, int nr, int nc, int ith, int nth) {
+    UNUSED(n);
+    UNUSED(s);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(ith);
+    UNUSED(nth);
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
+    if (svcntw() == 8) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+                    "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
+    }
 #endif
 #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
+                "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
+#elif defined(__ARM_NEON)
     int64_t x0 = roundup((ith * nc) / nth, (int64_t)4);
     int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)4);
     size_t width = xend - x0;
 
     int64_t nb = n / QK4_0;
-    const void * b_ptr = (const void *)((const block_q4_0x4 *) vx + ((x0 / 4) * nb));
+    const void * b_ptr = (const void *)((const block_q4_0x4 *) vx + ((x0/4) * nb));
     const void * a_ptr = vy;
     float * res_ptr = s + x0;
     size_t res_stride = nc * sizeof(float);
@@ -1008,514 +649,108 @@ void ggml_gemm_q4_0_q8_0_aarch64(int n, float * GGML_RESTRICT s, const void * GG
         "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
         "2:"  // Column loop
         "add x25, %x[a_ptr], #0x8\n"
-        "movi v2.16b, #0x0\n"
-        "movi v10.16b, #0x0\n"
+        "movi v15.16b, #0x0\n"
+        "movi v19.16b, #0x0\n"
         "mov x24, %x[num_blocks]\n"
         "add x23, x25, x9\n"
-        "movi v12.16b, #0x0\n"
-        "movi v28.16b, #0x0\n"
+        "movi v18.16b, #0x0\n"
+        "movi v14.16b, #0x0\n"
         "add x22, x23, x9\n"
         "movi v11.16b, #0x0\n"
         "movi v13.16b, #0x0\n"
         "add x21, x22, x9\n"
-        "movi v22.16b, #0x0\n"
         "movi v23.16b, #0x0\n"
+        "movi v16.16b, #0x0\n"
         "movi v25.16b, #0x0\n"
-        "movi v5.16b, #0x0\n"
         "movi v7.16b, #0x0\n"
+        "movi v0.16b, #0x0\n"
         "movi v4.16b, #0x0\n"
-        "movi v6.16b, #0x0\n"
-        "movi v30.16b, #0x0\n"
-        "movi v24.16b, #0x0\n"
-        "movi v14.16b, #0x0\n"
+        "movi v5.16b, #0x0\n"
+        "movi v21.16b, #0x0\n"
+        "movi v8.16b, #0x0\n"
+        "movi v1.16b, #0x0\n"
         "3:"  // Block loop
-        "ldr q21, [x28, #0x0]\n"
-        "ldr q16, [x28, #0x10]\n"
-        "movi v1.16b, #0x4\n"
-        "movi v19.4s, #0x0\n"
-        "ldr q27, [x25, #0x0]\n"
-        "ldr q15, [x25, #0x10]\n"
-        "movi v26.4s, #0x0\n"
-        "movi v18.4s, #0x0\n"
-        "ldr q29, [x28, #0x20]\n"
-        "ldr q3, [x28, #0x30]\n"
-        "movi v17.4s, #0x0\n"
-        "movi v0.16b, #0xf0\n"
-        "ldr d20, [x25, #-0x8]\n"
-        "ldr d9, [x23, #-0x8]\n"
-        "sshl v8.16b, v21.16b, v1.16b\n"
-        "sshl v31.16b, v16.16b, v1.16b\n"
-        "and v21.16b, v21.16b, v0.16b\n"
-        "and v16.16b, v16.16b, v0.16b\n"
+        "ldr q3, [x28, #0x0]\n"
+        "ldr q31, [x25, #0x0]\n"
+        "movi v28.16b, #0x4\n"
+        "movi v10.4s, #0x0\n"
+        "ldr q22, [x28, #0x10]\n"
+        "ldr q6, [x25, #0x10]\n"
+        "movi v29.4s, #0x0\n"
+        "movi v9.4s, #0x0\n"
+        "ldr q27, [x28, #0x20]\n"
+        "ldr q30, [x28, #0x30]\n"
+        "movi v20.4s, #0x0\n"
+        "movi v24.16b, #0xf0\n"
+        "ldr d2, [x25, #-0x8]\n"
+        "ldr d26, [x23, #-0x8]\n"
+        "sshl v12.16b, v3.16b, v28.16b\n"
         "sub x20, x28, #0x8\n"
+        "ldr d17, [x20, #0x0]\n"
+        "and v3.16b, v3.16b, v24.16b\n"
         "subs x24, x24, #0x1\n"
         "add x28, x28, #0x48\n"
-        ".inst 0x4e88a773  // smmla v19.4s, v27.16b, v8.16b\n"
-        ".inst 0x4e9fa77a  // smmla v26.4s, v27.16b, v31.16b\n"
-        "ldr q27, [x25, #0x20]\n"
-        ".inst 0x4e88a5f2  // smmla v18.4s, v15.16b, v8.16b\n"
-        ".inst 0x4e9fa5f1  // smmla v17.4s, v15.16b, v31.16b\n"
-        "sshl v15.16b, v29.16b, v1.16b\n"
-        "sshl v1.16b, v3.16b, v1.16b\n"
-        "and v29.16b, v29.16b, v0.16b\n"
-        "and v3.16b, v3.16b, v0.16b\n"
-        "ldr q0, [x25, #0x30]\n"
-        "fcvtl v20.4s, v20.4h\n"
-        ".inst 0x4e8fa773  // smmla v19.4s, v27.16b, v15.16b\n"
-        "fcvtl v9.4s, v9.4h\n"
-        ".inst 0x4e81a77a  // smmla v26.4s, v27.16b, v1.16b\n"
-        "ldr q27, [x25, #0x40]\n"
-        ".inst 0x4e8fa412  // smmla v18.4s, v0.16b, v15.16b\n"
-        ".inst 0x4e81a411  // smmla v17.4s, v0.16b, v1.16b\n"
-        "ldr q0, [x25, #0x50]\n"
-        ".inst 0x4e95a773  // smmla v19.4s, v27.16b, v21.16b\n"
-        ".inst 0x4e90a77a  // smmla v26.4s, v27.16b, v16.16b\n"
-        "ldr q27, [x25, #0x60]\n"
-        ".inst 0x4e95a412  // smmla v18.4s, v0.16b, v21.16b\n"
-        ".inst 0x4e90a411  // smmla v17.4s, v0.16b, v16.16b\n"
-        "ldr q0, [x25, #0x70]\n"
-        "add x25, x25, #0x88\n"
-        ".inst 0x4e9da773  // smmla v19.4s, v27.16b, v29.16b\n"
-        ".inst 0x4e83a77a  // smmla v26.4s, v27.16b, v3.16b\n"
-        "ldr d27, [x20, #0x0]\n"
-        ".inst 0x4e9da412  // smmla v18.4s, v0.16b, v29.16b\n"
-        ".inst 0x4e83a411  // smmla v17.4s, v0.16b, v3.16b\n"
-        "fcvtl v27.4s, v27.4h\n"
-        "uzp1 v0.2d, v19.2d, v26.2d\n"
-        "uzp2 v26.2d, v19.2d, v26.2d\n"
-        "fmul v19.4s, v27.4s, v20.s[0]\n"
-        "scvtf v0.4s, v0.4s, #0x4\n"
-        "scvtf v26.4s, v26.4s, #0x4\n"
-        "fmla v2.4s, v0.4s, v19.4s\n"
-        "ldr q19, [x23, #0x0]\n"
-        "uzp1 v0.2d, v18.2d, v17.2d\n"
-        "uzp2 v18.2d, v18.2d, v17.2d\n"
-        "fmul v17.4s, v27.4s, v20.s[1]\n"
-        "scvtf v0.4s, v0.4s, #0x4\n"
-        "scvtf v18.4s, v18.4s, #0x4\n"
-        "fmla v10.4s, v26.4s, v17.4s\n"
-        "ldr q17, [x23, #0x10]\n"
-        "fmul v26.4s, v27.4s, v20.s[2]\n"
-        "fmul v20.4s, v27.4s, v20.s[3]\n"
-        "fmla v12.4s, v0.4s, v26.4s\n"
-        "ldr d0, [x22, #-0x8]\n"
-        "ldr d26, [x21, #-0x8]\n"
-        "fcvtl v0.4s, v0.4h\n"
-        "fmla v28.4s, v18.4s, v20.4s\n"
-        "movi v20.4s, #0x0\n"
-        "movi v18.4s, #0x0\n"
-        ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
-        ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
-        "ldr q19, [x23, #0x20]\n"
+        ".inst 0x4f9fe18a  // sdot v10.4s, v12.16b, v31.4b[0]\n"
+        ".inst 0x4fbfe19d  // sdot v29.4s, v12.16b, v31.4b[1]\n"
+        ".inst 0x4f9fe989  // sdot v9.4s, v12.16b, v31.4b[2]\n"
+        ".inst 0x4fbfe994  // sdot v20.4s, v12.16b, v31.4b[3]\n"
+        "sshl v31.16b, v22.16b, v28.16b\n"
+        "and v22.16b, v22.16b, v24.16b\n"
+        "fcvtl v17.4s, v17.4h\n"
+        "fcvtl v2.4s, v2.4h\n"
         "fcvtl v26.4s, v26.4h\n"
-        ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
-        ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
-        "ldr q19, [x23, #0x40]\n"
-        ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
-        ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
-        "ldr q19, [x23, #0x60]\n"
-        ".inst 0x4e9da674  // smmla v20.4s, v19.16b, v29.16b\n"
-        ".inst 0x4e83a672  // smmla v18.4s, v19.16b, v3.16b\n"
-        "uzp1 v19.2d, v20.2d, v18.2d\n"
-        "scvtf v19.4s, v19.4s, #0x4\n"
-        "uzp2 v20.2d, v20.2d, v18.2d\n"
-        "fmul v18.4s, v27.4s, v9.s[0]\n"
-        "scvtf v20.4s, v20.4s, #0x4\n"
-        "fmla v11.4s, v19.4s, v18.4s\n"
-        "ldr q18, [x22, #0x0]\n"
-        "fmul v19.4s, v27.4s, v9.s[1]\n"
-        "fmla v13.4s, v20.4s, v19.4s\n"
-        "movi v19.4s, #0x0\n"
-        "movi v20.4s, #0x0\n"
-        ".inst 0x4e88a633  // smmla v19.4s, v17.16b, v8.16b\n"
-        ".inst 0x4e9fa634  // smmla v20.4s, v17.16b, v31.16b\n"
-        "ldr q17, [x23, #0x30]\n"
-        ".inst 0x4e8fa633  // smmla v19.4s, v17.16b, v15.16b\n"
-        ".inst 0x4e81a634  // smmla v20.4s, v17.16b, v1.16b\n"
-        "ldr q17, [x23, #0x50]\n"
-        ".inst 0x4e95a633  // smmla v19.4s, v17.16b, v21.16b\n"
-        ".inst 0x4e90a634  // smmla v20.4s, v17.16b, v16.16b\n"
-        "ldr q17, [x23, #0x70]\n"
-        "add x23, x23, #0x88\n"
-        ".inst 0x4e9da633  // smmla v19.4s, v17.16b, v29.16b\n"
-        ".inst 0x4e83a634  // smmla v20.4s, v17.16b, v3.16b\n"
-        "uzp1 v17.2d, v19.2d, v20.2d\n"
-        "scvtf v17.4s, v17.4s, #0x4\n"
-        "uzp2 v20.2d, v19.2d, v20.2d\n"
-        "fmul v19.4s, v27.4s, v9.s[2]\n"
-        "fmul v9.4s, v27.4s, v9.s[3]\n"
-        "scvtf v20.4s, v20.4s, #0x4\n"
-        "fmla v22.4s, v17.4s, v19.4s\n"
-        "ldr q17, [x22, #0x10]\n"
-        "movi v19.4s, #0x0\n"
-        ".inst 0x4e88a653  // smmla v19.4s, v18.16b, v8.16b\n"
-        "fmla v23.4s, v20.4s, v9.4s\n"
-        "movi v20.4s, #0x0\n"
-        "movi v9.4s, #0x0\n"
-        ".inst 0x4e9fa654  // smmla v20.4s, v18.16b, v31.16b\n"
-        "ldr q18, [x22, #0x20]\n"
-        ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
-        ".inst 0x4e8fa653  // smmla v19.4s, v18.16b, v15.16b\n"
-        ".inst 0x4e81a654  // smmla v20.4s, v18.16b, v1.16b\n"
-        "ldr q18, [x22, #0x40]\n"
-        ".inst 0x4e95a653  // smmla v19.4s, v18.16b, v21.16b\n"
-        ".inst 0x4e90a654  // smmla v20.4s, v18.16b, v16.16b\n"
-        "ldr q18, [x22, #0x60]\n"
-        ".inst 0x4e9da653  // smmla v19.4s, v18.16b, v29.16b\n"
-        ".inst 0x4e83a654  // smmla v20.4s, v18.16b, v3.16b\n"
-        "movi v18.4s, #0x0\n"
-        ".inst 0x4e9fa632  // smmla v18.4s, v17.16b, v31.16b\n"
-        "ldr q17, [x22, #0x30]\n"
-        ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
-        ".inst 0x4e81a632  // smmla v18.4s, v17.16b, v1.16b\n"
-        "ldr q17, [x22, #0x50]\n"
-        ".inst 0x4e95a629  // smmla v9.4s, v17.16b, v21.16b\n"
-        ".inst 0x4e90a632  // smmla v18.4s, v17.16b, v16.16b\n"
-        "ldr q17, [x22, #0x70]\n"
-        "add x22, x22, #0x88\n"
-        ".inst 0x4e9da629  // smmla v9.4s, v17.16b, v29.16b\n"
-        ".inst 0x4e83a632  // smmla v18.4s, v17.16b, v3.16b\n"
-        "uzp1 v17.2d, v19.2d, v20.2d\n"
-        "uzp2 v20.2d, v19.2d, v20.2d\n"
-        "fmul v19.4s, v27.4s, v0.s[0]\n"
-        "scvtf v17.4s, v17.4s, #0x4\n"
-        "scvtf v20.4s, v20.4s, #0x4\n"
-        "fmla v25.4s, v17.4s, v19.4s\n"
-        "ldr q19, [x21, #0x0]\n"
-        "fmul v17.4s, v27.4s, v0.s[1]\n"
-        "fmla v5.4s, v20.4s, v17.4s\n"
-        "ldr q17, [x21, #0x10]\n"
-        "uzp1 v20.2d, v9.2d, v18.2d\n"
-        "uzp2 v9.2d, v9.2d, v18.2d\n"
-        "fmul v18.4s, v27.4s, v0.s[2]\n"
-        "fmul v0.4s, v27.4s, v0.s[3]\n"
-        "scvtf v20.4s, v20.4s, #0x4\n"
+        ".inst 0x4f86e3ea  // sdot v10.4s, v31.16b, v6.4b[0]\n"
+        ".inst 0x4fa6e3fd  // sdot v29.4s, v31.16b, v6.4b[1]\n"
+        ".inst 0x4f86ebe9  // sdot v9.4s, v31.16b, v6.4b[2]\n"
+        ".inst 0x4fa6ebf4  // sdot v20.4s, v31.16b, v6.4b[3]\n"
+        "sshl v6.16b, v27.16b, v28.16b\n"
+        "sshl v28.16b, v30.16b, v28.16b\n"
+        "and v27.16b, v27.16b, v24.16b\n"
+        "and v30.16b, v30.16b, v24.16b\n"
+        "ldr q24, [x25, #0x20]\n"
+        ".inst 0x4f98e0ca  // sdot v10.4s, v6.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e0dd  // sdot v29.4s, v6.16b, v24.4b[1]\n"
+        ".inst 0x4f98e8c9  // sdot v9.4s, v6.16b, v24.4b[2]\n"
+        ".inst 0x4fb8e8d4  // sdot v20.4s, v6.16b, v24.4b[3]\n"
+        "ldr q24, [x25, #0x30]\n"
+        ".inst 0x4f98e38a  // sdot v10.4s, v28.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e39d  // sdot v29.4s, v28.16b, v24.4b[1]\n"
+        ".inst 0x4f98eb89  // sdot v9.4s, v28.16b, v24.4b[2]\n"
+        ".inst 0x4fb8eb94  // sdot v20.4s, v28.16b, v24.4b[3]\n"
+        "ldr q24, [x25, #0x40]\n"
+        ".inst 0x4f98e06a  // sdot v10.4s, v3.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e07d  // sdot v29.4s, v3.16b, v24.4b[1]\n"
+        ".inst 0x4f98e869  // sdot v9.4s, v3.16b, v24.4b[2]\n"
+        ".inst 0x4fb8e874  // sdot v20.4s, v3.16b, v24.4b[3]\n"
+        "ldr q24, [x25, #0x50]\n"
+        ".inst 0x4f98e2ca  // sdot v10.4s, v22.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e2dd  // sdot v29.4s, v22.16b, v24.4b[1]\n"
+        ".inst 0x4f98eac9  // sdot v9.4s, v22.16b, v24.4b[2]\n"
+        ".inst 0x4fb8ead4  // sdot v20.4s, v22.16b, v24.4b[3]\n"
+        "ldr q24, [x25, #0x60]\n"
+        ".inst 0x4f98e36a  // sdot v10.4s, v27.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e37d  // sdot v29.4s, v27.16b, v24.4b[1]\n"
+        ".inst 0x4f98eb69  // sdot v9.4s, v27.16b, v24.4b[2]\n"
+        ".inst 0x4fb8eb74  // sdot v20.4s, v27.16b, v24.4b[3]\n"
+        "ldr q24, [x25, #0x70]\n"
+        "add x25, x25, #0x88\n"
+        ".inst 0x4f98e3ca  // sdot v10.4s, v30.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e3dd  // sdot v29.4s, v30.16b, v24.4b[1]\n"
+        ".inst 0x4f98ebc9  // sdot v9.4s, v30.16b, v24.4b[2]\n"
+        ".inst 0x4fb8ebd4  // sdot v20.4s, v30.16b, v24.4b[3]\n"
+        "fmul v24.4s, v17.4s, v2.s[0]\n"
+        "scvtf v10.4s, v10.4s, #0x4\n"
+        "scvtf v29.4s, v29.4s, #0x4\n"
         "scvtf v9.4s, v9.4s, #0x4\n"
-        "fmla v7.4s, v20.4s, v18.4s\n"
-        "movi v20.4s, #0x0\n"
-        "movi v18.4s, #0x0\n"
-        ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
-        ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
-        "ldr q19, [x21, #0x20]\n"
-        "fmla v4.4s, v9.4s, v0.4s\n"
-        "movi v9.4s, #0x0\n"
-        "movi v0.4s, #0x0\n"
-        ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
-        "fmul v8.4s, v27.4s, v26.s[0]\n"
-        ".inst 0x4e9fa620  // smmla v0.4s, v17.16b, v31.16b\n"
-        "ldr q17, [x21, #0x30]\n"
-        ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
-        "fmul v31.4s, v27.4s, v26.s[1]\n"
-        ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
-        "ldr q19, [x21, #0x40]\n"
-        ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
-        "fmul v15.4s, v27.4s, v26.s[2]\n"
-        "fmul v27.4s, v27.4s, v26.s[3]\n"
-        ".inst 0x4e81a620  // smmla v0.4s, v17.16b, v1.16b\n"
-        "ldr q1, [x21, #0x50]\n"
-        ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
-        ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
-        "ldr q26, [x21, #0x60]\n"
-        ".inst 0x4e95a429  // smmla v9.4s, v1.16b, v21.16b\n"
-        ".inst 0x4e90a420  // smmla v0.4s, v1.16b, v16.16b\n"
-        "ldr q21, [x21, #0x70]\n"
-        "add x21, x21, #0x88\n"
-        ".inst 0x4e9da754  // smmla v20.4s, v26.16b, v29.16b\n"
-        ".inst 0x4e83a752  // smmla v18.4s, v26.16b, v3.16b\n"
-        ".inst 0x4e9da6a9  // smmla v9.4s, v21.16b, v29.16b\n"
-        ".inst 0x4e83a6a0  // smmla v0.4s, v21.16b, v3.16b\n"
-        "uzp1 v29.2d, v20.2d, v18.2d\n"
-        "uzp2 v21.2d, v20.2d, v18.2d\n"
-        "scvtf v29.4s, v29.4s, #0x4\n"
-        "uzp1 v18.2d, v9.2d, v0.2d\n"
-        "uzp2 v16.2d, v9.2d, v0.2d\n"
-        "scvtf v21.4s, v21.4s, #0x4\n"
-        "fmla v6.4s, v29.4s, v8.4s\n"
-        "scvtf v18.4s, v18.4s, #0x4\n"
-        "scvtf v16.4s, v16.4s, #0x4\n"
-        "fmla v30.4s, v21.4s, v31.4s\n"
-        "fmla v24.4s, v18.4s, v15.4s\n"
-        "fmla v14.4s, v16.4s, v27.4s\n"
-        "bgt 3b\n"
-        "mov x20, %x[res_ptr]\n"
-        "subs x27, x27, #0x4\n"
-        "add %x[res_ptr], %x[res_ptr], #0x10\n"
-        "str q2, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q10, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q12, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q28, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q11, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q13, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q22, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q23, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q25, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q5, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q7, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q4, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q6, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q30, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q24, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "str q14, [x20, #0x0]\n"
-        "bne 2b\n"
-        "mov x20, #0x4\n"
-        "sub x10, x10, #0x10\n"
-        "cmp x10, #0x10\n"
-        "mov %x[res_ptr], x26\n"
-        "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
-        "bge 1b\n"
-        "4:"  // Row loop skip
-        "cbz x10, 9f\n"
-        "5:"  // Row tail: Row loop
-        "add x24, %x[b_ptr], #0x8\n"
-        "mov x23, %x[width]\n"
-        "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
-        "6:"  // Row tail: Column loop
-        "movi v2.16b, #0x0\n"
-        "movi v10.16b, #0x0\n"
-        "add x25, %x[a_ptr], #0x8\n"
-        "mov x21, %x[num_blocks]\n"
-        "movi v12.16b, #0x0\n"
-        "movi v28.16b, #0x0\n"
-        "7:"  // Row tail: Block loop
-        "ldr q6, [x24, #0x0]\n"
-        "ldr q5, [x24, #0x10]\n"
-        "movi v17.16b, #0x4\n"
-        "movi v8.4s, #0x0\n"
-        "ldr q4, [x25, #0x0]\n"
-        "ldr q13, [x25, #0x10]\n"
-        "movi v27.4s, #0x0\n"
-        "movi v0.4s, #0x0\n"
-        "ldr q31, [x24, #0x20]\n"
-        "ldr q14, [x24, #0x30]\n"
-        "movi v29.4s, #0x0\n"
-        "movi v22.16b, #0xf0\n"
-        "ldr q11, [x25, #0x20]\n"
-        "ldr q23, [x25, #0x30]\n"
-        "sshl v21.16b, v6.16b, v17.16b\n"
-        "sshl v16.16b, v5.16b, v17.16b\n"
-        "ldr q20, [x25, #0x40]\n"
-        "ldr q26, [x25, #0x50]\n"
-        "and v6.16b, v6.16b, v22.16b\n"
-        "and v5.16b, v5.16b, v22.16b\n"
-        "ldr q25, [x25, #0x60]\n"
-        "ldr q3, [x25, #0x70]\n"
-        "sshl v19.16b, v31.16b, v17.16b\n"
-        "sshl v18.16b, v14.16b, v17.16b\n"
-        "ldr d17, [x25, #-0x8]\n"
-        ".inst 0x4e95a488  // smmla v8.4s, v4.16b, v21.16b\n"
-        ".inst 0x4e90a49b  // smmla v27.4s, v4.16b, v16.16b\n"
-        "and v31.16b, v31.16b, v22.16b\n"
-        ".inst 0x4e95a5a0  // smmla v0.4s, v13.16b, v21.16b\n"
-        ".inst 0x4e90a5bd  // smmla v29.4s, v13.16b, v16.16b\n"
-        "and v14.16b, v14.16b, v22.16b\n"
-        "sub x20, x24, #0x8\n"
-        "ldr d16, [x20, #0x0]\n"
-        "subs x21, x21, #0x1\n"
-        "add x25, x25, #0x88\n"
-        "fcvtl v17.4s, v17.4h\n"
-        "add x24, x24, #0x48\n"
-        ".inst 0x4e93a568  // smmla v8.4s, v11.16b, v19.16b\n"
-        ".inst 0x4e92a57b  // smmla v27.4s, v11.16b, v18.16b\n"
-        ".inst 0x4e93a6e0  // smmla v0.4s, v23.16b, v19.16b\n"
-        ".inst 0x4e92a6fd  // smmla v29.4s, v23.16b, v18.16b\n"
-        "fcvtl v16.4s, v16.4h\n"
-        ".inst 0x4e86a688  // smmla v8.4s, v20.16b, v6.16b\n"
-        ".inst 0x4e85a69b  // smmla v27.4s, v20.16b, v5.16b\n"
-        "fmul v23.4s, v16.4s, v17.s[0]\n"
-        "fmul v21.4s, v16.4s, v17.s[1]\n"
-        "fmul v1.4s, v16.4s, v17.s[2]\n"
-        "fmul v20.4s, v16.4s, v17.s[3]\n"
-        ".inst 0x4e86a740  // smmla v0.4s, v26.16b, v6.16b\n"
-        ".inst 0x4e85a75d  // smmla v29.4s, v26.16b, v5.16b\n"
-        ".inst 0x4e9fa728  // smmla v8.4s, v25.16b, v31.16b\n"
-        ".inst 0x4e8ea73b  // smmla v27.4s, v25.16b, v14.16b\n"
-        ".inst 0x4e9fa460  // smmla v0.4s, v3.16b, v31.16b\n"
-        ".inst 0x4e8ea47d  // smmla v29.4s, v3.16b, v14.16b\n"
-        "uzp1 v19.2d, v8.2d, v27.2d\n"
-        "uzp2 v18.2d, v8.2d, v27.2d\n"
-        "scvtf v19.4s, v19.4s, #0x4\n"
-        "uzp1 v17.2d, v0.2d, v29.2d\n"
-        "uzp2 v16.2d, v0.2d, v29.2d\n"
-        "scvtf v18.4s, v18.4s, #0x4\n"
-        "fmla v2.4s, v19.4s, v23.4s\n"
-        "scvtf v17.4s, v17.4s, #0x4\n"
-        "scvtf v16.4s, v16.4s, #0x4\n"
-        "fmla v10.4s, v18.4s, v21.4s\n"
-        "fmla v12.4s, v17.4s, v1.4s\n"
-        "fmla v28.4s, v16.4s, v20.4s\n"
-        "bgt 7b\n"
-        "mov x20, %x[res_ptr]\n"
-        "cmp x10, #0x1\n"
-        "str q2, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "ble 8f\n"
-        "cmp x10, #0x2\n"
-        "str q10, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "ble 8f\n"
-        "cmp x10, #0x3\n"
-        "str q12, [x20, #0x0]\n"
-        "add x20, x20, %x[res_stride]\n"
-        "ble 8f\n"
-        "str q28, [x20, #0x0]\n"
-        "8:"  // Row tail: Accumulator store skip
-        "subs x23, x23, #0x4\n"
-        "add %x[res_ptr], %x[res_ptr], #0x10\n"
-        "bne 6b\n"
-        "subs x10, x10, #0x4\n"
-        "add %x[a_ptr], %x[a_ptr], x9\n"
-        "mov %x[res_ptr], x22\n"
-        "bgt 5b\n"
-        "9:"  // Row tail: Row loop skip
-        : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
-        : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [num_blocks] "r" (num_blocks), [res_stride] "r" (res_stride), [width] "r" (width)
-        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
-    );
-#elif defined(__ARM_NEON)
-    int64_t x0 = roundup((ith * nc) / nth, (int64_t)4);
-    int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)4);
-    size_t width = xend - x0;
-
-    int64_t nb = n / QK4_0;
-    const void * b_ptr = (const void *)((const block_q4_0x4 *) vx + ((x0/4) * nb));
-    const void * a_ptr = vy;
-    float * res_ptr = s + x0;
-    size_t res_stride = nc * sizeof(float);
-
-    assert(n % 32 == 0);
-    assert(width % 4 == 0);
-
-    size_t num_blocks = n / 32;
-
-    __asm__ __volatile__(
-        "mov x10, %x[nr]\n"
-        "mov x9, #0x88\n"
-        "cmp x10, #0x10\n"
-        "mul x9, %x[num_blocks], x9\n"
-        "blt 4f\n"
-        "1:"  // Row loop
-        "add x28, %x[b_ptr], #0x8\n"
-        "mov x27, %x[width]\n"
-        "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
-        "2:"  // Column loop
-        "add x25, %x[a_ptr], #0x8\n"
-        "movi v15.16b, #0x0\n"
-        "movi v19.16b, #0x0\n"
-        "mov x24, %x[num_blocks]\n"
-        "add x23, x25, x9\n"
-        "movi v18.16b, #0x0\n"
-        "movi v14.16b, #0x0\n"
-        "add x22, x23, x9\n"
-        "movi v11.16b, #0x0\n"
-        "movi v13.16b, #0x0\n"
-        "add x21, x22, x9\n"
-        "movi v23.16b, #0x0\n"
-        "movi v16.16b, #0x0\n"
-        "movi v25.16b, #0x0\n"
-        "movi v7.16b, #0x0\n"
-        "movi v0.16b, #0x0\n"
-        "movi v4.16b, #0x0\n"
-        "movi v5.16b, #0x0\n"
-        "movi v21.16b, #0x0\n"
-        "movi v8.16b, #0x0\n"
-        "movi v1.16b, #0x0\n"
-        "3:"  // Block loop
-        "ldr q3, [x28, #0x0]\n"
-        "ldr q31, [x25, #0x0]\n"
-        "movi v28.16b, #0x4\n"
-        "movi v10.4s, #0x0\n"
-        "ldr q22, [x28, #0x10]\n"
-        "ldr q6, [x25, #0x10]\n"
-        "movi v29.4s, #0x0\n"
-        "movi v9.4s, #0x0\n"
-        "ldr q27, [x28, #0x20]\n"
-        "ldr q30, [x28, #0x30]\n"
-        "movi v20.4s, #0x0\n"
-        "movi v24.16b, #0xf0\n"
-        "ldr d2, [x25, #-0x8]\n"
-        "ldr d26, [x23, #-0x8]\n"
-        "sshl v12.16b, v3.16b, v28.16b\n"
-        "sub x20, x28, #0x8\n"
-        "ldr d17, [x20, #0x0]\n"
-        "and v3.16b, v3.16b, v24.16b\n"
-        "subs x24, x24, #0x1\n"
-        "add x28, x28, #0x48\n"
-        ".inst 0x4f9fe18a  // sdot v10.4s, v12.16b, v31.4b[0]\n"
-        ".inst 0x4fbfe19d  // sdot v29.4s, v12.16b, v31.4b[1]\n"
-        ".inst 0x4f9fe989  // sdot v9.4s, v12.16b, v31.4b[2]\n"
-        ".inst 0x4fbfe994  // sdot v20.4s, v12.16b, v31.4b[3]\n"
-        "sshl v31.16b, v22.16b, v28.16b\n"
-        "and v22.16b, v22.16b, v24.16b\n"
-        "fcvtl v17.4s, v17.4h\n"
-        "fcvtl v2.4s, v2.4h\n"
-        "fcvtl v26.4s, v26.4h\n"
-        ".inst 0x4f86e3ea  // sdot v10.4s, v31.16b, v6.4b[0]\n"
-        ".inst 0x4fa6e3fd  // sdot v29.4s, v31.16b, v6.4b[1]\n"
-        ".inst 0x4f86ebe9  // sdot v9.4s, v31.16b, v6.4b[2]\n"
-        ".inst 0x4fa6ebf4  // sdot v20.4s, v31.16b, v6.4b[3]\n"
-        "sshl v6.16b, v27.16b, v28.16b\n"
-        "sshl v28.16b, v30.16b, v28.16b\n"
-        "and v27.16b, v27.16b, v24.16b\n"
-        "and v30.16b, v30.16b, v24.16b\n"
-        "ldr q24, [x25, #0x20]\n"
-        ".inst 0x4f98e0ca  // sdot v10.4s, v6.16b, v24.4b[0]\n"
-        ".inst 0x4fb8e0dd  // sdot v29.4s, v6.16b, v24.4b[1]\n"
-        ".inst 0x4f98e8c9  // sdot v9.4s, v6.16b, v24.4b[2]\n"
-        ".inst 0x4fb8e8d4  // sdot v20.4s, v6.16b, v24.4b[3]\n"
-        "ldr q24, [x25, #0x30]\n"
-        ".inst 0x4f98e38a  // sdot v10.4s, v28.16b, v24.4b[0]\n"
-        ".inst 0x4fb8e39d  // sdot v29.4s, v28.16b, v24.4b[1]\n"
-        ".inst 0x4f98eb89  // sdot v9.4s, v28.16b, v24.4b[2]\n"
-        ".inst 0x4fb8eb94  // sdot v20.4s, v28.16b, v24.4b[3]\n"
-        "ldr q24, [x25, #0x40]\n"
-        ".inst 0x4f98e06a  // sdot v10.4s, v3.16b, v24.4b[0]\n"
-        ".inst 0x4fb8e07d  // sdot v29.4s, v3.16b, v24.4b[1]\n"
-        ".inst 0x4f98e869  // sdot v9.4s, v3.16b, v24.4b[2]\n"
-        ".inst 0x4fb8e874  // sdot v20.4s, v3.16b, v24.4b[3]\n"
-        "ldr q24, [x25, #0x50]\n"
-        ".inst 0x4f98e2ca  // sdot v10.4s, v22.16b, v24.4b[0]\n"
-        ".inst 0x4fb8e2dd  // sdot v29.4s, v22.16b, v24.4b[1]\n"
-        ".inst 0x4f98eac9  // sdot v9.4s, v22.16b, v24.4b[2]\n"
-        ".inst 0x4fb8ead4  // sdot v20.4s, v22.16b, v24.4b[3]\n"
-        "ldr q24, [x25, #0x60]\n"
-        ".inst 0x4f98e36a  // sdot v10.4s, v27.16b, v24.4b[0]\n"
-        ".inst 0x4fb8e37d  // sdot v29.4s, v27.16b, v24.4b[1]\n"
-        ".inst 0x4f98eb69  // sdot v9.4s, v27.16b, v24.4b[2]\n"
-        ".inst 0x4fb8eb74  // sdot v20.4s, v27.16b, v24.4b[3]\n"
-        "ldr q24, [x25, #0x70]\n"
-        "add x25, x25, #0x88\n"
-        ".inst 0x4f98e3ca  // sdot v10.4s, v30.16b, v24.4b[0]\n"
-        ".inst 0x4fb8e3dd  // sdot v29.4s, v30.16b, v24.4b[1]\n"
-        ".inst 0x4f98ebc9  // sdot v9.4s, v30.16b, v24.4b[2]\n"
-        ".inst 0x4fb8ebd4  // sdot v20.4s, v30.16b, v24.4b[3]\n"
-        "fmul v24.4s, v17.4s, v2.s[0]\n"
-        "scvtf v10.4s, v10.4s, #0x4\n"
-        "scvtf v29.4s, v29.4s, #0x4\n"
-        "scvtf v9.4s, v9.4s, #0x4\n"
-        "scvtf v20.4s, v20.4s, #0x4\n"
-        "fmla v15.4s, v10.4s, v24.4s\n"
-        "ldr q24, [x23, #0x0]\n"
-        "fmul v10.4s, v17.4s, v2.s[1]\n"
-        "fmla v19.4s, v29.4s, v10.4s\n"
-        "ldr q10, [x23, #0x10]\n"
-        "fmul v29.4s, v17.4s, v2.s[2]\n"
-        "fmul v2.4s, v17.4s, v2.s[3]\n"
-        "fmla v18.4s, v9.4s, v29.4s\n"
+        "scvtf v20.4s, v20.4s, #0x4\n"
+        "fmla v15.4s, v10.4s, v24.4s\n"
+        "ldr q24, [x23, #0x0]\n"
+        "fmul v10.4s, v17.4s, v2.s[1]\n"
+        "fmla v19.4s, v29.4s, v10.4s\n"
+        "ldr q10, [x23, #0x10]\n"
+        "fmul v29.4s, v17.4s, v2.s[2]\n"
+        "fmul v2.4s, v17.4s, v2.s[3]\n"
+        "fmla v18.4s, v9.4s, v29.4s\n"
         "movi v9.4s, #0x0\n"
         "movi v29.4s, #0x0\n"
         ".inst 0x4f98e189  // sdot v9.4s, v12.16b, v24.4b[0]\n"
@@ -1854,3 +1089,884 @@ void ggml_gemm_q4_0_q8_0_aarch64(int n, float * GGML_RESTRICT s, const void * GG
     );
 #endif
 }
+
+void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy, int nr, int nc, int ith, int nth) {
+    UNUSED(n);
+    UNUSED(s);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(ith);
+    UNUSED(nth);
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
+    if (svcntw() == 8) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+                    "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
+    }
+#endif
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    int64_t x0 = roundup((ith * nc) / nth, (int64_t)4);
+    int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)4);
+    size_t width = xend - x0;
+
+    int64_t nb = n / QK4_0;
+    const void * b_ptr = (const void *)((const block_q4_0x4 *) vx + ((x0 / 4) * nb));
+    const void * a_ptr = vy;
+    float * res_ptr = s + x0;
+    size_t res_stride = nc * sizeof(float);
+
+    assert(n % 32 == 0);
+    assert(width % 4 == 0);
+
+    size_t num_blocks = n / 32;
+
+    __asm__ __volatile__(
+        "mov x10, %x[nr]\n"
+        "mov x9, #0x88\n"
+        "cmp x10, #0x10\n"
+        "mul x9, %x[num_blocks], x9\n"
+        "blt 4f\n"
+        "1:"  // Row loop
+        "add x28, %x[b_ptr], #0x8\n"
+        "mov x27, %x[width]\n"
+        "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
+        "2:"  // Column loop
+        "add x25, %x[a_ptr], #0x8\n"
+        "movi v2.16b, #0x0\n"
+        "movi v10.16b, #0x0\n"
+        "mov x24, %x[num_blocks]\n"
+        "add x23, x25, x9\n"
+        "movi v12.16b, #0x0\n"
+        "movi v28.16b, #0x0\n"
+        "add x22, x23, x9\n"
+        "movi v11.16b, #0x0\n"
+        "movi v13.16b, #0x0\n"
+        "add x21, x22, x9\n"
+        "movi v22.16b, #0x0\n"
+        "movi v23.16b, #0x0\n"
+        "movi v25.16b, #0x0\n"
+        "movi v5.16b, #0x0\n"
+        "movi v7.16b, #0x0\n"
+        "movi v4.16b, #0x0\n"
+        "movi v6.16b, #0x0\n"
+        "movi v30.16b, #0x0\n"
+        "movi v24.16b, #0x0\n"
+        "movi v14.16b, #0x0\n"
+        "3:"  // Block loop
+        "ldr q21, [x28, #0x0]\n"
+        "ldr q16, [x28, #0x10]\n"
+        "movi v1.16b, #0x4\n"
+        "movi v19.4s, #0x0\n"
+        "ldr q27, [x25, #0x0]\n"
+        "ldr q15, [x25, #0x10]\n"
+        "movi v26.4s, #0x0\n"
+        "movi v18.4s, #0x0\n"
+        "ldr q29, [x28, #0x20]\n"
+        "ldr q3, [x28, #0x30]\n"
+        "movi v17.4s, #0x0\n"
+        "movi v0.16b, #0xf0\n"
+        "ldr d20, [x25, #-0x8]\n"
+        "ldr d9, [x23, #-0x8]\n"
+        "sshl v8.16b, v21.16b, v1.16b\n"
+        "sshl v31.16b, v16.16b, v1.16b\n"
+        "and v21.16b, v21.16b, v0.16b\n"
+        "and v16.16b, v16.16b, v0.16b\n"
+        "sub x20, x28, #0x8\n"
+        "subs x24, x24, #0x1\n"
+        "add x28, x28, #0x48\n"
+        ".inst 0x4e88a773  // smmla v19.4s, v27.16b, v8.16b\n"
+        ".inst 0x4e9fa77a  // smmla v26.4s, v27.16b, v31.16b\n"
+        "ldr q27, [x25, #0x20]\n"
+        ".inst 0x4e88a5f2  // smmla v18.4s, v15.16b, v8.16b\n"
+        ".inst 0x4e9fa5f1  // smmla v17.4s, v15.16b, v31.16b\n"
+        "sshl v15.16b, v29.16b, v1.16b\n"
+        "sshl v1.16b, v3.16b, v1.16b\n"
+        "and v29.16b, v29.16b, v0.16b\n"
+        "and v3.16b, v3.16b, v0.16b\n"
+        "ldr q0, [x25, #0x30]\n"
+        "fcvtl v20.4s, v20.4h\n"
+        ".inst 0x4e8fa773  // smmla v19.4s, v27.16b, v15.16b\n"
+        "fcvtl v9.4s, v9.4h\n"
+        ".inst 0x4e81a77a  // smmla v26.4s, v27.16b, v1.16b\n"
+        "ldr q27, [x25, #0x40]\n"
+        ".inst 0x4e8fa412  // smmla v18.4s, v0.16b, v15.16b\n"
+        ".inst 0x4e81a411  // smmla v17.4s, v0.16b, v1.16b\n"
+        "ldr q0, [x25, #0x50]\n"
+        ".inst 0x4e95a773  // smmla v19.4s, v27.16b, v21.16b\n"
+        ".inst 0x4e90a77a  // smmla v26.4s, v27.16b, v16.16b\n"
+        "ldr q27, [x25, #0x60]\n"
+        ".inst 0x4e95a412  // smmla v18.4s, v0.16b, v21.16b\n"
+        ".inst 0x4e90a411  // smmla v17.4s, v0.16b, v16.16b\n"
+        "ldr q0, [x25, #0x70]\n"
+        "add x25, x25, #0x88\n"
+        ".inst 0x4e9da773  // smmla v19.4s, v27.16b, v29.16b\n"
+        ".inst 0x4e83a77a  // smmla v26.4s, v27.16b, v3.16b\n"
+        "ldr d27, [x20, #0x0]\n"
+        ".inst 0x4e9da412  // smmla v18.4s, v0.16b, v29.16b\n"
+        ".inst 0x4e83a411  // smmla v17.4s, v0.16b, v3.16b\n"
+        "fcvtl v27.4s, v27.4h\n"
+        "uzp1 v0.2d, v19.2d, v26.2d\n"
+        "uzp2 v26.2d, v19.2d, v26.2d\n"
+        "fmul v19.4s, v27.4s, v20.s[0]\n"
+        "scvtf v0.4s, v0.4s, #0x4\n"
+        "scvtf v26.4s, v26.4s, #0x4\n"
+        "fmla v2.4s, v0.4s, v19.4s\n"
+        "ldr q19, [x23, #0x0]\n"
+        "uzp1 v0.2d, v18.2d, v17.2d\n"
+        "uzp2 v18.2d, v18.2d, v17.2d\n"
+        "fmul v17.4s, v27.4s, v20.s[1]\n"
+        "scvtf v0.4s, v0.4s, #0x4\n"
+        "scvtf v18.4s, v18.4s, #0x4\n"
+        "fmla v10.4s, v26.4s, v17.4s\n"
+        "ldr q17, [x23, #0x10]\n"
+        "fmul v26.4s, v27.4s, v20.s[2]\n"
+        "fmul v20.4s, v27.4s, v20.s[3]\n"
+        "fmla v12.4s, v0.4s, v26.4s\n"
+        "ldr d0, [x22, #-0x8]\n"
+        "ldr d26, [x21, #-0x8]\n"
+        "fcvtl v0.4s, v0.4h\n"
+        "fmla v28.4s, v18.4s, v20.4s\n"
+        "movi v20.4s, #0x0\n"
+        "movi v18.4s, #0x0\n"
+        ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
+        ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
+        "ldr q19, [x23, #0x20]\n"
+        "fcvtl v26.4s, v26.4h\n"
+        ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
+        ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
+        "ldr q19, [x23, #0x40]\n"
+        ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
+        ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
+        "ldr q19, [x23, #0x60]\n"
+        ".inst 0x4e9da674  // smmla v20.4s, v19.16b, v29.16b\n"
+        ".inst 0x4e83a672  // smmla v18.4s, v19.16b, v3.16b\n"
+        "uzp1 v19.2d, v20.2d, v18.2d\n"
+        "scvtf v19.4s, v19.4s, #0x4\n"
+        "uzp2 v20.2d, v20.2d, v18.2d\n"
+        "fmul v18.4s, v27.4s, v9.s[0]\n"
+        "scvtf v20.4s, v20.4s, #0x4\n"
+        "fmla v11.4s, v19.4s, v18.4s\n"
+        "ldr q18, [x22, #0x0]\n"
+        "fmul v19.4s, v27.4s, v9.s[1]\n"
+        "fmla v13.4s, v20.4s, v19.4s\n"
+        "movi v19.4s, #0x0\n"
+        "movi v20.4s, #0x0\n"
+        ".inst 0x4e88a633  // smmla v19.4s, v17.16b, v8.16b\n"
+        ".inst 0x4e9fa634  // smmla v20.4s, v17.16b, v31.16b\n"
+        "ldr q17, [x23, #0x30]\n"
+        ".inst 0x4e8fa633  // smmla v19.4s, v17.16b, v15.16b\n"
+        ".inst 0x4e81a634  // smmla v20.4s, v17.16b, v1.16b\n"
+        "ldr q17, [x23, #0x50]\n"
+        ".inst 0x4e95a633  // smmla v19.4s, v17.16b, v21.16b\n"
+        ".inst 0x4e90a634  // smmla v20.4s, v17.16b, v16.16b\n"
+        "ldr q17, [x23, #0x70]\n"
+        "add x23, x23, #0x88\n"
+        ".inst 0x4e9da633  // smmla v19.4s, v17.16b, v29.16b\n"
+        ".inst 0x4e83a634  // smmla v20.4s, v17.16b, v3.16b\n"
+        "uzp1 v17.2d, v19.2d, v20.2d\n"
+        "scvtf v17.4s, v17.4s, #0x4\n"
+        "uzp2 v20.2d, v19.2d, v20.2d\n"
+        "fmul v19.4s, v27.4s, v9.s[2]\n"
+        "fmul v9.4s, v27.4s, v9.s[3]\n"
+        "scvtf v20.4s, v20.4s, #0x4\n"
+        "fmla v22.4s, v17.4s, v19.4s\n"
+        "ldr q17, [x22, #0x10]\n"
+        "movi v19.4s, #0x0\n"
+        ".inst 0x4e88a653  // smmla v19.4s, v18.16b, v8.16b\n"
+        "fmla v23.4s, v20.4s, v9.4s\n"
+        "movi v20.4s, #0x0\n"
+        "movi v9.4s, #0x0\n"
+        ".inst 0x4e9fa654  // smmla v20.4s, v18.16b, v31.16b\n"
+        "ldr q18, [x22, #0x20]\n"
+        ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
+        ".inst 0x4e8fa653  // smmla v19.4s, v18.16b, v15.16b\n"
+        ".inst 0x4e81a654  // smmla v20.4s, v18.16b, v1.16b\n"
+        "ldr q18, [x22, #0x40]\n"
+        ".inst 0x4e95a653  // smmla v19.4s, v18.16b, v21.16b\n"
+        ".inst 0x4e90a654  // smmla v20.4s, v18.16b, v16.16b\n"
+        "ldr q18, [x22, #0x60]\n"
+        ".inst 0x4e9da653  // smmla v19.4s, v18.16b, v29.16b\n"
+        ".inst 0x4e83a654  // smmla v20.4s, v18.16b, v3.16b\n"
+        "movi v18.4s, #0x0\n"
+        ".inst 0x4e9fa632  // smmla v18.4s, v17.16b, v31.16b\n"
+        "ldr q17, [x22, #0x30]\n"
+        ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
+        ".inst 0x4e81a632  // smmla v18.4s, v17.16b, v1.16b\n"
+        "ldr q17, [x22, #0x50]\n"
+        ".inst 0x4e95a629  // smmla v9.4s, v17.16b, v21.16b\n"
+        ".inst 0x4e90a632  // smmla v18.4s, v17.16b, v16.16b\n"
+        "ldr q17, [x22, #0x70]\n"
+        "add x22, x22, #0x88\n"
+        ".inst 0x4e9da629  // smmla v9.4s, v17.16b, v29.16b\n"
+        ".inst 0x4e83a632  // smmla v18.4s, v17.16b, v3.16b\n"
+        "uzp1 v17.2d, v19.2d, v20.2d\n"
+        "uzp2 v20.2d, v19.2d, v20.2d\n"
+        "fmul v19.4s, v27.4s, v0.s[0]\n"
+        "scvtf v17.4s, v17.4s, #0x4\n"
+        "scvtf v20.4s, v20.4s, #0x4\n"
+        "fmla v25.4s, v17.4s, v19.4s\n"
+        "ldr q19, [x21, #0x0]\n"
+        "fmul v17.4s, v27.4s, v0.s[1]\n"
+        "fmla v5.4s, v20.4s, v17.4s\n"
+        "ldr q17, [x21, #0x10]\n"
+        "uzp1 v20.2d, v9.2d, v18.2d\n"
+        "uzp2 v9.2d, v9.2d, v18.2d\n"
+        "fmul v18.4s, v27.4s, v0.s[2]\n"
+        "fmul v0.4s, v27.4s, v0.s[3]\n"
+        "scvtf v20.4s, v20.4s, #0x4\n"
+        "scvtf v9.4s, v9.4s, #0x4\n"
+        "fmla v7.4s, v20.4s, v18.4s\n"
+        "movi v20.4s, #0x0\n"
+        "movi v18.4s, #0x0\n"
+        ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
+        ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
+        "ldr q19, [x21, #0x20]\n"
+        "fmla v4.4s, v9.4s, v0.4s\n"
+        "movi v9.4s, #0x0\n"
+        "movi v0.4s, #0x0\n"
+        ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
+        "fmul v8.4s, v27.4s, v26.s[0]\n"
+        ".inst 0x4e9fa620  // smmla v0.4s, v17.16b, v31.16b\n"
+        "ldr q17, [x21, #0x30]\n"
+        ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
+        "fmul v31.4s, v27.4s, v26.s[1]\n"
+        ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
+        "ldr q19, [x21, #0x40]\n"
+        ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
+        "fmul v15.4s, v27.4s, v26.s[2]\n"
+        "fmul v27.4s, v27.4s, v26.s[3]\n"
+        ".inst 0x4e81a620  // smmla v0.4s, v17.16b, v1.16b\n"
+        "ldr q1, [x21, #0x50]\n"
+        ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
+        ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
+        "ldr q26, [x21, #0x60]\n"
+        ".inst 0x4e95a429  // smmla v9.4s, v1.16b, v21.16b\n"
+        ".inst 0x4e90a420  // smmla v0.4s, v1.16b, v16.16b\n"
+        "ldr q21, [x21, #0x70]\n"
+        "add x21, x21, #0x88\n"
+        ".inst 0x4e9da754  // smmla v20.4s, v26.16b, v29.16b\n"
+        ".inst 0x4e83a752  // smmla v18.4s, v26.16b, v3.16b\n"
+        ".inst 0x4e9da6a9  // smmla v9.4s, v21.16b, v29.16b\n"
+        ".inst 0x4e83a6a0  // smmla v0.4s, v21.16b, v3.16b\n"
+        "uzp1 v29.2d, v20.2d, v18.2d\n"
+        "uzp2 v21.2d, v20.2d, v18.2d\n"
+        "scvtf v29.4s, v29.4s, #0x4\n"
+        "uzp1 v18.2d, v9.2d, v0.2d\n"
+        "uzp2 v16.2d, v9.2d, v0.2d\n"
+        "scvtf v21.4s, v21.4s, #0x4\n"
+        "fmla v6.4s, v29.4s, v8.4s\n"
+        "scvtf v18.4s, v18.4s, #0x4\n"
+        "scvtf v16.4s, v16.4s, #0x4\n"
+        "fmla v30.4s, v21.4s, v31.4s\n"
+        "fmla v24.4s, v18.4s, v15.4s\n"
+        "fmla v14.4s, v16.4s, v27.4s\n"
+        "bgt 3b\n"
+        "mov x20, %x[res_ptr]\n"
+        "subs x27, x27, #0x4\n"
+        "add %x[res_ptr], %x[res_ptr], #0x10\n"
+        "str q2, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q10, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q12, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q28, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q11, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q13, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q22, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q23, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q25, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q5, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q7, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q4, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q6, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q30, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q24, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q14, [x20, #0x0]\n"
+        "bne 2b\n"
+        "mov x20, #0x4\n"
+        "sub x10, x10, #0x10\n"
+        "cmp x10, #0x10\n"
+        "mov %x[res_ptr], x26\n"
+        "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
+        "bge 1b\n"
+        "4:"  // Row loop skip
+        "cbz x10, 9f\n"
+        "5:"  // Row tail: Row loop
+        "add x24, %x[b_ptr], #0x8\n"
+        "mov x23, %x[width]\n"
+        "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
+        "6:"  // Row tail: Column loop
+        "movi v2.16b, #0x0\n"
+        "movi v10.16b, #0x0\n"
+        "add x25, %x[a_ptr], #0x8\n"
+        "mov x21, %x[num_blocks]\n"
+        "movi v12.16b, #0x0\n"
+        "movi v28.16b, #0x0\n"
+        "7:"  // Row tail: Block loop
+        "ldr q6, [x24, #0x0]\n"
+        "ldr q5, [x24, #0x10]\n"
+        "movi v17.16b, #0x4\n"
+        "movi v8.4s, #0x0\n"
+        "ldr q4, [x25, #0x0]\n"
+        "ldr q13, [x25, #0x10]\n"
+        "movi v27.4s, #0x0\n"
+        "movi v0.4s, #0x0\n"
+        "ldr q31, [x24, #0x20]\n"
+        "ldr q14, [x24, #0x30]\n"
+        "movi v29.4s, #0x0\n"
+        "movi v22.16b, #0xf0\n"
+        "ldr q11, [x25, #0x20]\n"
+        "ldr q23, [x25, #0x30]\n"
+        "sshl v21.16b, v6.16b, v17.16b\n"
+        "sshl v16.16b, v5.16b, v17.16b\n"
+        "ldr q20, [x25, #0x40]\n"
+        "ldr q26, [x25, #0x50]\n"
+        "and v6.16b, v6.16b, v22.16b\n"
+        "and v5.16b, v5.16b, v22.16b\n"
+        "ldr q25, [x25, #0x60]\n"
+        "ldr q3, [x25, #0x70]\n"
+        "sshl v19.16b, v31.16b, v17.16b\n"
+        "sshl v18.16b, v14.16b, v17.16b\n"
+        "ldr d17, [x25, #-0x8]\n"
+        ".inst 0x4e95a488  // smmla v8.4s, v4.16b, v21.16b\n"
+        ".inst 0x4e90a49b  // smmla v27.4s, v4.16b, v16.16b\n"
+        "and v31.16b, v31.16b, v22.16b\n"
+        ".inst 0x4e95a5a0  // smmla v0.4s, v13.16b, v21.16b\n"
+        ".inst 0x4e90a5bd  // smmla v29.4s, v13.16b, v16.16b\n"
+        "and v14.16b, v14.16b, v22.16b\n"
+        "sub x20, x24, #0x8\n"
+        "ldr d16, [x20, #0x0]\n"
+        "subs x21, x21, #0x1\n"
+        "add x25, x25, #0x88\n"
+        "fcvtl v17.4s, v17.4h\n"
+        "add x24, x24, #0x48\n"
+        ".inst 0x4e93a568  // smmla v8.4s, v11.16b, v19.16b\n"
+        ".inst 0x4e92a57b  // smmla v27.4s, v11.16b, v18.16b\n"
+        ".inst 0x4e93a6e0  // smmla v0.4s, v23.16b, v19.16b\n"
+        ".inst 0x4e92a6fd  // smmla v29.4s, v23.16b, v18.16b\n"
+        "fcvtl v16.4s, v16.4h\n"
+        ".inst 0x4e86a688  // smmla v8.4s, v20.16b, v6.16b\n"
+        ".inst 0x4e85a69b  // smmla v27.4s, v20.16b, v5.16b\n"
+        "fmul v23.4s, v16.4s, v17.s[0]\n"
+        "fmul v21.4s, v16.4s, v17.s[1]\n"
+        "fmul v1.4s, v16.4s, v17.s[2]\n"
+        "fmul v20.4s, v16.4s, v17.s[3]\n"
+        ".inst 0x4e86a740  // smmla v0.4s, v26.16b, v6.16b\n"
+        ".inst 0x4e85a75d  // smmla v29.4s, v26.16b, v5.16b\n"
+        ".inst 0x4e9fa728  // smmla v8.4s, v25.16b, v31.16b\n"
+        ".inst 0x4e8ea73b  // smmla v27.4s, v25.16b, v14.16b\n"
+        ".inst 0x4e9fa460  // smmla v0.4s, v3.16b, v31.16b\n"
+        ".inst 0x4e8ea47d  // smmla v29.4s, v3.16b, v14.16b\n"
+        "uzp1 v19.2d, v8.2d, v27.2d\n"
+        "uzp2 v18.2d, v8.2d, v27.2d\n"
+        "scvtf v19.4s, v19.4s, #0x4\n"
+        "uzp1 v17.2d, v0.2d, v29.2d\n"
+        "uzp2 v16.2d, v0.2d, v29.2d\n"
+        "scvtf v18.4s, v18.4s, #0x4\n"
+        "fmla v2.4s, v19.4s, v23.4s\n"
+        "scvtf v17.4s, v17.4s, #0x4\n"
+        "scvtf v16.4s, v16.4s, #0x4\n"
+        "fmla v10.4s, v18.4s, v21.4s\n"
+        "fmla v12.4s, v17.4s, v1.4s\n"
+        "fmla v28.4s, v16.4s, v20.4s\n"
+        "bgt 7b\n"
+        "mov x20, %x[res_ptr]\n"
+        "cmp x10, #0x1\n"
+        "str q2, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "ble 8f\n"
+        "cmp x10, #0x2\n"
+        "str q10, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "ble 8f\n"
+        "cmp x10, #0x3\n"
+        "str q12, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "ble 8f\n"
+        "str q28, [x20, #0x0]\n"
+        "8:"  // Row tail: Accumulator store skip
+        "subs x23, x23, #0x4\n"
+        "add %x[res_ptr], %x[res_ptr], #0x10\n"
+        "bne 6b\n"
+        "subs x10, x10, #0x4\n"
+        "add %x[a_ptr], %x[a_ptr], x9\n"
+        "mov %x[res_ptr], x22\n"
+        "bgt 5b\n"
+        "9:"  // Row tail: Row loop skip
+        : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
+        : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [num_blocks] "r" (num_blocks), [res_stride] "r" (res_stride), [width] "r" (width)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+#elif defined(__ARM_NEON)
+    GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
+                "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
+                "performance");
+#endif
+}
+
+void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy, int nr, int nc, int ith, int nth) {
+    UNUSED(n);
+    UNUSED(s);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(ith);
+    UNUSED(nth);
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
+    if (svcntw() == 8) {
+        int64_t x0 = roundup((ith * nc) / nth, (int64_t)8);
+        int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)8);
+        size_t width = xend - x0;
+
+        int64_t nb = n / QK4_0;
+        const void * b_ptr = (const void *)((const block_q4_0x8 *) vx + ((x0 / 8) * nb));
+        const void * a_ptr = vy;
+        float * res_ptr = s + x0;
+        size_t res_stride = nc * sizeof(float);
+
+        assert(n % 32 == 0);
+        assert(width % 8 == 0);
+
+        size_t num_blocks = n / 32;
+
+        __asm__ __volatile__(
+            "mov x20, #0x4\n"
+            "mov x13, %x[nr]\n"
+            "mov z28.s, #-0x4\n"
+            "mov x12, #0x88\n"
+            "ptrue p1.b\n"
+            "whilelt p0.s, XZR, x20\n"
+            "cmp x13, #0x10\n"
+            "mul x12, %x[num_blocks], x12\n"
+            "blt 4f\n"
+            "1:"  // Row loop
+            "add x11, %x[b_ptr], #0x10\n"
+            "mov x10, %x[width]\n"
+            "add x9, %x[res_ptr], %x[res_stride], LSL #4\n"
+            "2:"  // Column loop
+            "add x28, %x[a_ptr], #0x8\n"
+            "mov z24.b, #0x0\n"
+            "mov z15.b, #0x0\n"
+            "mov x27, %x[num_blocks]\n"
+            "add x26, x28, x12\n"
+            "mov z12.b, #0x0\n"
+            "mov z0.b, #0x0\n"
+            "add x25, x26, x12\n"
+            "mov z13.b, #0x0\n"
+            "mov z1.b, #0x0\n"
+            "add x24, x25, x12\n"
+            "mov z20.b, #0x0\n"
+            "mov z25.b, #0x0\n"
+            "mov z11.b, #0x0\n"
+            "mov z16.b, #0x0\n"
+            "mov z19.b, #0x0\n"
+            "mov z26.b, #0x0\n"
+            "mov z8.b, #0x0\n"
+            "mov z29.b, #0x0\n"
+            "mov z27.b, #0x0\n"
+            "mov z10.b, #0x0\n"
+            "3:"  // Block loop
+            "ld1b { z30.b }, p1/Z, [x11]\n"
+            "ld1b { z21.b }, p1/Z, [x11, #1, MUL VL]\n"
+            "mov z18.s, #0x0\n"
+            "mov z7.s, #0x0\n"
+            "ld1rqb { z3.b }, p1/Z, [x28]\n"
+            "ld1rqb { z5.b }, p1/Z, [x28, #16]\n"
+            "mov z9.s, #0x0\n"
+            "mov z22.s, #0x0\n"
+            "ld1b { z4.b }, p1/Z, [x11, #2, MUL VL]\n"
+            "ld1b { z17.b }, p1/Z, [x11, #3, MUL VL]\n"
+            "sub x20, x11, #0x10\n"
+            "sub x23, x28, #0x8\n"
+            "lsl z31.b, z30.b, #0x4\n"
+            "lsl z6.b, z21.b, #0x4\n"
+            "ld1h { z23.s }, p1/Z, [x20]\n"
+            "sub x22, x26, #0x8\n"
+            "and z30.b, z30.b, #0xf0\n"
+            "and z21.b, z21.b, #0xf0\n"
+            "sub x21, x25, #0x8\n"
+            "sub x20, x24, #0x8\n"
+            "lsl z14.b, z4.b, #0x4\n"
+            "lsl z2.b, z17.b, #0x4\n"
+            "subs x27, x27, #0x1\n"
+            "add x11, x11, #0x90\n"
+            ".inst 0x451f9872  // smmla z18.s, z3.b, z31.b\n"
+            ".inst 0x45069867  // smmla z7.s, z3.b, z6.b\n"
+            "ld1rqb { z3.b }, p1/Z, [x28, #32]\n"
+            "and z4.b, z4.b, #0xf0\n"
+            ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
+            ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x28, #48]\n"
+            "and z17.b, z17.b, #0xf0\n"
+            "fcvt z23.s, p1/m, z23.h\n"
+            ".inst 0x450e9872  // smmla z18.s, z3.b, z14.b\n"
+            ".inst 0x45029867  // smmla z7.s, z3.b, z2.b\n"
+            "ld1rqb { z3.b }, p1/Z, [x28, #64]\n"
+            ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
+            ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x28, #80]\n"
+            "fscale z23.s, p1/m, z23.s, z28.s\n"
+            ".inst 0x451e9872  // smmla z18.s, z3.b, z30.b\n"
+            ".inst 0x45159867  // smmla z7.s, z3.b, z21.b\n"
+            "ld1rqb { z3.b }, p1/Z, [x28, #96]\n"
+            ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
+            ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x28, #112]\n"
+            "add x28, x28, #0x88\n"
+            ".inst 0x45049872  // smmla z18.s, z3.b, z4.b\n"
+            ".inst 0x45119867  // smmla z7.s, z3.b, z17.b\n"
+            "ld1h { z3.s }, p0/Z, [x23]\n"
+            ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
+            ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
+            "fcvt z3.s, p1/m, z3.h\n"
+            "uzp1 z5.d, z18.d, z7.d\n"
+            "uzp2 z18.d, z18.d, z7.d\n"
+            "mov z3.q, z3.q[0]\n"
+            "uzp1 z7.d, z9.d, z22.d\n"
+            "uzp2 z22.d, z9.d, z22.d\n"
+            "fmul z9.s, z23.s, z3.s[0]\n"
+            "scvtf z5.s, p1/m, z5.s\n"
+            "scvtf z18.s, p1/m, z18.s\n"
+            "scvtf z7.s, p1/m, z7.s\n"
+            "scvtf z22.s, p1/m, z22.s\n"
+            "fmla z24.s, p1/M, z5.s, z9.s\n"
+            "ld1rqb { z5.b }, p1/Z, [x26]\n"
+            "fmul z9.s, z23.s, z3.s[1]\n"
+            "fmla z15.s, p1/M, z18.s, z9.s\n"
+            "ld1rqb { z18.b }, p1/Z, [x26, #16]\n"
+            "fmul z9.s, z23.s, z3.s[2]\n"
+            "fmul z3.s, z23.s, z3.s[3]\n"
+            "fmla z12.s, p1/M, z7.s, z9.s\n"
+            "mov z9.s, #0x0\n"
+            "ld1h { z7.s }, p0/Z, [x22]\n"
+            ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
+            "fmla z0.s, p1/M, z22.s, z3.s\n"
+            "mov z22.s, #0x0\n"
+            "ld1h { z3.s }, p0/Z, [x21]\n"
+            ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x26, #32]\n"
+            "fcvt z7.s, p1/m, z7.h\n"
+            "fcvt z3.s, p1/m, z3.h\n"
+            ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
+            ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x26, #64]\n"
+            "mov z7.q, z7.q[0]\n"
+            "mov z3.q, z3.q[0]\n"
+            ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
+            ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x26, #96]\n"
+            ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
+            ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
+            "uzp1 z5.d, z9.d, z22.d\n"
+            "scvtf z5.s, p1/m, z5.s\n"
+            "uzp2 z22.d, z9.d, z22.d\n"
+            "fmul z9.s, z23.s, z7.s[0]\n"
+            "scvtf z22.s, p1/m, z22.s\n"
+            "fmla z13.s, p1/M, z5.s, z9.s\n"
+            "ld1rqb { z9.b }, p1/Z, [x25]\n"
+            "fmul z5.s, z23.s, z7.s[1]\n"
+            "fmla z1.s, p1/M, z22.s, z5.s\n"
+            "mov z5.s, #0x0\n"
+            "mov z22.s, #0x0\n"
+            ".inst 0x451f9a45  // smmla z5.s, z18.b, z31.b\n"
+            ".inst 0x45069a56  // smmla z22.s, z18.b, z6.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x26, #48]\n"
+            ".inst 0x450e9a45  // smmla z5.s, z18.b, z14.b\n"
+            ".inst 0x45029a56  // smmla z22.s, z18.b, z2.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x26, #80]\n"
+            ".inst 0x451e9a45  // smmla z5.s, z18.b, z30.b\n"
+            ".inst 0x45159a56  // smmla z22.s, z18.b, z21.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x26, #112]\n"
+            "add x26, x26, #0x88\n"
+            ".inst 0x45049a45  // smmla z5.s, z18.b, z4.b\n"
+            ".inst 0x45119a56  // smmla z22.s, z18.b, z17.b\n"
+            "uzp1 z18.d, z5.d, z22.d\n"
+            "scvtf z18.s, p1/m, z18.s\n"
+            "uzp2 z22.d, z5.d, z22.d\n"
+            "fmul z5.s, z23.s, z7.s[2]\n"
+            "fmul z7.s, z23.s, z7.s[3]\n"
+            "scvtf z22.s, p1/m, z22.s\n"
+            "fmla z20.s, p1/M, z18.s, z5.s\n"
+            "ld1rqb { z18.b }, p1/Z, [x25, #16]\n"
+            "ld1h { z5.s }, p0/Z, [x20]\n"
+            "fcvt z5.s, p1/m, z5.h\n"
+            "fmla z25.s, p1/M, z22.s, z7.s\n"
+            "mov z22.s, #0x0\n"
+            "mov z7.s, #0x0\n"
+            ".inst 0x451f9936  // smmla z22.s, z9.b, z31.b\n"
+            ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
+            "ld1rqb { z9.b }, p1/Z, [x25, #32]\n"
+            "mov z5.q, z5.q[0]\n"
+            ".inst 0x450e9936  // smmla z22.s, z9.b, z14.b\n"
+            ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
+            "ld1rqb { z9.b }, p1/Z, [x25, #64]\n"
+            ".inst 0x451e9936  // smmla z22.s, z9.b, z30.b\n"
+            ".inst 0x45159927  // smmla z7.s, z9.b, z21.b\n"
+            "ld1rqb { z9.b }, p1/Z, [x25, #96]\n"
+            ".inst 0x45049936  // smmla z22.s, z9.b, z4.b\n"
+            ".inst 0x45119927  // smmla z7.s, z9.b, z17.b\n"
+            "uzp1 z9.d, z22.d, z7.d\n"
+            "scvtf z9.s, p1/m, z9.s\n"
+            "uzp2 z22.d, z22.d, z7.d\n"
+            "fmul z7.s, z23.s, z3.s[0]\n"
+            "scvtf z22.s, p1/m, z22.s\n"
+            "fmla z11.s, p1/M, z9.s, z7.s\n"
+            "ld1rqb { z9.b }, p1/Z, [x24]\n"
+            "fmul z7.s, z23.s, z3.s[1]\n"
+            "fmla z16.s, p1/M, z22.s, z7.s\n"
+            "mov z22.s, #0x0\n"
+            "mov z7.s, #0x0\n"
+            ".inst 0x451f9a56  // smmla z22.s, z18.b, z31.b\n"
+            ".inst 0x45069a47  // smmla z7.s, z18.b, z6.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x25, #48]\n"
+            ".inst 0x450e9a56  // smmla z22.s, z18.b, z14.b\n"
+            ".inst 0x45029a47  // smmla z7.s, z18.b, z2.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x25, #80]\n"
+            ".inst 0x451e9a56  // smmla z22.s, z18.b, z30.b\n"
+            ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x25, #112]\n"
+            "add x25, x25, #0x88\n"
+            ".inst 0x45049a56  // smmla z22.s, z18.b, z4.b\n"
+            ".inst 0x45119a47  // smmla z7.s, z18.b, z17.b\n"
+            "uzp1 z18.d, z22.d, z7.d\n"
+            "scvtf z18.s, p1/m, z18.s\n"
+            "uzp2 z7.d, z22.d, z7.d\n"
+            "fmul z22.s, z23.s, z3.s[2]\n"
+            "fmul z3.s, z23.s, z3.s[3]\n"
+            "scvtf z7.s, p1/m, z7.s\n"
+            "fmla z19.s, p1/M, z18.s, z22.s\n"
+            "ld1rqb { z18.b }, p1/Z, [x24, #16]\n"
+            "fmul z22.s, z23.s, z5.s[0]\n"
+            "fmla z26.s, p1/M, z7.s, z3.s\n"
+            "mov z3.s, #0x0\n"
+            "mov z7.s, #0x0\n"
+            ".inst 0x451f9923  // smmla z3.s, z9.b, z31.b\n"
+            ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
+            "ld1rqb { z9.b }, p1/Z, [x24, #32]\n"
+            ".inst 0x450e9923  // smmla z3.s, z9.b, z14.b\n"
+            ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
+            "mov z9.s, #0x0\n"
+            ".inst 0x451f9a49  // smmla z9.s, z18.b, z31.b\n"
+            "mov z31.s, #0x0\n"
+            ".inst 0x45069a5f  // smmla z31.s, z18.b, z6.b\n"
+            "ld1rqb { z6.b }, p1/Z, [x24, #48]\n"
+            "ld1rqb { z18.b }, p1/Z, [x24, #64]\n"
+            ".inst 0x450e98c9  // smmla z9.s, z6.b, z14.b\n"
+            "fmul z14.s, z23.s, z5.s[1]\n"
+            ".inst 0x450298df  // smmla z31.s, z6.b, z2.b\n"
+            "ld1rqb { z6.b }, p1/Z, [x24, #80]\n"
+            "fmul z2.s, z23.s, z5.s[2]\n"
+            "fmul z23.s, z23.s, z5.s[3]\n"
+            ".inst 0x451e9a43  // smmla z3.s, z18.b, z30.b\n"
+            ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x24, #96]\n"
+            ".inst 0x451e98c9  // smmla z9.s, z6.b, z30.b\n"
+            ".inst 0x451598df  // smmla z31.s, z6.b, z21.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x24, #112]\n"
+            "add x24, x24, #0x88\n"
+            ".inst 0x450498a3  // smmla z3.s, z5.b, z4.b\n"
+            ".inst 0x451198a7  // smmla z7.s, z5.b, z17.b\n"
+            ".inst 0x45049a49  // smmla z9.s, z18.b, z4.b\n"
+            ".inst 0x45119a5f  // smmla z31.s, z18.b, z17.b\n"
+            "uzp1 z18.d, z3.d, z7.d\n"
+            "uzp2 z5.d, z3.d, z7.d\n"
+            "scvtf z18.s, p1/m, z18.s\n"
+            "uzp1 z6.d, z9.d, z31.d\n"
+            "uzp2 z9.d, z9.d, z31.d\n"
+            "scvtf z5.s, p1/m, z5.s\n"
+            "fmla z8.s, p1/M, z18.s, z22.s\n"
+            "scvtf z6.s, p1/m, z6.s\n"
+            "scvtf z9.s, p1/m, z9.s\n"
+            "fmla z29.s, p1/M, z5.s, z14.s\n"
+            "fmla z27.s, p1/M, z6.s, z2.s\n"
+            "fmla z10.s, p1/M, z9.s, z23.s\n"
+            "bgt 3b\n"
+            "mov x20, %x[res_ptr]\n"
+            "subs x10, x10, #0x8\n"
+            "add %x[res_ptr], %x[res_ptr], #0x20\n"
+            "st1w { z24.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z15.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z12.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z0.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z13.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z1.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z20.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z25.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z11.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z16.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z19.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z26.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z8.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z29.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z27.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z10.s }, p1, [x20]\n"
+            "bne 2b\n"
+            "mov x20, #0x4\n"
+            "sub x13, x13, #0x10\n"
+            "cmp x13, #0x10\n"
+            "mov %x[res_ptr], x9\n"
+            "madd %x[a_ptr], x20, x12, %x[a_ptr]\n"
+            "bge 1b\n"
+            "4:"  // Row loop skip
+            "cbz x13, 9f\n"
+            "5:"  // Row tail: Row loop
+            "add x25, %x[b_ptr], #0x10\n"
+            "mov x24, %x[width]\n"
+            "add x23, %x[res_ptr], %x[res_stride], LSL #2\n"
+            "6:"  // Row tail: Column loop
+            "mov z24.b, #0x0\n"
+            "mov z15.b, #0x0\n"
+            "add x28, %x[a_ptr], #0x8\n"
+            "mov x22, %x[num_blocks]\n"
+            "mov z12.b, #0x0\n"
+            "mov z0.b, #0x0\n"
+            "7:"  // Row tail: Block loop
+            "ld1b { z3.b }, p1/Z, [x25]\n"
+            "ld1b { z6.b }, p1/Z, [x25, #1, MUL VL]\n"
+            "mov z2.s, #0x0\n"
+            "mov z25.s, #0x0\n"
+            "ld1rqb { z26.b }, p1/Z, [x28]\n"
+            "ld1rqb { z21.b }, p1/Z, [x28, #16]\n"
+            "mov z27.s, #0x0\n"
+            "mov z19.s, #0x0\n"
+            "ld1b { z29.b }, p1/Z, [x25, #2, MUL VL]\n"
+            "ld1b { z16.b }, p1/Z, [x25, #3, MUL VL]\n"
+            "sub x21, x25, #0x10\n"
+            "sub x20, x28, #0x8\n"
+            "lsl z20.b, z3.b, #0x4\n"
+            "lsl z4.b, z6.b, #0x4\n"
+            "ld1rqb { z10.b }, p1/Z, [x28, #32]\n"
+            "ld1rqb { z23.b }, p1/Z, [x28, #48]\n"
+            "and z3.b, z3.b, #0xf0\n"
+            "and z6.b, z6.b, #0xf0\n"
+            "ld1rqb { z11.b }, p1/Z, [x28, #64]\n"
+            "ld1rqb { z7.b }, p1/Z, [x28, #80]\n"
+            "lsl z8.b, z29.b, #0x4\n"
+            "lsl z14.b, z16.b, #0x4\n"
+            "ld1rqb { z18.b }, p1/Z, [x28, #96]\n"
+            "ld1rqb { z30.b }, p1/Z, [x28, #112]\n"
+            ".inst 0x45149b42  // smmla z2.s, z26.b, z20.b\n"
+            ".inst 0x45049b59  // smmla z25.s, z26.b, z4.b\n"
+            "and z29.b, z29.b, #0xf0\n"
+            "ld1h { z17.s }, p1/Z, [x21]\n"
+            ".inst 0x45149abb  // smmla z27.s, z21.b, z20.b\n"
+            ".inst 0x45049ab3  // smmla z19.s, z21.b, z4.b\n"
+            "and z16.b, z16.b, #0xf0\n"
+            "ld1h { z4.s }, p0/Z, [x20]\n"
+            "subs x22, x22, #0x1\n"
+            "add x28, x28, #0x88\n"
+            "fcvt z17.s, p1/m, z17.h\n"
+            "add x25, x25, #0x90\n"
+            ".inst 0x45089942  // smmla z2.s, z10.b, z8.b\n"
+            ".inst 0x450e9959  // smmla z25.s, z10.b, z14.b\n"
+            "fcvt z4.s, p1/m, z4.h\n"
+            ".inst 0x45089afb  // smmla z27.s, z23.b, z8.b\n"
+            ".inst 0x450e9af3  // smmla z19.s, z23.b, z14.b\n"
+            "fscale z17.s, p1/m, z17.s, z28.s\n"
+            "mov z4.q, z4.q[0]\n"
+            ".inst 0x45039962  // smmla z2.s, z11.b, z3.b\n"
+            ".inst 0x45069979  // smmla z25.s, z11.b, z6.b\n"
+            "fmul z23.s, z17.s, z4.s[0]\n"
+            "fmul z9.s, z17.s, z4.s[1]\n"
+            "fmul z21.s, z17.s, z4.s[2]\n"
+            "fmul z4.s, z17.s, z4.s[3]\n"
+            ".inst 0x450398fb  // smmla z27.s, z7.b, z3.b\n"
+            ".inst 0x450698f3  // smmla z19.s, z7.b, z6.b\n"
+            ".inst 0x451d9a42  // smmla z2.s, z18.b, z29.b\n"
+            ".inst 0x45109a59  // smmla z25.s, z18.b, z16.b\n"
+            ".inst 0x451d9bdb  // smmla z27.s, z30.b, z29.b\n"
+            ".inst 0x45109bd3  // smmla z19.s, z30.b, z16.b\n"
+            "uzp1 z31.d, z2.d, z25.d\n"
+            "uzp2 z13.d, z2.d, z25.d\n"
+            "scvtf z31.s, p1/m, z31.s\n"
+            "uzp1 z17.d, z27.d, z19.d\n"
+            "uzp2 z18.d, z27.d, z19.d\n"
+            "scvtf z13.s, p1/m, z13.s\n"
+            "fmla z24.s, p1/M, z31.s, z23.s\n"
+            "scvtf z17.s, p1/m, z17.s\n"
+            "scvtf z18.s, p1/m, z18.s\n"
+            "fmla z15.s, p1/M, z13.s, z9.s\n"
+            "fmla z12.s, p1/M, z17.s, z21.s\n"
+            "fmla z0.s, p1/M, z18.s, z4.s\n"
+            "bgt 7b\n"
+            "mov x20, %x[res_ptr]\n"
+            "cmp x13, #0x1\n"
+            "st1w { z24.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "cmp x13, #0x2\n"
+            "st1w { z15.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "cmp x13, #0x3\n"
+            "st1w { z12.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "st1w { z0.s }, p1, [x20]\n"
+            "8:"  // Row tail: Accumulator store skip
+            "subs x24, x24, #0x8\n"
+            "add %x[res_ptr], %x[res_ptr], #0x20\n"
+            "bne 6b\n"
+            "subs x13, x13, #0x4\n"
+            "add %x[a_ptr], %x[a_ptr], x12\n"
+            "mov %x[res_ptr], x23\n"
+            "bgt 5b\n"
+            "9:"  // Row tail: Row loop skip
+            : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
+            : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [num_blocks] "r" (num_blocks), [res_stride] "r" (res_stride), [width] "r" (width)
+            : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+        );
+        return;
+    }
+    else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
+        GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
+                    "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
+                    "performance");
+    }
+    else if (ggml_cpu_has_neon()) {
+        GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
+                    "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
+                    "quantization format for optimal performance");
+    }
+#endif
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    GGML_ASSERT(ggml_cpu_has_sve() &&
+                "__ARM_FEATURE_SVE not defined, use the Q4_0_4_8 quantization format for optimal performance");
+#elif defined(__ARM_NEON)
+    GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
+                "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
+                "performance");
+#endif
+}
diff --git a/ggml-aarch64.h b/ggml-aarch64.h
index 1f0767a99d103..d4d4dd01b9fb4 100644
--- a/ggml-aarch64.h
+++ b/ggml-aarch64.h
@@ -13,21 +13,23 @@ extern "C" {
 #endif
 
 // Quantization
-void quantize_q8_0_aarch64(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k, int nrows_interleaved, int blocklen_per_row);
+void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 
 // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
-size_t quantize_q4_0_aarch64(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-
-block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned int block_len, unsigned int xor_mask);
-block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned int block_len, unsigned int xor_mask);
-block_q8_0x4 make_block_q8_0x4(const block_q8_0 * const in[4], unsigned int block_len);
-block_q8_0x8 make_block_q8_0x8(const block_q8_0 * const in[8], unsigned int block_len);
+size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 
 // GEMV
-void ggml_gemv_q4_0_q8_0_aarch64  (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
+void ggml_gemv_q4_0_4x4_q8_0  (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
+void ggml_gemv_q4_0_4x8_q8_0  (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
+void ggml_gemv_q4_0_8x8_q8_0  (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
 
 // GEMM
-void ggml_gemm_q4_0_q8_0_aarch64  (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
+void ggml_gemm_q4_0_4x4_q8_0  (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
+void ggml_gemm_q4_0_4x8_q8_0  (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
+void ggml_gemm_q4_0_8x8_q8_0  (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
 
 #ifdef __cplusplus
 }
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 1e8bb058cc290..7cfd74a7ede28 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -383,7 +383,9 @@ extern "C" {
         GGML_TYPE_F64     = 28,
         GGML_TYPE_IQ1_M   = 29,
         GGML_TYPE_BF16    = 30,
-        GGML_TYPE_Q4_0_AARCH64 = 31,
+        GGML_TYPE_Q4_0_4_4 = 31,
+        GGML_TYPE_Q4_0_4_8 = 32,
+        GGML_TYPE_Q4_0_8_8 = 33,
         GGML_TYPE_COUNT,
     };
 
@@ -425,7 +427,9 @@ extern "C" {
         GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
         GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_0_AARCH64 = 25, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
     };
 
     // available tensor operations:
@@ -2409,7 +2413,7 @@ extern "C" {
     typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
     typedef void (*ggml_vec_dot_t)   (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
                                       const void * GGML_RESTRICT y, size_t by, int nrc);
-    typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k, int n, int b);
+    typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
     typedef void (*ggml_gemv_t)      (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy,
                                       int nr, int nc, int ith, int nth);
     typedef void (*ggml_gemm_t)      (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy,
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 7320000902f01..ad5300b44c2bc 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -14987,19 +14987,16 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
             {
                 VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
             } break;
-        case GGML_TYPE_Q4_0_AARCH64:
+        case GGML_TYPE_Q4_0_4_4:
+        case GGML_TYPE_Q4_0_4_8:
             {
-#if defined(__ARM_FEATURE_SVE)
-                if (svcntw() == 8) {
-                    VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8);
-                }
-                else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-                    VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4);
-                }
-#elif defined(__ARM_NEON)
                 VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4);
-#endif
             } break;
+        case GGML_TYPE_Q4_0_8_8:
+            {
+                VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8);
+            } break;
+
         case GGML_TYPE_I8:
         case GGML_TYPE_I16:
         case GGML_TYPE_I32:
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 3a481c0a3e722..956465dfd82b0 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -702,10 +702,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_0,
 #if defined (__ARM_FEATURE_MATMUL_INT8)
         .nrows                    = 2,
+        .from_float_to_mat        = quantize_q8_0_4x8,
 #else
         .nrows                    = 1,
+        .from_float_to_mat        = quantize_q8_0_4x4,
 #endif
-        .from_float_to_mat        = quantize_q8_0_aarch64,
     },
     [GGML_TYPE_Q8_1] = {
         .type_name                = "q8_1",
@@ -904,8 +905,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_BF16,
         .nrows                    = 1,
     },
-    [GGML_TYPE_Q4_0_AARCH64] = {
-        .type_name                = "q4_0_aarch64",
+    [GGML_TYPE_Q4_0_4_4] = {
+        .type_name                = "q4_0_4x4",
         .blck_size                = QK4_0,
         .type_size                = sizeof(block_q4_0),
         .is_quantized             = true,
@@ -915,8 +916,36 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot                  = NULL,
         .vec_dot_type             = GGML_TYPE_Q8_0,
         .nrows                    = 1,
-        .gemv                     = ggml_gemv_q4_0_q8_0_aarch64,
-        .gemm                     = ggml_gemm_q4_0_q8_0_aarch64,
+        .gemv                     = ggml_gemv_q4_0_4x4_q8_0,
+        .gemm                     = ggml_gemm_q4_0_4x4_q8_0,
+    },
+    [GGML_TYPE_Q4_0_4_8] = {
+        .type_name                = "q4_0_4x8",
+        .blck_size                = QK4_0,
+        .type_size                = sizeof(block_q4_0),
+        .is_quantized             = true,
+        .to_float                 = NULL,
+        .from_float               = NULL,
+        .from_float_reference     = NULL,
+        .vec_dot                  = NULL,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+        .nrows                    = 1,
+        .gemv                     = ggml_gemv_q4_0_4x8_q8_0,
+        .gemm                     = ggml_gemm_q4_0_4x8_q8_0,
+    },
+    [GGML_TYPE_Q4_0_8_8] = {
+        .type_name                = "q4_0_8x8",
+        .blck_size                = QK4_0,
+        .type_size                = sizeof(block_q4_0),
+        .is_quantized             = true,
+        .to_float                 = NULL,
+        .from_float               = NULL,
+        .from_float_reference     = NULL,
+        .vec_dot                  = NULL,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+        .nrows                    = 1,
+        .gemv                     = ggml_gemv_q4_0_8x8_q8_0,
+        .gemm                     = ggml_gemm_q4_0_8x8_q8_0,
     }
 };
 
@@ -3216,7 +3245,9 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
         case GGML_FTYPE_MOSTLY_IQ4_XS:        wtype = GGML_TYPE_IQ4_XS;   break;
         case GGML_FTYPE_MOSTLY_IQ3_S:         wtype = GGML_TYPE_IQ3_S;    break;
         case GGML_FTYPE_MOSTLY_IQ2_S:         wtype = GGML_TYPE_IQ2_S;    break;
-        case GGML_FTYPE_MOSTLY_Q4_0_AARCH64:  wtype = GGML_TYPE_Q4_0_AARCH64;  break;
+        case GGML_FTYPE_MOSTLY_Q4_0_4_4:      wtype = GGML_TYPE_Q4_0_4_4; break;
+        case GGML_FTYPE_MOSTLY_Q4_0_4_8:      wtype = GGML_TYPE_Q4_0_4_8; break;
+        case GGML_FTYPE_MOSTLY_Q4_0_8_8:      wtype = GGML_TYPE_Q4_0_8_8; break;
         case GGML_FTYPE_UNKNOWN:              wtype = GGML_TYPE_COUNT; break;
         case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
     }
@@ -9461,7 +9492,9 @@ static void ggml_compute_forward_add(
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_AARCH64:
+        case GGML_TYPE_Q4_0_4_4:
+        case GGML_TYPE_Q4_0_4_8:
+        case GGML_TYPE_Q4_0_8_8:
             {
                 ggml_compute_forward_add_q_f32(params, dst);
             } break;
@@ -9837,7 +9870,9 @@ static void ggml_compute_forward_add1(
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_AARCH64:
+        case GGML_TYPE_Q4_0_4_4:
+        case GGML_TYPE_Q4_0_4_8:
+        case GGML_TYPE_Q4_0_8_8:
             {
                 ggml_compute_forward_add1_q_f32(params, dst);
             } break;
@@ -9963,7 +9998,9 @@ static void ggml_compute_forward_acc(
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_AARCH64:
+        case GGML_TYPE_Q4_0_4_4:
+        case GGML_TYPE_Q4_0_4_8:
+        case GGML_TYPE_Q4_0_8_8:
         default:
             {
                 GGML_ASSERT(false);
@@ -12166,7 +12203,8 @@ static void ggml_compute_forward_mul_mat(
     enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
     ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
     int64_t           const vec_dot_num_rows      = type_traits[type].nrows;
-    ggml_from_float_to_mat_t const from_float_to_mat = type_traits[vec_dot_type].from_float_to_mat;
+    ggml_from_float_to_mat_t const from_float_to_mat
+                                                  = type_traits[vec_dot_type].from_float_to_mat;
     ggml_gemv_t       const gemv                  = type_traits[type].gemv;
     ggml_gemm_t       const gemm                  = type_traits[type].gemm;
 
@@ -12236,7 +12274,7 @@ UseGgmlGemm1:;
         }
             if (from_float_to_mat && gemm && (ne11 >= 4) && (ne12 == 1) && (ne13 == 1)) {
                     for (int64_t i11 = 0; i11 < ne11 / 4; ++i11) {
-                        from_float_to_mat((float *)((char *) src1->data + i11 * 4 * nb11), (void *) wdata, ne10, 4, ggml_cpu_has_matmul_int8() ? 8 : 4);
+                        from_float_to_mat((float *)((char *) src1->data + i11 * 4 * nb11), (void *) wdata, ne10);
                         wdata += row_size * 4;
                     }
                     for (int64_t i11 = (ne11 / 4) * 4; i11 < ne11; ++i11) {
@@ -12790,7 +12828,9 @@ static void ggml_compute_forward_out_prod(
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_AARCH64:
+        case GGML_TYPE_Q4_0_4_4:
+        case GGML_TYPE_Q4_0_4_8:
+        case GGML_TYPE_Q4_0_8_8:
             {
                 ggml_compute_forward_out_prod_q_f32(params, dst);
             } break;
@@ -12976,7 +13016,9 @@ static void ggml_compute_forward_set(
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_AARCH64:
+        case GGML_TYPE_Q4_0_4_4:
+        case GGML_TYPE_Q4_0_4_8:
+        case GGML_TYPE_Q4_0_8_8:
         default:
             {
                 GGML_ASSERT(false);
@@ -13236,7 +13278,9 @@ static void ggml_compute_forward_get_rows(
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_AARCH64:
+        case GGML_TYPE_Q4_0_4_4:
+        case GGML_TYPE_Q4_0_4_8:
+        case GGML_TYPE_Q4_0_8_8:
             {
                 ggml_compute_forward_get_rows_q(params, dst);
             } break;
@@ -13823,7 +13867,9 @@ static void ggml_compute_forward_clamp(
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
         case GGML_TYPE_Q8_K:
-        case GGML_TYPE_Q4_0_AARCH64:
+        case GGML_TYPE_Q4_0_4_4:
+        case GGML_TYPE_Q4_0_4_8:
+        case GGML_TYPE_Q4_0_8_8:
         case GGML_TYPE_I8:
         case GGML_TYPE_I16:
         case GGML_TYPE_I32:
@@ -20547,7 +20593,9 @@ size_t ggml_quantize_chunk(
         case GGML_TYPE_IQ1_M:   result = quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ4_NL:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ4_XS:  result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q4_0_AARCH64: result = quantize_q4_0_aarch64(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_F16:
             {
                 size_t elemsize = sizeof(ggml_fp16_t);
diff --git a/include/llama.h b/include/llama.h
index bd108ec699c75..3970c3aebcd62 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -162,7 +162,9 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_BF16          = 32, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0_AARCH64  = 33, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0_4_4      = 33, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0_4_8      = 34, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0_8_8      = 35, // except 1d tensors
 
         LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
     };
diff --git a/src/llama.cpp b/src/llama.cpp
index 6b19d1b2a0363..0adb0afae118f 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3782,7 +3782,9 @@ struct llama_model_loader {
                 case GGML_TYPE_IQ4_NL:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL;  break;
                 case GGML_TYPE_IQ4_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS;  break;
                 case GGML_TYPE_IQ3_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ3_S;   break;
-                case GGML_TYPE_Q4_0_AARCH64: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_AARCH64; break;
+                case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
+                case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
+                case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
                 default:
                     {
                         LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -4476,7 +4478,9 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
         case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ3_S:  return "IQ3_S - 3.4375 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ3_M:  return "IQ3_S mix - 3.66 bpw";
-        case LLAMA_FTYPE_MOSTLY_Q4_0_AARCH64: return "Q4_0_AARCH64";
+        case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
+        case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
+        case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
 
         default: return "unknown, may not work";
     }
@@ -17762,7 +17766,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
             else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
                 new_type = GGML_TYPE_IQ3_S;
             }
-            else if (new_type == GGML_TYPE_Q4_0_AARCH64) {
+            else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
+                     new_type == GGML_TYPE_Q4_0_8_8) {
                 new_type = GGML_TYPE_Q4_0;
             }
         }
@@ -18077,7 +18082,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         case LLAMA_FTYPE_MOSTLY_IQ4_XS:  default_type = GGML_TYPE_IQ4_XS;  break;
         case LLAMA_FTYPE_MOSTLY_IQ3_S:   default_type = GGML_TYPE_IQ3_S;   break;
         case LLAMA_FTYPE_MOSTLY_IQ3_M:   default_type = GGML_TYPE_IQ3_S;   break;
-        case LLAMA_FTYPE_MOSTLY_Q4_0_AARCH64: default_type = GGML_TYPE_Q4_0_AARCH64; break;
+        case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
+        case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
+        case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
 
         default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
     }
@@ -18388,8 +18395,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                 f32_data = (float *) f32_conv_buf.data();
             }
 
-            if (new_type == GGML_TYPE_Q4_0_AARCH64) {
-                if ((ggml_cpu_has_neon() == 0) && (ggml_cpu_has_sve() == 0)) new_type = GGML_TYPE_Q4_0;
+            if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
                 if ((nelements / tensor->ne[0]) % 4 != 0) new_type = GGML_TYPE_Q4_0;
                 if (nthread > 1) nthread = 1;
             }

From 5d10c218ebf23c6019f2fdcb88fb2e3b61f8b66c Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar109@gmail.com>
Date: Fri, 31 May 2024 04:33:13 +0000
Subject: [PATCH 10/28] Arm AArch64: minor code change for resolving a build
 issue with server-windows

---
 ggml-aarch64.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml-aarch64.c b/ggml-aarch64.c
index d888031f315f8..b1a2e0148a33f 100644
--- a/ggml-aarch64.c
+++ b/ggml-aarch64.c
@@ -239,9 +239,9 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds
         out_ptr_B = (block_q4_0x4 *) malloc(sizeof(block_q4_0x4) * nb);
         out_ptr_B_start = out_ptr_B;
     }
+    block_q4_0 ** in_ptrs = (block_q4_0 **) malloc(sizeof(block_q4_0 *) * nrows_interleaved);
 
     for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) {
-        block_q4_0 * in_ptrs[nrows_interleaved];
 
         for (int i  = 0; i < nrows_interleaved; i++ ) {
             in_ptrs[i] = (block_q4_0 *) dst + (b + i * n_per_row) / QK4_0;
@@ -267,6 +267,7 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds
         else if (nrows_interleaved == 4) memcpy ((block_q4_0 *) dst + b / QK4_0, out_ptr_B_start, sizeof(block_q4_0x4) * nb);
     }
     if (out_ptr_B_start) free(out_ptr_B_start);
+    if (in_ptrs) free(in_ptrs);
 
     return ((nrow * n_per_row) / QK4_0 * sizeof(block_q4_0));
 }

From 7ac03e5fe8ac63d87df37a07e72584fc3dcba633 Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar109@gmail.com>
Date: Fri, 31 May 2024 18:44:25 +0000
Subject: [PATCH 11/28] retrigger checks


From e2c1c47fa8d33363dddcf10143008f36d6fea3bb Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar109@gmail.com>
Date: Wed, 5 Jun 2024 06:05:26 +0000
Subject: [PATCH 12/28] Arm AArch64: minor code changes for rebase

---
 ggml/src/ggml.c | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 956465dfd82b0..cd37aba823d56 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -12370,29 +12370,31 @@ UseGgmlGemm2:;
     //if (ith == 0)
     //    printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d.  Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);
 
+    const void * src1_wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+    const size_t src1_col_stride = ggml_is_contiguous(src1) || src1->type != vec_dot_type ? ggml_row_size(vec_dot_type, ne10) : nb11;
     if ((ggml_n_dims(src0) == 2) && gemm && gemv) {
-        if (ne11 == 1) gemv(ne00, (float *)((char *) dst->data), (const char *) src0->data, (const char *) wdata, 1, ne01, ith, nth);
+        if (ne11 == 1) gemv(ne00, (float *)((char *) dst->data), (const char *) src0->data, (const char *) src1_wdata, 1, ne01, ith, nth);
         else {
-            for (int row_iter = 0; row_iter < ne11 / 16; row_iter++) {
-                gemm(ne00, (float *)((char *) dst->data + (row_iter * 16 * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 16) * row_size : (row_iter * 16 * nb11)), 16, ne01, ith, nth);
+            for (int iter = 0; iter < ne11 / 16; iter++) {
+                gemm(ne00, (float *)((char *) dst->data + (iter * 16 * nb1)), (const char *) src0->data, (const char *) src1_wdata + (src1_col_stride * iter * 16), 16, ne01, ith, nth);
             }
             int rows_processed = (ne11 / 16) * 16;
-            for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 8; row_iter++) {
-                gemm(ne00, (float *)((char *) dst->data + ((rows_processed + row_iter * 8) * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 8) * row_size : ((rows_processed + row_iter * 8) * nb11)), 8, ne01, ith, nth);
+            for (int iter = 0; iter < (ne11 - rows_processed) / 8; iter++) {
+                gemm(ne00, (float *)((char *) dst->data + ((rows_processed + iter * 8) * nb1)), (const char *) src0->data, (const char *) src1_wdata + (src1_col_stride * (rows_processed + iter * 8)), 8, ne01, ith, nth);
             }
             rows_processed = rows_processed + ((ne11 - rows_processed) / 8) * 8;
-            for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
-                gemm(ne00, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), 4, ne01, ith, nth);
+            for (int iter = 0; iter < (ne11 - rows_processed) / 4; iter++) {
+                gemm(ne00, (float *)((char *) dst->data + ((rows_processed + iter * 4) * nb1)), (const char *) src0->data, (const char *) src1_wdata + (src1_col_stride * (rows_processed + iter * 4)), 4, ne01, ith, nth);
             }
             rows_processed = rows_processed + ((ne11 - rows_processed) / 4) * 4;
-            for (int row_iter = rows_processed; row_iter < ne11; row_iter++) {
-                gemv(ne00, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * row_size) : (row_iter * nb11)), 1, ne01, ith, nth);
+            for (int iter = rows_processed; iter < ne11; iter++) {
+                gemv(ne00, (float *)((char *) dst->data + (iter * nb1)), (const char *) src0->data, (const char *) src1_wdata + (src1_col_stride * iter), 1, ne01, ith, nth);
             }
         }
     }
     else if ((ggml_n_dims(src0) == 2) && gemv) {
-        for (int row_iter = 0; row_iter < ne11; row_iter++) {
-            gemv(ne00, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * row_size) : (row_iter * nb11)), 1, ne01, ith, nth);
+        for (int iter = 0; iter < ne11; iter++) {
+            gemv(ne00, (float *)((char *) dst->data + (iter * nb1)), (const char *) src0->data, (const char *) src1_wdata + (src1_col_stride * iter), 1, ne01, ith, nth);
         }
     }
     else {
@@ -22030,12 +22032,4 @@ int ggml_cpu_has_matmul_int8(void) {
 #endif
 }
 
-int ggml_cpu_has_sve(void) {
-#if defined(__ARM_FEATURE_SVE)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
 ////////////////////////////////////////////////////////////////////////////////

From 79b6cdfe6964be5c5787af3bd6bac8e9ebe74022 Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar.gope@arm.com>
Date: Fri, 14 Jun 2024 12:30:32 +0000
Subject: [PATCH 13/28] Arm AArch64: minor changes to skip the pr#7433 vec_dot
 code for arm cpus with SVE VL not equal to 256 bits

---
 ggml/src/ggml-quants.c | 102 ++++++++++++++++++++++-------------------
 ggml/src/ggml.c        |   1 -
 2 files changed, 55 insertions(+), 48 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index ad5300b44c2bc..cbe377cf5caee 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -3814,43 +3814,47 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     }
 #endif
 #if defined(__ARM_FEATURE_SVE)
-    const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
-    const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
+    if (svcntb() == QK8_0) {
+        const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
+        const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
 
-    svfloat32_t sumv0 = svdup_n_f32(0.0f);
-    svfloat32_t sumv1 = svdup_n_f32(0.0f);
+        svfloat32_t sumv0 = svdup_n_f32(0.0f);
+        svfloat32_t sumv1 = svdup_n_f32(0.0f);
 
-    assert(nb % 2 == 0); // TODO: handle odd nb
+        assert(nb % 2 == 0); // TODO: handle odd nb
 
-    for (int i = 0; i < nb; i += 2) {
-        const block_q4_0 * restrict x0 = &x[i + 0];
-        const block_q4_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
+        for (int i = 0; i < nb; i += 2) {
+            const block_q4_0 * restrict x0 = &x[i + 0];
+            const block_q4_0 * restrict x1 = &x[i + 1];
+            const block_q8_0 * restrict y0 = &y[i + 0];
+            const block_q8_0 * restrict y1 = &y[i + 1];
 
-        // load x
-        const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
-        const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
+            // load x
+            const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
+            const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
 
-        // 4-bit -> 8-bit
-        const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04));
-        const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04));
+            // 4-bit -> 8-bit
+            const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04));
+            const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04));
 
-        // sub 8
-        const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
-        const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8);
+            // sub 8
+            const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
+            const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8);
 
-        // load y
-        const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
-        const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
+            // load y
+            const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
+            const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
 
-        // dot product
-        sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-    }
+            // dot product
+            sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+            sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+        }
 
-    *s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
-#elif defined(__ARM_NEON)
+        *s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+        return;
+    }
+#endif
+#if defined(__ARM_NEON)
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
     float32x4_t sumv1 = vdupq_n_f32(0.0f);
 
@@ -5422,31 +5426,35 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     }
 #endif
 #if defined(__ARM_FEATURE_SVE)
-    svfloat32_t sumv0 = svdup_n_f32(0.0f);
-    svfloat32_t sumv1 = svdup_n_f32(0.0f);
+    if (svcntb() == QK8_0) {
+        svfloat32_t sumv0 = svdup_n_f32(0.0f);
+        svfloat32_t sumv1 = svdup_n_f32(0.0f);
 
-    assert(nb % 2 == 0); // TODO: handle odd nb
+        assert(nb % 2 == 0); // TODO: handle odd nb
 
-    for (int i = 0; i < nb; i += 2) {
-        const block_q8_0 * restrict x0 = &x[i + 0];
-        const block_q8_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
+        for (int i = 0; i < nb; i += 2) {
+            const block_q8_0 * restrict x0 = &x[i + 0];
+            const block_q8_0 * restrict x1 = &x[i + 1];
+            const block_q8_0 * restrict y0 = &y[i + 0];
+            const block_q8_0 * restrict y1 = &y[i + 1];
 
-        // load x
-        const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
-        const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);
+            // load x
+            const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
+            const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);
 
-        // load y
-        const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
-        const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
+            // load y
+            const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
+            const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
 
-        sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-    }
+            sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+            sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+        }
 
-    *s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
-#elif defined(__ARM_NEON)
+        *s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+        return;
+    }
+#endif
+#if defined(__ARM_NEON)
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
     float32x4_t sumv1 = vdupq_n_f32(0.0f);
 
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index cd37aba823d56..7400a0ec0d0ec 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -21901,7 +21901,6 @@ int ggml_cpu_has_neon(void) {
 int ggml_cpu_has_sve(void) {
 #if defined(__ARM_FEATURE_SVE)
     // TODO: Currently, SVE 256 bit is only supported.
-    GGML_ASSERT(svcntb() == QK8_0);
     return 1;
 #else
     return 0;

From 3c1ad5fe3c673dca23f750f746e5bfcf7ff516f2 Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar.gope@arm.com>
Date: Fri, 14 Jun 2024 13:00:04 +0000
Subject: [PATCH 14/28] Arm AArch64: remove stale LLAMA_QKK_64 from
 CMakeLists.txt and delete build.zig

---
 build.zig | 173 ------------------------------------------------------
 1 file changed, 173 deletions(-)
 delete mode 100644 build.zig

diff --git a/build.zig b/build.zig
deleted file mode 100644
index 97fa42fdbb7c8..0000000000000
--- a/build.zig
+++ /dev/null
@@ -1,173 +0,0 @@
-// Compatible with Zig Version 0.11.0
-const std = @import("std");
-const ArrayList = std.ArrayList;
-const Compile = std.Build.Step.Compile;
-const ConfigHeader = std.Build.Step.ConfigHeader;
-const Mode = std.builtin.Mode;
-const CrossTarget = std.zig.CrossTarget;
-
-const Maker = struct {
-    builder: *std.build.Builder,
-    target: CrossTarget,
-    optimize: Mode,
-    enable_lto: bool,
-
-    include_dirs: ArrayList([]const u8),
-    cflags: ArrayList([]const u8),
-    cxxflags: ArrayList([]const u8),
-    objs: ArrayList(*Compile),
-
-    fn addInclude(m: *Maker, dir: []const u8) !void {
-        try m.include_dirs.append(dir);
-    }
-    fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
-        try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
-    }
-    fn addCFlag(m: *Maker, flag: []const u8) !void {
-        try m.cflags.append(flag);
-    }
-    fn addCxxFlag(m: *Maker, flag: []const u8) !void {
-        try m.cxxflags.append(flag);
-    }
-    fn addFlag(m: *Maker, flag: []const u8) !void {
-        try m.addCFlag(flag);
-        try m.addCxxFlag(flag);
-    }
-
-    fn init(builder: *std.build.Builder) !Maker {
-        const target = builder.standardTargetOptions(.{});
-        const zig_version = @import("builtin").zig_version_string;
-        const commit_hash = try std.ChildProcess.exec(
-            .{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
-        );
-        try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
-            \\int LLAMA_BUILD_NUMBER = {};
-            \\char const *LLAMA_COMMIT = "{s}";
-            \\char const *LLAMA_COMPILER = "Zig {s}";
-            \\char const *LLAMA_BUILD_TARGET = "{s}";
-            \\
-        , .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
-        var m = Maker{
-            .builder = builder,
-            .target = target,
-            .optimize = builder.standardOptimizeOption(.{}),
-            .enable_lto = false,
-            .include_dirs = ArrayList([]const u8).init(builder.allocator),
-            .cflags = ArrayList([]const u8).init(builder.allocator),
-            .cxxflags = ArrayList([]const u8).init(builder.allocator),
-            .objs = ArrayList(*Compile).init(builder.allocator),
-        };
-
-        try m.addCFlag("-std=c11");
-        try m.addCxxFlag("-std=c++11");
-        try m.addProjectInclude(&.{});
-        try m.addProjectInclude(&.{"common"});
-        return m;
-    }
-
-    fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
-        const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
-        if (o.target.getAbi() != .msvc)
-            o.defineCMacro("_GNU_SOURCE", null);
-
-        if (std.mem.endsWith(u8, src, ".c")) {
-            o.addCSourceFiles(&.{src}, m.cflags.items);
-            o.linkLibC();
-        } else {
-            o.addCSourceFiles(&.{src}, m.cxxflags.items);
-            if (o.target.getAbi() == .msvc) {
-                o.linkLibC(); // need winsdk + crt
-            } else {
-                // linkLibCpp already add (libc++ + libunwind + libc)
-                o.linkLibCpp();
-            }
-        }
-        for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
-        o.want_lto = m.enable_lto;
-        return o;
-    }
-
-    fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
-        const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
-        e.addCSourceFiles(&.{src}, m.cxxflags.items);
-        for (deps) |d| e.addObject(d);
-        for (m.objs.items) |o| e.addObject(o);
-        for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
-
-        // https://github.com/ziglang/zig/issues/15448
-        if (e.target.getAbi() == .msvc) {
-            e.linkLibC(); // need winsdk + crt
-        } else {
-            // linkLibCpp already add (libc++ + libunwind + libc)
-            e.linkLibCpp();
-        }
-        m.builder.installArtifact(e);
-        e.want_lto = m.enable_lto;
-        return e;
-    }
-};
-
-pub fn build(b: *std.build.Builder) !void {
-    var make = try Maker.init(b);
-    make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
-
-    const ggml = make.obj("ggml", "ggml.c");
-    const sgemm = make.obj("sgemm", "sgemm.cpp");
-    const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
-    const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
-    const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
-    const unicode = make.obj("unicode", "unicode.cpp");
-    const unicode_data = make.obj("unicode-data", "unicode-data.cpp");
-    const llama = make.obj("llama", "llama.cpp");
-    const buildinfo = make.obj("common", "common/build-info.cpp");
-    const common = make.obj("common", "common/common.cpp");
-    const console = make.obj("console", "common/console.cpp");
-    const sampling = make.obj("sampling", "common/sampling.cpp");
-    const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
-    const json_schema_to_grammar = make.obj("json-schema-to-grammar", "common/json-schema-to-grammar.cpp");
-    const train = make.obj("train", "common/train.cpp");
-    const clip = make.obj("clip", "examples/llava/clip.cpp");
-    const llava = make.obj("llava", "examples/llava/llava.cpp");
-    const ggml_aarch64 = make.obj("ggml-aarch64", "ggml-aarch64.c");
-
-    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, ggml_aarch64, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
-    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, ggml_aarch64, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
-    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, ggml_aarch64, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
-    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, ggml_aarch64, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
-    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, ggml_aarch64, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
-    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, ggml_aarch64, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
-
-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, ggml_aarch64, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
-    if (server.target.isWindows()) {
-        server.linkSystemLibrary("ws2_32");
-    }
-
-    const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" };
-    for (server_assets) |asset| {
-        const input_path = b.fmt("examples/server/public/{s}", .{asset});
-        const output_path = b.fmt("examples/server/{s}.hpp", .{asset});
-
-        // Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`:
-
-        const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize));
-        defer b.allocator.free(input);
-
-        var buf = std.ArrayList(u8).init(b.allocator);
-        defer buf.deinit();
-
-        for (input) |byte| {
-            try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte});
-        }
-
-        var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_");
-        defer b.allocator.free(name);
-        std.mem.replaceScalar(u8, name, '.', '_');
-
-        try std.fs.cwd().writeFile(output_path, b.fmt(
-            "unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n",
-            .{ name, buf.items, name, input.len },
-        ));
-
-        std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path });
-    }
-}

From a7055b7be5ba6da761f1b3b1d5b9e6a08576f011 Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar.gope@arm.com>
Date: Tue, 18 Jun 2024 08:02:37 +0000
Subject: [PATCH 15/28] Arm AArch64: add reference scalar gemm and gemv, and
 avoid dynamic memory allocations during quantization for Q4_0_4_4, Q4_0_4_8,
 and Q4_0_8_8

---
 ggml-aarch64.c      | 486 ++++++++++++++++++++++++++++----------------
 ggml-aarch64.h      |  12 +-
 ggml/include/ggml.h |  10 +-
 ggml/src/ggml.c     |  41 +++-
 src/llama.cpp       |   1 -
 5 files changed, 357 insertions(+), 193 deletions(-)

diff --git a/ggml-aarch64.c b/ggml-aarch64.c
index b1a2e0148a33f..8347960942fee 100644
--- a/ggml-aarch64.c
+++ b/ggml-aarch64.c
@@ -33,11 +33,11 @@
 //               from bias offset form to pure sign form (this saves subtract
 //               operations durin unpacking)
 //
-static block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned int block_len, unsigned int xor_mask) {
+static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int block_len, unsigned int xor_mask) {
     block_q4_0x4 out;
 
     for (int i = 0; i < 4; i++) {
-        out.d[i] = in[i]->d;
+        out.d[i] = in[i].d;
     }
 
     for (int i = 0; i < QK4_0 * 2; i++) {
@@ -45,7 +45,7 @@ static block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned i
         int src_id = (i % (4 * block_len)) / block_len;
         src_offset += (i % block_len);
 
-        out.qs[i] = in[src_id]->qs[src_offset] ^ xor_mask;
+        out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
     }
 
     return out;
@@ -55,11 +55,11 @@ static block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned i
 // returns an interleaved block_q4_0x8
 // in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
 // first, then interleave quants from 8 block_q4_0s in blocks of block_len
-static block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned int block_len, unsigned int xor_mask) {
+static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int block_len, unsigned int xor_mask) {
     block_q4_0x8 out;
 
     for (int i = 0; i < 8; i++) {
-        out.d[i] = in[i]->d;
+        out.d[i] = in[i].d;
     }
 
     for (int i = 0; i < QK4_0 * 4; i++) {
@@ -67,7 +67,7 @@ static block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned i
         int src_id = (i % (8 * block_len)) / block_len;
         src_offset += (i % block_len);
 
-        out.qs[i] = in[src_id]->qs[src_offset] ^ xor_mask;
+        out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
     }
 
     return out;
@@ -134,6 +134,8 @@ void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k)
             y[i].qs[16 * j + 15] = vgetq_lane_s32(vi, 3);
         }
     }
+#else
+    assert(false);
 #endif
 }
 
@@ -222,6 +224,8 @@ void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int64_t k)
             y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3);
         }
     }
+#else
+    assert(false);
 #endif
 }
 
@@ -229,45 +233,33 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds
     assert(n_per_row % QK4_0 == 0);
     const int nb = n_per_row / QK4_0;
 
-    void * out_ptr_B = NULL;
-    void * out_ptr_B_start = NULL;
+    void * out_ptr = NULL;
     if (nrows_interleaved == 8) {
-        out_ptr_B = (block_q4_0x8 *) malloc(sizeof(block_q4_0x8) * nb);
-        out_ptr_B_start = out_ptr_B;
+        out_ptr = (block_q4_0x8 *) dst;
     }
     else if (nrows_interleaved == 4) {
-        out_ptr_B = (block_q4_0x4 *) malloc(sizeof(block_q4_0x4) * nb);
-        out_ptr_B_start = out_ptr_B;
+        out_ptr = (block_q4_0x4 *) dst;
     }
-    block_q4_0 ** in_ptrs = (block_q4_0 **) malloc(sizeof(block_q4_0 *) * nrows_interleaved);
+    block_q4_0 dst_tmp[nrows_interleaved];
 
     for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) {
 
-        for (int i  = 0; i < nrows_interleaved; i++ ) {
-            in_ptrs[i] = (block_q4_0 *) dst + (b + i * n_per_row) / QK4_0;
-            quantize_row_q4_0_reference(src + b + i * n_per_row, (block_q4_0 *) in_ptrs[i], n_per_row);
-        }
-
         for (int64_t x = 0; x < nb; x++) {
+
+            for (int i  = 0; i < nrows_interleaved; i++ ) {
+                quantize_row_q4_0_reference(src + b + i * n_per_row + x * QK4_0, (block_q4_0 *) dst_tmp + i, QK4_0);
+            }
+
             if (nrows_interleaved == 8) {
-                *(block_q4_0x8 *) out_ptr_B = make_block_q4_0x8((const block_q4_0 * const *) in_ptrs, blocklen_per_row, 0x88);
-                out_ptr_B = (block_q4_0x8 *) out_ptr_B + 1;
+                *(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blocklen_per_row, 0x88);
+                out_ptr = (block_q4_0x8 *) out_ptr + 1;
             }
             else if (nrows_interleaved == 4) {
-                *(block_q4_0x4 *) out_ptr_B = make_block_q4_0x4((const block_q4_0 * const *) in_ptrs, blocklen_per_row, 0x88);
-                out_ptr_B = (block_q4_0x4 *) out_ptr_B + 1;
-            }
-
-            for (int i = 0; i < nrows_interleaved; i++) {
-                in_ptrs[i]++;
+                *(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blocklen_per_row, 0x88);
+                out_ptr = (block_q4_0x4 *) out_ptr + 1;
             }
         }
-        out_ptr_B = out_ptr_B_start;
-        if (nrows_interleaved == 8) memcpy ((block_q4_0 *) dst + b / QK4_0, out_ptr_B_start, sizeof(block_q4_0x8) * nb);
-        else if (nrows_interleaved == 4) memcpy ((block_q4_0 *) dst + b / QK4_0, out_ptr_B_start, sizeof(block_q4_0x4) * nb);
     }
-    if (out_ptr_B_start) free(out_ptr_B_start);
-    if (in_ptrs) free(in_ptrs);
 
     return ((nrow * n_per_row) / QK4_0 * sizeof(block_q4_0));
 }
@@ -302,25 +294,24 @@ size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_
     }
 }
 
-inline int64_t roundup(const int64_t a, const int64_t b) {
-    int64_t rem = a % b;
+void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 4;
 
-    if (rem) {
-        return a + b - rem;
-    } else {
-        return a;
-    }
-}
+    assert (n % qk == 0);
+    assert (nc % ncols_interleaved == 0);
 
-void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy, int nr, int nc, int ith, int nth) {
-    UNUSED(n);
     UNUSED(s);
+    UNUSED(bs);
     UNUSED(vx);
     UNUSED(vy);
     UNUSED(nr);
     UNUSED(nc);
-    UNUSED(ith);
-    UNUSED(nth);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
 
 #if defined(__ARM_FEATURE_SVE)
     if (svcntw() == 8) {
@@ -332,19 +323,9 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, const void * restrict vx
     GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
                 "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
 #elif defined(__ARM_NEON)
-    int64_t x0 = roundup((ith * nc) / nth, (int64_t)4);
-    int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)4);
-    size_t width = xend - x0;
-
-    int64_t nb = n / QK4_0;
-    const void * b_ptr = (const void *)((const block_q4_0x4 *) vx + ((x0 / 4) * nb));
+    const void * b_ptr = vx;
     const void * a_ptr = vy;
-    float * res_ptr = s + x0;
-
-    assert(n % 32 == 0);
-    assert(width % 4 == 0);
-
-    size_t num_blocks = n / 32;
+    float * res_ptr = s;
 
     __asm__ __volatile__(
         "movi v31.16b, #0x4\n"
@@ -353,7 +334,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, const void * restrict vx
         "1:"  // Column loop
         "add x22, %x[a_ptr], #0x2\n"
         "movi v29.16b, #0x0\n"
-        "mov x21, %x[num_blocks]\n"
+        "mov x21, %x[nb]\n"
         "2:"  // Block loop
         "ldr q28, [%x[b_ptr], #0x0]\n"
         "ldr q27, [x22, #0x0]\n"
@@ -390,26 +371,58 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, const void * restrict vx
         "scvtf v26.4s, v26.4s, #0x4\n"
         "fmla v29.4s, v26.4s, v16.4s\n"
         "cbnz x21, 2b\n"
-        "sub %x[width], %x[width], #0x4\n"
+        "sub %x[nc], %x[nc], #0x4\n"
         "str q29, [%x[res_ptr], #0x0]\n"
         "add %x[res_ptr], %x[res_ptr], #0x10\n"
-        "cbnz %x[width], 1b\n"
-        : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [width] "+&r" (width)
-        : [a_ptr] "r" (a_ptr), [num_blocks] "r" (num_blocks)
+        "cbnz %x[nc], 1b\n"
+        : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
+        : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
         : "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22"
     );
+#else
+    float sumf[4];
+
+    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+    for (int x = 0; x < nc / ncols_interleaved; x++) {
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+
+        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+        for (int l = 0; l < nb; l++) {
+            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    int sumi = 0;
+                    for (int i = 0; i < blocklen; ++i) {
+                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+                    }
+                    sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
+                }
+            }
+        }
+        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+    }
 #endif
 }
 
-void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy, int nr, int nc, int ith, int nth) {
-    UNUSED(n);
+void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 8;
+
+    assert (n % qk == 0);
+    assert (nc % ncols_interleaved == 0);
+
     UNUSED(s);
+    UNUSED(bs);
     UNUSED(vx);
     UNUSED(vy);
     UNUSED(nr);
     UNUSED(nc);
-    UNUSED(ith);
-    UNUSED(nth);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
 
 #if defined(__ARM_FEATURE_SVE)
     if (svcntw() == 8) {
@@ -418,19 +431,9 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, const void * restrict vx
     }
 #endif
 #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
-    int64_t x0 = roundup((ith * nc) / nth, (int64_t)4);
-    int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)4);
-    size_t width = xend - x0;
-
-    int64_t nb = n / QK4_0;
-    const void * b_ptr = (const void *)((const block_q4_0x4 *) vx + ((x0 / 4) * nb));
+    const void * b_ptr = vx;
     const void * a_ptr = vy;
-    float * res_ptr = s + x0;
-
-    assert(n % 32 == 0);
-    assert(width % 4 == 0);
-
-    size_t num_blocks = n / 32;
+    float * res_ptr = s;
 
     __asm__ __volatile__(
         "movi v2.16b, #0x4\n"
@@ -439,7 +442,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, const void * restrict vx
         "1:"  // Column loop
         "add x23, %x[a_ptr], #0x2\n"
         "movi v0.16b, #0x0\n"
-        "mov x22, %x[num_blocks]\n"
+        "mov x22, %x[nb]\n"
         "2:"  // Block loop
         "ldr q31, [%x[b_ptr], #0x0]\n"
         "ldr q30, [%x[b_ptr], #0x10]\n"
@@ -481,46 +484,68 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, const void * restrict vx
         "scvtf v29.4s, v29.4s, #0x4\n"
         "fmla v0.4s, v29.4s, v16.4s\n"
         "cbnz x22, 2b\n"
-        "sub %x[width], %x[width], #0x4\n"
+        "sub %x[nc], %x[nc], #0x4\n"
         "str q0, [%x[res_ptr], #0x0]\n"
         "add %x[res_ptr], %x[res_ptr], #0x10\n"
-        "cbnz %x[width], 1b\n"
-        : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [width] "+&r" (width)
-        : [a_ptr] "r" (a_ptr), [num_blocks] "r" (num_blocks)
+        "cbnz %x[nc], 1b\n"
+        : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
+        : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
         : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
     );
 #elif defined(__ARM_NEON)
     GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
                 "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
                 "performance");
+#else
+    float sumf[4];
+
+    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+    for (int x = 0; x < nc / ncols_interleaved; x++) {
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+
+        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+        for (int l = 0; l < nb; l++) {
+            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    int sumi = 0;
+                    for (int i = 0; i < blocklen; ++i) {
+                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+                    }
+                    sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
+                }
+            }
+        }
+        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+    }
 #endif
 }
 
-void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy, int nr, int nc, int ith, int nth) {
-    UNUSED(n);
+void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 8;
+    const int blocklen = 8;
+
+    assert (n % qk == 0);
+    assert (nc % ncols_interleaved == 0);
+
     UNUSED(s);
+    UNUSED(bs);
     UNUSED(vx);
     UNUSED(vy);
     UNUSED(nr);
     UNUSED(nc);
-    UNUSED(ith);
-    UNUSED(nth);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
 
 #if defined(__ARM_FEATURE_SVE)
     if (svcntw() == 8) {
-        int64_t x0 = roundup((ith * nc) / nth, (int64_t)8);
-        int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)8);
-        size_t width = xend - x0;
-
-        int64_t nb = n / QK4_0;
-        const void * b_ptr = (const void *)((const block_q4_0x8 *) vx + ((x0 / 8) * nb));
+        const void * b_ptr = vx;
         const void * a_ptr = vy;
-        float * res_ptr = s + x0;
-
-        assert(n % 32 == 0);
-        assert(width % 8 == 0);
-
-        size_t num_blocks = n / 32;
+        float * res_ptr = s;
 
         __asm__ __volatile__(
             "ptrue p0.b\n"
@@ -528,7 +553,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, const void * restrict vx
             "1:"  // Column loop
             "add x22, %x[a_ptr], #0x2\n"
             "mov z31.b, #0x0\n"
-            "mov x21, %x[num_blocks]\n"
+            "mov x21, %x[nb]\n"
             "2:"  // Block loop
             "ld1b { z30.b }, p0/Z, [%x[b_ptr]]\n"
             "ld1b { z29.b }, p0/Z, [%x[b_ptr], #1, MUL VL]\n"
@@ -572,12 +597,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, const void * restrict vx
             "scvtf z17.s, p0/m, z17.s\n"
             "fmla z31.s, p0/M, z17.s, z18.s\n"
             "cbnz x21, 2b\n"
-            "sub %x[width], %x[width], #0x8\n"
+            "sub %x[nc], %x[nc], #0x8\n"
             "st1w { z31.s }, p0, [%x[res_ptr]]\n"
             "add %x[res_ptr], %x[res_ptr], #0x20\n"
-            "cbnz %x[width], 1b\n"
-            : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [width] "+&r" (width)
-            : [a_ptr] "r" (a_ptr), [num_blocks] "r" (num_blocks)
+            "cbnz %x[nc], 1b\n"
+            : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
+            : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
             : "memory", "p0", "x20", "x21", "x22", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
         );
         return;
@@ -600,18 +625,51 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, const void * restrict vx
     GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
                 "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
                 "performance");
+#else
+    float sumf[8];
+
+    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+    for (int x = 0; x < nc / ncols_interleaved; x++) {
+        const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+
+        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+        for (int l = 0; l < nb; l++) {
+            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    int sumi = 0;
+                    for (int i = 0; i < blocklen; ++i) {
+                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+                    }
+                    sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
+                }
+            }
+        }
+        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+    }
 #endif
 }
 
-void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy, int nr, int nc, int ith, int nth) {
-    UNUSED(n);
+void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 4;
+
+    assert (n % qk == 0);
+    assert (nr % 4 == 0);
+    assert (nc % ncols_interleaved == 0);
+
     UNUSED(s);
+    UNUSED(bs);
     UNUSED(vx);
     UNUSED(vy);
     UNUSED(nr);
     UNUSED(nc);
-    UNUSED(ith);
-    UNUSED(nth);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
 
 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
     if (svcntw() == 8) {
@@ -623,36 +681,26 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, const void * restrict vx
     GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
                 "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
 #elif defined(__ARM_NEON)
-    int64_t x0 = roundup((ith * nc) / nth, (int64_t)4);
-    int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)4);
-    size_t width = xend - x0;
-
-    int64_t nb = n / QK4_0;
-    const void * b_ptr = (const void *)((const block_q4_0x4 *) vx + ((x0/4) * nb));
+    const void * b_ptr = vx;
     const void * a_ptr = vy;
-    float * res_ptr = s + x0;
-    size_t res_stride = nc * sizeof(float);
-
-    assert(n % 32 == 0);
-    assert(width % 4 == 0);
-
-    size_t num_blocks = n / 32;
+    float * res_ptr = s;
+    size_t res_stride = bs * sizeof(float);
 
     __asm__ __volatile__(
         "mov x10, %x[nr]\n"
         "mov x9, #0x88\n"
         "cmp x10, #0x10\n"
-        "mul x9, %x[num_blocks], x9\n"
+        "mul x9, %x[nb], x9\n"
         "blt 4f\n"
         "1:"  // Row loop
         "add x28, %x[b_ptr], #0x8\n"
-        "mov x27, %x[width]\n"
+        "mov x27, %x[nc]\n"
         "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
         "2:"  // Column loop
         "add x25, %x[a_ptr], #0x8\n"
         "movi v15.16b, #0x0\n"
         "movi v19.16b, #0x0\n"
-        "mov x24, %x[num_blocks]\n"
+        "mov x24, %x[nb]\n"
         "add x23, x25, x9\n"
         "movi v18.16b, #0x0\n"
         "movi v14.16b, #0x0\n"
@@ -972,13 +1020,13 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, const void * restrict vx
         "cbz x10, 9f\n"
         "5:"  // Row tail: Row loop
         "add x24, %x[b_ptr], #0x8\n"
-        "mov x23, %x[width]\n"
+        "mov x23, %x[nc]\n"
         "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
         "6:"  // Row tail: Column loop
         "movi v15.16b, #0x0\n"
         "movi v19.16b, #0x0\n"
         "add x25, %x[a_ptr], #0x8\n"
-        "mov x21, %x[num_blocks]\n"
+        "mov x21, %x[nb]\n"
         "movi v18.16b, #0x0\n"
         "movi v14.16b, #0x0\n"
         "7:"  // Row tail: Block loop
@@ -1085,21 +1133,63 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, const void * restrict vx
         "bgt 5b\n"
         "9:"  // Row tail: Row loop skip
         : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
-        : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [num_blocks] "r" (num_blocks), [res_stride] "r" (res_stride), [width] "r" (width)
+        : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
+#else
+    float sumf[4][4];
+
+    for (int y = 0; y < nr / 4; y++) {
+        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+            }
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int m = 0; m < 4; m++) {
+                        for (int j = 0; j < ncols_interleaved; j++) {
+                            int sumi = 0;
+                            for (int i = 0; i < blocklen; ++i) {
+                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                                const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                                sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                         (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
+                            }
+                            sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
+                        }
+                    }
+                }
+            }
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++)
+                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+            }
+        }
+    }
 #endif
 }
 
-void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy, int nr, int nc, int ith, int nth) {
-    UNUSED(n);
+void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 8;
+
+    assert (n % qk == 0);
+    assert (nr % 4 == 0);
+    assert (nc % ncols_interleaved == 0);
+
     UNUSED(s);
+    UNUSED(bs);
     UNUSED(vx);
     UNUSED(vy);
     UNUSED(nr);
     UNUSED(nc);
-    UNUSED(ith);
-    UNUSED(nth);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
 
 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
     if (svcntw() == 8) {
@@ -1108,36 +1198,26 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, const void * restrict vx
     }
 #endif
 #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
-    int64_t x0 = roundup((ith * nc) / nth, (int64_t)4);
-    int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)4);
-    size_t width = xend - x0;
-
-    int64_t nb = n / QK4_0;
-    const void * b_ptr = (const void *)((const block_q4_0x4 *) vx + ((x0 / 4) * nb));
+    const void * b_ptr = vx;
     const void * a_ptr = vy;
-    float * res_ptr = s + x0;
-    size_t res_stride = nc * sizeof(float);
-
-    assert(n % 32 == 0);
-    assert(width % 4 == 0);
-
-    size_t num_blocks = n / 32;
+    float * res_ptr = s;
+    size_t res_stride = bs * sizeof(float);
 
     __asm__ __volatile__(
         "mov x10, %x[nr]\n"
         "mov x9, #0x88\n"
         "cmp x10, #0x10\n"
-        "mul x9, %x[num_blocks], x9\n"
+        "mul x9, %x[nb], x9\n"
         "blt 4f\n"
         "1:"  // Row loop
         "add x28, %x[b_ptr], #0x8\n"
-        "mov x27, %x[width]\n"
+        "mov x27, %x[nc]\n"
         "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
         "2:"  // Column loop
         "add x25, %x[a_ptr], #0x8\n"
         "movi v2.16b, #0x0\n"
         "movi v10.16b, #0x0\n"
-        "mov x24, %x[num_blocks]\n"
+        "mov x24, %x[nb]\n"
         "add x23, x25, x9\n"
         "movi v12.16b, #0x0\n"
         "movi v28.16b, #0x0\n"
@@ -1409,13 +1489,13 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, const void * restrict vx
         "cbz x10, 9f\n"
         "5:"  // Row tail: Row loop
         "add x24, %x[b_ptr], #0x8\n"
-        "mov x23, %x[width]\n"
+        "mov x23, %x[nc]\n"
         "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
         "6:"  // Row tail: Column loop
         "movi v2.16b, #0x0\n"
         "movi v10.16b, #0x0\n"
         "add x25, %x[a_ptr], #0x8\n"
-        "mov x21, %x[num_blocks]\n"
+        "mov x21, %x[nb]\n"
         "movi v12.16b, #0x0\n"
         "movi v28.16b, #0x0\n"
         "7:"  // Row tail: Block loop
@@ -1510,42 +1590,74 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, const void * restrict vx
         "bgt 5b\n"
         "9:"  // Row tail: Row loop skip
         : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
-        : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [num_blocks] "r" (num_blocks), [res_stride] "r" (res_stride), [width] "r" (width)
+        : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 #elif defined(__ARM_NEON)
     GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
                 "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
                 "performance");
+#else
+    float sumf[4][4];
+
+    for (int y = 0; y < nr / 4; y++) {
+        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+            }
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int m = 0; m < 4; m++) {
+                        for (int j = 0; j < ncols_interleaved; j++) {
+                            int sumi = 0;
+                            for (int i = 0; i < blocklen; ++i) {
+                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                                const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                                sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                         (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
+                            }
+                            sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
+                        }
+                    }
+                }
+            }
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++)
+                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+            }
+        }
+    }
 #endif
 }
 
-void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy, int nr, int nc, int ith, int nth) {
-    UNUSED(n);
+void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 8;
+    const int blocklen = 8;
+
+    assert (n % qk == 0);
+    assert (nr % 4 == 0);
+    assert (nc % ncols_interleaved == 0);
+
     UNUSED(s);
+    UNUSED(bs);
     UNUSED(vx);
     UNUSED(vy);
     UNUSED(nr);
     UNUSED(nc);
-    UNUSED(ith);
-    UNUSED(nth);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
 
 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
     if (svcntw() == 8) {
-        int64_t x0 = roundup((ith * nc) / nth, (int64_t)8);
-        int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)8);
-        size_t width = xend - x0;
-
-        int64_t nb = n / QK4_0;
-        const void * b_ptr = (const void *)((const block_q4_0x8 *) vx + ((x0 / 8) * nb));
+        const void * b_ptr = vx;
         const void * a_ptr = vy;
-        float * res_ptr = s + x0;
-        size_t res_stride = nc * sizeof(float);
-
-        assert(n % 32 == 0);
-        assert(width % 8 == 0);
-
-        size_t num_blocks = n / 32;
+        float * res_ptr = s;
+        size_t res_stride = bs * sizeof(float);
 
         __asm__ __volatile__(
             "mov x20, #0x4\n"
@@ -1555,17 +1667,17 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, const void * restrict vx
             "ptrue p1.b\n"
             "whilelt p0.s, XZR, x20\n"
             "cmp x13, #0x10\n"
-            "mul x12, %x[num_blocks], x12\n"
+            "mul x12, %x[nb], x12\n"
             "blt 4f\n"
             "1:"  // Row loop
             "add x11, %x[b_ptr], #0x10\n"
-            "mov x10, %x[width]\n"
+            "mov x10, %x[nc]\n"
             "add x9, %x[res_ptr], %x[res_stride], LSL #4\n"
             "2:"  // Column loop
             "add x28, %x[a_ptr], #0x8\n"
             "mov z24.b, #0x0\n"
             "mov z15.b, #0x0\n"
-            "mov x27, %x[num_blocks]\n"
+            "mov x27, %x[nb]\n"
             "add x26, x28, x12\n"
             "mov z12.b, #0x0\n"
             "mov z0.b, #0x0\n"
@@ -1844,13 +1956,13 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, const void * restrict vx
             "cbz x13, 9f\n"
             "5:"  // Row tail: Row loop
             "add x25, %x[b_ptr], #0x10\n"
-            "mov x24, %x[width]\n"
+            "mov x24, %x[nc]\n"
             "add x23, %x[res_ptr], %x[res_stride], LSL #2\n"
             "6:"  // Row tail: Column loop
             "mov z24.b, #0x0\n"
             "mov z15.b, #0x0\n"
             "add x28, %x[a_ptr], #0x8\n"
-            "mov x22, %x[num_blocks]\n"
+            "mov x22, %x[nb]\n"
             "mov z12.b, #0x0\n"
             "mov z0.b, #0x0\n"
             "7:"  // Row tail: Block loop
@@ -1946,7 +2058,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, const void * restrict vx
             "bgt 5b\n"
             "9:"  // Row tail: Row loop skip
             : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
-            : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [num_blocks] "r" (num_blocks), [res_stride] "r" (res_stride), [width] "r" (width)
+            : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
             : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
         );
         return;
@@ -1969,5 +2081,37 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, const void * restrict vx
     GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
                 "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
                 "performance");
+#else
+    float sumf[4][8];
+
+    for (int y = 0; y < nr / 4; y++) {
+        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+            }
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int m = 0; m < 4; m++) {
+                        for (int j = 0; j < ncols_interleaved; j++) {
+                            int sumi = 0;
+                            for (int i = 0; i < blocklen; ++i) {
+                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                                const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                                sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                         (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
+                            }
+                            sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
+                        }
+                    }
+                }
+            }
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++)
+                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+            }
+        }
+    }
 #endif
 }
diff --git a/ggml-aarch64.h b/ggml-aarch64.h
index d4d4dd01b9fb4..53f9d518d1ab2 100644
--- a/ggml-aarch64.h
+++ b/ggml-aarch64.h
@@ -22,14 +22,14 @@ size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT d
 size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 
 // GEMV
-void ggml_gemv_q4_0_4x4_q8_0  (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
-void ggml_gemv_q4_0_4x8_q8_0  (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
-void ggml_gemv_q4_0_8x8_q8_0  (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
+void ggml_gemv_q4_0_4x4_q8_0  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q4_0_4x8_q8_0  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q4_0_8x8_q8_0  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 
 // GEMM
-void ggml_gemm_q4_0_4x4_q8_0  (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
-void ggml_gemm_q4_0_4x8_q8_0  (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
-void ggml_gemm_q4_0_8x8_q8_0  (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth);
+void ggml_gemm_q4_0_4x4_q8_0  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_0_4x8_q8_0  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_0_8x8_q8_0  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 
 #ifdef __cplusplus
 }
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 7cfd74a7ede28..0c526c47e2cfc 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2397,7 +2397,6 @@ extern "C" {
     GGML_API int ggml_cpu_has_rpc        (void);
     GGML_API int ggml_cpu_has_vsx        (void);
     GGML_API int ggml_cpu_has_matmul_int8(void);
-    GGML_API int ggml_cpu_has_sve        (void);
 
     //
     // Internal types and functions exposed for tests and benchmarks
@@ -2414,10 +2413,10 @@ extern "C" {
     typedef void (*ggml_vec_dot_t)   (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
                                       const void * GGML_RESTRICT y, size_t by, int nrc);
     typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
-    typedef void (*ggml_gemv_t)      (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy,
-                                      int nr, int nc, int ith, int nth);
-    typedef void (*ggml_gemm_t)      (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy,
-                                      int nr, int nc, int ith, int nth);
+    typedef void (*ggml_gemv_t)      (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx,
+                                      const void * GGML_RESTRICT vy, int nr, int nc);
+    typedef void (*ggml_gemm_t)      (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx,
+                                      const void * GGML_RESTRICT vy, int nr, int nc);
 
     typedef struct {
         const char      * type_name;
@@ -2430,6 +2429,7 @@ extern "C" {
         ggml_vec_dot_t    vec_dot;
         enum ggml_type    vec_dot_type;
         int64_t           nrows; // number of rows to process simultaneously;
+        int64_t           ncols; // number of columns to process simultaneously;
         ggml_from_float_to_mat_t from_float_to_mat;
         ggml_gemv_t       gemv;
         ggml_gemm_t       gemm;
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 7400a0ec0d0ec..1f6b5127d375e 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -916,6 +916,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot                  = NULL,
         .vec_dot_type             = GGML_TYPE_Q8_0,
         .nrows                    = 1,
+        .ncols                    = 4,
         .gemv                     = ggml_gemv_q4_0_4x4_q8_0,
         .gemm                     = ggml_gemm_q4_0_4x4_q8_0,
     },
@@ -930,6 +931,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot                  = NULL,
         .vec_dot_type             = GGML_TYPE_Q8_0,
         .nrows                    = 1,
+        .ncols                    = 4,
         .gemv                     = ggml_gemv_q4_0_4x8_q8_0,
         .gemm                     = ggml_gemm_q4_0_4x8_q8_0,
     },
@@ -944,6 +946,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot                  = NULL,
         .vec_dot_type             = GGML_TYPE_Q8_0,
         .nrows                    = 1,
+        .ncols                    = 8,
         .gemv                     = ggml_gemv_q4_0_8x8_q8_0,
         .gemm                     = ggml_gemm_q4_0_8x8_q8_0,
     }
@@ -12203,6 +12206,7 @@ static void ggml_compute_forward_mul_mat(
     enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
     ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
     int64_t           const vec_dot_num_rows      = type_traits[type].nrows;
+    int64_t           const matmul_num_cols       = type_traits[type].ncols;
     ggml_from_float_to_mat_t const from_float_to_mat
                                                   = type_traits[vec_dot_type].from_float_to_mat;
     ggml_gemv_t       const gemv                  = type_traits[type].gemv;
@@ -12372,32 +12376,49 @@ UseGgmlGemm2:;
 
     const void * src1_wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
     const size_t src1_col_stride = ggml_is_contiguous(src1) || src1->type != vec_dot_type ? ggml_row_size(vec_dot_type, ne10) : nb11;
+    int64_t src0_start = (ith * ne01) / nth;
+    int64_t src0_end   = ((ith + 1) * ne01) / nth;
+    src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start;
+    src0_end = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end;
+
     if ((ggml_n_dims(src0) == 2) && gemm && gemv) {
-        if (ne11 == 1) gemv(ne00, (float *)((char *) dst->data), (const char *) src0->data, (const char *) src1_wdata, 1, ne01, ith, nth);
+        if (src0_start >= src0_end) return;
+        if (ne11 == 1)
+            gemv(ne00, (float *)((char *) dst->data) + src0_start, ne01, (const char *) src0->data + src0_start * nb01,
+                 (const char *) src1_wdata, 1, src0_end - src0_start);
         else {
             for (int iter = 0; iter < ne11 / 16; iter++) {
-                gemm(ne00, (float *)((char *) dst->data + (iter * 16 * nb1)), (const char *) src0->data, (const char *) src1_wdata + (src1_col_stride * iter * 16), 16, ne01, ith, nth);
+                gemm(ne00, (float *)((char *) dst->data + (iter * 16 * nb1)) + src0_start, ne01,
+                     (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter * 16), 16,
+                     src0_end - src0_start);
             }
             int rows_processed = (ne11 / 16) * 16;
             for (int iter = 0; iter < (ne11 - rows_processed) / 8; iter++) {
-                gemm(ne00, (float *)((char *) dst->data + ((rows_processed + iter * 8) * nb1)), (const char *) src0->data, (const char *) src1_wdata + (src1_col_stride * (rows_processed + iter * 8)), 8, ne01, ith, nth);
+                gemm(ne00, (float *)((char *) dst->data + ((rows_processed + iter * 8) * nb1)) + src0_start, ne01,
+                     (const char *) src0->data + src0_start * nb01,
+                     (const char *) src1_wdata + (src1_col_stride * (rows_processed + iter * 8)), 8, src0_end - src0_start);
             }
             rows_processed = rows_processed + ((ne11 - rows_processed) / 8) * 8;
             for (int iter = 0; iter < (ne11 - rows_processed) / 4; iter++) {
-                gemm(ne00, (float *)((char *) dst->data + ((rows_processed + iter * 4) * nb1)), (const char *) src0->data, (const char *) src1_wdata + (src1_col_stride * (rows_processed + iter * 4)), 4, ne01, ith, nth);
+                gemm(ne00, (float *)((char *) dst->data + ((rows_processed + iter * 4) * nb1)) + src0_start, ne01,
+                     (const char *) src0->data + src0_start * nb01,
+                     (const char *) src1_wdata + (src1_col_stride * (rows_processed + iter * 4)), 4, src0_end - src0_start);
             }
             rows_processed = rows_processed + ((ne11 - rows_processed) / 4) * 4;
             for (int iter = rows_processed; iter < ne11; iter++) {
-                gemv(ne00, (float *)((char *) dst->data + (iter * nb1)), (const char *) src0->data, (const char *) src1_wdata + (src1_col_stride * iter), 1, ne01, ith, nth);
+                gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01,
+                     (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1,
+                     src0_end - src0_start);
             }
         }
-    }
-    else if ((ggml_n_dims(src0) == 2) && gemv) {
+    } else if ((ggml_n_dims(src0) == 2) && gemv) {
+        if (src0_start >= src0_end) return;
         for (int iter = 0; iter < ne11; iter++) {
-            gemv(ne00, (float *)((char *) dst->data + (iter * nb1)), (const char *) src0->data, (const char *) src1_wdata + (src1_col_stride * iter), 1, ne01, ith, nth);
+            gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01,
+                 (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1,
+                 src0_end - src0_start);
         }
-    }
-    else {
+    } else {
         // The first chunk comes from our thread_id, the rest will get auto-assigned.
         int current_chunk = ith;
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 0adb0afae118f..22cd387c5931e 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -21693,7 +21693,6 @@ const char * llama_print_system_info(void) {
 #else
     s += "LLAMAFILE = 0 | ";
 #endif
-    s += "SVE = "         + std::to_string(ggml_cpu_has_sve())         + " | ";
 
     return s.c_str();
 }

From cce236bc4755d49ed6fc15ca549131faf16b3f8e Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar.gope@arm.com>
Date: Wed, 19 Jun 2024 06:15:28 +0000
Subject: [PATCH 16/28] Arm AArch64: add multithreaded quantization support for
 the new types: Q4_0_4_4, Q4_0_4_8, and Q4_0_8_8

---
 src/llama.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 22cd387c5931e..3e72411e0d3d4 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -18395,9 +18395,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                 f32_data = (float *) f32_conv_buf.data();
             }
 
+            int chunk_size_multiplier = 1;
             if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
-                if ((nelements / tensor->ne[0]) % 4 != 0) new_type = GGML_TYPE_Q4_0;
-                if (nthread > 1) nthread = 1;
+                if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
+                else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
+                if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
+                else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
             }
 
             LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
@@ -18412,7 +18415,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             const int64_t nrows = tensor->ne[1];
 
             static const int64_t min_chunk_size = 32 * 512;
-            const int64_t chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
+            const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
+                                       chunk_size_multiplier;
 
             const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
             const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;

From 7a706067b5ef96b35d78f50052ec7329d676e0c5 Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar.gope@arm.com>
Date: Wed, 19 Jun 2024 16:15:13 +0000
Subject: [PATCH 17/28] Arm AArch64: minor code refactoring

---
 ggml-aarch64.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/ggml-aarch64.c b/ggml-aarch64.c
index 8347960942fee..28a92759fac34 100644
--- a/ggml-aarch64.c
+++ b/ggml-aarch64.c
@@ -381,6 +381,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
     );
 #else
     float sumf[4];
+    int sumi;
 
     const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
     for (int x = 0; x < nc / ncols_interleaved; x++) {
@@ -390,7 +391,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
         for (int l = 0; l < nb; l++) {
             for (int k = 0; k < (qk / (2 * blocklen)); k++) {
                 for (int j = 0; j < ncols_interleaved; j++) {
-                    int sumi = 0;
+                    sumi = 0;
                     for (int i = 0; i < blocklen; ++i) {
                         const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
                         const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
@@ -498,6 +499,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
                 "performance");
 #else
     float sumf[4];
+    int sumi;
 
     const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
     for (int x = 0; x < nc / ncols_interleaved; x++) {
@@ -507,7 +509,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
         for (int l = 0; l < nb; l++) {
             for (int k = 0; k < (qk / (2 * blocklen)); k++) {
                 for (int j = 0; j < ncols_interleaved; j++) {
-                    int sumi = 0;
+                    sumi = 0;
                     for (int i = 0; i < blocklen; ++i) {
                         const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
                         const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
@@ -627,6 +629,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
                 "performance");
 #else
     float sumf[8];
+    int sumi;
 
     const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
     for (int x = 0; x < nc / ncols_interleaved; x++) {
@@ -636,7 +639,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
         for (int l = 0; l < nb; l++) {
             for (int k = 0; k < (qk / (2 * blocklen)); k++) {
                 for (int j = 0; j < ncols_interleaved; j++) {
-                    int sumi = 0;
+                    sumi = 0;
                     for (int i = 0; i < blocklen; ++i) {
                         const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
                         const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
@@ -1138,6 +1141,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
     );
 #else
     float sumf[4][4];
+    int sumi;
 
     for (int y = 0; y < nr / 4; y++) {
         const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
@@ -1150,7 +1154,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
                 for (int k = 0; k < (qk / (2 * blocklen)); k++) {
                     for (int m = 0; m < 4; m++) {
                         for (int j = 0; j < ncols_interleaved; j++) {
-                            int sumi = 0;
+                            sumi = 0;
                             for (int i = 0; i < blocklen; ++i) {
                                 const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
                                 const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
@@ -1599,6 +1603,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
                 "performance");
 #else
     float sumf[4][4];
+    int sumi;
 
     for (int y = 0; y < nr / 4; y++) {
         const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
@@ -1611,7 +1616,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
                 for (int k = 0; k < (qk / (2 * blocklen)); k++) {
                     for (int m = 0; m < 4; m++) {
                         for (int j = 0; j < ncols_interleaved; j++) {
-                            int sumi = 0;
+                            sumi = 0;
                             for (int i = 0; i < blocklen; ++i) {
                                 const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
                                 const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
@@ -2083,6 +2088,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
                 "performance");
 #else
     float sumf[4][8];
+    int sumi;
 
     for (int y = 0; y < nr / 4; y++) {
         const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
@@ -2095,7 +2101,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
                 for (int k = 0; k < (qk / (2 * blocklen)); k++) {
                     for (int m = 0; m < 4; m++) {
                         for (int j = 0; j < ncols_interleaved; j++) {
-                            int sumi = 0;
+                            sumi = 0;
                             for (int i = 0; i < blocklen; ++i) {
                                 const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
                                 const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);

From ffbfabb517466fbb2dce42a550466bcc9480a392 Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar.gope@arm.com>
Date: Sun, 23 Jun 2024 20:22:28 +0000
Subject: [PATCH 18/28] Arm AArch64: simplify logic for calling gemm and gemv
 functions in ggml_compute_forward_mul_mat

---
 ggml/src/ggml.c | 39 +++++++++------------------------------
 1 file changed, 9 insertions(+), 30 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 1f6b5127d375e..5fabcadf5bcba 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -12383,41 +12383,20 @@ UseGgmlGemm2:;
 
     if ((ggml_n_dims(src0) == 2) && gemm && gemv) {
         if (src0_start >= src0_end) return;
-        if (ne11 == 1)
-            gemv(ne00, (float *)((char *) dst->data) + src0_start, ne01, (const char *) src0->data + src0_start * nb01,
-                 (const char *) src1_wdata, 1, src0_end - src0_start);
-        else {
-            for (int iter = 0; iter < ne11 / 16; iter++) {
-                gemm(ne00, (float *)((char *) dst->data + (iter * 16 * nb1)) + src0_start, ne01,
-                     (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter * 16), 16,
-                     src0_end - src0_start);
-            }
-            int rows_processed = (ne11 / 16) * 16;
-            for (int iter = 0; iter < (ne11 - rows_processed) / 8; iter++) {
-                gemm(ne00, (float *)((char *) dst->data + ((rows_processed + iter * 8) * nb1)) + src0_start, ne01,
-                     (const char *) src0->data + src0_start * nb01,
-                     (const char *) src1_wdata + (src1_col_stride * (rows_processed + iter * 8)), 8, src0_end - src0_start);
-            }
-            rows_processed = rows_processed + ((ne11 - rows_processed) / 8) * 8;
-            for (int iter = 0; iter < (ne11 - rows_processed) / 4; iter++) {
-                gemm(ne00, (float *)((char *) dst->data + ((rows_processed + iter * 4) * nb1)) + src0_start, ne01,
-                     (const char *) src0->data + src0_start * nb01,
-                     (const char *) src1_wdata + (src1_col_stride * (rows_processed + iter * 4)), 4, src0_end - src0_start);
-            }
-            rows_processed = rows_processed + ((ne11 - rows_processed) / 4) * 4;
-            for (int iter = rows_processed; iter < ne11; iter++) {
-                gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01,
-                     (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1,
-                     src0_end - src0_start);
-            }
-        }
+        // If there are more than three rows in src1, use gemm; otherwise, use gemv.
+        if (ne11 > 3)
+            gemm(ne00, (float *)((char *) dst->data) + src0_start, ne01, (const char *) src0->data + src0_start * nb01,
+                 (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
+        for (int iter = ne11 - ne11 % 4; iter < ne11; iter++)
+            gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01,
+                 (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1,
+                 src0_end - src0_start);
     } else if ((ggml_n_dims(src0) == 2) && gemv) {
         if (src0_start >= src0_end) return;
-        for (int iter = 0; iter < ne11; iter++) {
+        for (int iter = 0; iter < ne11; iter++)
             gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01,
                  (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1,
                  src0_end - src0_start);
-        }
     } else {
         // The first chunk comes from our thread_id, the rest will get auto-assigned.
         int current_chunk = ith;

From cbbfd69f423f033d8b854b0c9363a39520c92a28 Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar.gope@arm.com>
Date: Wed, 26 Jun 2024 07:32:53 +0000
Subject: [PATCH 19/28] Arm AArch64: minimize changes in
 ggml_compute_forward_mul_mat

---
 ggml/src/ggml.c | 83 ++++++++++++++++++++++---------------------------
 1 file changed, 37 insertions(+), 46 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 5fabcadf5bcba..babebc7bbb798 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -12276,24 +12276,20 @@ UseGgmlGemm1:;
                 }
             }
         }
-            if (from_float_to_mat && gemm && (ne11 >= 4) && (ne12 == 1) && (ne13 == 1)) {
-                    for (int64_t i11 = 0; i11 < ne11 / 4; ++i11) {
-                        from_float_to_mat((float *)((char *) src1->data + i11 * 4 * nb11), (void *) wdata, ne10);
-                        wdata += row_size * 4;
+            for (int64_t i13 = 0; i13 < ne13; ++i13) {
+                for (int64_t i12 = 0; i12 < ne12; ++i12) {
+                    int64_t i11_processed = 0;
+                    if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) {
+                        for (int64_t i11 = 0; i11 < ne11 - ne11 % 4; i11 += 4) {
+                            from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
+                            wdata += row_size * 4;
+                        }
+                        i11_processed = ne11 - ne11 % 4;
                     }
-                    for (int64_t i11 = (ne11 / 4) * 4; i11 < ne11; ++i11) {
-                        from_float_to_vec_dot((float *)((char *) src1->data + i11 * nb11), (void *) wdata, ne10);
+                    for (int64_t i11 = i11_processed; i11 < ne11; ++i11) {
+                        from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
                         wdata += row_size;
                     }
-            }
-            else {
-                for (int64_t i13 = 0; i13 < ne13; ++i13) {
-                    for (int64_t i12 = 0; i12 < ne12; ++i12) {
-                        for (int64_t i11 = 0; i11 < ne11; ++i11) {
-                            from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
-                            wdata += row_size;
-                        }
-                    }
                 }
             }
 
@@ -12374,51 +12370,46 @@ UseGgmlGemm2:;
     //if (ith == 0)
     //    printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d.  Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);
 
-    const void * src1_wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
-    const size_t src1_col_stride = ggml_is_contiguous(src1) || src1->type != vec_dot_type ? ggml_row_size(vec_dot_type, ne10) : nb11;
-    int64_t src0_start = (ith * ne01) / nth;
-    int64_t src0_end   = ((ith + 1) * ne01) / nth;
-    src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start;
-    src0_end = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end;
-
-    if ((ggml_n_dims(src0) == 2) && gemm && gemv) {
+    if ((ggml_n_dims(src0) == 2) && gemv) {
+        const void * src1_wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+        const size_t src1_col_stride = ggml_is_contiguous(src1) || src1->type != vec_dot_type ? ggml_row_size(vec_dot_type, ne10) : nb11;
+        int64_t src0_start = (ith * ne01) / nth;
+        int64_t src0_end   = ((ith + 1) * ne01) / nth;
+        src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start;
+        src0_end = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end;
         if (src0_start >= src0_end) return;
+
         // If there are more than three rows in src1, use gemm; otherwise, use gemv.
-        if (ne11 > 3)
+        if (gemm && (ne11 > 3))
             gemm(ne00, (float *)((char *) dst->data) + src0_start, ne01, (const char *) src0->data + src0_start * nb01,
                  (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
-        for (int iter = ne11 - ne11 % 4; iter < ne11; iter++)
-            gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01,
-                 (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1,
-                 src0_end - src0_start);
-    } else if ((ggml_n_dims(src0) == 2) && gemv) {
-        if (src0_start >= src0_end) return;
-        for (int iter = 0; iter < ne11; iter++)
+        for (int iter = gemm ? ne11 - ne11 % 4 : 0; iter < ne11; iter++)
             gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01,
                  (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1,
                  src0_end - src0_start);
-    } else {
-        // The first chunk comes from our thread_id, the rest will get auto-assigned.
-        int current_chunk = ith;
+        return;
+    }
 
-        while (current_chunk < nchunk0 * nchunk1) {
-            const int64_t ith0 = current_chunk % nchunk0;
-            const int64_t ith1 = current_chunk / nchunk0;
+    // The first chunk comes from our thread_id, the rest will get auto-assigned.
+    int current_chunk = ith;
 
-            const int64_t ir0_start = dr0 * ith0;
-            const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
+    while (current_chunk < nchunk0 * nchunk1) {
+        const int64_t ith0 = current_chunk % nchunk0;
+        const int64_t ith1 = current_chunk / nchunk0;
 
-            const int64_t ir1_start = dr1 * ith1;
-            const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
+        const int64_t ir0_start = dr0 * ith0;
+        const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
 
-            ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
+        const int64_t ir1_start = dr1 * ith1;
+        const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
 
-            if (nth >= nchunk0 * nchunk1) {
-                break;
-            }
+        ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
 
-            current_chunk = atomic_fetch_add(&params->shared->current_chunk, 1);
+        if (nth >= nchunk0 * nchunk1) {
+            break;
         }
+
+        current_chunk = atomic_fetch_add(&params->shared->current_chunk, 1);
     }
 }
 

From 356464454b50e2ee6aa6f4d9514ae68c6c5bc4c5 Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar.gope@arm.com>
Date: Wed, 3 Jul 2024 12:38:11 +0000
Subject: [PATCH 20/28] Arm AArch64: minor code refactoring, and add reference
 scalar code to quantize routines for new quant types

---
 ggml-aarch64.c      | 110 +++++++++++++++++++++++++++++++++++---------
 ggml-aarch64.h      |   2 +
 ggml/include/ggml.h |   4 +-
 ggml/src/ggml.c     |   9 ++--
 4 files changed, 98 insertions(+), 27 deletions(-)

diff --git a/ggml-aarch64.c b/ggml-aarch64.c
index 28a92759fac34..f5b6ec896cfb6 100644
--- a/ggml-aarch64.c
+++ b/ggml-aarch64.c
@@ -21,19 +21,19 @@
 
 // Functions to create the interleaved data layout formats
 
-// interleave 4 block_q4_0s in blocks of block_len
+// interleave 4 block_q4_0s in blocks of interleave_blcksize
 // returns an interleaved block_q4_0x4
 // in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks
-// first, then interleave quants from 4 block_q4_0s in blocks of block_len
+// first, then interleave quants from 4 block_q4_0s in blocks of interleave_blcksize
 //
-// - in        : an array of block_q4_0 pointers
-// - block_len : the block_q4_0 quants bytes are interleaved in blocks of
-//               block_len bytes
-// - xor_mask  : the mask to convert the nibbles in block_q4_0 quants bytes
-//               from bias offset form to pure sign form (this saves subtract
-//               operations durin unpacking)
+// - in                  : an array of block_q4_0 pointers
+// - interleave_blcksize : the block_q4_0 quants bytes are interleaved in blocks of
+//                         interleave_blcksize bytes
+// - xor_mask            : the mask to convert the nibbles in block_q4_0 quants bytes
+//                         from bias offset form to pure sign form (this saves subtract
+//                         operations durin unpacking)
 //
-static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int block_len, unsigned int xor_mask) {
+static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int interleave_blcksize, unsigned int xor_mask) {
     block_q4_0x4 out;
 
     for (int i = 0; i < 4; i++) {
@@ -41,9 +41,9 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int block_len, u
     }
 
     for (int i = 0; i < QK4_0 * 2; i++) {
-        int src_offset = (i / (4 * block_len)) * block_len;
-        int src_id = (i % (4 * block_len)) / block_len;
-        src_offset += (i % block_len);
+        int src_offset = (i / (4 * interleave_blcksize)) * interleave_blcksize;
+        int src_id = (i % (4 * interleave_blcksize)) / interleave_blcksize;
+        src_offset += (i % interleave_blcksize);
 
         out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
     }
@@ -51,11 +51,11 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int block_len, u
     return out;
 }
 
-// interleave 8 block_q4_0s in blocks of block_len
+// interleave 8 block_q4_0s in blocks of interleave_blcksize
 // returns an interleaved block_q4_0x8
 // in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
-// first, then interleave quants from 8 block_q4_0s in blocks of block_len
-static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int block_len, unsigned int xor_mask) {
+// first, then interleave quants from 8 block_q4_0s in blocks of interleave_blcksize
+static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int interleave_blcksize, unsigned int xor_mask) {
     block_q4_0x8 out;
 
     for (int i = 0; i < 8; i++) {
@@ -63,9 +63,9 @@ static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int block_len, u
     }
 
     for (int i = 0; i < QK4_0 * 4; i++) {
-        int src_offset = (i / (8 * block_len)) * block_len;
-        int src_id = (i % (8 * block_len)) / block_len;
-        src_offset += (i % block_len);
+        int src_offset = (i / (8 * interleave_blcksize)) * interleave_blcksize;
+        int src_id = (i % (8 * interleave_blcksize)) / interleave_blcksize;
+        src_offset += (i % interleave_blcksize);
 
         out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
     }
@@ -135,7 +135,35 @@ void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k)
         }
     }
 #else
-    assert(false);
+    // scalar
+    const int interleave_blcksize = 4;
+    float srcv[4][QK8_0];
+    float id[4];
+
+    for (int i = 0; i < nb; i++) {
+        for (int row_iter = 0; row_iter < 4; row_iter++) {
+            float amax = 0.0f; // absolute max
+
+            for (int j = 0; j < QK8_0; j++) {
+                srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
+                amax = MAX(amax, fabsf(srcv[row_iter][j]));
+            }
+
+            const float d = amax / ((1 << 7) - 1);
+            id[row_iter] = d ? 1.0f / d : 0.0f;
+
+            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
+        }
+
+        for (int j = 0; j < QK8_0 * 4; j++) {
+            int src_offset = (j / (4 * interleave_blcksize)) * interleave_blcksize;
+            int src_id = (j % (4 * interleave_blcksize)) / interleave_blcksize;
+            src_offset += (j % interleave_blcksize);
+
+            float x0 = srcv[src_id][src_offset] * id[src_id];
+            y[i].qs[j] = roundf(x0);;
+        }
+    }
 #endif
 }
 
@@ -225,11 +253,47 @@ void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int64_t k)
         }
     }
 #else
-    assert(false);
+    // scalar
+    const int interleave_blcksize = 8;
+    float srcv[4][QK8_0];
+    float id[4];
+
+    for (int i = 0; i < nb; i++) {
+        for (int row_iter = 0; row_iter < 4; row_iter++) {
+            float amax = 0.0f; // absolute max
+
+            for (int j = 0; j < QK8_0; j++) {
+                srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
+                amax = MAX(amax, fabsf(srcv[row_iter][j]));
+            }
+
+            const float d = amax / ((1 << 7) - 1);
+            id[row_iter] = d ? 1.0f / d : 0.0f;
+
+            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
+        }
+
+        for (int j = 0; j < QK8_0 * 4; j++) {
+            int src_offset = (j / (4 * interleave_blcksize)) * interleave_blcksize;
+            int src_id = (j % (4 * interleave_blcksize)) / interleave_blcksize;
+            src_offset += (j % interleave_blcksize);
+
+            float x0 = srcv[src_id][src_offset] * id[src_id];
+            y[i].qs[j] = roundf(x0);;
+        }
+    }
 #endif
 }
 
-static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int blocklen_per_row) {
+void quantize_mat_q8_0(const float * restrict x, void * restrict vy, int64_t nrow, int64_t n_per_row, int64_t interleave_blcksize) {
+    assert(nrow == 4);
+    UNUSED(nrow);
+    if (interleave_blcksize == 4) quantize_q8_0_4x4(x, vy, n_per_row);
+    else if (interleave_blcksize == 8) quantize_q8_0_4x8(x, vy, n_per_row);
+    else assert(false);
+}
+
+static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int interleave_blcksize) {
     assert(n_per_row % QK4_0 == 0);
     const int nb = n_per_row / QK4_0;
 
@@ -251,11 +315,11 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds
             }
 
             if (nrows_interleaved == 8) {
-                *(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blocklen_per_row, 0x88);
+                *(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, interleave_blcksize, 0x88);
                 out_ptr = (block_q4_0x8 *) out_ptr + 1;
             }
             else if (nrows_interleaved == 4) {
-                *(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blocklen_per_row, 0x88);
+                *(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, interleave_blcksize, 0x88);
                 out_ptr = (block_q4_0x4 *) out_ptr + 1;
             }
         }
diff --git a/ggml-aarch64.h b/ggml-aarch64.h
index 53f9d518d1ab2..65ead1efed572 100644
--- a/ggml-aarch64.h
+++ b/ggml-aarch64.h
@@ -16,6 +16,8 @@ extern "C" {
 void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 
+void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t interleave_blcksize);
+
 // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
 size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 0c526c47e2cfc..0f663971d49c9 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2412,7 +2412,8 @@ extern "C" {
     typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
     typedef void (*ggml_vec_dot_t)   (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
                                       const void * GGML_RESTRICT y, size_t by, int nrc);
-    typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
+    typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t nr,
+                                      int64_t k, int64_t bx);
     typedef void (*ggml_gemv_t)      (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx,
                                       const void * GGML_RESTRICT vy, int nr, int nc);
     typedef void (*ggml_gemm_t)      (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx,
@@ -2430,6 +2431,7 @@ extern "C" {
         enum ggml_type    vec_dot_type;
         int64_t           nrows; // number of rows to process simultaneously;
         int64_t           ncols; // number of columns to process simultaneously;
+        int64_t           interleave_blcksize; // interleave elements in blocks of interleave_blcksize;
         ggml_from_float_to_mat_t from_float_to_mat;
         ggml_gemv_t       gemv;
         ggml_gemm_t       gemm;
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index babebc7bbb798..6b5bdad163730 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -702,11 +702,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_0,
 #if defined (__ARM_FEATURE_MATMUL_INT8)
         .nrows                    = 2,
-        .from_float_to_mat        = quantize_q8_0_4x8,
 #else
         .nrows                    = 1,
-        .from_float_to_mat        = quantize_q8_0_4x4,
 #endif
+        .from_float_to_mat        = quantize_mat_q8_0,
     },
     [GGML_TYPE_Q8_1] = {
         .type_name                = "q8_1",
@@ -917,6 +916,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_0,
         .nrows                    = 1,
         .ncols                    = 4,
+        .interleave_blcksize      = 4,
         .gemv                     = ggml_gemv_q4_0_4x4_q8_0,
         .gemm                     = ggml_gemm_q4_0_4x4_q8_0,
     },
@@ -932,6 +932,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_0,
         .nrows                    = 1,
         .ncols                    = 4,
+        .interleave_blcksize      = 8,
         .gemv                     = ggml_gemv_q4_0_4x8_q8_0,
         .gemm                     = ggml_gemm_q4_0_4x8_q8_0,
     },
@@ -947,6 +948,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_0,
         .nrows                    = 1,
         .ncols                    = 8,
+        .interleave_blcksize      = 8,
         .gemv                     = ggml_gemv_q4_0_8x8_q8_0,
         .gemm                     = ggml_gemm_q4_0_8x8_q8_0,
     }
@@ -12207,6 +12209,7 @@ static void ggml_compute_forward_mul_mat(
     ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
     int64_t           const vec_dot_num_rows      = type_traits[type].nrows;
     int64_t           const matmul_num_cols       = type_traits[type].ncols;
+    int64_t           const interleave_blcksize   = type_traits[type].interleave_blcksize;
     ggml_from_float_to_mat_t const from_float_to_mat
                                                   = type_traits[vec_dot_type].from_float_to_mat;
     ggml_gemv_t       const gemv                  = type_traits[type].gemv;
@@ -12281,7 +12284,7 @@ UseGgmlGemm1:;
                     int64_t i11_processed = 0;
                     if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) {
                         for (int64_t i11 = 0; i11 < ne11 - ne11 % 4; i11 += 4) {
-                            from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
+                            from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, 4, ne10, interleave_blcksize);
                             wdata += row_size * 4;
                         }
                         i11_processed = ne11 - ne11 % 4;

From 110d143ecef69819c47507711104672e05d9c244 Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar.gope@arm.com>
Date: Wed, 3 Jul 2024 12:41:13 +0000
Subject: [PATCH 21/28] Arm AArch64: minor code refactoring

---
 ggml/include/ggml.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 0f663971d49c9..42dd224e69142 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2413,7 +2413,7 @@ extern "C" {
     typedef void (*ggml_vec_dot_t)   (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
                                       const void * GGML_RESTRICT y, size_t by, int nrc);
     typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t nr,
-                                      int64_t k, int64_t bx);
+                                             int64_t k, int64_t bx);
     typedef void (*ggml_gemv_t)      (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx,
                                       const void * GGML_RESTRICT vy, int nr, int nc);
     typedef void (*ggml_gemm_t)      (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx,

From 4ff0b223c3d85b6fb0319302dcd71d2fdcdd94e1 Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar.gope@arm.com>
Date: Sat, 6 Jul 2024 19:15:55 +0000
Subject: [PATCH 22/28] Arm AArch64: minor code refactoring

---
 ggml/src/ggml.c | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 6b5bdad163730..bb515ee058ccf 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -12374,12 +12374,12 @@ UseGgmlGemm2:;
     //    printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d.  Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);
 
     if ((ggml_n_dims(src0) == 2) && gemv) {
-        const void * src1_wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+        const void * src1_wdata      = (src1->type == vec_dot_type) ? src1->data : params->wdata;
         const size_t src1_col_stride = ggml_is_contiguous(src1) || src1->type != vec_dot_type ? ggml_row_size(vec_dot_type, ne10) : nb11;
         int64_t src0_start = (ith * ne01) / nth;
         int64_t src0_end   = ((ith + 1) * ne01) / nth;
         src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start;
-        src0_end = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end;
+        src0_end   = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end;
         if (src0_start >= src0_end) return;
 
         // If there are more than three rows in src1, use gemm; otherwise, use gemv.
@@ -12438,6 +12438,8 @@ static void ggml_compute_forward_mul_mat_id(
     ggml_vec_dot_t    const vec_dot               = type_traits[type].vec_dot;
     enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
     ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
+    int64_t           const matmul_num_cols       = type_traits[type].ncols;
+    ggml_gemv_t       const gemv                  = type_traits[type].gemv;
 
     // we don't support permuted src0 or src1
     GGML_ASSERT(nb00 == ggml_type_size(type));
@@ -12523,6 +12525,34 @@ static void ggml_compute_forward_mul_mat_id(
         const int64_t nr0 = ne01; // src0 rows
         const int64_t nr1 = cne1; // src1 rows
 
+        if (((ggml_n_dims(src0) - 1) == 2) && gemv) {
+            int64_t src0_cur_start = (ith * ne01) / nth;
+            int64_t src0_cur_end   = ((ith + 1) * ne01) / nth;
+            src0_cur_start = (src0_cur_start % matmul_num_cols) ? src0_cur_start + matmul_num_cols - (src0_cur_start % matmul_num_cols): src0_cur_start;
+            src0_cur_end   = (src0_cur_end % matmul_num_cols) ? src0_cur_end + matmul_num_cols - (src0_cur_end % matmul_num_cols): src0_cur_end;
+            if (src0_cur_start >= src0_cur_end) return;
+
+            for (int ir1 = 0; ir1 < nr1; ir1++) {
+                struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
+                const int id       = row_mapping.i1; // selected expert index
+
+                const int64_t  i11 = id % ne11;
+                const int64_t  i12 = row_mapping.i2; // row index in src1
+
+                const int64_t  i1 = id;  // selected expert index
+                const int64_t  i2 = i12; // row
+
+                const char * src1_col = (const char *) wdata +
+                    (src1_cont || src1->type != vec_dot_type
+                    ? (i11        + i12 * ne11) * row_size
+                    : (i11 * nb11 + i12 * nb12));
+
+                gemv(ne00, (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
+                     (const char *) src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start);
+            }
+            continue;
+        }
+
         // distribute the thread work across the inner or outer loop based on which one is larger
 
         const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows

From 42724b4d02ddb60bdd8a93bd7d174402cfcf3ebb Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar.gope@arm.com>
Date: Mon, 8 Jul 2024 04:19:04 +0000
Subject: [PATCH 23/28] Arm AArch64: minor code refactoring

---
 ggml-aarch64.c      | 6 ++----
 ggml/include/ggml.h | 8 ++++----
 ggml/src/ggml.c     | 6 ++++--
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/ggml-aarch64.c b/ggml-aarch64.c
index f5b6ec896cfb6..d7f7f5ed580fa 100644
--- a/ggml-aarch64.c
+++ b/ggml-aarch64.c
@@ -5,9 +5,6 @@
 #include "ggml-quants.h"
 #include "ggml-impl.h"
 
-#define GGML_COMMON_IMPL_C
-#include "ggml-common.h"
-
 #include <math.h>
 #include <string.h>
 #include <assert.h>
@@ -304,7 +301,8 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds
     else if (nrows_interleaved == 4) {
         out_ptr = (block_q4_0x4 *) dst;
     }
-    block_q4_0 dst_tmp[nrows_interleaved];
+    assert(nrows_interleaved <= 8);
+    block_q4_0 dst_tmp[8];
 
     for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) {
 
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 42dd224e69142..1e367753738d9 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2414,10 +2414,10 @@ extern "C" {
                                       const void * GGML_RESTRICT y, size_t by, int nrc);
     typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t nr,
                                              int64_t k, int64_t bx);
-    typedef void (*ggml_gemv_t)      (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx,
-                                      const void * GGML_RESTRICT vy, int nr, int nc);
-    typedef void (*ggml_gemm_t)      (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx,
-                                      const void * GGML_RESTRICT vy, int nr, int nc);
+    typedef void (*ggml_gemv_t)      (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
+                                      const void * GGML_RESTRICT y, int nr, int nc);
+    typedef void (*ggml_gemm_t)      (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
+                                      const void * GGML_RESTRICT y, int nr, int nc);
 
     typedef struct {
         const char      * type_name;
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index bb515ee058ccf..725e3fc7a7741 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -12383,13 +12383,15 @@ UseGgmlGemm2:;
         if (src0_start >= src0_end) return;
 
         // If there are more than three rows in src1, use gemm; otherwise, use gemv.
-        if (gemm && (ne11 > 3))
+        if (gemm && (ne11 > 3)) {
             gemm(ne00, (float *)((char *) dst->data) + src0_start, ne01, (const char *) src0->data + src0_start * nb01,
                  (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
-        for (int iter = gemm ? ne11 - ne11 % 4 : 0; iter < ne11; iter++)
+        }
+        for (int iter = gemm ? ne11 - ne11 % 4 : 0; iter < ne11; iter++) {
             gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01,
                  (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1,
                  src0_end - src0_start);
+        }
         return;
     }
 

From e5f4713d810c13af60d8fd09400df92ea6a30bdd Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar.gope@arm.com>
Date: Mon, 8 Jul 2024 17:09:24 +0000
Subject: [PATCH 24/28] rebase on the latest master commit 3fd62a6 and adapt to
 the new directory structure

---
 Makefile                                  | 10 ++++-
 ggml/src/CMakeLists.txt                   |  1 +
 ggml-aarch64.c => ggml/src/ggml-aarch64.c |  0
 ggml-aarch64.h => ggml/src/ggml-aarch64.h |  0
 ggml/src/ggml.c                           | 45 ++++++-----------------
 5 files changed, 21 insertions(+), 35 deletions(-)
 rename ggml-aarch64.c => ggml/src/ggml-aarch64.c (100%)
 rename ggml-aarch64.h => ggml/src/ggml-aarch64.h (100%)

diff --git a/Makefile b/Makefile
index bb6e2f968cf0f..20313782e0fe8 100644
--- a/Makefile
+++ b/Makefile
@@ -826,7 +826,8 @@ OBJ_GGML += \
 	ggml/src/ggml.o \
 	ggml/src/ggml-alloc.o \
 	ggml/src/ggml-backend.o \
-	ggml/src/ggml-quants.o
+	ggml/src/ggml-quants.o \
+	ggml/src/ggml-aarch64.o
 
 OBJ_LLAMA = \
 	src/llama.o \
@@ -959,6 +960,13 @@ ggml/src/ggml-quants.o: \
 	ggml/src/ggml-common.h
 	$(CC) $(CFLAGS)    -c $< -o $@
 
+ggml/src/ggml-aarch64.o: \
+	ggml/src/ggml-aarch64.c \
+	ggml/include/ggml.h \
+	ggml/src/ggml-aarch64.h \
+	ggml/src/ggml-common.h
+	$(CC) $(CFLAGS)    -c $< -o $@
+
 ggml/src/ggml-blas.o: \
 	ggml/src/ggml-blas.cpp \
 	ggml/include/ggml-blas.h
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index c6694df678fff..aae5b8e9fe35c 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -1153,6 +1153,7 @@ add_library(ggml
             ${GGML_SOURCES_ROCM}      ${GGML_HEADERS_ROCM}
             ${GGML_SOURCES_BLAS}      ${GGML_HEADERS_BLAS}
             ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
+            ggml-aarch64.c            ggml-aarch64.h
             )
 
 if (EMSCRIPTEN)
diff --git a/ggml-aarch64.c b/ggml/src/ggml-aarch64.c
similarity index 100%
rename from ggml-aarch64.c
rename to ggml/src/ggml-aarch64.c
diff --git a/ggml-aarch64.h b/ggml/src/ggml-aarch64.h
similarity index 100%
rename from ggml-aarch64.h
rename to ggml/src/ggml-aarch64.h
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 725e3fc7a7741..7505f0764083b 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -474,18 +474,6 @@ int64_t ggml_cycles_per_ms(void) {
     return CLOCKS_PER_SEC/1000;
 }
 
-#ifdef GGML_PERF
-#define ggml_perf_time_ms()       ggml_time_ms()
-#define ggml_perf_time_us()       ggml_time_us()
-#define ggml_perf_cycles()        ggml_cycles()
-#define ggml_perf_cycles_per_ms() ggml_cycles_per_ms()
-#else
-#define ggml_perf_time_ms()       0
-#define ggml_perf_time_us()       0
-#define ggml_perf_cycles()        0
-#define ggml_perf_cycles_per_ms() 0
-#endif
-
 //
 // cross-platform UTF-8 file paths
 //
@@ -12272,29 +12260,23 @@ UseGgmlGemm1:;
 
         for (int64_t i13 = 0; i13 < ne13; ++i13) {
             for (int64_t i12 = 0; i12 < ne12; ++i12) {
-                for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
+                int64_t i11_processed = 0;
+                if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) {
+                    for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
+                        from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
+                                          (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
+                                          4, ne10, interleave_blcksize);
+                    }
+                    i11_processed = ne11 - ne11 % 4;
+                }
+                for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
                     from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
                                           (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
                                            ne10);
                 }
             }
         }
-            for (int64_t i13 = 0; i13 < ne13; ++i13) {
-                for (int64_t i12 = 0; i12 < ne12; ++i12) {
-                    int64_t i11_processed = 0;
-                    if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) {
-                        for (int64_t i11 = 0; i11 < ne11 - ne11 % 4; i11 += 4) {
-                            from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, 4, ne10, interleave_blcksize);
-                            wdata += row_size * 4;
-                        }
-                        i11_processed = ne11 - ne11 % 4;
-                    }
-                    for (int64_t i11 = i11_processed; i11 < ne11; ++i11) {
-                        from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
-                        wdata += row_size;
-                    }
-                }
-            }
+    }
 
     if (ith == 0) {
         // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
@@ -12368,11 +12350,6 @@ UseGgmlGemm2:;
     const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
     const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
 
-    // The first chunk comes from our thread_id, the rest will get auto-assigned.
-    int current_chunk = ith;
-    //if (ith == 0)
-    //    printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d.  Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);
-
     if ((ggml_n_dims(src0) == 2) && gemv) {
         const void * src1_wdata      = (src1->type == vec_dot_type) ? src1->data : params->wdata;
         const size_t src1_col_stride = ggml_is_contiguous(src1) || src1->type != vec_dot_type ? ggml_row_size(vec_dot_type, ne10) : nb11;

From c2595d0b80b11f34864a54075038dd1a2cd9072d Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar.gope@arm.com>
Date: Tue, 9 Jul 2024 12:24:56 +0000
Subject: [PATCH 25/28] Arm AArch64: remove a redundant comment

---
 ggml/src/ggml.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 7505f0764083b..cd8a9f77060fc 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -21903,7 +21903,6 @@ int ggml_cpu_has_neon(void) {
 
 int ggml_cpu_has_sve(void) {
 #if defined(__ARM_FEATURE_SVE)
-    // TODO: Currently, SVE 256 bit is only supported.
     return 1;
 #else
     return 0;

From a7abb78565487f6352a6b9853979358a85aa356e Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar.gope@arm.com>
Date: Tue, 9 Jul 2024 12:56:15 +0000
Subject: [PATCH 26/28] Arm AArch64: add pragma in ggml-aarch64.c to turn
 -Woverlength-strings warning off

---
 ggml/src/ggml-aarch64.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c
index d7f7f5ed580fa..1f28b0f5744c7 100644
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@@ -14,6 +14,8 @@
 
 #include "ggml-aarch64.h"
 
+#pragma GCC diagnostic ignored "-Woverlength-strings"
+
 #define UNUSED GGML_UNUSED
 
 // Functions to create the interleaved data layout formats

From 0e84ef1aa7eb40178b71252893c46006241dc87e Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar.gope@arm.com>
Date: Tue, 9 Jul 2024 18:24:40 +0000
Subject: [PATCH 27/28] Arm AArch64: use __aarch64__ check to guard 64-bit neon
 kernels

---
 ggml/src/ggml-aarch64.c | 12 ++++++------
 ggml/src/ggml.c         |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c
index 1f28b0f5744c7..008718634fe0d 100644
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@@ -386,7 +386,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
 #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
     GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
                 "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
-#elif defined(__ARM_NEON)
+#elif defined(__ARM_NEON) && defined(__aarch64__)
     const void * b_ptr = vx;
     const void * a_ptr = vy;
     float * res_ptr = s;
@@ -557,7 +557,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
         : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
         : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
     );
-#elif defined(__ARM_NEON)
+#elif defined(__ARM_NEON) && defined(__aarch64__)
     GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
                 "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
                 "performance");
@@ -687,7 +687,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
 #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
     GGML_ASSERT(ggml_cpu_has_sve() &&
                 "__ARM_FEATURE_SVE not defined, use the Q4_0_4_8 quantization format for optimal performance");
-#elif defined(__ARM_NEON)
+#elif defined(__ARM_NEON) && defined(__aarch64__)
     GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
                 "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
                 "performance");
@@ -747,7 +747,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
 #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
     GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
                 "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
-#elif defined(__ARM_NEON)
+#elif defined(__ARM_NEON) && defined(__aarch64__)
     const void * b_ptr = vx;
     const void * a_ptr = vy;
     float * res_ptr = s;
@@ -1661,7 +1661,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
         : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
-#elif defined(__ARM_NEON)
+#elif defined(__ARM_NEON) && defined(__aarch64__)
     GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
                 "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
                 "performance");
@@ -2146,7 +2146,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
 #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
     GGML_ASSERT(ggml_cpu_has_sve() &&
                 "__ARM_FEATURE_SVE not defined, use the Q4_0_4_8 quantization format for optimal performance");
-#elif defined(__ARM_NEON)
+#elif defined(__ARM_NEON) && defined(__aarch64__)
     GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
                 "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
                 "performance");
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index cd8a9f77060fc..c0aced3d2d069 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -38,7 +38,7 @@
 #include <unistd.h>
 #endif
 
-#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
+#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
 #undef GGML_USE_LLAMAFILE
 #endif
 

From c653eb1f1bbdd5e9cde7ce4e0d135ac50e64e26b Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar.gope@arm.com>
Date: Tue, 9 Jul 2024 19:19:24 +0000
Subject: [PATCH 28/28] Arm AArch64: update docs/build.md README to include
 compile time flags for buiilding the Q4_0_4_4 quant type

---
 docs/build.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/build.md b/docs/build.md
index bf41bfdf9c2f8..d70f72f4c7b82 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -28,6 +28,7 @@ In order to build llama.cpp you have four different options.
         ```
 
   - Notes:
+    - For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
     - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
     - For faster repeated compilation, install [ccache](https://ccache.dev/).
     - For debug builds, run `make LLAMA_DEBUG=1`
@@ -41,6 +42,7 @@ In order to build llama.cpp you have four different options.
 
   **Notes**:
 
+    - For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
     - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
     - For faster repeated compilation, install [ccache](https://ccache.dev/).
     - For debug builds, there are two cases: