ggml : add AArch64 optimized GEMV and GEMM Q4 kernels (#5780)

* Arm AArch64: optimized GEMV and GEMM kernels for q4_0_q8_0, and q8_0_q8_0 quantization * Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780 suggestions * Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780 suggestions * Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780 suggestions * Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780 suggestions * Arm AArch64: add copyright claim only to ggml-aarch64.cpp and ggml-aarch64.h files * Arm AArch64: minor code refactoring for rebase * Arm AArch64: minor code refactoring for resolving a build issue with cmake * Arm AArch64: minor code refactoring to split the Q4_0_AARC64 type into three separate types: Q4_0_4_4, Q4_0_4_8, and Q4_0_8_8 * Arm AArch64: minor code change for resolving a build issue with server-windows * retrigger checks * Arm AArch64: minor code changes for rebase * Arm AArch64: minor changes to skip the pr#7433 vec_dot code for arm cpus with SVE VL not equal to 256 bits * Arm AArch64: remove stale LLAMA_QKK_64 from CMakeLists.txt and delete build.zig * Arm AArch64: add reference scalar gemm and gemv, and avoid dynamic memory allocations during quantization for Q4_0_4_4, Q4_0_4_8, and Q4_0_8_8 * Arm AArch64: add multithreaded quantization support for the new types: Q4_0_4_4, Q4_0_4_8, and Q4_0_8_8 * Arm AArch64: minor code refactoring * Arm AArch64: simplify logic for calling gemm and gemv functions in ggml_compute_forward_mul_mat * Arm AArch64: minimize changes in ggml_compute_forward_mul_mat * Arm AArch64: minor code refactoring, and add reference scalar code to quantize routines for new quant types * Arm AArch64: minor code refactoring * Arm AArch64: minor code refactoring * Arm AArch64: minor code refactoring * rebase on the latest master commit 3fd62a6 and adapt to the new directory structure * Arm AArch64: remove a redundant comment * Arm AArch64: add pragma in ggml-aarch64.c to turn -Woverlength-strings warning off * Arm AArch64: use __aarch64__ check to guard 64-bit neon kernels * Arm AArch64: update docs/build.md README to include compile time flags for buiilding the Q4_0_4_4 quant type
ggerganov · Jul 10, 2024 · 0f1a39f · 0f1a39f
1 parent 83321c6
commit 0f1a39f
Show file tree

Hide file tree

Showing 14 changed files with 2,534 additions and 53 deletions.
diff --git a/Makefile b/Makefile
@@ -835,7 +835,8 @@ OBJ_GGML += \
 	ggml/src/ggml.o \
 	ggml/src/ggml-alloc.o \
 	ggml/src/ggml-backend.o \
-	ggml/src/ggml-quants.o
+	ggml/src/ggml-quants.o \
+	ggml/src/ggml-aarch64.o
 
 OBJ_LLAMA = \
 	src/llama.o \
@@ -969,6 +970,13 @@ ggml/src/ggml-quants.o: \
 	ggml/src/ggml-common.h
 	$(CC) $(CFLAGS)    -c $< -o $@
 
+ggml/src/ggml-aarch64.o: \
+	ggml/src/ggml-aarch64.c \
+	ggml/include/ggml.h \
+	ggml/src/ggml-aarch64.h \
+	ggml/src/ggml-common.h
+	$(CC) $(CFLAGS)    -c $< -o $@
+
 ggml/src/ggml-blas.o: \
 	ggml/src/ggml-blas.cpp \
 	ggml/include/ggml-blas.h

diff --git a/Package.swift b/Package.swift
@@ -10,6 +10,7 @@ var sources = [
     "ggml/src/ggml-alloc.c",
     "ggml/src/ggml-backend.c",
     "ggml/src/ggml-quants.c",
+    "ggml/src/ggml-aarch64.c",
 ]
 
 var resources: [Resource] = []

diff --git a/docs/build.md b/docs/build.md
@@ -28,6 +28,7 @@ In order to build llama.cpp you have four different options.
         ```
 
   - Notes:
+    - For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
     - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
     - For faster repeated compilation, install [ccache](https://ccache.dev/).
     - For debug builds, run `make LLAMA_DEBUG=1`
@@ -41,6 +42,7 @@ In order to build llama.cpp you have four different options.
 
   **Notes**:
 
+    - For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
     - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
     - For faster repeated compilation, install [ccache](https://ccache.dev/).
     - For debug builds, there are two cases:

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
@@ -46,6 +46,9 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B",  },
     { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 6.14G, +0.0217 ppl @ Llama-3-8B",  },
     { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 7.96G, +0.0026 ppl @ Llama-3-8B",  },
+    { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
+    { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
+    { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
     { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "14.00G, +0.0020 ppl @ Mistral-7B",  },
     { "BF16",   LLAMA_FTYPE_MOSTLY_BF16,   "14.00G, -0.0050 ppl @ Mistral-7B",  },
     { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B",          },

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -383,6 +383,9 @@ extern "C" {
         GGML_TYPE_F64     = 28,
         GGML_TYPE_IQ1_M   = 29,
         GGML_TYPE_BF16    = 30,
+        GGML_TYPE_Q4_0_4_4 = 31,
+        GGML_TYPE_Q4_0_4_8 = 32,
+        GGML_TYPE_Q4_0_8_8 = 33,
         GGML_TYPE_COUNT,
     };
 
@@ -424,6 +427,9 @@ extern "C" {
         GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
         GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
     };
 
     // available tensor operations:
@@ -2406,6 +2412,12 @@ extern "C" {
     typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
     typedef void (*ggml_vec_dot_t)   (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
                                       const void * GGML_RESTRICT y, size_t by, int nrc);
+    typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t nr,
+                                             int64_t k, int64_t bx);
+    typedef void (*ggml_gemv_t)      (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
+                                      const void * GGML_RESTRICT y, int nr, int nc);
+    typedef void (*ggml_gemm_t)      (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
+                                      const void * GGML_RESTRICT y, int nr, int nc);
 
     typedef struct {
         const char      * type_name;
@@ -2418,6 +2430,11 @@ extern "C" {
         ggml_vec_dot_t    vec_dot;
         enum ggml_type    vec_dot_type;
         int64_t           nrows; // number of rows to process simultaneously;
+        int64_t           ncols; // number of columns to process simultaneously;
+        int64_t           interleave_blcksize; // interleave elements in blocks of interleave_blcksize;
+        ggml_from_float_to_mat_t from_float_to_mat;
+        ggml_gemv_t       gemv;
+        ggml_gemm_t       gemm;
     } ggml_type_traits_t;
 
     GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);

diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
@@ -1153,6 +1153,7 @@ add_library(ggml
             ${GGML_SOURCES_ROCM}      ${GGML_HEADERS_ROCM}
             ${GGML_SOURCES_BLAS}      ${GGML_HEADERS_BLAS}
             ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
+            ggml-aarch64.c            ggml-aarch64.h
             )
 
 if (EMSCRIPTEN)