Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Arm AArch64: optimized GEMV and GEMM kernels for q4_0_q8_0, and q8_0_q8_0 quantization #5780

Merged
merged 28 commits into from
Jul 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
002e36e
Arm AArch64: optimized GEMV and GEMM kernels for q4_0_q8_0, and q8_0_…
Dibakar Feb 28, 2024
340ef07
Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 qu…
Dibakar Apr 22, 2024
81215ff
Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 qu…
Dibakar Apr 23, 2024
6c8d826
Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 qu…
Dibakar Apr 25, 2024
43e1297
Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 qu…
Dibakar Apr 29, 2024
441ab64
Arm AArch64: add copyright claim only to ggml-aarch64.cpp and ggml-aa…
Dibakar Apr 29, 2024
8ee6779
Arm AArch64: minor code refactoring for rebase
Dibakar May 1, 2024
a657246
Arm AArch64: minor code refactoring for resolving a build issue with …
Dibakar May 16, 2024
746b57f
Arm AArch64: minor code refactoring to split the Q4_0_AARC64 type int…
Dibakar May 21, 2024
5d10c21
Arm AArch64: minor code change for resolving a build issue with serve…
Dibakar May 31, 2024
7ac03e5
retrigger checks
Dibakar May 31, 2024
e2c1c47
Arm AArch64: minor code changes for rebase
Dibakar Jun 5, 2024
79b6cdf
Arm AArch64: minor changes to skip the pr#7433 vec_dot code for arm c…
Dibakar Jun 14, 2024
3c1ad5f
Arm AArch64: remove stale LLAMA_QKK_64 from CMakeLists.txt and delete…
Dibakar Jun 14, 2024
a7055b7
Arm AArch64: add reference scalar gemm and gemv, and avoid dynamic me…
Dibakar Jun 18, 2024
cce236b
Arm AArch64: add multithreaded quantization support for the new types…
Dibakar Jun 19, 2024
7a70606
Arm AArch64: minor code refactoring
Dibakar Jun 19, 2024
ffbfabb
Arm AArch64: simplify logic for calling gemm and gemv functions in gg…
Dibakar Jun 23, 2024
cbbfd69
Arm AArch64: minimize changes in ggml_compute_forward_mul_mat
Dibakar Jun 26, 2024
3564644
Arm AArch64: minor code refactoring, and add reference scalar code to…
Dibakar Jul 3, 2024
110d143
Arm AArch64: minor code refactoring
Dibakar Jul 3, 2024
4ff0b22
Arm AArch64: minor code refactoring
Dibakar Jul 6, 2024
42724b4
Arm AArch64: minor code refactoring
Dibakar Jul 8, 2024
e5f4713
rebase on the latest master commit 3fd62a6 and adapt to the new direc…
Dibakar Jul 8, 2024
c2595d0
Arm AArch64: remove a redundant comment
Dibakar Jul 9, 2024
a7abb78
Arm AArch64: add pragma in ggml-aarch64.c to turn -Woverlength-string…
Dibakar Jul 9, 2024
0e84ef1
Arm AArch64: use __aarch64__ check to guard 64-bit neon kernels
Dibakar Jul 9, 2024
c653eb1
Arm AArch64: update docs/build.md README to include compile time flag…
Dibakar Jul 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -826,7 +826,8 @@ OBJ_GGML += \
ggml/src/ggml.o \
ggml/src/ggml-alloc.o \
ggml/src/ggml-backend.o \
ggml/src/ggml-quants.o
ggml/src/ggml-quants.o \
ggml/src/ggml-aarch64.o

OBJ_LLAMA = \
src/llama.o \
Expand Down Expand Up @@ -959,6 +960,13 @@ ggml/src/ggml-quants.o: \
ggml/src/ggml-common.h
$(CC) $(CFLAGS) -c $< -o $@

ggml/src/ggml-aarch64.o: \
ggml/src/ggml-aarch64.c \
ggml/include/ggml.h \
ggml/src/ggml-aarch64.h \
ggml/src/ggml-common.h
$(CC) $(CFLAGS) -c $< -o $@

ggml/src/ggml-blas.o: \
ggml/src/ggml-blas.cpp \
ggml/include/ggml-blas.h
Expand Down
1 change: 1 addition & 0 deletions Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ var sources = [
"ggml/src/ggml-alloc.c",
"ggml/src/ggml-backend.c",
"ggml/src/ggml-quants.c",
"ggml/src/ggml-aarch64.c",
]

var resources: [Resource] = []
Expand Down
2 changes: 2 additions & 0 deletions docs/build.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ In order to build llama.cpp you have four different options.
```

- Notes:
- For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
- For faster repeated compilation, install [ccache](https://ccache.dev/).
- For debug builds, run `make LLAMA_DEBUG=1`
Expand All @@ -41,6 +42,7 @@ In order to build llama.cpp you have four different options.

**Notes**:

- For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
- For faster repeated compilation, install [ccache](https://ccache.dev/).
- For debug builds, there are two cases:
Expand Down
3 changes: 3 additions & 0 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
Expand Down
17 changes: 17 additions & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,9 @@ extern "C" {
GGML_TYPE_F64 = 28,
GGML_TYPE_IQ1_M = 29,
GGML_TYPE_BF16 = 30,
GGML_TYPE_Q4_0_4_4 = 31,
GGML_TYPE_Q4_0_4_8 = 32,
GGML_TYPE_Q4_0_8_8 = 33,
GGML_TYPE_COUNT,
};

Expand Down Expand Up @@ -424,6 +427,9 @@ extern "C" {
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
};

// available tensor operations:
Expand Down Expand Up @@ -2406,6 +2412,12 @@ extern "C" {
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
const void * GGML_RESTRICT y, size_t by, int nrc);
typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr,
int64_t k, int64_t bx);
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);

typedef struct {
const char * type_name;
Expand All @@ -2418,6 +2430,11 @@ extern "C" {
ggml_vec_dot_t vec_dot;
enum ggml_type vec_dot_type;
int64_t nrows; // number of rows to process simultaneously;
int64_t ncols; // number of columns to process simultaneously;
int64_t interleave_blcksize; // interleave elements in blocks of interleave_blcksize;
ggml_from_float_to_mat_t from_float_to_mat;
ggml_gemv_t gemv;
ggml_gemm_t gemm;
} ggml_type_traits_t;

GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
Expand Down
1 change: 1 addition & 0 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1153,6 +1153,7 @@ add_library(ggml
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
ggml-aarch64.c ggml-aarch64.h
)

if (EMSCRIPTEN)
Expand Down
Loading
Loading