-
Notifications
You must be signed in to change notification settings - Fork 9.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SYCL] iq2_s #6052
[SYCL] iq2_s #6052
Changes from 27 commits
08d3b40
9b030b9
81b6139
0af3ed7
87e5c86
f3a3ea1
1641c52
15617b8
32589a6
a553def
9fa92aa
7f70fbe
4b7aaae
7466e4e
f5fed74
36c7f02
7ea2e15
551f5a0
ada101e
d4b182c
e9377ba
69aaa3d
19772fa
871a135
ff4ace5
8c07b8f
4e6df37
619ce80
935eabd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -4436,6 +4436,24 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri | |||||
#endif | ||||||
} | ||||||
|
||||||
inline bool ggml_sycl_supports_mmq(enum ggml_type type) { | ||||||
switch (type) { | ||||||
case GGML_TYPE_Q4_0: | ||||||
case GGML_TYPE_Q4_1: | ||||||
case GGML_TYPE_Q5_0: | ||||||
case GGML_TYPE_Q5_1: | ||||||
case GGML_TYPE_Q8_0: | ||||||
case GGML_TYPE_Q2_K: | ||||||
case GGML_TYPE_Q3_K: | ||||||
case GGML_TYPE_Q4_K: | ||||||
case GGML_TYPE_Q5_K: | ||||||
case GGML_TYPE_Q6_K: | ||||||
return true; | ||||||
default: | ||||||
return false; | ||||||
} | ||||||
} | ||||||
|
||||||
template<typename dst_t> | ||||||
static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy, | ||||||
const sycl::nd_item<3> &item_ct1, | ||||||
|
@@ -4581,6 +4599,36 @@ static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restr | |||||
|
||||||
} | ||||||
|
||||||
template<typename dst_t> | ||||||
static void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy, | ||||||
const sycl::nd_item<3> &item_ct1, | ||||||
const uint64_t *iq2s_grid, | ||||||
const uint8_t *ksigns_iq2xs, | ||||||
const uint8_t *kmask_iq2xs) { | ||||||
const int i = item_ct1.get_group(2); | ||||||
const block_iq2_s * x = (const block_iq2_s *) vx; | ||||||
|
||||||
const int tid = item_ct1.get_local_id(2); | ||||||
#if QK_K == 256 | ||||||
const int il = tid/8; // 0...3 | ||||||
abhilash1910 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
const int ib = tid%8; // 0...7 | ||||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il; | ||||||
const uint8_t * qs = x[i].qs + 8*ib; | ||||||
const uint8_t * grid1 = (const uint8_t *)(iq2s_grid + qs[2*il+0]); | ||||||
const uint8_t * grid2 = (const uint8_t *)(iq2s_grid + qs[2*il+1]); | ||||||
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; | ||||||
const uint8_t signs = ksigns_iq2xs[(x[i].qh[ib] >> 3*il) & 7]; | ||||||
for (int j = 0; j < 4; ++j) { | ||||||
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); | ||||||
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); | ||||||
} | ||||||
#else | ||||||
assert(false); | ||||||
#endif | ||||||
|
||||||
} | ||||||
|
||||||
|
||||||
/* | ||||||
DPCT1110:4: The total declared local variable size in device function | ||||||
dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register | ||||||
|
@@ -7497,6 +7545,57 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq, | |||||
#endif | ||||||
} | ||||||
|
||||||
static __dpct_inline__ float | ||||||
vec_dot_iq2_s_q8_1(const void *__restrict__ vbq, | ||||||
const block_q8_1 *__restrict__ bq8_1, const int &iqs, | ||||||
const uint64_t *iq2s_grid, const uint64_t *ksigns64) { | ||||||
#if QK_K == 256 | ||||||
const block_iq2_s * bq2 = (const block_iq2_s *) vbq; | ||||||
|
||||||
const int ib32 = iqs; | ||||||
const int8_t * q8 = bq8_1[ib32].qs; | ||||||
const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32; | ||||||
const uint8_t ls1 = bq2->scales[ib32] & 0xf; | ||||||
const uint8_t ls2 = bq2->scales[ib32] >> 4; | ||||||
int sumi1 = 0; | ||||||
for (int l = 0; l < 2; ++l) { | ||||||
const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300))); | ||||||
const uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>( | ||||||
((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>()); | ||||||
const uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>( | ||||||
((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>()); | ||||||
const int grid_l = dpct::vectorized_binary<sycl::uchar4>( | ||||||
grid[0] ^ signs0, signs0, std::minus<>()); | ||||||
const int grid_h = dpct::vectorized_binary<sycl::uchar4>( | ||||||
grid[1] ^ signs1, signs1, std::minus<>()); | ||||||
sumi1 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi1); | ||||||
sumi1 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi1); | ||||||
q8 += 8; | ||||||
} | ||||||
int sumi2 = 0; | ||||||
for (int l = 2; l < 4; ++l) { | ||||||
const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300))); | ||||||
const uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>( | ||||||
((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>()); | ||||||
const uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>( | ||||||
((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>()); | ||||||
const int grid_l = dpct::vectorized_binary<sycl::uchar4>( | ||||||
grid[0] ^ signs0, signs0, std::minus<>()); | ||||||
const int grid_h = dpct::vectorized_binary<sycl::uchar4>( | ||||||
grid[1] ^ signs1, signs1, std::minus<>()); | ||||||
sumi2 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi2); | ||||||
sumi2 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi2); | ||||||
q8 += 8; | ||||||
} | ||||||
const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f; | ||||||
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); | ||||||
#else | ||||||
(void) ksigns64; | ||||||
assert(false); | ||||||
return 0.f; | ||||||
#endif | ||||||
} | ||||||
|
||||||
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, | ||||||
int mmq_y, int nwarps, load_tiles_sycl_t load_tiles, int vdr, | ||||||
vec_dot_q_mul_mat_sycl_t vec_dot> | ||||||
|
@@ -8353,6 +8452,53 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void * | |||||
} | ||||||
} | ||||||
|
||||||
|
||||||
template <int qk, int qi, typename block_q_t, int vdr> | ||||||
static void mul_mat_vec_q_iq2_s_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, | ||||||
const sycl::nd_item<3> &item_ct1, | ||||||
const uint64_t *iq2s_grid_ptr, const uint64_t *ksigns64_ptr ) { | ||||||
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + | ||||||
item_ct1.get_local_id(1); | ||||||
|
||||||
if (row >= nrows) { | ||||||
return; | ||||||
} | ||||||
|
||||||
const int blocks_per_row = ncols / qk; | ||||||
const int blocks_per_warp = vdr * WARP_SIZE / qi; | ||||||
|
||||||
// partial sum for each thread | ||||||
float tmp = 0.0f; | ||||||
|
||||||
const block_q_t * x = (const block_q_t *) vx; | ||||||
const block_q8_1 * y = (const block_q8_1 *) vy; | ||||||
|
||||||
for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; | ||||||
i += blocks_per_warp) { | ||||||
const int ibx = row*blocks_per_row + i; // x block index | ||||||
|
||||||
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx | ||||||
|
||||||
const int iqs = | ||||||
vdr * | ||||||
(item_ct1.get_local_id(2) % | ||||||
(qi / vdr)); // x block quant index when casting the quants to int | ||||||
|
||||||
tmp += vec_dot_iq2_s_q8_1(&x[ibx], &y[iby], iqs, iq2s_grid_ptr, ksigns64_ptr); | ||||||
} | ||||||
|
||||||
// sum up partial sums and write back result | ||||||
#pragma unroll | ||||||
for (int mask = 16; mask > 0; mask >>= 1) { | ||||||
tmp += | ||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); | ||||||
} | ||||||
|
||||||
if (item_ct1.get_local_id(2) == 0) { | ||||||
dst[row] = tmp; | ||||||
} | ||||||
} | ||||||
|
||||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel> | ||||||
static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows, | ||||||
const sycl::nd_item<3> &item_ct1) { | ||||||
|
@@ -10096,6 +10242,36 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k, | |||||
} | ||||||
} | ||||||
|
||||||
template <typename dst_t> | ||||||
static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int k, | ||||||
dpct::queue_ptr stream) { | ||||||
const int nb = k / QK_K; | ||||||
{ | ||||||
iq2s_grid.init(*stream); | ||||||
ksigns_iq2xs.init(*stream); | ||||||
kmask_iq2xs.init(*stream); | ||||||
|
||||||
dpct::has_capability_or_fail(stream->get_device(), | ||||||
{sycl::aspect::fp16}); | ||||||
|
||||||
stream->submit([&](sycl::handler &cgh) { | ||||||
auto iq2s_grid_ptr_ct1 = iq2s_grid.get_ptr(); | ||||||
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr(); | ||||||
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr(); | ||||||
|
||||||
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * | ||||||
sycl::range<3>(1, 1, 32), | ||||||
sycl::range<3>(1, 1, 32)), | ||||||
[=](sycl::nd_item<3> item_ct1) { | ||||||
dequantize_block_iq2_s( | ||||||
vx, y, item_ct1, iq2s_grid_ptr_ct1, | ||||||
ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1); | ||||||
}); | ||||||
}); | ||||||
} | ||||||
} | ||||||
|
||||||
|
||||||
template <typename src_t, typename dst_t> | ||||||
static void convert_unary_sycl(const void *__restrict__ vx, | ||||||
dst_t *__restrict__ y, const int k, | ||||||
|
@@ -10150,6 +10326,8 @@ static to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) try { | |||||
return dequantize_row_iq3_s_sycl; | ||||||
case GGML_TYPE_IQ1_S: | ||||||
return dequantize_row_iq1_s_sycl; | ||||||
case GGML_TYPE_IQ2_S: | ||||||
return dequantize_row_iq2_s_sycl; | ||||||
case GGML_TYPE_F32: | ||||||
return convert_unary_sycl<float>; | ||||||
default: | ||||||
|
@@ -10194,6 +10372,8 @@ static to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) { | |||||
return dequantize_row_iq3_s_sycl; | ||||||
case GGML_TYPE_IQ1_S: | ||||||
return dequantize_row_iq1_s_sycl; | ||||||
case GGML_TYPE_IQ2_S: | ||||||
return dequantize_row_iq2_s_sycl; | ||||||
case GGML_TYPE_F16: | ||||||
return convert_unary_sycl<sycl::half>; | ||||||
default: | ||||||
|
@@ -10839,6 +11019,35 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy, | |||||
} | ||||||
} | ||||||
|
||||||
static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy, | ||||||
float *dst, const int ncols, | ||||||
const int nrows, | ||||||
dpct::queue_ptr stream) { | ||||||
GGML_ASSERT(ncols % QK_K == 0); | ||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; | ||||||
const sycl::range<3> block_nums(1, 1, block_num_y); | ||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); | ||||||
{ | ||||||
iq2s_grid.init(*stream); | ||||||
ksigns64.init(*stream); | ||||||
|
||||||
stream->submit([&](sycl::handler &cgh) { | ||||||
auto iq2s_grid_ptr_ct1 = iq2s_grid.get_ptr(); | ||||||
auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); | ||||||
|
||||||
cgh.parallel_for( | ||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims), | ||||||
[=](sycl::nd_item<3> item_ct1) | ||||||
[[intel::reqd_sub_group_size(32)]] { | ||||||
mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S, block_iq2_s, 1>( | ||||||
vx, vy, dst, ncols, nrows, item_ct1, | ||||||
iq2s_grid_ptr_ct1, ksigns64_ptr_ct1); | ||||||
}); | ||||||
}); | ||||||
} | ||||||
} | ||||||
|
||||||
|
||||||
static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy, | ||||||
float *dst, const int ncols_x, | ||||||
const int nrows_x, const int ncols_y, | ||||||
|
@@ -13612,6 +13821,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC | |||||
case GGML_TYPE_IQ2_XXS: | ||||||
case GGML_TYPE_IQ2_XS: | ||||||
case GGML_TYPE_IQ1_S: | ||||||
case GGML_TYPE_IQ2_S: | ||||||
case GGML_TYPE_IQ3_XXS: | ||||||
return max_compute_capability >= VER_GEN9 ? 128 : 64; | ||||||
case GGML_TYPE_IQ3_S: | ||||||
|
@@ -13631,7 +13841,8 @@ inline void ggml_sycl_op_mul_mat_vec_q( | |||||
const int64_t src1_ncols, const int64_t src1_padded_row_size, | ||||||
const dpct::queue_ptr &stream) { | ||||||
|
||||||
GGML_ASSERT(ggml_nrows(src1) == 1); | ||||||
//GGML_ASSERT(ggml_nrows(src1) == 1); | ||||||
//GGML_ASSERT(ne10 % QK8_1 == 0); | ||||||
Comment on lines
+13844
to
+13845
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Delete comments |
||||||
|
||||||
const int64_t ne00 = src0->ne[0]; | ||||||
const int64_t row_diff = row_high - row_low; | ||||||
|
@@ -13682,6 +13893,9 @@ inline void ggml_sycl_op_mul_mat_vec_q( | |||||
case GGML_TYPE_IQ1_S: | ||||||
mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); | ||||||
break; | ||||||
case GGML_TYPE_IQ2_S: | ||||||
mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); | ||||||
break; | ||||||
default: | ||||||
GGML_ASSERT(false); | ||||||
break; | ||||||
|
@@ -13758,6 +13972,24 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec( | |||||
case GGML_TYPE_Q6_K: | ||||||
dequantize_mul_mat_vec_q6_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); | ||||||
break; | ||||||
case GGML_TYPE_IQ2_XXS: | ||||||
mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); | ||||||
break; | ||||||
case GGML_TYPE_IQ2_XS: | ||||||
mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); | ||||||
break; | ||||||
case GGML_TYPE_IQ3_XXS: | ||||||
mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); | ||||||
break; | ||||||
case GGML_TYPE_IQ3_S: | ||||||
mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); | ||||||
break; | ||||||
case GGML_TYPE_IQ1_S: | ||||||
mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); | ||||||
break; | ||||||
case GGML_TYPE_IQ2_S: | ||||||
mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); | ||||||
break; | ||||||
case GGML_TYPE_F16: | ||||||
convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); | ||||||
break; | ||||||
|
@@ -15177,7 +15409,7 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 | |||||
#ifdef GGML_SYCL_FORCE_DMMV | ||||||
const bool use_mul_mat_vec_q = false; | ||||||
#else | ||||||
const bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1; | ||||||
const bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
This is a breaking change for both Intel and Nvidia targets. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes actually the assertion should not be mandatory and ggml_nrows=1 wont allow the iq stages to run. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Iv tested with the suppression of the assert. It fails on both Intel and Nvidia targets. In both cases it is due to the function lacking the case for your new quantization type Intel (Intel(R) Level-Zero, Intel(R) Data Center GPU Max 1100 1.3 [1.3.28454]): Nvidia (NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.2]): There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually for both the cases, it should go to neither of the methods. It is strange that it is falling back to either dequantize_mul_mat_vec or op_mul_mat_q path. For this type only vectorised mul_mat_q-mmvq should be called. |
||||||
#endif // GGML_SYCL_FORCE_DMMV | ||||||
|
||||||
if (use_mul_mat_vec_q) { | ||||||
|
@@ -17031,9 +17263,13 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons | |||||
return false; | ||||||
} | ||||||
ggml_type a_type = a->type; | ||||||
if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ2_S || | ||||||
a_type == GGML_TYPE_IQ4_XS) { | ||||||
return false; | ||||||
|
||||||
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || | ||||||
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S || | ||||||
a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) { | ||||||
if (b->ne[1] == 1 && ggml_nrows(b) > 1) { | ||||||
return false; | ||||||
} | ||||||
abhilash1910 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
} | ||||||
return true; | ||||||
} break; | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is an unused function, not sure if it should be added.