Skip to content

Commit

Permalink
Add intrinsics polyfills for AMD
Browse files Browse the repository at this point in the history
---------

Co-authored-by: ardfork <[email protected]>
Co-authored-by: funnbot <[email protected]>
Co-authored-by: Engininja2 <[email protected]>
  • Loading branch information
4 people committed Aug 8, 2023
1 parent ab62128 commit 4024f91
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 17 deletions.
1 change: 0 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,6 @@ if (LLAMA_HIPBLAS)
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)

Expand Down
1 change: 0 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,6 @@ ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y)
ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_FORCE_DMMV
ggml-cuda.o: CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(CXX) $(CXXFLAGS) -x hip -c -o $@ $<
Expand Down
53 changes: 38 additions & 15 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,29 @@

#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products

#if defined(GGML_USE_HIPBLAS)
#define __CUDA_ARCH__ 1300

typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
return reinterpret_cast<const int&>(c);
}

static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
c = __builtin_amdgcn_sdot4(a, b, c, false);
#else
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
#endif
return c;
}
#endif

#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
Expand Down Expand Up @@ -1396,8 +1419,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
return;
}

y[ib].ds.x = d;
y[ib].ds.y = sum;
reinterpret_cast<half&>(y[ib].ds.x) = d;
reinterpret_cast<half&>(y[ib].ds.y) = sum;
}

template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
Expand Down Expand Up @@ -1609,8 +1632,8 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
#else
const float2 dm8f = __half22float2(dm8);
const float2 ds8f = __half22float2(ds8);
const float d8d8 = dm8.x * ds8.x;
const float m8s8 = dm8.y * ds8.y;
const float d8d8 = __low2float(dm8) * __low2float(ds8);
const float m8s8 = __high2float(dm8) * __high2float(ds8);
#endif // GGML_CUDA_F16

// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
Expand Down Expand Up @@ -2380,7 +2403,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
}

return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
}

static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
Expand Down Expand Up @@ -2478,7 +2501,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
#pragma unroll
for (int i = 0; i < QR2_K; ++ i) {
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
d8[i] = bq8_1[bq8_offset + i].ds.x;
d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
}

return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
Expand Down Expand Up @@ -2605,7 +2628,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
#pragma unroll
for (int i = 0; i < QR3_K; ++i) {
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
d8[i] = bq8_1[bq8_offset + i].ds.x;
d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
}

return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
Expand Down Expand Up @@ -2782,7 +2805,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(

for (int i = 0; i < QR4_K; ++i) {
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
d8[i] = bq8i->ds.x;
d8[i] = __low2half(bq8i->ds);

const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
u[2*i+0] = q8[0];
Expand All @@ -2809,8 +2832,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
const float dall = bq4_K->d[0];
const float dmin = bq4_K->d[1];

const float d8_1 = bq8_1[0].ds.x;
const float d8_2 = bq8_1[1].ds.x;
const float d8_1 = __low2float(bq8_1[0].ds);
const float d8_2 = __low2float(bq8_1[1].ds);

const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
Expand Down Expand Up @@ -2977,7 +3000,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
#pragma unroll
for (int i = 0; i < QR5_K; ++i) {
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
d8[i] = bq8i->ds.x;
d8[i] = __low2float(bq8i->ds);

const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
u[2*i+0] = q8[0];
Expand All @@ -2995,8 +3018,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(

const float d = bq5_K->d;

const float d8_1 = bq8_1[0].ds.x;
const float d8_2 = bq8_1[1].ds.x;
const float d8_1 = __low2half(bq8_1[0].ds);
const float d8_2 = __low2half(bq8_1[1].ds);

const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
Expand Down Expand Up @@ -3157,7 +3180,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
#pragma unroll
for (int i = 0; i < QR6_K; ++i) {
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
}

return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
Expand Down Expand Up @@ -3336,7 +3359,7 @@ static __global__ void mul_mat_q(
*dsi_dst = *dsi_src;
} else {
float * dfi_dst = (float *) dsi_dst;
*dfi_dst = (*dsi_src).x;
*dfi_dst = __low2half(*dsi_src);
}
}

Expand Down

0 comments on commit 4024f91

Please sign in to comment.