diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp index fd32ffed4dbe62..8dd1ebf2809782 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp @@ -27,7 +27,7 @@ struct dynamic_quantize : public primitive_base { dynamic_quantize(const primitive_id& id, const input_info& input, const Attributes& attrs, - const size_t input_size = 2) + const size_t input_size = 3) : primitive_base(id, {input}) , attrs(attrs) , input_size(input_size) { diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl index 782ef876871b25..ca7c3d921b32bd 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl @@ -6,6 +6,14 @@ #define UINT64_MAX 0xFFFFFFFFFFFFFFFF +#if ASYMMETRIC_QUANTIZATION && UNSIGNED_OUTPUT + #define TO_OUTPUT_TYPE_RTE(val) convert_uchar_rte(val) + #define TO_OUTPUT_VEC_TYPE_RTE(val) convert_uchar8_rte(val) +#else + #define TO_OUTPUT_TYPE_RTE(val) convert_char_rte(val) + #define TO_OUTPUT_VEC_TYPE_RTE(val) convert_char8_rte(val) +#endif + #if OUTPUT_DIMS != 4 #error "dynamic_quantize_gpu_ref.cl: Unsupported output dimension" #endif @@ -93,8 +101,12 @@ KERNEL(dynamic_quantize_gpu_ref)( #if ASYMMETRIC_QUANTIZATION // need to support output data precision of i8 and u8 OUTPUT1_TYPE scale = (OUTPUT1_TYPE)((CHAR_MAX - CHAR_MIN) / (max_val - min_val)); +# if UNSIGNED_OUTPUT OUTPUT1_TYPE zp = (OUTPUT1_TYPE)(-min_val * scale); -#else +# else // SIGNED_OUTPUT + OUTPUT1_TYPE zp = (OUTPUT1_TYPE)(-min_val * scale) - CHAR_MAX; +# endif +#else // !ASYMMETRIC_QUANTIZATION max_val = work_group_reduce_max(max_val); OUTPUT1_TYPE scale = 127.0h / max_val; #endif @@ -107,43 +119,31 @@ KERNEL(dynamic_quantize_gpu_ref)( const uint out_offset = OUTPUT_GET_INDEX(b + b_off, f + f_off, y + y_off, x); half val = input[in_offset]; -#if ASYMMETRIC_QUANTIZATION val *= scale; +#if ASYMMETRIC_QUANTIZATION val += zp; - // printf("1 val %f\n", val); - output[out_offset] = convert_uchar_rte(val); -#else - val *= scale; - output[out_offset] = convert_char_rte(val); #endif + output[out_offset] = TO_OUTPUT_TYPE_RTE(val); #else const uint in_offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, 0); const uint out_offset = OUTPUT_GET_INDEX(b + b_off, f + f_off, y + y_off, 0); int x; for (x = 0; x < INPUT0_SIZE_X / 8; x++) { half8 val = as_half8(vload8(0, (ushort*)input + in_offset + x * 8)); -#if ASYMMETRIC_QUANTIZATION val *= scale; +#if ASYMMETRIC_QUANTIZATION val += zp; - // printf("2 val %f\n", val); - vstore8(convert_uchar8_rte(val), 0, output + out_offset + x * 8); -#else - val *= scale; - vstore8(convert_char8_rte(val), 0, output + out_offset + x * 8); #endif + vstore8(TO_OUTPUT_VEC_TYPE_RTE(val), 0, output + out_offset + x * 8); } x *= 8; for (; x < INPUT0_SIZE_X; x++) { half val = input[in_offset + x]; -#if ASYMMETRIC_QUANTIZATION val *= scale; +#if ASYMMETRIC_QUANTIZATION val += zp; - // printf("3 val %f zp %f\n", val, zp); - output[out_offset + x] = convert_uchar_rte(val); -#else - val *= scale; - output[out_offset + x] = convert_char_rte(val); #endif + output[out_offset + x] = TO_OUTPUT_TYPE_RTE(val); } #endif } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp index edf6dc5aacb960..f432fa6ac5756d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp @@ -54,6 +54,7 @@ JitConstants DynamicQuantizeKernelRef::GetJitConstants(const dynamic_quantize_pa jit.AddConstant(MakeJitConstant("ASYMMETRIC_QUANTIZATION", params.use_asymmetric_quantization)); jit.AddConstant(MakeJitConstant("GROUP_SCALES_WITH_ZP", params.combine_scales_and_zp)); + jit.AddConstant(MakeJitConstant("UNSIGNED_OUTPUT", params.outputs[0].GetDType() == Datatype::UINT8 ? 1 : 0)); auto group_sizes = params.group_sizes; group_sizes.resize(std::min((size_t)4, group_sizes.size()), 1); diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp index 07c47009badc68..32e0533b662746 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp @@ -147,6 +147,10 @@ TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_large_size) { this->test_dynamic_quantization(false, {11, 1, 1, 4096}, {2048, 1, 1, 4096}); } +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_large_size_dynamic) { + this->test_dynamic_quantization(false, {-1, 1, 1, 4096}, {2048, 1, 1, 4096}); +} + TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_small_size) { this->test_dynamic_quantization(false, {1, 1, 1, 4096}, {64, 1, 1, 4096}); }