Skip to content

Commit

Permalink
dyn_quan_opt is implemented
Browse files Browse the repository at this point in the history
  • Loading branch information
isanghao committed Sep 11, 2024
1 parent 513f21b commit ff083ea
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,48 @@
#define AS_INPUT_TYPE_N(x) AS_TYPE_N(INPUT0_TYPE, VEC_SIZE, x)

REQD_SUB_GROUP_SIZE(SIMD)
#ifdef QUANTIZE_GROUP_SIZE <= 128
KERNEL(dynamic_quantize_gpu_opt)(
OPTIONAL_SHAPE_INFO_ARG
const __global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output,
__global OUTPUT1_TYPE* output_scale) {
const uint bf = get_global_id(0);
const uint b = bf / INPUT0_FEATURE_NUM;
const uint f = bf % INPUT0_FEATURE_NUM;
const uint y_grp = get_global_id(1);

const uint input_offset = INPUT0_GET_INDEX(b, f, y_grp * QUANTIZE_GROUP_SIZE, 0);
const uint output_offset = OUTPUT_GET_INDEX(b, f, y_grp * QUANTIZE_GROUP_SIZE, 0);
const uint quantize_block = QUANTIZE_GROUP_SIZE / 4;
half4 input_0[quantize_block];
char4 quantized_value[quantize_block];
half max[quantize_block];

unroll_for (uint i = 0 ; i < quantize_block; ++i) {
input_0[i] = vload4(0, &input[input_offset + i * 4]);
max[i] = fmax(fmax(fabs(input_0[i][0]), fabs(input_0[i][1])), fmax(fabs(input_0[i][2]), fabs(input_0[i][3])));
}

half max_value = 0.001;
for (uint i = 0 ; i < quantize_block; i+=8) {
half temp = fmax(fmax(fmax(max[i], max[i+1]), fmax(max[i+2], max[i+3])),
fmax(fmax(max[i+4], max[i+5]), fmax(max[i+6], max[i+7])));
max_value = fmax(max_value, temp);
}

half quan_scale = max_value / 128;

unroll_for (uint i = 0 ; i < quantize_block; ++i) {
quantized_value[i] = convert_char4(input_0[i] / (half4)quan_scale);
vstore4(quantized_value[i], 0, &output[output_offset + i * 4]);
}

output_scale[OUTPUT1_GET_INDEX(b, f, y_grp, 0)] = quan_scale;
}

#else

KERNEL(dynamic_quantize_gpu_opt)(
OPTIONAL_SHAPE_INFO_ARG
const __global INPUT0_TYPE* input,
Expand Down Expand Up @@ -78,3 +120,4 @@ KERNEL(dynamic_quantize_gpu_opt)(
if (sglid == 0 && local_id == 0)
output_scale[bf] = 1.0h / scale;
}
#endif
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ JitConstants DynamicQuantizeKernelOpt::GetJitConstants(const dynamic_quantize_pa
jit.AddConstant(MakeJitConstant("TOTAL_BLOCK_NUM", total_block_num));
jit.AddConstant(MakeJitConstant("ALIGNED_BLOCK_NUM", aligned_block_num));
jit.AddConstant(MakeJitConstant("BLOCK_NUM", block_num));
jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", params.group_size));
jit.Merge(GetTensorFriendlyWorkGroupsJit(params.outputs[0]));

return jit;
Expand All @@ -70,14 +71,20 @@ JitConstants DynamicQuantizeKernelOpt::GetJitConstants(const dynamic_quantize_pa
CommonDispatchData DynamicQuantizeKernelOpt::SetDefault(const dynamic_quantize_params& params) const {
CommonDispatchData dispatchData;

auto vec_size = get_match_vector_size(params);
auto bf_size = get_input_bf_size(params);
size_t total_block_num = bf_size.second / (simd * vec_size);
size_t batch = get_input_bf_size(params).first;
size_t block_num = (total_block_num > 32) ? 32 : total_block_num;

dispatchData.gws = {simd, block_num, batch};
dispatchData.lws = {simd, block_num, 1};
if (params.group_size > 128) {
auto vec_size = get_match_vector_size(params);
auto bf_size = get_input_bf_size(params);
size_t total_block_num = bf_size.second / (simd * vec_size);
size_t batch = get_input_bf_size(params).first;
size_t block_num = (total_block_num > 32) ? 32 : total_block_num;

dispatchData.gws = {simd, block_num, batch};
dispatchData.lws = {simd, block_num, 1};
} else {
auto bf_size = get_input_bf_size(params);
dispatchData.gws = {bf_size.first, bf_size.second / params.group_size};
dispatchData.lws = {1, 1, 1};
}

return dispatchData;
}
Expand Down Expand Up @@ -132,7 +139,7 @@ KernelsData DynamicQuantizeKernelOpt::GetKernelsData(const Params& params) const
}

KernelsPriority DynamicQuantizeKernelOpt::GetKernelsPriority(const Params& /*params*/) const {
return DONT_USE_IF_HAVE_SOMETHING_ELSE;
return FORCE_PRIORITY_1;
}

bool DynamicQuantizeKernelOpt::Validate(const Params& params) const {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -878,7 +878,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
// OneDNN accuracy issue
if ((root->get_input_element_type(1) == ov::element::i8 || root->get_input_element_type(1) == ov::element::u8)
&& dynamic_quantization_group_size != UINT64_MAX) {
GPU_DEBUG_COUT << root->get_friendly_name() << " : dynamic quantization is not supported because of library accuracy issue" << std::endl;
GPU_DEBUG_TRACE << root->get_friendly_name() << " : dynamic quantization is not supported because of library accuracy issue" << std::endl;
return true;
}
return false;
Expand Down

0 comments on commit ff083ea

Please sign in to comment.