diff --git a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp new file mode 100644 index 00000000000000..69c148305fb94f --- /dev/null +++ b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp @@ -0,0 +1,44 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/op/op.hpp" +#include "transformations_visibility.hpp" + +namespace ov { +namespace op { +namespace internal { + +/// \brief Operator performing Dynamic Quantize +class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op { +public: + OPENVINO_OP("DynamicQuantize", "gpu_opset"); + + DynamicQuantize() = default; + /// \brief Constructs an DynamicQuantize operation. + /// + /// \param data Input tensor with data + /// \param group_sizes Group sizes for dynamic quantization + /// \param dt_scale Data type for scale output + DynamicQuantize(const Output& data, std::vector group_sizes, element::Type dt_scale); + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + const std::vector& get_group_sizes() const { + return m_group_sizes; + }; + static std::vector shape_infer(const DynamicQuantize* op, + const std::vector& input_shapes, + const std::vector& group_sizes); + +private: + std::vector m_group_sizes; + element::Type m_dt_scale; +}; + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp new file mode 100644 index 00000000000000..74c0498e9a4425 --- /dev/null +++ b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp @@ -0,0 +1,71 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ov_ops/dynamic_quantize.hpp" + +#include "openvino/core/partial_shape.hpp" +#include "openvino/core/validation_util.hpp" +#include "openvino/op/variadic_split.hpp" +#include "variadic_split_shape_inference.hpp" + +namespace ov { +namespace op { +namespace internal { + +DynamicQuantize::DynamicQuantize(const Output& data, std::vector group_sizes, element::Type dt_scale) + : Op({data}), + m_group_sizes(std::move(group_sizes)), + m_dt_scale(dt_scale) { + OPENVINO_ASSERT(data.get_partial_shape().rank() == m_group_sizes.size(), + "FC input rank should be same as the rank of group_size ", + data.get_tensor_ptr()->get_partial_shape().rank(), + " / ", + m_group_sizes.size()); + set_output_size(2); + validate_and_infer_types(); +} + +void DynamicQuantize::validate_and_infer_types() { + std::vector input_shapes = {get_input_partial_shape(0)}; + + auto out_shapes = shape_infer(this, input_shapes, m_group_sizes); + set_output_type(0, element::i8, out_shapes[0]); + set_output_type(1, m_dt_scale, out_shapes[1]); +} + +std::shared_ptr DynamicQuantize::clone_with_new_inputs(const ov::OutputVector& new_args) const { + check_new_args_count(this, new_args); + return std::make_shared(new_args.at(0), m_group_sizes, m_dt_scale); +} + +std::vector DynamicQuantize::shape_infer(const DynamicQuantize* op, + const std::vector& input_shapes, + const std::vector& group_sizes) { + std::vector out_shapes; + out_shapes.push_back(input_shapes[0]); + + auto scale_shape = input_shapes[0]; + OPENVINO_ASSERT(scale_shape.size() == group_sizes.size(), + "Scale_shape and group_size are supposed to have same rank: ", + scale_shape.size(), + " / ", + group_sizes.size()); + for (size_t i = 0; i < scale_shape.size(); i++) { + if (scale_shape[i].is_dynamic()) + continue; + + if (group_sizes[i] == UINT64_MAX) + scale_shape[i] = 1; + else { + scale_shape[i] /= group_sizes[i]; // if group_size is larger than shape, scale_shape will be 1 + scale_shape[i] = std::max(static_cast(scale_shape[i].get_length()), 1); + } + } + out_shapes.push_back(scale_shape); + return out_shapes; +} + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/fully_connected_compressed.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/fully_connected_compressed.hpp index 2c2e3c82089621..1112a3785317a3 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/fully_connected_compressed.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/fully_connected_compressed.hpp @@ -19,14 +19,23 @@ class FullyConnectedCompressed : public FullyConnected { FullyConnectedCompressed(const ov::Output &A, const ov::Output &B, const ov::Output &bias, - const ov::Output &decompression_scale, - const ov::Output &decompression_zero_point, + const ov::Output &w_decompression_scale, + const ov::Output &w_decompression_zero_point, + const ov::Output &a_decompression_scale, + const ov::element::Type output_type = ov::element::undefined); + + + FullyConnectedCompressed(const ov::Output &A, + const ov::Output &B, + const ov::Output &bias, + const ov::Output &w_decompression_scale, + const ov::Output &w_decompression_zero_point, const ov::element::Type output_type = ov::element::undefined); FullyConnectedCompressed(const ov::Output &A, const ov::Output &B, const ov::Output &bias, - const ov::Output &decompression_scale, + const ov::Output &w_decompression_scale, const ov::element::Type output_type = ov::element::undefined); std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp index 6ce8bb62407aa5..e61542a0c50dfc 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp @@ -287,3 +287,4 @@ REGISTER_FACTORY(internal, Placeholder); REGISTER_FACTORY(internal, SDPA); REGISTER_FACTORY(internal, IndirectSDPA); REGISTER_FACTORY(internal, RoPE); +REGISTER_FACTORY(internal, DynamicQuantize); diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp new file mode 100644 index 00000000000000..d93e2f86eed144 --- /dev/null +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp @@ -0,0 +1,57 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#include "primitive.hpp" + +namespace cldnn { + +/// @brief Dynamic Quantize primitive +/// @details Performs dynamic quantization +struct dynamic_quantize : public primitive_base { + CLDNN_DECLARE_PRIMITIVE(dynamic_quantize); + + dynamic_quantize() : primitive_base("", {}), group_size(0) {} + + /// @brief Constructs dynamic_quantize primitive + /// @param id This primitive id + /// @param input Input primitive id + /// @param group_size Quantization group size + /// @param data_type Output data type of quantized + /// @param output_size Output data size of the primitive + dynamic_quantize(const primitive_id& id, + const input_info& input, + const uint64_t group_size, + const std::vector data_types = {optional_data_type(data_types::f16), optional_data_type(data_types::i8)}) + : primitive_base(id, {input}, 2, data_types), + group_size(group_size) {} + + uint64_t group_size = 0; + + size_t hash() const override { + size_t seed = primitive::hash(); + seed = hash_combine(seed, group_size); + return seed; + } + + bool operator==(const primitive& rhs) const override { + if (!compare_common_params(rhs)) + return false; + + auto rhs_casted = downcast(rhs); + + return group_size == rhs_casted.group_size; + } + + void save(BinaryOutputBuffer& ob) const override { + primitive_base::save(ob); + ob << group_size; + } + + void load(BinaryInputBuffer& ib) override { + primitive_base::load(ib); + ib >> group_size; + } +}; +} // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/fully_connected.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/fully_connected.hpp index 3287958b57bd28..e39078cb1011cc 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/fully_connected.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/fully_connected.hpp @@ -95,11 +95,46 @@ struct fully_connected : public primitive_base { compressed_weights(true), decompression_scale(decompression_scale), decompression_zero_point(decompression_zero_point), + dynamic_quantized_activation(false), input_size(input_size), weights_rank(weights_rank) { OPENVINO_ASSERT(!decompression_scale.empty(), "[GPU] Compressed fully connected requires at least decompression scale input"); } + /// @brief Constructs fully connected compressed layer. + /// @param id This primitive id. + /// @param input Input primitive id. + /// @param weights Primitive id containing weights data. + /// @param bias Primitive id containing bias data. + /// @param compression_scale Primitive id containing scale factors for weights decompression. + /// @param compression_zero_point Primitive id containing zero points for weights decompression. + /// @param activation_scale Primitive id containing scale factor for activation. + fully_connected(const primitive_id& id, + const input_info& input, + const primitive_id& weights, + const primitive_id& bias, + const primitive_id& decompression_scale, + const primitive_id& decompression_zero_point, + const input_info& activation_scale, + const data_types data_type, + const size_t input_size = 2, + const size_t weights_rank = 2) + : primitive_base(id, { input }, 1, {optional_data_type{data_type}}), + weights(weights), + bias(bias), + compressed_weights(true), + decompression_scale(decompression_scale), + decompression_zero_point(decompression_zero_point), + dynamic_quantized_activation(false), + activation_scale(activation_scale), + input_size(input_size), + weights_rank(weights_rank) { + if (activation_scale.is_valid()) + dynamic_quantized_activation = true; + + OPENVINO_ASSERT(!decompression_scale.empty(), "[GPU] Compressed fully connected requires at least decompression scale input"); + } + /// @brief Primitive id containing weights data. primitive_id weights; /// @brief Primitive id containing bias data. @@ -108,6 +143,8 @@ struct fully_connected : public primitive_base { bool compressed_weights = false; primitive_id decompression_scale = ""; primitive_id decompression_zero_point = ""; + bool dynamic_quantized_activation = false; + input_info activation_scale = {"", 0}; optional_value decompression_zero_point_scalar = optional_value(); /// @brief Primitive dimension size. @@ -123,6 +160,7 @@ struct fully_connected : public primitive_base { seed = hash_combine(seed, compressed_weights); seed = hash_combine(seed, !decompression_scale.empty()); seed = hash_combine(seed, !decompression_zero_point.empty()); + seed = hash_combine(seed, activation_scale.is_valid()); seed = hash_combine(seed, decompression_zero_point_scalar.has_value()); seed = hash_combine(seed, decompression_zero_point_scalar.value_or(0.0f)); return seed; @@ -140,6 +178,7 @@ struct fully_connected : public primitive_base { compressed_weights == rhs_casted.compressed_weights && decompression_scale.empty() == rhs_casted.decompression_scale.empty() && decompression_zero_point.empty() == rhs_casted.decompression_zero_point.empty() && + activation_scale.is_valid() == rhs_casted.activation_scale.is_valid() && decompression_zero_point_scalar.value_or(0.0f) == rhs_casted.decompression_zero_point_scalar.value_or(0.0f); } @@ -150,8 +189,10 @@ struct fully_connected : public primitive_base { ob << compressed_weights; ob << decompression_scale; ob << decompression_zero_point; + ob << activation_scale; ob << input_size; ob << weights_rank; + ob << dynamic_quantized_activation; if (decompression_zero_point_scalar.has_value()) { ob << true; @@ -169,8 +210,10 @@ struct fully_connected : public primitive_base { ib >> compressed_weights; ib >> decompression_scale; ib >> decompression_zero_point; + ib >> activation_scale; ib >> input_size; ib >> weights_rank; + ib >> dynamic_quantized_activation; bool has_value; ib >> has_value; @@ -197,6 +240,9 @@ struct fully_connected : public primitive_base { if (!decompression_zero_point.empty()) ret.push_back(decompression_zero_point); + if (activation_scale.is_valid()) + ret.push_back(activation_scale); + return ret; } }; diff --git a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp new file mode 100644 index 00000000000000..5c945f4c2d389c --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp @@ -0,0 +1,66 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ov_ops/dynamic_quantize.hpp" +#include "dynamic_quantize_inst.h" + +#include "primitive_type_base.h" +#include "json_object.h" +#include + +namespace cldnn { +GPU_DEFINE_PRIMITIVE_TYPE_ID(dynamic_quantize); + +layout dynamic_quantize_inst::calc_output_layout(dynamic_quantize_node const& node, kernel_impl_params const& impl_param) { + auto desc = impl_param.typed_desc(); + const auto& input_layout = impl_param.get_input_layout(); + auto output_type = data_types::i8; + auto output_format = input_layout.format; + + return layout(output_type, output_format, input_layout.get_tensor()); +} + +template +std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, uint64_t group_size) { + ov::op::internal::DynamicQuantize op; + auto output_format = act_layout.format; + + std::vector input_shapes = { + act_layout.get(), + }; + + std::vector shape_group_size(act_layout.get().size(), 1); + shape_group_size.back() = group_size; + + auto output_shapes = ov::op::internal::DynamicQuantize::shape_infer(&op, input_shapes, shape_group_size); + + return { layout(output_shapes[0], data_types::i8, output_format), layout(output_shapes[1], data_types::f16, output_format) }; +} + +template std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, uint64_t group_size); + +template +std::vector dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& /*node*/, const kernel_impl_params& impl_param) { + auto desc = impl_param.typed_desc(); + const auto& input_layout = impl_param.get_input_layout(); + return __calc_output_layouts(input_layout, UINT64_MAX /* TODO: handle group_size here */); +} + +template std::vector dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& node, + const kernel_impl_params& impl_param); + +std::string dynamic_quantize_inst::to_string(dynamic_quantize_node const& node) { + auto desc = node.get_primitive(); + auto node_info = node.desc_to_json(); + + std::stringstream primitive_description; + + node_info->dump(primitive_description); + + return primitive_description.str(); +} + +dynamic_quantize_inst::typed_primitive_inst(network& network, dynamic_quantize_node const& node) : parent(network, node) {} + +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/fully_connected.cpp b/src/plugins/intel_gpu/src/graph/fully_connected.cpp index 28a65a40e3d692..287c3758586eb1 100644 --- a/src/plugins/intel_gpu/src/graph/fully_connected.cpp +++ b/src/plugins/intel_gpu/src/graph/fully_connected.cpp @@ -280,6 +280,9 @@ std::string fully_connected_inst::to_string(fully_connected_node const& node) { fc_info.add("decompression zp value", desc->decompression_zero_point_scalar.value()); } } + if (desc->dynamic_quantized_activation) { + fc_info.add("activation scale id", desc->activation_scale.pid); + } node_info->add("fully connected info", fc_info); node_info->dump(primitive_description); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp index 1f5098828872b9..9a51953ce58157 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp @@ -409,6 +409,8 @@ void prepare_primitive_fusing::fuse_bias(program &p) { fc_with_bias_prim->decompression_zero_point = desc->decompression_zero_point; if (desc->decompression_zero_point_scalar.has_value()) fc_with_bias_prim->decompression_zero_point_scalar = desc->decompression_zero_point_scalar.value(); + fc_with_bias_prim->activation_scale = desc->activation_scale; + fc_with_bias_prim->dynamic_quantized_activation = desc->dynamic_quantized_activation; } auto& new_fc_node = p.get_or_create(fc_with_bias_prim); fuse_bias_f(fc, new_fc_node, bias_node, eltw_node); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp new file mode 100644 index 00000000000000..91f141ae062723 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp @@ -0,0 +1,73 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/core/validation_util.hpp" +#include "primitive_base.hpp" +#include "dynamic_quantize/dynamic_quantize_kernel_ref.h" +#include "dynamic_quantize/dynamic_quantize_kernel_selector.h" +#include "dynamic_quantize_inst.h" + +namespace cldnn { +namespace ocl { + +struct dynamic_quantize_impl : typed_primitive_impl_ocl { + using parent = typed_primitive_impl_ocl; + using parent::parent; + using kernel_selector_t = kernel_selector::dynamic_quantize_kernel_selector; + using kernel_params_t = kernel_selector::dynamic_quantize_params; + + DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::dynamic_quantize_impl); + + std::unique_ptr clone() const override { + return make_unique(*this); + } + + void load(BinaryInputBuffer& ib) override { + parent::load(ib); + if (is_dynamic()) { + auto& kernel_selector = kernel_selector_t::Instance(); + auto kernel_impl = kernel_selector.GetImplementation(_kernel_data.kernelName); + kernel_impl->GetUpdateDispatchDataFunc(_kernel_data); + } + } + + static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { + /// TODO: handle group_size here + auto params = get_default_params(impl_param, is_shape_agnostic); + params.outputs.push_back(convert_data_tensor(impl_param.get_output_layout(1))); + + return params; + } + + void update_dispatch_data(const kernel_impl_params& impl_param) override { + auto kernel_params = get_kernel_params(impl_param, true); + (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + } +}; + +namespace detail { + +attach_dynamic_quantize_impl::attach_dynamic_quantize_impl() { + auto types = { + data_types::f16, + data_types::i8 + }; + + auto formats = { + format::bfyx, + }; + + implementation_map::add(impl_types::ocl, + shape_types::any, + typed_primitive_impl_ocl::create, + types, + formats); +} + +} // namespace detail +} // namespace ocl +} // namespace cldnn + +BIND_BINARY_BUFFER_WITH_TYPE(cldnn::ocl::dynamic_quantize_impl) +BIND_BINARY_BUFFER_WITH_TYPE(cldnn::dynamic_quantize) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp index cadab1b29ec711..07f409ab052c75 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp @@ -23,6 +23,7 @@ void register_implementations() { REGISTER_OCL(depth_to_space); REGISTER_OCL(detection_output); REGISTER_OCL(dft); + REGISTER_OCL(dynamic_quantize); REGISTER_OCL(batch_to_space); REGISTER_OCL(experimental_detectron_detection_output); REGISTER_OCL(experimental_detectron_generate_proposals_single_image); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp index bacb3c60023c76..a0933a3ab582cd 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp @@ -20,6 +20,7 @@ #include "intel_gpu/primitives/deconvolution.hpp" #include "intel_gpu/primitives/depth_to_space.hpp" #include "intel_gpu/primitives/detection_output.hpp" +#include "intel_gpu/primitives/dynamic_quantize.hpp" #include "intel_gpu/primitives/eltwise.hpp" #include "intel_gpu/primitives/experimental_detectron_detection_output.hpp" #include "intel_gpu/primitives/experimental_detectron_prior_grid_generator.hpp" @@ -104,6 +105,7 @@ REGISTER_OCL(deconvolution); REGISTER_OCL(depth_to_space); REGISTER_OCL(detection_output); REGISTER_OCL(dft); +REGISTER_OCL(dynamic_quantize); REGISTER_OCL(experimental_detectron_detection_output); REGISTER_OCL(experimental_detectron_generate_proposals_single_image); REGISTER_OCL(experimental_detectron_prior_grid_generator); diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp index 89a59324dd6f57..ef19a81942d511 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp @@ -19,6 +19,9 @@ namespace onednn { struct fully_connected_onednn : typed_primitive_onednn_impl { using parent = typed_primitive_onednn_impl; using parent::parent; + static constexpr int COMMON = 0; + static constexpr int PER_OC = 2; + static constexpr int GROUPED = 3; DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::onednn::fully_connected_onednn) @@ -60,20 +63,29 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { const auto weights_dt = instance.get_input_layout(1).data_type; auto weight_bitwidth = ov::element::Type(weights_dt).bitwidth(); OPENVINO_ASSERT(weight_bitwidth == 8 || weight_bitwidth == 4, "[GPU] oneDNN supports only 4bit/8bit compressed weights"); + int idx = prim->bias.empty() ? 2 : 3; if (!prim->decompression_scale.empty()) { - auto decompression_scale_idx = prim->bias.empty() ? 2 : 3; + auto decompression_scale_idx = idx++; auto scale_mem = instance.dep_memory_ptr(decompression_scale_idx); dnnl::memory::desc desc = onednn::layout_to_memory_desc(scale_mem->get_layout(), dnnl::memory::format_tag::a, true); args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, scale_mem->get_onednn_memory(desc)}); } if (!prim->decompression_zero_point.empty()) { - auto decompression_zp_idx = prim->bias.empty() ? 3 : 4; + auto decompression_zp_idx = idx++; auto zp_mem = instance.dep_memory_ptr(decompression_zp_idx); dnnl::memory::desc desc = onednn::layout_to_memory_desc(zp_mem->get_layout(), dnnl::memory::format_tag::a, true); args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS, zp_mem->get_onednn_memory(desc)}); } + + if (prim->activation_scale.is_valid()) { + auto activation_scale_idx = idx++; + auto act_scale_mem = instance.dep_memory_ptr(activation_scale_idx); + // TODO: handle group_size here + dnnl::memory::desc desc = onednn::layout_to_memory_desc(act_scale_mem->get_layout(), dnnl::memory::format_tag::a, true); + args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0, act_scale_mem->get_onednn_memory(desc)}); + } } return args; @@ -231,6 +243,7 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { ob << input_size; ob << has_bias; ob << is_compressed; + ob << prim->dynamic_quantized_activation; bool has_decompression_scale = !prim->decompression_scale.empty(); if (has_decompression_scale) { @@ -256,9 +269,11 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { size_t input_size = 2; bool has_bias = false; bool is_compressed = false; + bool dynamic_quantized_activation; ib >> input_size; ib >> has_bias; ib >> is_compressed; + ib >> dynamic_quantized_activation; const kernel_impl_params* impl_params = reinterpret_cast(ib.getKernelImplParams()); auto prim = impl_params->typed_desc(); @@ -270,31 +285,40 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { ib >> _ds_group_size; ib >> make_data(&_ds_data_type, sizeof(dnnl::memory::data_type)); if (!is_four_bit_weight) - _attrs->set_scales(DNNL_ARG_WEIGHTS, 1 << 1, dnnl::memory::dims{}, _ds_data_type); + _attrs->set_scales(DNNL_ARG_WEIGHTS, PER_OC, dnnl::memory::dims{}, _ds_data_type); else - _attrs->set_scales(DNNL_ARG_WEIGHTS, (1 << 1) + (1 << 0), {_ds_group_size, 1}, _ds_data_type); + _attrs->set_scales(DNNL_ARG_WEIGHTS, GROUPED, {_ds_group_size, 1}, _ds_data_type); } bool has_decompression_zp = !prim->decompression_zero_point.empty() || prim->decompression_zero_point_scalar.has_value(); + auto& arg = impl_params->get_program().get_node(impl_params->desc->id).as(); + int idx = !arg.bias_term() ? 3 : 4; if (has_decompression_zp) { ib >> make_data(&_dzp_data_type, sizeof(dnnl::memory::data_type)); - auto& arg = impl_params->get_program().get_node(impl_params->desc->id).as(); - auto decompression_zp_idx = !arg.bias_term() ? 3 : 4; - auto dzp_layout = arg.get_dependency(decompression_zp_idx).get_output_layout(); + auto dzp_layout = arg.get_dependency(idx++).get_output_layout(); if (dzp_layout.count() == 1) { - _attrs->set_zero_points(DNNL_ARG_WEIGHTS, 0, dnnl::memory::dims{}, _dzp_data_type); + _attrs->set_zero_points(DNNL_ARG_WEIGHTS, COMMON, dnnl::memory::dims{}, _dzp_data_type); } else { auto ngroups = dzp_layout.get_dim(1); if (ngroups == 1) { - _attrs->set_zero_points(DNNL_ARG_WEIGHTS, 1 << 1, dnnl::memory::dims{}, _dzp_data_type); + _attrs->set_zero_points(DNNL_ARG_WEIGHTS, PER_OC, dnnl::memory::dims{}, _dzp_data_type); } else { - _attrs->set_zero_points(DNNL_ARG_WEIGHTS, (1 << 1) + (1 << 0), {_ds_group_size, 1}, _dzp_data_type); + _attrs->set_zero_points(DNNL_ARG_WEIGHTS, GROUPED, {_ds_group_size, 1}, _dzp_data_type); } } } + if (dynamic_quantized_activation) { + // TODO: it supports per-token activation scale only + auto partial_shape = impl_params->get_input_layout(0).get_partial_shape(); + auto innermost_len = partial_shape[partial_shape.size() - 1].get_length(); + + auto act_scale_data_type = convert_data_type(impl_params->get_input_layout(idx).data_type); + _attrs->set_scales(DNNL_ARG_SRC, GROUPED, dnnl::memory::dims{1, innermost_len}, act_scale_data_type); + } + if (is_compressed) { auto prim_desc = get_matmul_primitive_descriptor(*impl_params, ib.get_engine(), input_size, has_bias, *_attrs); _pd = *prim_desc; @@ -321,6 +345,7 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { dnnl::memory::data_type ds_data_type = dnnl::memory::data_type::undef; dnnl::memory::data_type dzp_data_type = dnnl::memory::data_type::undef; bool is_four_bit_weight = false; + int idx = !arg.bias_term() ? 1 : 2; // There may be a performance difference between InnerProduct and MatMul primitives in oneDNN, // so use MatMul only for weights compression and IP for all other cases. @@ -329,37 +354,47 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { auto weights_layout = impl_params.get_input_layout(1); is_four_bit_weight = weights_layout.data_type == data_types::u4 || weights_layout.data_type == data_types::i4; if (!prim->decompression_scale.empty()) { - auto decompression_scale_idx = !arg.bias_term() ? 2 : 3; + auto decompression_scale_idx = ++idx; ds_data_type = convert_data_type(arg.get_dependency(decompression_scale_idx).get_output_layout().data_type); auto ifm = arg.get_dependency(1).get_output_layout().get_dim(1); auto ngroups = arg.get_dependency(decompression_scale_idx).get_output_layout().get_dim(1); group_size = ifm / ngroups; if (!is_four_bit_weight) { // 8-bit quantized weight - attr->set_scales(DNNL_ARG_WEIGHTS, 1 << 1, dnnl::memory::dims{}, ds_data_type); + attr->set_scales(DNNL_ARG_WEIGHTS, PER_OC, dnnl::memory::dims{}, ds_data_type); } else { // OneDNN does not support scalar zero-point for s4 and u8 type. Need to broadcast it. - attr->set_scales(DNNL_ARG_WEIGHTS, (1 << 1) + (1 << 0), {group_size, 1}, ds_data_type); + attr->set_scales(DNNL_ARG_WEIGHTS, GROUPED, {group_size, 1}, ds_data_type); } } if (!prim->decompression_zero_point.empty()) { - auto decompression_zp_idx = !arg.bias_term() ? 3 : 4; + auto decompression_zp_idx = ++idx; auto dzp_layout = arg.get_dependency(decompression_zp_idx).get_output_layout(); dzp_data_type = convert_data_type(dzp_layout.data_type); if (dzp_layout.count() == 1) { - attr->set_zero_points(DNNL_ARG_WEIGHTS, 0, dnnl::memory::dims{}, dzp_data_type); + attr->set_zero_points(DNNL_ARG_WEIGHTS, COMMON, dnnl::memory::dims{}, dzp_data_type); } else { auto ngroups = dzp_layout.get_dim(1); if (ngroups == 1) { - attr->set_zero_points(DNNL_ARG_WEIGHTS, 1 << 1, dnnl::memory::dims{}, dzp_data_type); + attr->set_zero_points(DNNL_ARG_WEIGHTS, PER_OC, dnnl::memory::dims{}, dzp_data_type); } else { - attr->set_zero_points(DNNL_ARG_WEIGHTS, (1 << 1) + (1 << 0), {group_size, 1}, dzp_data_type); + attr->set_zero_points(DNNL_ARG_WEIGHTS, GROUPED, {group_size, 1}, dzp_data_type); } } } + if (prim->dynamic_quantized_activation) { + // Note: it supports per-token activation scale only + ++idx; + auto partial_shape = impl_params.input_layouts[0].get_partial_shape(); + auto innermost_len = partial_shape[partial_shape.size() - 1].get_length(); + + auto act_scale_data_type = convert_data_type(impl_params.input_layouts[idx].data_type); + attr->set_scales(DNNL_ARG_SRC, GROUPED, dnnl::memory::dims{1, innermost_len}, act_scale_data_type); + } + auto prim_desc = get_matmul_primitive_descriptor(impl_params, impl_params.prog->get_engine(), prim->input_size, !prim->bias.empty(), *attr); diff --git a/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h new file mode 100644 index 00000000000000..49dd62c6332549 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h @@ -0,0 +1,46 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#include "intel_gpu/primitives/dynamic_quantize.hpp" +#include "primitive_inst.h" + +#include + +namespace cldnn { + +template <> +struct typed_program_node : public typed_program_node_base { + using parent = typed_program_node_base; + +public: + using parent::parent; + + program_node& input(size_t index = 0) const { return get_dependency(index); } + std::vector get_shape_infer_dependencies() const override { return {}; } +}; + +using dynamic_quantize_node = typed_program_node; + +template <> +class typed_primitive_inst : public typed_primitive_inst_base { + using parent = typed_primitive_inst_base; + using parent::parent; + +public: + template + static std::vector calc_output_layouts(dynamic_quantize_node const& /*node*/, const kernel_impl_params& impl_params); + static layout calc_output_layout(dynamic_quantize_node const& node, kernel_impl_params const& impl_params); + + // Internal function to be used from fakealignment + template + static std::vector __calc_output_layouts(const layout &act_layout, uint64_t group_size); + static std::string to_string(dynamic_quantize_node const& node); + + typed_primitive_inst(network& network, dynamic_quantize_node const& node); +}; + +using dynamic_quantize_inst = typed_primitive_inst; + +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index 8dc40d75967b27..7817e929d0d065 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -1072,7 +1072,11 @@ void network::execute_impl(const std::vector& events) { OPENVINO_ASSERT(!bin.empty(), "Failure loading binary from OV_GPU_LoadDumpRawBinary : " + dump_file); auto input_mem = get_primitive(inst->id())->dep_memory_ptr(i); - OPENVINO_ASSERT(input_mem->size() == bin.size(), "memory size mis-match for OV_GPU_LoadDumpRawBinary : " + layer_name); + if (input_mem->size() != bin.size()) { + std::cout << "WARNING: memory size mis-match for OV_GPU_LoadDumpRawBinary : " + layer_name + << " " << input_mem->size() << " / " << bin.size() << std::endl; + bin.resize(input_mem->size()); + } input_mem->copy_from(get_stream(), static_cast(&bin[0]), true); } diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 3268740984d519..7aa69629740206 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -30,6 +30,7 @@ #include "condition_inst.h" #include "gather_inst.h" #include "broadcast_inst.h" +#include "dynamic_quantize_inst.h" #include "experimental_detectron_roi_feature_extractor_inst.hpp" #include "impls/registry/implementation_map.hpp" #include "graph_optimizer/prepare_buffer_fusing.h" @@ -496,7 +497,7 @@ event::ptr primitive_inst::realloc_if_needed() { // Reuse state memory as output for kv cache if possible // otherwise clear _outputs for the cases when mem was reused previously if (_impl_params->can_be_optimized()) { - GPU_DEBUG_TRACE_DETAIL << id() << " : realloc_if_needed: Set kvcache output memmory as variable memory " << variable.get_memory()->buffer_ptr() + GPU_DEBUG_TRACE_DETAIL << id() << " : realloc_if_needed: Set kvcache output memory as variable memory " << variable.get_memory()->buffer_ptr() << " (ptr: " << variable.get_memory()->buffer_ptr() << ", actual_size: " << variable.get_actual_mem_size()/8 << " bytes" << ", variable layout " << variable.get_layout().to_short_string() << ")" << std::endl; @@ -596,10 +597,19 @@ event::ptr primitive_inst::realloc_if_needed() { auto fc_impl_params = *user->_impl_params; auto fc_input_layout = get_fake_aligned_params(fc_impl_params).input_layouts[0]; if (fc_input_layout.bytes_count() > updated_layouts[dep_idx].bytes_count()) { - GPU_DEBUG_TRACE_DETAIL << id() << ": increase output layout allocation size from " << actual_layouts[dep_idx].to_short_string() << " -> " + GPU_DEBUG_TRACE_DETAIL << id() << ": increase output layout allocation size from " + << actual_layouts[dep_idx].to_short_string() << " -> " << fc_input_layout.to_short_string() << " to meet the input buffer alignment requirements for FC\n"; updated_layouts[dep_idx] = fc_input_layout; } + + // dynamic quantization is only applied to activation of FC + if (get_node().is_type()) { + auto dyn_quan_scale_layout = dynamic_quantize_inst::__calc_output_layouts(updated_layouts[dep_idx], 0); + GPU_DEBUG_TRACE_DETAIL << "update layout of dynamic quantize scale parameter layout " + << dyn_quan_scale_layout[1].to_short_string() << std::endl; + updated_params.output_layouts[1] = dyn_quan_scale_layout[1]; + } } } } diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl new file mode 100644 index 00000000000000..628bc69f3886df --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl @@ -0,0 +1,80 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/batch_headers/fetch_data.cl" + +#if OUTPUT_DIMS != 4 +#error "dynamic_quantize_gpu_opt.cl: Unsupported output dimension" +#endif + +#define VLOAD_N CAT(vload, VEC_SIZE) +#define VSTORE_N CAT(vstore, VEC_SIZE) +#define CONVERT_CHAR_N CAT(convert_char, VEC_SIZE) +#define AS_TYPE_N_(type, n, x) as_##type##n(x) +#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x) +#define AS_INPUT_TYPE_N(x) AS_TYPE_N(INPUT0_TYPE, VEC_SIZE, x) + +REQD_SUB_GROUP_SIZE(SIMD) +KERNEL(dynamic_quantize_gpu_opt)( + OPTIONAL_SHAPE_INFO_ARG + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output, + __global OUTPUT1_TYPE* output_scale) +{ + const uint bf = (uint)get_global_id(2); + const uint sglid = get_sub_group_local_id(); + const uint local_id = (uint)get_local_id(1); + + const uint block_size = SIMD * VEC_SIZE; + const uint b_offset = bf * INPUT0_FEATURE_PITCH; + + const uint offset = b_offset + VEC_SIZE * sglid; + + const uint iteration = ALIGNED_BLOCK_NUM / BLOCK_NUM; + + __local half local_mem[BLOCK_NUM]; + + MAKE_VECTOR_TYPE(INPUT0_TYPE, VEC_SIZE) val[iteration]; + MAKE_VECTOR_TYPE(INPUT0_TYPE, VEC_SIZE) abs_val; + half max = 0.0h; + half grp_max = 0.001h; + half max_value; + + unroll_for(int i = 0; i < iteration; ++i) { + if ((local_id * iteration + i) >= TOTAL_BLOCK_NUM) + continue; + + val[i] = AS_INPUT_TYPE_N(VLOAD_N(0, input + offset + ((local_id * iteration + i) * block_size))); + abs_val = fabs(val[i]); + + unroll_for (int j = 0; j < VEC_SIZE; j++) { + max = fmax(max, abs_val[j]); + } + + grp_max = fmax(grp_max, max); + } + + max_value = sub_group_reduce_max(grp_max); + if (sglid == 0) + local_mem[local_id] = max_value; + + barrier(CLK_LOCAL_MEM_FENCE); + + for (int j = 0; j < BLOCK_NUM; j++) { + max_value = fmax(max_value, local_mem[j]); + } + + half scale = 127.0h / max_value; + + unroll_for(int i = 0; i < iteration; ++i) { + if ((local_id * iteration + i) >= TOTAL_BLOCK_NUM) + continue; + + val[i] *= scale; + VSTORE_N(CONVERT_CHAR_N(val[i]), 0, output + offset + ((local_id * iteration + i) * block_size)); + } + + if (sglid == 0 && local_id == 0) + output_scale[bf] = 1.0h / scale; +} diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl new file mode 100644 index 00000000000000..436276a67e48c0 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl @@ -0,0 +1,55 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/batch_headers/fetch_data.cl" + +#if OUTPUT_DIMS != 4 +#error "dynamic_quantize_gpu_ref.cl: Unsupported output dimension" +#endif + +KERNEL(dynamic_quantize_gpu_ref)( + OPTIONAL_SHAPE_INFO_ARG + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output, + __global OUTPUT1_TYPE* output_scale) +{ + const uint bf = (uint)get_global_id(0); + const uint b = (uint)get_global_id(0) / INPUT0_FEATURE_NUM; + const uint f = (uint)get_global_id(0) % INPUT0_FEATURE_NUM; + const uint y = (uint)get_global_id(1); + const uint scale_idx = OUTPUT1_GET_INDEX(b, f, y, 0); + + half max_val = 0.0001h; + for (int y_off = 0; y_off < (get_global_size(1) == 1 ? INPUT0_SIZE_Y : 1); y_off++) { + const uint offset = INPUT0_GET_INDEX(b, f, y + y_off, 0); + int x; + for (x = 0; x < INPUT0_SIZE_X / 8; x++) { + half8 val = as_half8(vload8(0, (ushort*)input + offset + x * 8)); + half8 abs_val = fabs(val); + + for (int j = 0; j < 8; j++) + max_val = fmax(max_val, abs_val[j]); + } + x *= 8; + for (; x < INPUT0_SIZE_X; x++) + max_val = fmax(max_val, fabs(input[offset + x])); + } + + half scale = 127.0h / max_val; + for (int y_off = 0; y_off < (get_global_size(1) == 1 ? INPUT0_SIZE_Y : 1); y_off++) { + const uint in_offset = INPUT0_GET_INDEX(b, f, y + y_off, 0); + const uint out_offset = OUTPUT_GET_INDEX(b, f, y + y_off, 0); + int x; + for (x = 0; x < INPUT0_SIZE_X / 8; x++) { + half8 val = as_half8(vload8(0, (ushort*)input + in_offset + x * 8)); + val *= scale; + vstore8(convert_char8(val), 0, output + out_offset + x * 8); + } + x *= 8; + for (; x < INPUT0_SIZE_X; x++) + output[out_offset + x] = convert_char(input[in_offset + x] * scale); + } + + output_scale[scale_idx] = 1.0h / scale; +} diff --git a/src/plugins/intel_gpu/src/kernel_selector/common_types.h b/src/plugins/intel_gpu/src/kernel_selector/common_types.h index d0aba8554eccc7..bf96ff16aa4c8f 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/common_types.h +++ b/src/plugins/intel_gpu/src/kernel_selector/common_types.h @@ -98,7 +98,8 @@ enum class KernelType { UNIQUE_GATHER, RMS, SWIGLU, - ROPE + ROPE, + DYNAMIC_QUANTIZE }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector.cpp index 160e6744a03928..a4aac48b71cc5d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector.cpp @@ -188,6 +188,8 @@ KernelList kernel_selector_base::GetAllImplementations(const Params& params, Ker [](const PriorityPair& impl) { return std::move(impl.second); }); + } else { + GPU_DEBUG_COUT << "No implementation for " << params.layerID << " because of kernel type mismatch" << std::endl; } return result; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp new file mode 100644 index 00000000000000..d4b5268eaee4e4 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp @@ -0,0 +1,154 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "dynamic_quantize_kernel_opt.h" +#include "kernel_selector_utils.h" +#include + + +static constexpr size_t simd = 16; + +namespace kernel_selector { +static std::pair get_input_bf_size(const dynamic_quantize_params& params) { + size_t input_f = params.inputs[0].Feature().v; + size_t input_batch = params.inputs[0].Batch().v; + // 3D input + if (params.outputs[0].GetLayout() == DataLayout::bfyx) { + input_f = params.inputs[0].Y().v * params.inputs[0].X().v; + input_batch = params.inputs[0].Batch().v * params.inputs[0].Feature().v; + } + + return {input_batch, input_f}; +} + +static size_t get_match_vector_size(const dynamic_quantize_params& params) { + auto block_sizes = { 8, 4, 2 }; + + for (auto block_size : block_sizes) { + if (((params.inputs[0].X().v * params.inputs[0].Y().v) / simd) % block_size == 0) { + return block_size; + } + } + + return 1; +} + +ParamsKey DynamicQuantizeKernelOpt::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::INT8); + k.EnableDifferentTypes(); + k.EnableAllInputLayout(); + k.EnableAllOutputLayout(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnableDynamicShapesSupport(); + return k; +} + +JitConstants DynamicQuantizeKernelOpt::GetJitConstants(const dynamic_quantize_params& params) const { + JitConstants jit = MakeBaseParamsJitConstants(params); + + auto vec_size = get_match_vector_size(params); + auto bf_size = get_input_bf_size(params); + auto total_block_num = bf_size.second / (simd * vec_size); + size_t aligned_block_num = (total_block_num > 32) ? Align(total_block_num, 32) : total_block_num; + size_t block_num = (total_block_num > 32) ? 32 : total_block_num; + + jit.AddConstant(MakeJitConstant("VEC_SIZE", vec_size)); + jit.AddConstant(MakeJitConstant("SIMD", simd)); + jit.AddConstant(MakeJitConstant("TOTAL_BLOCK_NUM", total_block_num)); + jit.AddConstant(MakeJitConstant("ALIGNED_BLOCK_NUM", aligned_block_num)); + jit.AddConstant(MakeJitConstant("BLOCK_NUM", block_num)); + jit.Merge(GetTensorFriendlyWorkGroupsJit(params.outputs[0])); + + return jit; +} + +CommonDispatchData DynamicQuantizeKernelOpt::SetDefault(const dynamic_quantize_params& params) const { + CommonDispatchData dispatchData; + + auto vec_size = get_match_vector_size(params); + auto bf_size = get_input_bf_size(params); + size_t total_block_num = bf_size.second / (simd * vec_size); + size_t batch = get_input_bf_size(params).first; + size_t block_num = (total_block_num > 32) ? 32 : total_block_num; + + dispatchData.gws = {simd, block_num, batch}; + dispatchData.lws = {simd, block_num, 1}; + + return dispatchData; +} + +void DynamicQuantizeKernelOpt::GetUpdateDispatchDataFunc(KernelData& kd) const { + kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) { + const auto& prim_params = static_cast(params); + auto dispatchData = SetDefault(prim_params); + OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func"); + kd.kernels[0].params.workGroups.global = dispatchData.gws; + kd.kernels[0].params.workGroups.local = dispatchData.lws; + kd.kernels[0].skip_execution = false; + + GPU_DEBUG_TRACE_DETAIL << "Update Dispatch data DynamicQuantizeKernelOpt gws : " << dispatchData.gws[0] << ", " + << dispatchData.gws[1] << ", " << dispatchData.gws[2] << std::endl; + }; +} + +KernelsData DynamicQuantizeKernelOpt::GetKernelsData(const Params& params) const { + assert(params.GetType() == KernelType::DYNAMIC_QUANTIZE); + + if (!Validate(params)) + return {}; + + const dynamic_quantize_params& prim_params = static_cast(params); + auto dispatchData = SetDefault(prim_params); + + KernelData kd = KernelData::Default(params); + + auto cldnn_jit = GetJitConstants(prim_params); + auto entry_point = GetEntryPoint(kernelName, prim_params.layerID, params); + auto jit = CreateJit(kernelName, cldnn_jit, entry_point); + + GetUpdateDispatchDataFunc(kd); + + auto& kernel = kd.kernels[0]; + FillCLKernelData(kernel, + dispatchData, + params.engineInfo, + kernelName, + jit, + entry_point, + EXE_MODE_DEFAULT, + false, + false, + 1, + GetFusedPrimitiveInputsCount(params), + static_cast(prim_params.outputs.size()), + prim_params.is_shape_agnostic); + + return {kd}; +} + +KernelsPriority DynamicQuantizeKernelOpt::GetKernelsPriority(const Params& /*params*/) const { + return FORCE_PRIORITY_2; +} + +bool DynamicQuantizeKernelOpt::Validate(const Params& params) const { + if (!KernelBaseOpenCL::Validate(params)) + return false; + + const auto& dq_params = static_cast(params); + + // Todo : Add proper exception here + if (((dq_params.inputs[0].X().v * dq_params.inputs[0].Y().v) % (simd * 2)) != 0) + return false; + + if (dq_params.inputs[0].GetPaddedVal() != 0 || dq_params.outputs[0].GetPaddedVal() != 0) + return false; + + return true; +} +} // namespace kernel_selector + diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.h new file mode 100644 index 00000000000000..3a95836080d2eb --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.h @@ -0,0 +1,30 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_base_opencl.h" +#include "dynamic_quantize_kernel_ref.h" + +namespace kernel_selector { +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// dynamic_quantize_params +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +class DynamicQuantizeKernelOpt : public KernelBaseOpenCL { +public: + DynamicQuantizeKernelOpt() : KernelBaseOpenCL("dynamic_quantize_gpu_opt") {} + virtual ~DynamicQuantizeKernelOpt() {} + + virtual JitConstants GetJitConstants(const dynamic_quantize_params& params) const; + virtual CommonDispatchData SetDefault(const dynamic_quantize_params& params) const; + KernelsData GetKernelsData(const Params& params) const override; + KernelsPriority GetKernelsPriority(const Params& params) const override; + Datatype GetAccumulatorType(const dynamic_quantize_params& params) const; + ParamsKey GetSupportedKey() const override; + +protected: + bool Validate(const Params&) const override; + void GetUpdateDispatchDataFunc(KernelData& kd) const override; +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp new file mode 100644 index 00000000000000..3b214848e2f8ad --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp @@ -0,0 +1,99 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "dynamic_quantize_kernel_ref.h" +#include "kernel_selector_utils.h" +#include + +namespace kernel_selector { +ParamsKey DynamicQuantizeKernelRef::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::INT8); + k.EnableInputLayout(DataLayout::bfyx); + k.EnableOutputLayout(DataLayout::bfyx); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnableDifferentTypes(); + k.EnableDynamicShapesSupport(); + return k; +} + +JitConstants DynamicQuantizeKernelRef::GetJitConstants(const dynamic_quantize_params& params) const { + JitConstants jit = MakeBaseParamsJitConstants(params); + + jit.Merge(GetTensorFriendlyWorkGroupsJit(params.outputs[0])); + + return jit; +} + +CommonDispatchData DynamicQuantizeKernelRef::SetDefault(const dynamic_quantize_params& params) const { + GPU_DEBUG_GET_INSTANCE(debug_config); + CommonDispatchData dispatchData; + + OPENVINO_ASSERT(params.outputs[0].GetLayout() == DataLayout::bfyx, "It supports only 4d tensor"); + dispatchData.gws = {params.outputs[0].Batch().v * params.outputs[0].Feature().v, 1, 1}; + dispatchData.lws = {1, 1, 1}; + + return dispatchData; +} + +void DynamicQuantizeKernelRef::GetUpdateDispatchDataFunc(KernelData& kd) const { + kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) { + const auto& prim_params = static_cast(params); + auto dispatchData = SetDefault(prim_params); + OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func"); + kd.kernels[0].params.workGroups.global = dispatchData.gws; + kd.kernels[0].params.workGroups.local = dispatchData.lws; + kd.kernels[0].skip_execution = false; + }; +} + +KernelsData DynamicQuantizeKernelRef::GetKernelsData(const Params& params) const { + assert(params.GetType() == KernelType::DYNAMIC_QUANTIZE); + + if (!Validate(params)) + return {}; + + const dynamic_quantize_params& prim_params = static_cast(params); + auto dispatchData = SetDefault(prim_params); + + KernelData kd = KernelData::Default(params); + + auto cldnn_jit = GetJitConstants(prim_params); + auto entry_point = GetEntryPoint(kernelName, prim_params.layerID, params); + auto jit = CreateJit(kernelName, cldnn_jit, entry_point); + + GetUpdateDispatchDataFunc(kd); + + auto& kernel = kd.kernels[0]; + FillCLKernelData(kernel, + dispatchData, + params.engineInfo, + kernelName, + jit, + entry_point, + EXE_MODE_DEFAULT, + false, + false, + 1, + GetFusedPrimitiveInputsCount(params), + static_cast(prim_params.outputs.size()), + prim_params.is_shape_agnostic); + + return {kd}; +} + +KernelsPriority DynamicQuantizeKernelRef::GetKernelsPriority(const Params& /*params*/) const { + return FORCE_PRIORITY_8; +} + +bool DynamicQuantizeKernelRef::Validate(const Params& params) const { + if (!KernelBaseOpenCL::Validate(params)) + return false; + + return true; +} +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h new file mode 100644 index 00000000000000..ce52ed9fb19714 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h @@ -0,0 +1,33 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_base_opencl.h" + +namespace kernel_selector { +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// dynamic_quantize_params +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +struct dynamic_quantize_params : public base_params { + dynamic_quantize_params() : base_params(KernelType::DYNAMIC_QUANTIZE) {} +}; + +class DynamicQuantizeKernelRef : public KernelBaseOpenCL { +public: + DynamicQuantizeKernelRef() : KernelBaseOpenCL("dynamic_quantize_gpu_ref") {} + virtual ~DynamicQuantizeKernelRef() {} + + virtual JitConstants GetJitConstants(const dynamic_quantize_params& params) const; + virtual CommonDispatchData SetDefault(const dynamic_quantize_params& params) const; + KernelsData GetKernelsData(const Params& params) const override; + KernelsPriority GetKernelsPriority(const Params& params) const override; + Datatype GetAccumulatorType(const dynamic_quantize_params& params) const; + ParamsKey GetSupportedKey() const override; + +protected: + bool Validate(const Params&) const override; + void GetUpdateDispatchDataFunc(KernelData& kd) const override; +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp new file mode 100644 index 00000000000000..6ca9fbd2f5bd76 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp @@ -0,0 +1,18 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "dynamic_quantize_kernel_selector.h" +#include "dynamic_quantize_kernel_ref.h" +#include "dynamic_quantize_kernel_opt.h" + +namespace kernel_selector { +dynamic_quantize_kernel_selector::dynamic_quantize_kernel_selector() { + Attach(); + Attach(); +} + +KernelsData dynamic_quantize_kernel_selector::GetBestKernels(const Params& params) const { + return GetNaiveBestKernel(params, KernelType::DYNAMIC_QUANTIZE); +} +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.h new file mode 100644 index 00000000000000..962aca7234f601 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.h @@ -0,0 +1,23 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_selector.h" + +namespace kernel_selector { +class dynamic_quantize_kernel_selector : public kernel_selector_base { +public: + static dynamic_quantize_kernel_selector& Instance() { + static dynamic_quantize_kernel_selector instance_; + return instance_; + } + + dynamic_quantize_kernel_selector(); + + virtual ~dynamic_quantize_kernel_selector() {} + + KernelsData GetBestKernels(const Params& params) const override; +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/plugin/compiled_model.cpp b/src/plugins/intel_gpu/src/plugin/compiled_model.cpp index b4d3e658e410a6..b9729ca7bf0f20 100644 --- a/src/plugins/intel_gpu/src/plugin/compiled_model.cpp +++ b/src/plugins/intel_gpu/src/plugin/compiled_model.cpp @@ -256,9 +256,9 @@ ov::Any CompiledModel::get_property(const std::string& name) const { ov::PropertyName{ov::num_streams.name(), PropertyMutability::RO}, ov::PropertyName{ov::hint::num_requests.name(), PropertyMutability::RO}, ov::PropertyName{ov::hint::inference_precision.name(), PropertyMutability::RO}, + ov::PropertyName{ov::hint::dynamic_quantization_group_size.name(), PropertyMutability::RO}, ov::PropertyName{ov::device::id.name(), PropertyMutability::RO}, ov::PropertyName{ov::execution_devices.name(), PropertyMutability::RO}, - ov::PropertyName{ov::hint::dynamic_quantization_group_size.name(), PropertyMutability::RO} }; } else if (name == ov::model_name) { return decltype(ov::model_name)::value_type {m_model_name}; diff --git a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp new file mode 100644 index 00000000000000..0373251e45c051 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp @@ -0,0 +1,33 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ov_ops/dynamic_quantize.hpp" +#include "intel_gpu/plugin/program_builder.hpp" +#include "intel_gpu/plugin/common_utils.hpp" +#include "intel_gpu/primitives/dynamic_quantize.hpp" + +namespace ov { +namespace intel_gpu { + +static void CreateDynamicQuantizeOp(ProgramBuilder& p, const std::shared_ptr& op) { + validate_inputs_count(op, {1}); + auto inputs = p.GetInputInfo(op); + std::string primitive_name = layer_type_name_ID(op); + + auto group_sizes = op->get_group_sizes(); + for (size_t i = 0; i < group_sizes.size() - 1; i++) + OPENVINO_ASSERT(group_sizes[i] == 1, "Not supported group size at ", i, ": ", group_sizes[i]); + + OPENVINO_ASSERT(group_sizes.back() == UINT64_MAX, "Not supported group size: ", group_sizes.back()); + auto prim = cldnn::dynamic_quantize(primitive_name, + inputs[0], + op->get_group_sizes().back(), + get_output_data_types(op)); + p.add_primitive(*op, prim); +} + +REGISTER_FACTORY_IMPL(internal, DynamicQuantize); + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp index 9b3a6492ec364b..7c0c570f7cf54c 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp @@ -26,22 +26,24 @@ namespace ov { namespace intel_gpu { static void CreateFullyConnectedCompressedOp(ProgramBuilder& p, const std::shared_ptr& op) { - validate_inputs_count(op, {4, 5}); + validate_inputs_count(op, {4, 5, 6}); auto inputs = p.GetInputInfo(op); std::string primitive_name = layer_type_name_ID(op); auto supports_immad = p.get_engine().get_device_info().supports_immad; - const int INPUT_CNT_WITH_ZP = 5; auto input_name = inputs[0].pid; auto weights_name = inputs[1].pid; auto bias_name = inputs[2].pid; auto scale_name = inputs[3].pid; - auto zp_name = inputs.size() == INPUT_CNT_WITH_ZP ? inputs[4].pid : ""; + size_t input_idx = 4; + const size_t W_ZP_IDX = input_idx; + std::string zp_name = op->get_input_size() > input_idx ? inputs[input_idx++].pid : ""; + auto activation_scale_input = op->get_input_size() > input_idx ? inputs[input_idx++] : cldnn::input_info(); float zp_value = 0.0f; bool has_scalar_zp = false; - if (op->get_input_size() == INPUT_CNT_WITH_ZP) { - auto zp_const = std::dynamic_pointer_cast(op->get_input_node_shared_ptr(INPUT_CNT_WITH_ZP-1)); + if (zp_name.size() > 0) { + auto zp_const = std::dynamic_pointer_cast(op->get_input_node_shared_ptr(W_ZP_IDX)); if (zp_const && ov::shape_size(zp_const->get_output_shape(0)) == 1) { has_scalar_zp = true; zp_value = zp_const->cast_vector()[0]; @@ -55,10 +57,12 @@ static void CreateFullyConnectedCompressedOp(ProgramBuilder& p, const std::share bias_name, scale_name, has_scalar_zp && !supports_immad ? "" : zp_name, + activation_scale_input, cldnn::element_type_to_data_type(op->get_output_element_type(0)), op->get_input_partial_shape(0).size(), op->get_input_partial_shape(1).size()); + if (has_scalar_zp) { fc.decompression_zero_point_scalar = zp_value; } diff --git a/src/plugins/intel_gpu/src/plugin/program_builder.cpp b/src/plugins/intel_gpu/src/plugin/program_builder.cpp index 57e9bfdf444bf9..aae9b163b4f6bf 100644 --- a/src/plugins/intel_gpu/src/plugin/program_builder.cpp +++ b/src/plugins/intel_gpu/src/plugin/program_builder.cpp @@ -381,7 +381,7 @@ void validate_inputs_count(const std::shared_ptr& op, std::vectorget_input_size(), ") in )", + OPENVINO_THROW("Invalid inputs count (", op->get_input_size(), ") in ", op->get_friendly_name(), " (", op->get_type_name(), " ", op->get_type_info().version_id, ")"); } diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp new file mode 100644 index 00000000000000..a2e4a1a38983c8 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp @@ -0,0 +1,85 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "dynamic_quantize_fully_connected.hpp" + +#include "intel_gpu/op/fully_connected_compressed.hpp" +#include "intel_gpu/op/placeholder.hpp" +#include "ov_ops/dynamic_quantize.hpp" + +#include "openvino/core/rt_info.hpp" +#include "openvino/pass/pattern/op/or.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "transformations/utils/utils.hpp" +#include "intel_gpu/runtime/debug_configuration.hpp" + +namespace ov { +namespace intel_gpu { + +DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(uint64_t group_size) { + GPU_DEBUG_GET_INSTANCE(debug_config); + using namespace ov::pass::pattern; + + // per-token quantization is supported + if (group_size != UINT64_MAX) { + GPU_DEBUG_TRACE << "Dynamic quantization is disabled " << group_size << std::endl; + return; + } + auto is_dynamic = [](const ov::Output& output) -> bool { + bool is_dynamic = output.get_node_shared_ptr()->get_output_partial_shape(0).is_dynamic(); + size_t num_inputs = output.get_node_shared_ptr()->get_input_size(); + for (size_t idx = 0; idx < num_inputs; idx++) { + is_dynamic |= output.get_node_shared_ptr()->get_input_partial_shape(idx).is_dynamic(); + } + return is_dynamic; + }; + + auto data = any_input(); + auto fully_connected_compressed3 = wrap_type({data, any_input(), any_input(), any_input()}, is_dynamic); + auto fully_connected_compressed4 = wrap_type({data, any_input(), any_input(), any_input(), any_input()}, is_dynamic); + auto fully_connected_compressed = std::make_shared(OutputVector{fully_connected_compressed3, fully_connected_compressed4}); + + + ov::matcher_pass_callback callback = [=](Matcher& m) { + if (transformation_callback(m.get_match_root())) { + return false; + } + const auto& pattern_map = m.get_pattern_value_map(); + const auto& m_data = pattern_map.at(data).get_node_shared_ptr(); + + auto m_fc = std::dynamic_pointer_cast(m.get_match_root()); + + auto weight_shape = m_fc->get_input_partial_shape(1); + const size_t innermost_size = weight_shape[weight_shape.size() - 1].get_length(); + if (group_size != UINT64_MAX && + (group_size == 0 || (innermost_size % group_size != 0 && innermost_size > group_size))) { + GPU_DEBUG_TRACE << "Dynamic quantization: shape is not aligned with group size " << innermost_size << " / " << group_size << std::endl; + return false; + } + + auto rank = m_fc->get_input_partial_shape(0).size(); + std::vector shape_group_size(rank, 1); + shape_group_size.back() = group_size; + auto dyn_quan = std::make_shared(m_data, shape_group_size, element::f16); + auto optional_w_zp = m_fc->get_input_size() > 4 ? m_fc->get_input_node_shared_ptr(4) : std::make_shared(); + + auto new_fc = std::make_shared(dyn_quan->output(0), + m_fc->get_input_node_shared_ptr(1), + m_fc->get_input_node_shared_ptr(2), + m_fc->get_input_node_shared_ptr(3), + optional_w_zp, + dyn_quan->output(1), + m_fc->get_output_type()); + ov::replace_node(m_fc, new_fc); + + new_fc->set_friendly_name(m_fc->get_friendly_name()); + ov::copy_runtime_info(m_fc, new_fc); + return true; + }; + auto m = std::make_shared(fully_connected_compressed, "DynamicQuantizeFullyConnected"); + this->register_matcher(m, callback); +} + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.hpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.hpp new file mode 100644 index 00000000000000..b5d956f7872b5c --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.hpp @@ -0,0 +1,19 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" + +namespace ov { +namespace intel_gpu { + +class DynamicQuantizeFullyConnected: public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("DynamicQuantizeFullyConnected", "0"); + DynamicQuantizeFullyConnected(uint64_t group_size); +}; + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/fully_connected_compressed.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/fully_connected_compressed.cpp index 571d4fcbd576ae..2e3819d7e850ee 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/fully_connected_compressed.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/fully_connected_compressed.cpp @@ -11,22 +11,36 @@ namespace op { FullyConnectedCompressed::FullyConnectedCompressed(const ov::Output& A, const ov::Output& B, const ov::Output& bias, - const ov::Output& decompression_scale, - const ov::Output& decompression_zero_point, + const ov::Output& w_decompression_scale, + const ov::Output& w_decompression_zero_point, + const ov::Output& a_decompression_scale, const ov::element::Type output_type) : FullyConnected(A, B, bias, output_type) { - set_argument(3, decompression_scale); - set_argument(4, decompression_zero_point); + set_argument(3, w_decompression_scale); + set_argument(4, w_decompression_zero_point); + set_argument(5, a_decompression_scale); validate_and_infer_types(); } FullyConnectedCompressed::FullyConnectedCompressed(const ov::Output& A, const ov::Output& B, const ov::Output& bias, - const ov::Output& decompression_scale, + const ov::Output& w_decompression_scale, + const ov::Output& w_decompression_zero_point, const ov::element::Type output_type) : FullyConnected(A, B, bias, output_type) { - set_argument(3, decompression_scale); + set_argument(3, w_decompression_scale); + set_argument(4, w_decompression_zero_point); + validate_and_infer_types(); +} + +FullyConnectedCompressed::FullyConnectedCompressed(const ov::Output& A, + const ov::Output& B, + const ov::Output& bias, + const ov::Output& w_decompression_scale, + const ov::element::Type output_type) + : FullyConnected(A, B, bias, output_type) { + set_argument(3, w_decompression_scale); validate_and_infer_types(); } @@ -46,10 +60,16 @@ std::shared_ptr FullyConnectedCompressed::clone_with_new_inputs(const new_args.at(3), new_args.at(4), m_output_type); + else if (new_args.size() == 6) + return std::make_shared(new_args.at(0), + new_args.at(1), + new_args.at(2), + new_args.at(3), + new_args.at(4), + new_args.at(6), + m_output_type); else - OPENVINO_THROW("Unexpected inputs count for FullyConnectedCompressed op: ", new_args.size()); -} - + OPENVINO_THROW("Unexpected inputs count for FullyConnectedCompressed op: ", new_args.size());} } // namespace op } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 1da31af3ac5c16..181dd39252f3a0 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -71,6 +71,7 @@ #include "plugin/transformations/unsqueeze_broadcast_reshape_matmul_fusion.hpp" #include "plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.hpp" #include "plugin/transformations/group_norm_composition.hpp" +#include "plugin/transformations/dynamic_quantize_fully_connected.hpp" #include "transformations/common_optimizations/rms_fusion.hpp" #include "transformations/common_optimizations/broadcast_elementwise_fusion.hpp" #include "transformations/common_optimizations/broadcast_transition.hpp" @@ -848,6 +849,25 @@ void TransformationsPipeline::apply(std::shared_ptr func) { pass_config->disable(); pass_config->disable(); + auto dynamic_quantization_group_size = config.get_property(ov::hint::dynamic_quantization_group_size); + if (device_info.supports_immad) { // XXX: 1048576 is considered per-token + pass_config->set_callback([=](const_node_ptr& root) -> bool { + if (root->get_input_node_shared_ptr(0)->get_element_type() == ov::element::Type_t::f32) { + GPU_DEBUG_TRACE << root->get_friendly_name() << " Dynamic quantization is turned off because input type is not supported" << std::endl; + return true; + } + + auto weight_shape = root->get_input_partial_shape(1); + const size_t innermost_size = weight_shape[weight_shape.size() - 1].get_length(); + if (innermost_size < 32) { + GPU_DEBUG_TRACE << "Dynamic quantization: shape is too small " << innermost_size << " / " << dynamic_quantization_group_size << std::endl; + return true; + } + return false; + }); + manager.register_pass(dynamic_quantization_group_size); + } + // This is supposed to be the last pass to ensure that we don't have name collisions until // GPU plugin stops using friendly names for program creation manager.register_pass(true); diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp index b7bb9947717ad0..6c745d67c733cc 100644 --- a/src/plugins/intel_gpu/src/runtime/execution_config.cpp +++ b/src/plugins/intel_gpu/src/runtime/execution_config.cpp @@ -46,6 +46,7 @@ void ExecutionConfig::set_default() { std::make_tuple(ov::hint::execution_mode, ov::hint::ExecutionMode::PERFORMANCE), std::make_tuple(ov::hint::num_requests, 0), std::make_tuple(ov::hint::enable_cpu_pinning, false), + std::make_tuple(ov::hint::dynamic_quantization_group_size, 0), std::make_tuple(ov::intel_gpu::hint::host_task_priority, ov::hint::Priority::MEDIUM), std::make_tuple(ov::intel_gpu::hint::queue_throttle, ov::intel_gpu::hint::ThrottleLevel::MEDIUM), @@ -192,6 +193,10 @@ void ExecutionConfig::apply_debug_options(const cldnn::device_info& info) { GPU_DEBUG_IF(debug_config->disable_dynamic_impl == 1) { set_property(ov::intel_gpu::use_only_static_kernels_for_dynamic_shape(true)); } + + GPU_DEBUG_IF(debug_config->enable_dynamic_quantize) { + set_property(ov::hint::dynamic_quantization_group_size(UINT64_MAX)); + } } void ExecutionConfig::apply_hints(const cldnn::device_info& info) { diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp index 9a439132af0bae..b667a17fb233b8 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp @@ -56,7 +56,9 @@ using MatmulWeightsDecompressionParams = std::tuple; // per-tensor zero-point + bool, // per-tensor zero-point + int // dynamic_quantization_group_size + >; class MatmulWeightsDecompression : public testing::WithParamInterface, virtual public ov::test::SubgraphBaseTest { @@ -69,6 +71,7 @@ class MatmulWeightsDecompression : public testing::WithParamInterface +#include +#include "intel_gpu/primitives/dynamic_quantize.hpp" +#include +#include + +#include "intel_gpu/runtime/compilation_context.hpp" +#include "fully_connected_inst.h" + +#include + +using namespace cldnn; +using namespace ::tests; + +class dynamic_quantization_gpu_tests: public ::testing::Test { +public: + + void test_dynamic_quantization(bool is_caching_test, bool is_dynamic, int batch = 1, int ifm = 1024) { + tests::random_generator rg(GET_SUITE_NAME); + auto& engine = get_test_engine(); + + long int batch_num = batch; + long int ifm_num = ifm; + + bool is_4d = true; + + auto input_ps = is_4d ? ov::PartialShape{ batch_num, 1, 1, ifm_num } : ov::PartialShape{ batch_num, ifm_num}; + auto dyn_input_ps = is_4d ? ov::PartialShape{ -1, 1, 1, ifm_num } : ov::PartialShape{ -1, ifm_num}; + auto input_mem = engine.allocate_memory({ input_ps, data_types::f32, format::bfyx }); + + auto input_data = rg.generate_random_1d(batch_num * ifm_num, -16.0f, 16.0f); + set_values(input_mem, input_data); + + auto in_layout_f32 = is_dynamic ? layout{ dyn_input_ps, data_types::f32, format::bfyx } + : layout{ input_ps, data_types::f32, format::bfyx }; + + auto in_layout = is_dynamic ? layout{ dyn_input_ps, data_types::f16, format::bfyx } + : layout{ input_ps, data_types::f16, format::bfyx }; + + auto reorder_1 = reorder("reorder_1", input_info("input"), layout{ input_ps, data_types::f16, format::bfyx }); + auto dyn_quan_prim = dynamic_quantize("dyn_quan_prim", input_info("reorder_1"), 32, {data_types::f16, data_types::i8}); + auto reorder_2 = reorder("reorder_2", input_info("dyn_quan_prim"), layout{ input_ps, data_types::f16, format::bfyx }); + + // Implemented dynamic quantize kernel + auto get_ref_results = [&]() { + topology topology( + input_layout("input", in_layout_f32), + reorder_1, + dyn_quan_prim, + reorder_2 + ); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + + ov::intel_gpu::ImplementationDesc dyn_quan_impl_desc = { format::bfyx, "dynamic_quantize_gpu_ref", impl_types::ocl }; + config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"dyn_quan_prim", dyn_quan_impl_desc} })); + + network network(engine, topology, config); + network.set_input_data("input", input_mem); + + auto outputs = network.execute(); + + auto output_layout = outputs.begin()->second.get_layout(); + auto output_mem = outputs.begin()->second.get_memory(); + + return engine.reinterpret_buffer(*output_mem, output_layout); + }; + + topology topology( + input_layout("input", in_layout_f32), + reorder_1, + dyn_quan_prim, + reorder_2 + ); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); + + network->set_input_data("input", input_mem); + + auto outputs = network->execute(); + + auto output_mem = outputs.begin()->second.get_memory(); + cldnn::mem_lock output_ptr (output_mem, get_test_stream()); + + auto ref_output_mem = get_ref_results(); + cldnn::mem_lock output_ptr_ref (ref_output_mem, get_test_stream()); + + size_t count = 0; + float max_diff = 0.f; + float avg = 0.f; + for (size_t i = 0; i < output_ptr_ref.size(); ++i) { + auto abs_diff = std::abs(output_ptr_ref[i] - output_ptr[i]); + if (max_diff < abs_diff) + max_diff = abs_diff; + avg = abs_diff; + count++; + OPENVINO_ASSERT(abs_diff < 1); + } + GPU_DEBUG_LOG << "---> count: " << count << ", max_diff:" << max_diff << ", avg_diff: " << (avg/count) << std::endl; + } +}; + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_large_size) { + this->test_dynamic_quantization(false, false, 2048, 4096); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_large_size_dynamic) { + this->test_dynamic_quantization(false, true, 2048, 4096); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_small_size) { + this->test_dynamic_quantization(false, false, 64, 4096); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_single_batch) { + this->test_dynamic_quantization(false, false, 1, 4096); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_ref_only) { + this->test_dynamic_quantization(false, false, 16, 33); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_ref_only_dynamic) { + this->test_dynamic_quantization(false, true, 16, 33); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_invalid) { + this->test_dynamic_quantization(false, false, 16, 7); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_unaligned) { + this->test_dynamic_quantization(false, false, 16, 32); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_unaligned_dynamic) { + this->test_dynamic_quantization(false, true, 16, 32); +} diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/hash_key_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/hash_key_gpu_test.cpp index 73f159b1290d6f..104684964e2b2c 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/hash_key_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/hash_key_gpu_test.cpp @@ -71,11 +71,11 @@ class check_hash_value: public ::testing::Test { const auto primitive_hash = primitve->hash(); const auto params_hash = prim_inst->get_fake_aligned_params(*prim_inst->get_impl_params()).hash(); if (!engine.get_device_info().supports_immad) { - ASSERT_EQ(primitive_hash, 14259723886449306729UL); - ASSERT_EQ(params_hash, 3365957578641948513UL); + ASSERT_EQ(primitive_hash, 8017451717095756666UL); + ASSERT_EQ(params_hash, 4374037685392472517UL); } else { - ASSERT_EQ(primitive_hash, 14259723886449306729UL); - ASSERT_EQ(params_hash, 9831190959346679696UL); + ASSERT_EQ(primitive_hash, 8017451717095756666UL); + ASSERT_EQ(params_hash, 17704411706121042861UL); } } diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu index 7ab8ee9adda866..915b0d4c3954fe 160000 --- a/src/plugins/intel_gpu/thirdparty/onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu @@ -1 +1 @@ -Subproject commit 7ab8ee9adda866d675edeee7a3a6a29b2d0a1572 +Subproject commit 915b0d4c3954fe78f5e9b4dc28221b858f8450bd