From 2fe52fb5543d6efc602b385c6cf0a30bafd0ab04 Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Sat, 8 Jun 2024 19:16:07 +0900 Subject: [PATCH 01/29] [GPU] FC dynamic quantization with OneDNN Signed-off-by: Kim, Mingyu Signed-off-by: Min, Byungil --- .../include/intel_gpu/op/dynamic_quantize.hpp | 37 +++++ .../op/fully_connected_compressed.hpp | 13 ++ .../intel_gpu/plugin/primitives_list.hpp | 1 + .../intel_gpu/primitives/dynamic_quantize.hpp | 58 ++++++++ .../intel_gpu/primitives/fully_connected.hpp | 45 +++++++ .../intel_gpu/runtime/debug_configuration.hpp | 1 + .../intel_gpu/src/graph/dynamic_quantize.cpp | 63 +++++++++ .../intel_gpu/src/graph/fully_connected.cpp | 3 + .../graph_optimizer/build_implementations.cpp | 3 + .../prepare_primitive_fusing.cpp | 2 + .../src/graph/impls/ocl/dynamic_quantize.cpp | 73 ++++++++++ .../src/graph/impls/ocl/register.cpp | 1 + .../src/graph/impls/ocl/register.hpp | 2 + .../impls/onednn/fully_connected_onednn.cpp | 44 +++++- .../src/graph/include/dynamic_quantize_inst.h | 46 +++++++ src/plugins/intel_gpu/src/graph/network.cpp | 5 +- .../intel_gpu/src/graph/primitive_inst.cpp | 11 +- .../cl_kernels/dynamic_quantize_gpu_opt.cl | 57 ++++++++ .../cl_kernels/dynamic_quantize_gpu_ref.cl | 57 ++++++++ .../src/kernel_selector/common_types.h | 3 +- .../src/kernel_selector/kernel_selector.cpp | 2 + .../dynamic_quantize_kernel_opt.cpp | 127 ++++++++++++++++++ .../dynamic_quantize_kernel_opt.h | 30 +++++ .../dynamic_quantize_kernel_ref.cpp | 110 +++++++++++++++ .../dynamic_quantize_kernel_ref.h | 33 +++++ .../dynamic_quantize_kernel_selector.cpp | 18 +++ .../dynamic_quantize_kernel_selector.h | 23 ++++ .../intel_gpu/src/plugin/compiled_model.cpp | 1 + .../src/plugin/ops/dynamic_quantize.cpp | 38 ++++++ .../src/plugin/ops/fully_connected.cpp | 14 +- src/plugins/intel_gpu/src/plugin/plugin.cpp | 1 + .../intel_gpu/src/plugin/program_builder.cpp | 2 +- .../dynamic_quantize_fully_connected.cpp | 66 +++++++++ .../dynamic_quantize_fully_connected.hpp | 19 +++ .../transformations/op/dynamic_quantize.cpp | 53 ++++++++ .../op/fully_connected_compressed.cpp | 53 +++++--- .../src/plugin/transformations_pipeline.cpp | 9 ++ .../src/runtime/debug_configuration.cpp | 3 + .../src/runtime/execution_config.cpp | 1 + .../test_cases/dynamic_quantize_gpu_test.cpp | 123 +++++++++++++++++ 40 files changed, 1221 insertions(+), 30 deletions(-) create mode 100644 src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp create mode 100644 src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp create mode 100644 src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp create mode 100644 src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp create mode 100644 src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h create mode 100644 src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl create mode 100644 src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.h create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.h create mode 100644 src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.hpp create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp create mode 100644 src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp new file mode 100644 index 00000000000000..ae8c1f5e7461d3 --- /dev/null +++ b/src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp @@ -0,0 +1,37 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/op/op.hpp" + +namespace ov { +namespace intel_gpu { +namespace op { + +/// \brief Operator performing Dynamic Quantize +class DynamicQuantize : public ov::op::Op { +public: + OPENVINO_OP("DynamicQuantize", "gpu_opset"); + + DynamicQuantize() = default; + /// \brief Constructs an DynamicQuantize operation. + /// + /// \param data Input tensor with data + DynamicQuantize(const Output& data, size_t group_size); + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + size_t get_group_size() { return m_group_size; }; + +private: + size_t m_group_size; +}; + +std::vector shape_infer(const DynamicQuantize* op, std::vector input_shapes); + +} // namespace op +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/fully_connected_compressed.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/fully_connected_compressed.hpp index 2c2e3c82089621..217095934a6185 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/fully_connected_compressed.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/fully_connected_compressed.hpp @@ -29,7 +29,20 @@ class FullyConnectedCompressed : public FullyConnected { const ov::Output &decompression_scale, const ov::element::Type output_type = ov::element::undefined); + FullyConnectedCompressed(const OutputVector& inputs, + bool has_zp = true, + bool has_activation_scale = false, + const ov::element::Type output_type = ov::element::undefined); + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + + bool get_has_zp() const { return m_has_zp; } + bool get_has_activation_scale() const { return m_has_activation_scale; } + + +protected: + bool m_has_zp; + bool m_has_activation_scale; }; } // namespace op diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp index 6ce8bb62407aa5..e61542a0c50dfc 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp @@ -287,3 +287,4 @@ REGISTER_FACTORY(internal, Placeholder); REGISTER_FACTORY(internal, SDPA); REGISTER_FACTORY(internal, IndirectSDPA); REGISTER_FACTORY(internal, RoPE); +REGISTER_FACTORY(internal, DynamicQuantize); diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp new file mode 100644 index 00000000000000..fc051f54bb6940 --- /dev/null +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp @@ -0,0 +1,58 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#include "primitive.hpp" + +namespace cldnn { + +/// @brief Dynamic Quantize primitive +/// @details Performs dynamic quantization +struct dynamic_quantize : public primitive_base { + CLDNN_DECLARE_PRIMITIVE(dynamic_quantize); + + dynamic_quantize() : primitive_base("", {}), group_size(0) {} + + // [TODO] should fix size of outputs + /// @brief Constructs dynamic_quantize primitive + /// @param id This primitive id + /// @param input Input primitive id + /// @param output_size Output data size of the primitive + /// @param data_type Output data type of quantized + dynamic_quantize(const primitive_id& id, + const input_info& input, + const size_t group_size, + const std::vector data_types = {optional_data_type(data_types::f16), optional_data_type(data_types::i8)}, + const padding& output_padding = padding()) + : primitive_base(id, {input}, 2, data_types, {output_padding}) + , group_size(group_size) {} + + size_t group_size = 0; + + size_t hash() const override { + size_t seed = primitive::hash(); + seed = hash_combine(seed, group_size); + return seed; + } + + bool operator==(const primitive& rhs) const override { + if (!compare_common_params(rhs)) + return false; + + auto rhs_casted = downcast(rhs); + + return group_size == rhs_casted.group_size; + } + + void save(BinaryOutputBuffer& ob) const override { + primitive_base::save(ob); + ob << group_size; + } + + void load(BinaryInputBuffer& ib) override { + primitive_base::load(ib); + ib >> group_size; + } +}; +} // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/fully_connected.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/fully_connected.hpp index 3287958b57bd28..2121bf9e122c49 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/fully_connected.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/fully_connected.hpp @@ -95,10 +95,44 @@ struct fully_connected : public primitive_base { compressed_weights(true), decompression_scale(decompression_scale), decompression_zero_point(decompression_zero_point), + dynamic_quantized_activation(false), input_size(input_size), weights_rank(weights_rank) { OPENVINO_ASSERT(!decompression_scale.empty(), "[GPU] Compressed fully connected requires at least decompression scale input"); } + /// @brief Constructs fully connected compressed layer. + /// @param id This primitive id. + /// @param input Input primitive id. + /// @param weights Primitive id containing weights data. + /// @param bias Primitive id containing bias data. + /// @param compression_scale Primitive id containing scale factors for weights decompression. + /// @param compression_zero_point Primitive id containing zero points for weights decompression. + /// @param activation_scale Primitive id containing scale factor for activation. + fully_connected(const primitive_id& id, + const input_info& input, + const primitive_id& weights, + const primitive_id& bias, + const primitive_id& decompression_scale, + const primitive_id& decompression_zero_point, + const input_info& activation_scale, + const data_types data_type, + const size_t input_size = 2, + const size_t weights_rank = 2) + : primitive_base(id, { input }, 1, {optional_data_type{data_type}}), + weights(weights), + bias(bias), + compressed_weights(true), + decompression_scale(decompression_scale), + decompression_zero_point(decompression_zero_point), + dynamic_quantized_activation(false), + activation_scale(activation_scale), + input_size(input_size), + weights_rank(weights_rank) { + if (activation_scale.is_valid()) + dynamic_quantized_activation = true; + + OPENVINO_ASSERT(!decompression_scale.empty(), "[GPU] Compressed fully connected requires at least decompression scale input"); + } /// @brief Primitive id containing weights data. primitive_id weights; @@ -108,6 +142,8 @@ struct fully_connected : public primitive_base { bool compressed_weights = false; primitive_id decompression_scale = ""; primitive_id decompression_zero_point = ""; + bool dynamic_quantized_activation = false; + input_info activation_scale = {"", 0}; optional_value decompression_zero_point_scalar = optional_value(); /// @brief Primitive dimension size. @@ -123,6 +159,7 @@ struct fully_connected : public primitive_base { seed = hash_combine(seed, compressed_weights); seed = hash_combine(seed, !decompression_scale.empty()); seed = hash_combine(seed, !decompression_zero_point.empty()); + seed = hash_combine(seed, activation_scale.is_valid()); seed = hash_combine(seed, decompression_zero_point_scalar.has_value()); seed = hash_combine(seed, decompression_zero_point_scalar.value_or(0.0f)); return seed; @@ -140,6 +177,7 @@ struct fully_connected : public primitive_base { compressed_weights == rhs_casted.compressed_weights && decompression_scale.empty() == rhs_casted.decompression_scale.empty() && decompression_zero_point.empty() == rhs_casted.decompression_zero_point.empty() && + activation_scale.is_valid() == rhs_casted.activation_scale.is_valid() && decompression_zero_point_scalar.value_or(0.0f) == rhs_casted.decompression_zero_point_scalar.value_or(0.0f); } @@ -150,8 +188,10 @@ struct fully_connected : public primitive_base { ob << compressed_weights; ob << decompression_scale; ob << decompression_zero_point; + ob << activation_scale; ob << input_size; ob << weights_rank; + ob << dynamic_quantized_activation; if (decompression_zero_point_scalar.has_value()) { ob << true; @@ -169,8 +209,10 @@ struct fully_connected : public primitive_base { ib >> compressed_weights; ib >> decompression_scale; ib >> decompression_zero_point; + ib >> activation_scale; ib >> input_size; ib >> weights_rank; + ib >> dynamic_quantized_activation; bool has_value; ib >> has_value; @@ -197,6 +239,9 @@ struct fully_connected : public primitive_base { if (!decompression_zero_point.empty()) ret.push_back(decompression_zero_point); + if (activation_scale.is_valid()) + ret.push_back(activation_scale); + return ret; } }; diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp index 589b5dd96cb055..2f60afbd029734 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp @@ -129,6 +129,7 @@ class debug_configuration { int serialize_compile; // Serialize creating primitives and compiling kernels std::vector forced_impl_types; // Force implementation type either ocl or onednn int max_kernels_per_batch; // Maximum number of kernels in a batch during compiling kernels + int dynamic_quantization_group_size; // Set dynamic quantization group size. Default is 0.(ignored) int impls_cache_capacity; // The maximum number of entries in the kernel impl cache int enable_sdpa; // Allows to control SDPA decomposition int disable_async_compilation; // Disable async compilation diff --git a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp new file mode 100644 index 00000000000000..9938ba6f08358c --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp @@ -0,0 +1,63 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "intel_gpu/op/dynamic_quantize.hpp" +#include "dynamic_quantize_inst.h" + +#include "primitive_type_base.h" +#include "json_object.h" +#include + +namespace cldnn { +GPU_DEFINE_PRIMITIVE_TYPE_ID(dynamic_quantize); + +layout dynamic_quantize_inst::calc_output_layout(dynamic_quantize_node const& node, kernel_impl_params const& impl_param) { + auto desc = impl_param.typed_desc(); + auto input_layout = impl_param.get_input_layout(); + auto output_type = data_types::i8; + auto output_format = input_layout.format; + + return layout(output_type, output_format, input_layout.get_tensor()); +} + +template +std::vector dynamic_quantize_inst::__calc_output_layouts(layout &act_layout, size_t group_size) { + ov::intel_gpu::op::DynamicQuantize op; + auto output_format = act_layout.format; + + std::vector input_shapes = { + act_layout.get(), + }; + + std::vector output_shapes = shape_infer(&op, input_shapes); + + return { layout(output_shapes[0], data_types::i8, output_format), layout(output_shapes[1], data_types::f16, output_format) }; + +} +template std::vector dynamic_quantize_inst::__calc_output_layouts(layout &act_layout, size_t group_size); + +template +std::vector dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& /*node*/, const kernel_impl_params& impl_param) { + auto desc = impl_param.typed_desc(); + auto input_layout = impl_param.get_input_layout(); + return __calc_output_layouts(input_layout, 0 /*NOT IMPLEMENTED*/); +} + +template std::vector dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& node, + const kernel_impl_params& impl_param); + +std::string dynamic_quantize_inst::to_string(dynamic_quantize_node const& node) { + auto desc = node.get_primitive(); + auto node_info = node.desc_to_json(); + + std::stringstream primitive_description; + + node_info->dump(primitive_description); + + return primitive_description.str(); +} + +dynamic_quantize_inst::typed_primitive_inst(network& network, dynamic_quantize_node const& node) : parent(network, node) {} + +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/fully_connected.cpp b/src/plugins/intel_gpu/src/graph/fully_connected.cpp index 23392dc9514772..a47aeff564ca96 100644 --- a/src/plugins/intel_gpu/src/graph/fully_connected.cpp +++ b/src/plugins/intel_gpu/src/graph/fully_connected.cpp @@ -277,6 +277,9 @@ std::string fully_connected_inst::to_string(fully_connected_node const& node) { fc_info.add("decompression zp value", desc->decompression_zero_point_scalar.value()); } } + if (desc->dynamic_quantized_activation) { + fc_info.add("activation scale id", desc->activation_scale.pid); + } node_info->add("fully connected info", fc_info); node_info->dump(primitive_description); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp index 4c1b1008434144..84f1b26507f19e 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp @@ -6,6 +6,7 @@ #include "program_helpers.h" #include "intel_gpu/runtime/itt.hpp" +#include "intel_gpu/runtime/debug_configuration.hpp" using namespace cldnn; @@ -19,6 +20,7 @@ void build_implementations::run(program& p) { for (auto& n : p.get_processing_order()) { if (auto impl = n->get_selected_impl()) { auto params = n->get_kernel_impl_params(); + GPU_DEBUG_TRACE << "add_kernels_source: " << params->desc->id << std::endl; cache.add_kernels_source(*params, impl->get_kernels_source()); } } @@ -26,6 +28,7 @@ void build_implementations::run(program& p) { for (auto& n : p.get_processing_order()) { if (auto impl = n->get_selected_impl()) { auto params = n->get_kernel_impl_params(); + GPU_DEBUG_TRACE << "init_kernels: " << params->desc->id << std::endl; impl->init_kernels(cache, *params); impl->reset_kernels_source(); } diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp index 4384f23367293d..33df55157c5225 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp @@ -409,6 +409,8 @@ void prepare_primitive_fusing::fuse_bias(program &p) { fc_with_bias_prim->decompression_zero_point = desc->decompression_zero_point; if (desc->decompression_zero_point_scalar.has_value()) fc_with_bias_prim->decompression_zero_point_scalar = desc->decompression_zero_point_scalar.value(); + fc_with_bias_prim->activation_scale = desc->activation_scale; + fc_with_bias_prim->dynamic_quantized_activation = desc->dynamic_quantized_activation; } auto& new_fc_node = p.get_or_create(fc_with_bias_prim); fuse_bias_f(fc, new_fc_node, bias_node, eltw_node); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp new file mode 100644 index 00000000000000..bf087cd4258e7f --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp @@ -0,0 +1,73 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/core/validation_util.hpp" +#include "primitive_base.hpp" +#include "dynamic_quantize/dynamic_quantize_kernel_ref.h" +#include "dynamic_quantize/dynamic_quantize_kernel_selector.h" +#include "dynamic_quantize_inst.h" + +namespace cldnn { +namespace ocl { + +struct dynamic_quantize_impl : typed_primitive_impl_ocl { + using parent = typed_primitive_impl_ocl; + using parent::parent; + using kernel_selector_t = kernel_selector::dynamic_quantize_kernel_selector; + using kernel_params_t = kernel_selector::dynamic_quantize_params; + + DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::dynamic_quantize_impl); + + std::unique_ptr clone() const override { + return make_unique(*this); + } + + void load(BinaryInputBuffer& ib) override { + parent::load(ib); + if (is_dynamic()) { + auto& kernel_selector = kernel_selector_t::Instance(); + auto kernel_impl = kernel_selector.GetImplementation(_kernel_data.kernelName); + kernel_impl->GetUpdateDispatchDataFunc(_kernel_data); + } + } + + static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { + /// XXX: handle group size here + auto params = get_default_params(impl_param, is_shape_agnostic); + params.outputs.push_back(convert_data_tensor(impl_param.get_output_layout(1))); + + return params; + } + + void update_dispatch_data(const kernel_impl_params& impl_param) override { + auto kernel_params = get_kernel_params(impl_param, true); + (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + } +}; + +namespace detail { + +attach_dynamic_quantize_impl::attach_dynamic_quantize_impl() { + auto types = { + data_types::f16, + data_types::i8 + }; + + auto formats = { + format::bfyx, + }; + + implementation_map::add(impl_types::ocl, + shape_types::any, + typed_primitive_impl_ocl::create, + types, + formats); +} + +} // namespace detail +} // namespace ocl +} // namespace cldnn + +BIND_BINARY_BUFFER_WITH_TYPE(cldnn::ocl::dynamic_quantize_impl) +BIND_BINARY_BUFFER_WITH_TYPE(cldnn::dynamic_quantize) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp index cadab1b29ec711..07f409ab052c75 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp @@ -23,6 +23,7 @@ void register_implementations() { REGISTER_OCL(depth_to_space); REGISTER_OCL(detection_output); REGISTER_OCL(dft); + REGISTER_OCL(dynamic_quantize); REGISTER_OCL(batch_to_space); REGISTER_OCL(experimental_detectron_detection_output); REGISTER_OCL(experimental_detectron_generate_proposals_single_image); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp index bacb3c60023c76..a0933a3ab582cd 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp @@ -20,6 +20,7 @@ #include "intel_gpu/primitives/deconvolution.hpp" #include "intel_gpu/primitives/depth_to_space.hpp" #include "intel_gpu/primitives/detection_output.hpp" +#include "intel_gpu/primitives/dynamic_quantize.hpp" #include "intel_gpu/primitives/eltwise.hpp" #include "intel_gpu/primitives/experimental_detectron_detection_output.hpp" #include "intel_gpu/primitives/experimental_detectron_prior_grid_generator.hpp" @@ -104,6 +105,7 @@ REGISTER_OCL(deconvolution); REGISTER_OCL(depth_to_space); REGISTER_OCL(detection_output); REGISTER_OCL(dft); +REGISTER_OCL(dynamic_quantize); REGISTER_OCL(experimental_detectron_detection_output); REGISTER_OCL(experimental_detectron_generate_proposals_single_image); REGISTER_OCL(experimental_detectron_prior_grid_generator); diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp index 619797c1f78432..5cd06a16f734fb 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp @@ -60,20 +60,28 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { const auto weights_dt = instance.get_input_layout(1).data_type; auto weight_bitwidth = ov::element::Type(weights_dt).bitwidth(); OPENVINO_ASSERT(weight_bitwidth == 8 || weight_bitwidth == 4, "[GPU] oneDNN supports only 4bit/8bit compressed weights"); + int idx = prim->bias.empty() ? 2 : 3; if (!prim->decompression_scale.empty()) { - auto decompression_scale_idx = prim->bias.empty() ? 2 : 3; + auto decompression_scale_idx = idx++; auto scale_mem = instance.dep_memory_ptr(decompression_scale_idx); dnnl::memory::desc desc = onednn::layout_to_memory_desc(scale_mem->get_layout(), dnnl::memory::format_tag::a, true); args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, scale_mem->get_onednn_memory(desc)}); } if (!prim->decompression_zero_point.empty()) { - auto decompression_zp_idx = prim->bias.empty() ? 3 : 4; + auto decompression_zp_idx = idx++; auto zp_mem = instance.dep_memory_ptr(decompression_zp_idx); dnnl::memory::desc desc = onednn::layout_to_memory_desc(zp_mem->get_layout(), dnnl::memory::format_tag::a, true); args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS, zp_mem->get_onednn_memory(desc)}); } + + if (prim->activation_scale.is_valid()) { + auto activation_scale_idx = idx++; + auto act_scale_mem = instance.dep_memory_ptr(activation_scale_idx); + dnnl::memory::desc desc = onednn::layout_to_memory_desc(act_scale_mem->get_layout(), dnnl::memory::format_tag::a, true); + args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0, act_scale_mem->get_onednn_memory(desc)}); + } } return args; @@ -231,6 +239,7 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { ob << input_size; ob << has_bias; ob << is_compressed; + ob << prim->dynamic_quantized_activation; bool has_decompression_scale = !prim->decompression_scale.empty(); if (has_decompression_scale) { @@ -256,9 +265,11 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { size_t input_size = 2; bool has_bias = false; bool is_compressed = false; + bool dynamic_quantized_activation; ib >> input_size; ib >> has_bias; ib >> is_compressed; + ib >> dynamic_quantized_activation; const kernel_impl_params* impl_params = reinterpret_cast(ib.getKernelImplParams()); auto prim = impl_params->typed_desc(); @@ -295,6 +306,20 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { } } + if (dynamic_quantized_activation) { + int input_idx = has_bias ? 2 : 1; + if (has_decompression_scale) + input_idx++; + if (has_decompression_zp) + input_idx++; + // Note: it supports per-token activation scale only + auto partial_shape = impl_params->get_input_layout(0).get_partial_shape(); + auto innermost_len = partial_shape[partial_shape.size() - 1].get_length(); + + auto act_scale_data_type = convert_data_type(impl_params->get_input_layout(input_idx).data_type); + _attrs->set_scales(DNNL_ARG_SRC, (1 << 1) | (1 << 0), dnnl::memory::dims{1, innermost_len}, act_scale_data_type); + } + if (is_compressed) { auto prim_desc = get_matmul_primitive_descriptor(*impl_params, ib.get_engine(), input_size, has_bias, *_attrs); _pd = *prim_desc; @@ -321,6 +346,7 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { dnnl::memory::data_type ds_data_type = dnnl::memory::data_type::undef; dnnl::memory::data_type dzp_data_type = dnnl::memory::data_type::undef; bool is_four_bit_weight = false; + int idx = !arg.bias_term() ? 1 : 2; // There may be a performance difference between InnerProduct and MatMul primitives in oneDNN, // so use MatMul only for weights compression and IP for all other cases. @@ -329,7 +355,7 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { auto weights_layout = impl_params.get_input_layout(1); is_four_bit_weight = weights_layout.data_type == data_types::u4 || weights_layout.data_type == data_types::i4; if (!prim->decompression_scale.empty()) { - auto decompression_scale_idx = !arg.bias_term() ? 2 : 3; + auto decompression_scale_idx = ++idx; ds_data_type = convert_data_type(arg.get_dependency(decompression_scale_idx).get_output_layout().data_type); auto ifm = arg.get_dependency(1).get_output_layout().get_dim(1); auto ngroups = arg.get_dependency(decompression_scale_idx).get_output_layout().get_dim(1); @@ -344,7 +370,7 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { } if (!prim->decompression_zero_point.empty()) { - auto decompression_zp_idx = !arg.bias_term() ? 3 : 4; + auto decompression_zp_idx = ++idx; auto dzp_layout = arg.get_dependency(decompression_zp_idx).get_output_layout(); dzp_data_type = convert_data_type(dzp_layout.data_type); @@ -360,6 +386,16 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { } } + if (prim->dynamic_quantized_activation) { + // Note: it supports per-token activation scale only + ++idx; + auto partial_shape = impl_params.input_layouts[0].get_partial_shape(); + auto innermost_len = partial_shape[partial_shape.size() - 1].get_length(); + + auto act_scale_data_type = convert_data_type(impl_params.input_layouts[idx].data_type); + attr->set_scales(DNNL_ARG_SRC, (1 << 1) | (1 << 0), dnnl::memory::dims{1, innermost_len}, act_scale_data_type); + } + auto prim_desc = get_matmul_primitive_descriptor(impl_params, impl_params.prog->get_engine(), prim->input_size, !prim->bias.empty(), *attr); diff --git a/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h new file mode 100644 index 00000000000000..6458b28dda877d --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h @@ -0,0 +1,46 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#include "intel_gpu/primitives/dynamic_quantize.hpp" +#include "primitive_inst.h" + +#include + +namespace cldnn { + +template <> +struct typed_program_node : public typed_program_node_base { + using parent = typed_program_node_base; + +public: + using parent::parent; + + program_node& input(size_t index = 0) const { return get_dependency(index); } + std::vector get_shape_infer_dependencies() const override { return {}; } +}; + +using dynamic_quantize_node = typed_program_node; + +template <> +class typed_primitive_inst : public typed_primitive_inst_base { + using parent = typed_primitive_inst_base; + using parent::parent; + +public: + template + static std::vector calc_output_layouts(dynamic_quantize_node const& /*node*/, const kernel_impl_params& impl_params); + static layout calc_output_layout(dynamic_quantize_node const& node, kernel_impl_params const& impl_params); + + // Internal function to be used from fakealignment + template + static std::vector __calc_output_layouts(layout &act_layout, size_t group_size); + static std::string to_string(dynamic_quantize_node const& node); + + typed_primitive_inst(network& network, dynamic_quantize_node const& node); +}; + +using dynamic_quantize_inst = typed_primitive_inst; + +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index 8dc40d75967b27..5a6bd94fe9dd1a 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -1072,7 +1072,10 @@ void network::execute_impl(const std::vector& events) { OPENVINO_ASSERT(!bin.empty(), "Failure loading binary from OV_GPU_LoadDumpRawBinary : " + dump_file); auto input_mem = get_primitive(inst->id())->dep_memory_ptr(i); - OPENVINO_ASSERT(input_mem->size() == bin.size(), "memory size mis-match for OV_GPU_LoadDumpRawBinary : " + layer_name); + if (input_mem->size() != bin.size()) { + std::cout << "WARNING: memory size mis-match for OV_GPU_LoadDumpRawBinary : " + layer_name << " " << input_mem->size() << " / " << bin.size() << std::endl; + bin.resize(input_mem->size()); + } input_mem->copy_from(get_stream(), static_cast(&bin[0]), true); } diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index f8267673722e64..8f5b12c7f6f7ee 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -30,6 +30,7 @@ #include "condition_inst.h" #include "gather_inst.h" #include "broadcast_inst.h" +#include "dynamic_quantize_inst.h" #include "experimental_detectron_roi_feature_extractor_inst.hpp" #include "implementation_map.hpp" #include "graph_optimizer/prepare_buffer_fusing.h" @@ -535,7 +536,7 @@ event::ptr primitive_inst::realloc_if_needed() { // Reuse state memory as output for kv cache if possible // otherwise clear _outputs for the cases when mem was reused previously if (_impl_params->can_be_optimized()) { - GPU_DEBUG_TRACE_DETAIL << id() << " : realloc_if_needed: Set kvcache output memmory as variable memory " << variable.get_memory()->buffer_ptr() + GPU_DEBUG_TRACE_DETAIL << id() << " : realloc_if_needed: Set kvcache output memory as variable memory " << variable.get_memory()->buffer_ptr() << " (ptr: " << variable.get_memory()->buffer_ptr() << ", actual_size: " << variable.get_actual_mem_size()/8 << " bytes" << ", variable layout " << variable.get_layout().to_short_string() << ")" << std::endl; @@ -639,7 +640,15 @@ event::ptr primitive_inst::realloc_if_needed() { << fc_input_layout.to_short_string() << " to meet the input buffer alignment requirements for FC\n"; updated_layouts[dep_idx] = fc_input_layout; } + + // dynamic quantization is only applied to activation of FC + if (get_node().is_type()) { + auto dyn_quan_scale_layout = dynamic_quantize_inst::__calc_output_layouts(updated_layouts[dep_idx], 0); + GPU_DEBUG_TRACE_DETAIL << "update layout of dynamic quantize scale parameter layout " << dyn_quan_scale_layout[1].to_short_string() << std::endl; + updated_params.output_layouts[1] = dyn_quan_scale_layout[1]; + } } + } } diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl new file mode 100644 index 00000000000000..a61803cd4926ef --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl @@ -0,0 +1,57 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/batch_headers/fetch_data.cl" + +#if OUTPUT_DIMS != 4 +#error "dynamic_quantize_gpu_opt.cl: Unsupported output dimension" +#endif + +REQD_SUB_GROUP_SIZE(16) +KERNEL(dynamic_quantize_gpu_opt)( + OPTIONAL_SHAPE_INFO_ARG + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output, + __global OUTPUT1_TYPE* output_scale) +{ + const uint bf = (uint)get_global_id(2); + + const uint sglid = get_sub_group_local_id(); + const uint sgid = get_sub_group_id(); + const uint num_sg = get_num_sub_groups(); + const uint group_size = (INPUT0_FEATURE_PITCH / 16 / num_sg); + const uint offset = bf * INPUT0_FEATURE_PITCH + group_size * (sglid + 16 * sgid); + __local half partial_max[32]; // FIXME: 16 is an arbitrary number + half8 val; + half max; + half grp_max = 0.001h; + + unroll_for (int i = 0; i < group_size/8; ++i) { + val = fabs(as_half8(vload8(0, input + offset + (i * 8)))); + + max = fmax(fmax(fmax(val[0], val[1]), fmax(val[2], val[3])), + fmax(fmax(val[4], val[5]), fmax(val[6], val[7]))); + grp_max = fmax(grp_max, max); + } + + half max_value = sub_group_reduce_max(grp_max); + partial_max[sgid] = max_value; + barrier(CLK_LOCAL_MEM_FENCE); + + // calculate global max + max_value = partial_max[0]; + for (int i = 1; i < num_sg; i++) + max_value = fmax(max_value, partial_max[i]); + + half scale = 127.0h / max_value; + + unroll_for (int i = 0; i < group_size/8; ++i) { + val = as_half8(vload8(0, input + offset + i*8)); + val *= scale; + vstore8(convert_char8(val), 0, output + offset + i*8); + } + + if (sglid == 0 && sgid == 0) + output_scale[bf] = 1.0h / scale; +} diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl new file mode 100644 index 00000000000000..14e6f8383e07f7 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl @@ -0,0 +1,57 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/batch_headers/fetch_data.cl" + +#if OUTPUT_DIMS != 4 +#error "dynamic_quantize_gpu_ref.cl: Unsupported output dimension" +#endif + +KERNEL(dynamic_quantize_gpu_ref)( + OPTIONAL_SHAPE_INFO_ARG + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output, + __global OUTPUT1_TYPE* output_scale) +{ + const uint bf = (uint)get_global_id(0); + const uint b = (uint)get_global_id(0) / INPUT0_FEATURE_NUM; + const uint f = (uint)get_global_id(0) % INPUT0_FEATURE_NUM; + const uint y = (uint)get_global_id(1); + const uint scale_idx = OUTPUT1_GET_INDEX(b, f, y, 0); + + half max_val = 0.0001h; + for (int y_off = 0; y_off < (get_global_size(1) == 1 ? INPUT0_SIZE_Y : 1); y_off++) { + const uint offset = INPUT0_GET_INDEX(b, f, y + y_off, 0); + int x; + for (x = 0; x < INPUT0_SIZE_X / 8; x++) { + half8 val = as_half8(vload8(0, (ushort*)input + offset + x * 8)); + half8 abs_val = fabs(val); + + for (int j = 0; j < 8; j++) + max_val = fmax(max_val, abs_val[j]); + } + x *= 8; + for (; x < INPUT0_SIZE_X; x++) + max_val = fmax(max_val, fabs(input[offset + x])); + } + + half scale = 127.0h / max_val; + for (int y_off = 0; y_off < (get_global_size(1) == 1 ? INPUT0_SIZE_Y : 1); y_off++) { + const uint in_offset = INPUT0_GET_INDEX(b, f, y + y_off, 0); + const uint out_offset = OUTPUT_GET_INDEX(b, f, y + y_off, 0); + int x; + for (x = 0; x < INPUT0_SIZE_X / 8; x++) { + half8 val = as_half8(vload8(0, (ushort*)input + in_offset + x * 8)); + val *= scale; + vstore8(convert_char8(val), 0, output + out_offset + x * 8); + } + x *= 8; + for (; x < INPUT0_SIZE_X; x++) + output[out_offset + x] = convert_char(input[in_offset + x] * scale); + } + + ushort8 test = vload8(0, (ushort*)input + INPUT0_GET_INDEX(b, f, 0, 0)); + + output_scale[scale_idx] = 1.0h / scale; +} diff --git a/src/plugins/intel_gpu/src/kernel_selector/common_types.h b/src/plugins/intel_gpu/src/kernel_selector/common_types.h index d0aba8554eccc7..bf96ff16aa4c8f 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/common_types.h +++ b/src/plugins/intel_gpu/src/kernel_selector/common_types.h @@ -98,7 +98,8 @@ enum class KernelType { UNIQUE_GATHER, RMS, SWIGLU, - ROPE + ROPE, + DYNAMIC_QUANTIZE }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector.cpp index 160e6744a03928..a4aac48b71cc5d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector.cpp @@ -188,6 +188,8 @@ KernelList kernel_selector_base::GetAllImplementations(const Params& params, Ker [](const PriorityPair& impl) { return std::move(impl.second); }); + } else { + GPU_DEBUG_COUT << "No implementation for " << params.layerID << " because of kernel type mismatch" << std::endl; } return result; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp new file mode 100644 index 00000000000000..fcf9d43341bb13 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp @@ -0,0 +1,127 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "dynamic_quantize_kernel_opt.h" +#include "kernel_selector_utils.h" +#include + +namespace kernel_selector { +ParamsKey DynamicQuantizeKernelOpt::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + k.EnableInputDataType(Datatype::INT8); + k.EnableInputDataType(Datatype::UINT8); + k.EnableInputDataType(Datatype::INT32); + k.EnableInputDataType(Datatype::INT64); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::INT32); + k.EnableOutputDataType(Datatype::INT64); + k.EnableDifferentTypes(); + k.EnableAllInputLayout(); + k.EnableAllOutputLayout(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnableDynamicShapesSupport(); + return k; +} + +JitConstants DynamicQuantizeKernelOpt::GetJitConstants(const dynamic_quantize_params& params) const { + JitConstants jit = MakeBaseParamsJitConstants(params); + + jit.Merge(GetTensorFriendlyWorkGroupsJit(params.outputs[0])); + + return jit; +} + +CommonDispatchData DynamicQuantizeKernelOpt::SetDefault(const dynamic_quantize_params& params) const { + GPU_DEBUG_GET_INSTANCE(debug_config); + CommonDispatchData dispatchData; + + dispatchData.gws = {64, 1, params.inputs[0].Batch().v * params.inputs[0].Feature().v}; + dispatchData.lws = {64, 1, 1}; + + return dispatchData; +} + +void DynamicQuantizeKernelOpt::GetUpdateDispatchDataFunc(KernelData& kd) const { + kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) { + const auto& prim_params = static_cast(params); + auto dispatchData = SetDefault(prim_params); + OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func"); + kd.kernels[0].params.workGroups.global = dispatchData.gws; + kd.kernels[0].params.workGroups.local = dispatchData.lws; + kd.kernels[0].skip_execution = false; + }; +} + +KernelsData DynamicQuantizeKernelOpt::GetKernelsData(const Params& params) const { + assert(params.GetType() == KernelType::DYNAMIC_QUANTIZE); + + if (!Validate(params)) + return {}; + + const dynamic_quantize_params& prim_params = static_cast(params); + auto dispatchData = SetDefault(prim_params); + + KernelData kd = KernelData::Default(params); + + auto cldnn_jit = GetJitConstants(prim_params); + auto entry_point = GetEntryPoint(kernelName, prim_params.layerID, params); + auto jit = CreateJit(kernelName, cldnn_jit, entry_point); + + GetUpdateDispatchDataFunc(kd); + + auto& kernel = kd.kernels[0]; + FillCLKernelData(kernel, + dispatchData, + params.engineInfo, + kernelName, + jit, + entry_point, + EXE_MODE_DEFAULT, + false, + false, + 1, + GetFusedPrimitiveInputsCount(params), + static_cast(prim_params.outputs.size()), + prim_params.is_shape_agnostic); + + // std::cout << ">> Select dynamic_quantize_kernel_opt : " << prim_params.outputs.size() << std::endl; + + return {kd}; +} + +KernelsPriority DynamicQuantizeKernelOpt::GetKernelsPriority(const Params& /*params*/) const { + return FORCE_PRIORITY_2; +} + +Datatype DynamicQuantizeKernelOpt::GetAccumulatorType(const dynamic_quantize_params& params) const { + Datatype types[] = { Datatype::F32, Datatype::F16, Datatype::INT64, Datatype::INT32, Datatype::UINT32}; + + for (Datatype type : types) + for (auto& in : params.inputs) + if (in.GetDType() == type) + return type; + return Datatype::F32; +} + +bool DynamicQuantizeKernelOpt::Validate(const Params& params) const { + if (!KernelBaseOpenCL::Validate(params)) + return false; + + const auto& dq_params = static_cast(params); + + // Todo : Add proper exception here + if ((dq_params.outputs[0].X().v * dq_params.outputs[0].Y().v % 32) != 0) + return false; + + return true; +} +} // namespace kernel_selector + diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.h new file mode 100644 index 00000000000000..3a95836080d2eb --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.h @@ -0,0 +1,30 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_base_opencl.h" +#include "dynamic_quantize_kernel_ref.h" + +namespace kernel_selector { +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// dynamic_quantize_params +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +class DynamicQuantizeKernelOpt : public KernelBaseOpenCL { +public: + DynamicQuantizeKernelOpt() : KernelBaseOpenCL("dynamic_quantize_gpu_opt") {} + virtual ~DynamicQuantizeKernelOpt() {} + + virtual JitConstants GetJitConstants(const dynamic_quantize_params& params) const; + virtual CommonDispatchData SetDefault(const dynamic_quantize_params& params) const; + KernelsData GetKernelsData(const Params& params) const override; + KernelsPriority GetKernelsPriority(const Params& params) const override; + Datatype GetAccumulatorType(const dynamic_quantize_params& params) const; + ParamsKey GetSupportedKey() const override; + +protected: + bool Validate(const Params&) const override; + void GetUpdateDispatchDataFunc(KernelData& kd) const override; +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp new file mode 100644 index 00000000000000..46e7774e937b54 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp @@ -0,0 +1,110 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "dynamic_quantize_kernel_ref.h" +#include "kernel_selector_utils.h" +#include + +namespace kernel_selector { +ParamsKey DynamicQuantizeKernelRef::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::INT8); + k.EnableInputLayout(DataLayout::bfyx); + k.EnableOutputLayout(DataLayout::bfyx); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnableDifferentTypes(); + k.EnableDynamicShapesSupport(); + return k; +} + +JitConstants DynamicQuantizeKernelRef::GetJitConstants(const dynamic_quantize_params& params) const { + JitConstants jit = MakeBaseParamsJitConstants(params); + + jit.Merge(GetTensorFriendlyWorkGroupsJit(params.outputs[0])); + + return jit; +} + +CommonDispatchData DynamicQuantizeKernelRef::SetDefault(const dynamic_quantize_params& params) const { + GPU_DEBUG_GET_INSTANCE(debug_config); + CommonDispatchData dispatchData; + + dispatchData.gws = {params.outputs[0].Batch().v * params.outputs[0].Feature().v, 1, 1}; + dispatchData.lws = {1, 1, 1}; + + return dispatchData; +} + +void DynamicQuantizeKernelRef::GetUpdateDispatchDataFunc(KernelData& kd) const { + kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) { + const auto& prim_params = static_cast(params); + auto dispatchData = SetDefault(prim_params); + OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func"); + kd.kernels[0].params.workGroups.global = dispatchData.gws; + kd.kernels[0].params.workGroups.local = dispatchData.lws; + kd.kernels[0].skip_execution = false; + }; +} + +KernelsData DynamicQuantizeKernelRef::GetKernelsData(const Params& params) const { + assert(params.GetType() == KernelType::DYNAMIC_QUANTIZE); + + if (!Validate(params)) + return {}; + + const dynamic_quantize_params& prim_params = static_cast(params); + auto dispatchData = SetDefault(prim_params); + + KernelData kd = KernelData::Default(params); + + auto cldnn_jit = GetJitConstants(prim_params); + auto entry_point = GetEntryPoint(kernelName, prim_params.layerID, params); + auto jit = CreateJit(kernelName, cldnn_jit, entry_point); + + GetUpdateDispatchDataFunc(kd); + + auto& kernel = kd.kernels[0]; + FillCLKernelData(kernel, + dispatchData, + params.engineInfo, + kernelName, + jit, + entry_point, + EXE_MODE_DEFAULT, + false, + false, + 1, + GetFusedPrimitiveInputsCount(params), + static_cast(prim_params.outputs.size()), + prim_params.is_shape_agnostic); + + return {kd}; +} + +KernelsPriority DynamicQuantizeKernelRef::GetKernelsPriority(const Params& /*params*/) const { + return FORCE_PRIORITY_8; +} + +Datatype DynamicQuantizeKernelRef::GetAccumulatorType(const dynamic_quantize_params& params) const { + Datatype types[] = { Datatype::F32, Datatype::F16, Datatype::INT64, Datatype::INT32, Datatype::UINT32}; + + for (Datatype type : types) + for (auto& in : params.inputs) + if (in.GetDType() == type) + return type; + return Datatype::F32; +} + +bool DynamicQuantizeKernelRef::Validate(const Params& params) const { + if (!KernelBaseOpenCL::Validate(params)) + return false; + + return true; +} +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h new file mode 100644 index 00000000000000..ce52ed9fb19714 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h @@ -0,0 +1,33 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_base_opencl.h" + +namespace kernel_selector { +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// dynamic_quantize_params +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +struct dynamic_quantize_params : public base_params { + dynamic_quantize_params() : base_params(KernelType::DYNAMIC_QUANTIZE) {} +}; + +class DynamicQuantizeKernelRef : public KernelBaseOpenCL { +public: + DynamicQuantizeKernelRef() : KernelBaseOpenCL("dynamic_quantize_gpu_ref") {} + virtual ~DynamicQuantizeKernelRef() {} + + virtual JitConstants GetJitConstants(const dynamic_quantize_params& params) const; + virtual CommonDispatchData SetDefault(const dynamic_quantize_params& params) const; + KernelsData GetKernelsData(const Params& params) const override; + KernelsPriority GetKernelsPriority(const Params& params) const override; + Datatype GetAccumulatorType(const dynamic_quantize_params& params) const; + ParamsKey GetSupportedKey() const override; + +protected: + bool Validate(const Params&) const override; + void GetUpdateDispatchDataFunc(KernelData& kd) const override; +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp new file mode 100644 index 00000000000000..6ca9fbd2f5bd76 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp @@ -0,0 +1,18 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "dynamic_quantize_kernel_selector.h" +#include "dynamic_quantize_kernel_ref.h" +#include "dynamic_quantize_kernel_opt.h" + +namespace kernel_selector { +dynamic_quantize_kernel_selector::dynamic_quantize_kernel_selector() { + Attach(); + Attach(); +} + +KernelsData dynamic_quantize_kernel_selector::GetBestKernels(const Params& params) const { + return GetNaiveBestKernel(params, KernelType::DYNAMIC_QUANTIZE); +} +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.h new file mode 100644 index 00000000000000..962aca7234f601 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.h @@ -0,0 +1,23 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_selector.h" + +namespace kernel_selector { +class dynamic_quantize_kernel_selector : public kernel_selector_base { +public: + static dynamic_quantize_kernel_selector& Instance() { + static dynamic_quantize_kernel_selector instance_; + return instance_; + } + + dynamic_quantize_kernel_selector(); + + virtual ~dynamic_quantize_kernel_selector() {} + + KernelsData GetBestKernels(const Params& params) const override; +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/plugin/compiled_model.cpp b/src/plugins/intel_gpu/src/plugin/compiled_model.cpp index b4d3e658e410a6..178e9b94d422aa 100644 --- a/src/plugins/intel_gpu/src/plugin/compiled_model.cpp +++ b/src/plugins/intel_gpu/src/plugin/compiled_model.cpp @@ -256,6 +256,7 @@ ov::Any CompiledModel::get_property(const std::string& name) const { ov::PropertyName{ov::num_streams.name(), PropertyMutability::RO}, ov::PropertyName{ov::hint::num_requests.name(), PropertyMutability::RO}, ov::PropertyName{ov::hint::inference_precision.name(), PropertyMutability::RO}, + ov::PropertyName{ov::hint::dynamic_quantization_group_size.name(), PropertyMutability::RO}, ov::PropertyName{ov::device::id.name(), PropertyMutability::RO}, ov::PropertyName{ov::execution_devices.name(), PropertyMutability::RO}, ov::PropertyName{ov::hint::dynamic_quantization_group_size.name(), PropertyMutability::RO} diff --git a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp new file mode 100644 index 00000000000000..e5a43b4c62541f --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp @@ -0,0 +1,38 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "intel_gpu/op/dynamic_quantize.hpp" +#include "intel_gpu/plugin/program_builder.hpp" +#include "intel_gpu/plugin/common_utils.hpp" +#include "intel_gpu/primitives/dynamic_quantize.hpp" + +namespace ov { +namespace op { +namespace internal { +using DynamicQuantize = ov::intel_gpu::op::DynamicQuantize; +} // namespace internal +} // namespace op +} // namespace ov + +namespace ov { +namespace intel_gpu { + +static void CreateDynamicQuantizeOp(ProgramBuilder& p, const std::shared_ptr& op) { + validate_inputs_count(op, {1}); + auto inputs = p.GetInputInfo(op); + std::string primitive_name = layer_type_name_ID(op); + + OPENVINO_ASSERT(op->get_group_size() == 1048576, "Not supported group size: ", op->get_group_size()); + auto prim = cldnn::dynamic_quantize(primitive_name, + inputs[0], + op->get_group_size(), + get_output_data_types(op) + ); + p.add_primitive(*op, prim); +} + +REGISTER_FACTORY_IMPL(internal, DynamicQuantize); + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp index 9b3a6492ec364b..36872f870c3adc 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp @@ -26,22 +26,24 @@ namespace ov { namespace intel_gpu { static void CreateFullyConnectedCompressedOp(ProgramBuilder& p, const std::shared_ptr& op) { - validate_inputs_count(op, {4, 5}); + validate_inputs_count(op, {4, 5, 6}); auto inputs = p.GetInputInfo(op); std::string primitive_name = layer_type_name_ID(op); auto supports_immad = p.get_engine().get_device_info().supports_immad; - const int INPUT_CNT_WITH_ZP = 5; auto input_name = inputs[0].pid; auto weights_name = inputs[1].pid; auto bias_name = inputs[2].pid; auto scale_name = inputs[3].pid; - auto zp_name = inputs.size() == INPUT_CNT_WITH_ZP ? inputs[4].pid : ""; + size_t input_idx = 4; + const size_t INPUT_PORT_IDX = input_idx; + std::string zp_name = op->get_has_zp() ? inputs[input_idx++].pid : ""; + std::string activation_scale_name = op->get_has_activation_scale() ? inputs[input_idx++].pid : ""; float zp_value = 0.0f; bool has_scalar_zp = false; - if (op->get_input_size() == INPUT_CNT_WITH_ZP) { - auto zp_const = std::dynamic_pointer_cast(op->get_input_node_shared_ptr(INPUT_CNT_WITH_ZP-1)); + if (op->get_has_zp()) { + auto zp_const = std::dynamic_pointer_cast(op->get_input_node_shared_ptr(INPUT_PORT_IDX)); if (zp_const && ov::shape_size(zp_const->get_output_shape(0)) == 1) { has_scalar_zp = true; zp_value = zp_const->cast_vector()[0]; @@ -55,10 +57,12 @@ static void CreateFullyConnectedCompressedOp(ProgramBuilder& p, const std::share bias_name, scale_name, has_scalar_zp && !supports_immad ? "" : zp_name, + {activation_scale_name, 1}, cldnn::element_type_to_data_type(op->get_output_element_type(0)), op->get_input_partial_shape(0).size(), op->get_input_partial_shape(1).size()); + if (has_scalar_zp) { fc.decompression_zero_point_scalar = zp_value; } diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index e3f7c2cfe2f6b5..9243b558be99d3 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -555,6 +555,7 @@ std::vector Plugin::get_supported_properties() const { ov::PropertyName{ov::hint::num_requests.name(), PropertyMutability::RW}, ov::PropertyName{ov::hint::inference_precision.name(), PropertyMutability::RW}, ov::PropertyName{ov::hint::enable_cpu_pinning.name(), PropertyMutability::RW}, + ov::PropertyName{ov::hint::dynamic_quantization_group_size.name(), PropertyMutability::RW}, ov::PropertyName{ov::device::id.name(), PropertyMutability::RW}, ov::PropertyName{ov::hint::dynamic_quantization_group_size.name(), PropertyMutability::RW} }; diff --git a/src/plugins/intel_gpu/src/plugin/program_builder.cpp b/src/plugins/intel_gpu/src/plugin/program_builder.cpp index 57e9bfdf444bf9..aae9b163b4f6bf 100644 --- a/src/plugins/intel_gpu/src/plugin/program_builder.cpp +++ b/src/plugins/intel_gpu/src/plugin/program_builder.cpp @@ -381,7 +381,7 @@ void validate_inputs_count(const std::shared_ptr& op, std::vectorget_input_size(), ") in )", + OPENVINO_THROW("Invalid inputs count (", op->get_input_size(), ") in ", op->get_friendly_name(), " (", op->get_type_name(), " ", op->get_type_info().version_id, ")"); } diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp new file mode 100644 index 00000000000000..3ab890becef0ad --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp @@ -0,0 +1,66 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "dynamic_quantize_fully_connected.hpp" + +#include "intel_gpu/op/fully_connected_compressed.hpp" +#include "intel_gpu/op/dynamic_quantize.hpp" + +#include "openvino/core/rt_info.hpp" +#include "openvino/pass/pattern/op/or.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "transformations/utils/utils.hpp" + +namespace ov { +namespace intel_gpu { + +DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(size_t group_size) { + using namespace ov::pass::pattern; + + auto data = any_input(); + auto fully_connected_compressed3 = wrap_type({data, any_input(), any_input(), any_input()}); + auto fully_connected_compressed4 = wrap_type({data, any_input(), any_input(), any_input(), any_input()}); + auto fully_connected_compressed = std::make_shared(OutputVector{fully_connected_compressed3, fully_connected_compressed4}); + + ov::matcher_pass_callback callback = [=](Matcher& m) { + if (transformation_callback(m.get_match_root())) { + return false; + } + const auto& pattern_map = m.get_pattern_value_map(); + const auto& m_data = pattern_map.at(data).get_node_shared_ptr(); + + std::shared_ptr m_fc; + + if (pattern_map.find(fully_connected_compressed3) != pattern_map.end()) + m_fc = std::dynamic_pointer_cast(pattern_map.at(fully_connected_compressed3).get_node_shared_ptr()); + else if (pattern_map.find(fully_connected_compressed4) != pattern_map.end()) + m_fc = std::dynamic_pointer_cast(pattern_map.at(fully_connected_compressed4).get_node_shared_ptr()); + + const auto innermost_size = m_fc->get_input_partial_shape(0)[m_fc->get_input_partial_shape(0).size() - 1].get_length(); + if (group_size == 0 || (innermost_size % group_size != 0 && static_cast(innermost_size) > group_size)) + return false; + + OutputVector fc_inputs; + auto dyn_quan = std::make_shared(m_data, group_size); + for (size_t i = 0; i < m_fc->get_input_size(); i++) + fc_inputs.push_back(m_fc->get_input_node_shared_ptr(i)); + fc_inputs[0] = dyn_quan->output(0); + fc_inputs.push_back(dyn_quan->output(1)); + auto new_fc = std::make_shared(fc_inputs, + m_fc->get_has_zp(), + true, + m_fc->get_output_type()); + ov::replace_node(m_fc, new_fc); + + new_fc->set_friendly_name(m_fc->get_friendly_name()); + ov::copy_runtime_info(m_fc, new_fc); + + return true; + }; + auto m = std::make_shared(fully_connected_compressed, "DynamicQuantizeFullyConnected"); + this->register_matcher(m, callback); +} + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.hpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.hpp new file mode 100644 index 00000000000000..35ae934e91d6f3 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.hpp @@ -0,0 +1,19 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" + +namespace ov { +namespace intel_gpu { + +class DynamicQuantizeFullyConnected: public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("DynamicQuantizeFullyConnected", "0"); + DynamicQuantizeFullyConnected(size_t group_size); +}; + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp new file mode 100644 index 00000000000000..c8ca959f791f44 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp @@ -0,0 +1,53 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "intel_gpu/op/dynamic_quantize.hpp" +#include "openvino/core/partial_shape.hpp" +#include "openvino/core/validation_util.hpp" +#include "openvino/op/variadic_split.hpp" +#include "variadic_split_shape_inference.hpp" +#include "intel_gpu/runtime/debug_configuration.hpp" + +namespace ov { +namespace intel_gpu { +namespace op { + +DynamicQuantize::DynamicQuantize(const Output& data, size_t group_size) + : Op({data}) + , m_group_size(group_size) { + set_output_size(2); + validate_and_infer_types(); +} + +void DynamicQuantize::validate_and_infer_types() { + std::vector input_shapes = { + get_input_partial_shape(0) + }; + + auto out_shapes = shape_infer(this, input_shapes); + set_output_type(0, ov::element::Type_t::i8, out_shapes[0]); + set_output_type(1, ov::element::Type_t::f16, out_shapes[1]); +} + +std::shared_ptr DynamicQuantize::clone_with_new_inputs(const ov::OutputVector& new_args) const { + check_new_args_count(this, new_args); + return std::make_shared(new_args.at(0), m_group_size); +} + +std::vector shape_infer(const DynamicQuantize* op, std::vector input_shapes) { + GPU_DEBUG_GET_INSTANCE(debug_config); + std::vector out_shapes; + out_shapes.push_back(input_shapes[0]); + // FIXME: generalize to N-dim case + auto scale_shape = input_shapes[0]; + for (size_t i = 2; i < scale_shape.size(); i++) + scale_shape[i] = 1; + out_shapes.push_back(scale_shape); + return out_shapes; +} + + +} // namespace op +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/fully_connected_compressed.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/fully_connected_compressed.cpp index 571d4fcbd576ae..c1586db5e56859 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/fully_connected_compressed.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/fully_connected_compressed.cpp @@ -14,7 +14,9 @@ FullyConnectedCompressed::FullyConnectedCompressed(const ov::Output& A, const ov::Output& decompression_scale, const ov::Output& decompression_zero_point, const ov::element::Type output_type) - : FullyConnected(A, B, bias, output_type) { + : FullyConnected(A, B, bias, output_type) + , m_has_zp(true) + , m_has_activation_scale(false) { set_argument(3, decompression_scale); set_argument(4, decompression_zero_point); validate_and_infer_types(); @@ -25,29 +27,46 @@ FullyConnectedCompressed::FullyConnectedCompressed(const ov::Output& A, const ov::Output& bias, const ov::Output& decompression_scale, const ov::element::Type output_type) - : FullyConnected(A, B, bias, output_type) { + : FullyConnected(A, B, bias, output_type) + , m_has_zp(false) + , m_has_activation_scale(false) { set_argument(3, decompression_scale); validate_and_infer_types(); } +FullyConnectedCompressed::FullyConnectedCompressed(const OutputVector& inputs, + bool has_zp, + bool has_activation_scale, + const ov::element::Type output_type) + : FullyConnected(inputs[0], inputs[1], inputs[2], output_type) + , m_has_zp(has_zp) + , m_has_activation_scale(has_activation_scale) +{ + for (size_t i = 3; i < inputs.size(); i++) + set_argument(i, inputs[i]); + validate_and_infer_types(); +} + std::shared_ptr FullyConnectedCompressed::clone_with_new_inputs(const ov::OutputVector& new_args) const { check_new_args_count(this, new_args); - if (new_args.size() == 4) - return std::make_shared(new_args.at(0), - new_args.at(1), - new_args.at(2), - new_args.at(3), - m_output_type); - else if (new_args.size() == 5) - return std::make_shared(new_args.at(0), - new_args.at(1), - new_args.at(2), - new_args.at(3), - new_args.at(4), - m_output_type); - else - OPENVINO_THROW("Unexpected inputs count for FullyConnectedCompressed op: ", new_args.size()); + auto input_size = new_args.size(); + auto expected_inputs = 4; + if (m_has_zp) + expected_inputs++; + if (m_has_activation_scale) + expected_inputs++; + NODE_VALIDATION_CHECK(this, + input_size == m_has_zp, + "Number of inputs is incorrect. Current value is: ", + input_size, + ", expected ", + expected_inputs); + + return std::make_shared(new_args, + m_has_zp, + m_has_activation_scale, + m_output_type); } } // namespace op diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index dbe7e858c1e6fe..c436f5f91b8196 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -71,6 +71,7 @@ #include "plugin/transformations/unsqueeze_broadcast_reshape_matmul_fusion.hpp" #include "plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.hpp" #include "plugin/transformations/group_norm_composition.hpp" +#include "plugin/transformations/dynamic_quantize_fully_connected.hpp" #include "transformations/common_optimizations/rms_fusion.hpp" #include "transformations/common_optimizations/broadcast_elementwise_fusion.hpp" #include "transformations/common_optimizations/broadcast_transition.hpp" @@ -847,6 +848,14 @@ void TransformationsPipeline::apply(std::shared_ptr func) { pass_config->disable(); pass_config->disable(); + auto dynamic_quantization_group_size = config.get_property(ov::hint::dynamic_quantization_group_size); + GPU_DEBUG_IF(cldnn::debug_configuration::get_instance()->dynamic_quantization_group_size > 0) { + dynamic_quantization_group_size = cldnn::debug_configuration::get_instance()->dynamic_quantization_group_size; + } + + if (device_info.supports_immad && dynamic_quantization_group_size == 1048576) // XXX: 1048576 is considered per-token + manager.register_pass(dynamic_quantization_group_size); + // This is supposed to be the last pass to ensure that we don't have name collisions until // GPU plugin stops using friendly names for program creation manager.register_pass(true); diff --git a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp index 9a34d77ae47f3e..7f5859702ee1a6 100644 --- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp +++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp @@ -166,6 +166,7 @@ static void print_help_messages() { " For example fc:onednn gemm:onednn reduce:ocl do:cpu" " For primitives fc, gemm, do, reduce, concat are supported. Separated by space."); message_list.emplace_back("OV_GPU_MaxKernelsPerBatch", "Maximum number of kernels in a batch during compiling kernels"); + message_list.emplace_back("OV_GPU_DynamicQuantizationGroupSize", "Set dynamic quantization group size"); message_list.emplace_back("OV_GPU_ImplsCacheCapacity", "The maximum number of entries in the kernel impl cache"); message_list.emplace_back("OV_GPU_DisableAsyncCompilation", "Disable async compilation"); message_list.emplace_back("OV_GPU_DisableWinogradConv", "Disable Winograd convolution"); @@ -237,6 +238,7 @@ debug_configuration::debug_configuration() , base_batch_for_memory_estimation(-1) , serialize_compile(0) , max_kernels_per_batch(0) + , dynamic_quantization_group_size(0) , impls_cache_capacity(-1) , enable_sdpa(-1) , disable_async_compilation(0) @@ -289,6 +291,7 @@ debug_configuration::debug_configuration() std::string forced_impl_types_str; get_gpu_debug_env_var("ForceImplTypes", forced_impl_types_str); get_gpu_debug_env_var("MaxKernelsPerBatch", max_kernels_per_batch); + get_gpu_debug_env_var("DynamicQuantizationGroupSize", dynamic_quantization_group_size); get_gpu_debug_env_var("ImplsCacheCapacity", impls_cache_capacity); get_gpu_debug_env_var("EnableSDPA", enable_sdpa); get_gpu_debug_env_var("DisableAsyncCompilation", disable_async_compilation); diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp index b7bb9947717ad0..b59932a4a3c2d9 100644 --- a/src/plugins/intel_gpu/src/runtime/execution_config.cpp +++ b/src/plugins/intel_gpu/src/runtime/execution_config.cpp @@ -46,6 +46,7 @@ void ExecutionConfig::set_default() { std::make_tuple(ov::hint::execution_mode, ov::hint::ExecutionMode::PERFORMANCE), std::make_tuple(ov::hint::num_requests, 0), std::make_tuple(ov::hint::enable_cpu_pinning, false), + std::make_tuple(ov::hint::dynamic_quantization_group_size, 0), std::make_tuple(ov::intel_gpu::hint::host_task_priority, ov::hint::Priority::MEDIUM), std::make_tuple(ov::intel_gpu::hint::queue_throttle, ov::intel_gpu::hint::ThrottleLevel::MEDIUM), diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp new file mode 100644 index 00000000000000..abea5a0fe6f2aa --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp @@ -0,0 +1,123 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "intel_gpu/primitives/tile.hpp" +#include "intel_gpu/runtime/internal_properties.hpp" +#include "intel_gpu/runtime/layout.hpp" +#include "openvino/core/partial_shape.hpp" +#include "test_utils.h" +#include "random_generator.hpp" +#include "network_test.h" +#include +#include +#include "intel_gpu/primitives/dynamic_quantize.hpp" +#include +#include + +#include "intel_gpu/runtime/compilation_context.hpp" +#include "fully_connected_inst.h" + +#include + +using namespace cldnn; +using namespace ::tests; + +class dynamic_quantization_gpu_tests: public ::testing::Test { +public: + + void test_dynamic_quantization(bool is_caching_test, bool is_dynamic, int batch = 1, int ifm = 1024) { + tests::random_generator rg(GET_SUITE_NAME); + auto& engine = get_test_engine(); + + long int batch_num = batch; + long int ifm_num = ifm; + + bool is_4d = true; + + auto input_ps = is_4d ? ov::PartialShape{ batch_num, 1, 1, ifm_num } : ov::PartialShape{ batch_num, ifm_num}; + auto dyn_input_ps = is_4d ? ov::PartialShape{ -1, 1, 1, ifm_num } : ov::PartialShape{ -1, ifm_num}; + auto input_mem = engine.allocate_memory({ input_ps, data_types::f32, format::bfyx }); + + auto input_data = rg.generate_random_1d(batch_num * ifm_num, -16.0f, 16.0f); + set_values(input_mem, input_data); + + auto in_layout_f32 = is_dynamic ? layout{ dyn_input_ps, data_types::f32, format::bfyx } + : layout{ input_ps, data_types::f32, format::bfyx }; + + auto in_layout = is_dynamic ? layout{ dyn_input_ps, data_types::f16, format::bfyx } + : layout{ input_ps, data_types::f16, format::bfyx }; + + auto reorder_1 = reorder("reorder_1", input_info("input"), layout{ input_ps, data_types::f16, format::bfyx }); + auto dyn_quan_prim = dynamic_quantize("dyn_quan_prim", input_info("reorder_1"), 32, {data_types::f16, data_types::i8}); + auto reorder_2 = reorder("reorder_2", input_info("dyn_quan_prim"), layout{ input_ps, data_types::f16, format::bfyx }); + + // Implemented dynamic quantize kernel + auto get_ref_results = [&]() { + topology topology( + input_layout("input", in_layout_f32), + reorder_1, + dyn_quan_prim, + reorder_2 + ); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + + ov::intel_gpu::ImplementationDesc dyn_quan_impl_desc = { format::bfyx, "dynamic_quantize_gpu_ref", impl_types::ocl }; + config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"dyn_quan_prim", dyn_quan_impl_desc} })); + + network network(engine, topology, config); + network.set_input_data("input", input_mem); + + auto outputs = network.execute(); + + auto output_layout = outputs.begin()->second.get_layout(); + auto output_mem = outputs.begin()->second.get_memory(); + + return engine.reinterpret_buffer(*output_mem, output_layout); + }; + + topology topology( + input_layout("input", in_layout_f32), + reorder_1, + dyn_quan_prim, + reorder_2 + ); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); + + network->set_input_data("input", input_mem); + + auto outputs = network->execute(); + + auto output_mem = outputs.begin()->second.get_memory(); + cldnn::mem_lock output_ptr (output_mem, get_test_stream()); + + auto ref_output_mem = get_ref_results(); + cldnn::mem_lock output_ptr_ref (ref_output_mem, get_test_stream()); + + size_t count = 0; + float max_diff = 0.f; + float avg = 0.f; + for (size_t i = 0; i < output_ptr_ref.size(); ++i) { + auto abs_diff = std::abs(output_ptr_ref[i] - output_ptr[i]); + if (max_diff < abs_diff) + max_diff = abs_diff; + avg = abs_diff; + count++; + // OPENVINO_ASSERT(abs_diff < 256); + } + /*GPU_DEBUG_LOG*/std::cout << "---> count: " << count << ", max_diff:" << max_diff << ", avg_diff: " << (avg/count) << std::endl; + } +}; + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_large_size) { + this->test_dynamic_quantization(false, false, 2048, 4096); +} + From bc3c4e85afb24435c53267628c2e5c1c8f8d362b Mon Sep 17 00:00:00 2001 From: "Min, Byung-il" Date: Tue, 2 Jul 2024 12:19:24 +0900 Subject: [PATCH 02/29] Modify dynamic quantize kernels Optimize dynamic_quantize_opt kernel Signed-off-by: Min, Byung-il --- .../cl_kernels/dynamic_quantize_gpu_opt.cl | 78 +++++++++++++------ .../dynamic_quantize_kernel_opt.cpp | 62 +++++++++++++-- .../test_cases/dynamic_quantize_gpu_test.cpp | 39 +++++++++- 3 files changed, 146 insertions(+), 33 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl index a61803cd4926ef..fedc1a2858259f 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl @@ -8,7 +8,14 @@ #error "dynamic_quantize_gpu_opt.cl: Unsupported output dimension" #endif -REQD_SUB_GROUP_SIZE(16) +#define VLOAD_N CAT(vload, VEC_SIZE) +#define VSTORE_N CAT(vstore, VEC_SIZE) +#define CONVERT_CHAR_N CAT(convert_char, VEC_SIZE) +#define AS_TYPE_N_(type, n, x) as_##type##n(x) +#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x) +#define AS_INPUT_TYPE_N(x) AS_TYPE_N(INPUT0_TYPE, VEC_SIZE, x) + +REQD_SUB_GROUP_SIZE(SIMD) KERNEL(dynamic_quantize_gpu_opt)( OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* input, @@ -16,42 +23,65 @@ KERNEL(dynamic_quantize_gpu_opt)( __global OUTPUT1_TYPE* output_scale) { const uint bf = (uint)get_global_id(2); - const uint sglid = get_sub_group_local_id(); - const uint sgid = get_sub_group_id(); - const uint num_sg = get_num_sub_groups(); - const uint group_size = (INPUT0_FEATURE_PITCH / 16 / num_sg); - const uint offset = bf * INPUT0_FEATURE_PITCH + group_size * (sglid + 16 * sgid); - __local half partial_max[32]; // FIXME: 16 is an arbitrary number - half8 val; - half max; + const uint local_id = (uint)get_local_id(1); + + const uint block_size = SIMD * VEC_SIZE; + const uint b_offset = bf * INPUT0_FEATURE_PITCH; + + const uint offset = b_offset + VEC_SIZE * sglid; + + const uint iteration = ALIGNED_BLOCK_NUM / BLOCK_NUM; + + __local int local_mem[BLOCK_NUM]; + + MAKE_VECTOR_TYPE(INPUT0_TYPE, VEC_SIZE) val[iteration]; + MAKE_VECTOR_TYPE(INPUT0_TYPE, VEC_SIZE) abs_val; + half max = 0.0h; half grp_max = 0.001h; + half max_value; - unroll_for (int i = 0; i < group_size/8; ++i) { - val = fabs(as_half8(vload8(0, input + offset + (i * 8)))); + unroll_for(int i = 0; i < iteration; ++i) { + if ((local_id * iteration + i) >= TOTAL_BLOCK_NUM) + continue; + + val[i] = AS_INPUT_TYPE_N(VLOAD_N(0, input + offset + ((local_id * iteration + i) * block_size))); + abs_val = fabs(val[i]); + + #if VEC_SIZE == 8 + max = fmax(fmax(fmax(abs_val[0], abs_val[1]), fmax(abs_val[2], abs_val[3])), + fmax(fmax(abs_val[4], abs_val[5]), fmax(abs_val[6], abs_val[7]))); + #else + for (int j = 0; j < VEC_SIZE; j++) { + max = fmax(max, abs_val[j]); + } + #endif - max = fmax(fmax(fmax(val[0], val[1]), fmax(val[2], val[3])), - fmax(fmax(val[4], val[5]), fmax(val[6], val[7]))); grp_max = fmax(grp_max, max); } - half max_value = sub_group_reduce_max(grp_max); - partial_max[sgid] = max_value; barrier(CLK_LOCAL_MEM_FENCE); - // calculate global max - max_value = partial_max[0]; - for (int i = 1; i < num_sg; i++) - max_value = fmax(max_value, partial_max[i]); + max_value = sub_group_reduce_max(grp_max); + if (sglid == 0) + local_mem[local_id] = max_value; + + barrier(CLK_LOCAL_MEM_FENCE); + + for (int j = 0; j < BLOCK_NUM; j++) { + max_value = fmax(max_value, local_mem[j]); + } half scale = 127.0h / max_value; - unroll_for (int i = 0; i < group_size/8; ++i) { - val = as_half8(vload8(0, input + offset + i*8)); - val *= scale; - vstore8(convert_char8(val), 0, output + offset + i*8); + unroll_for(int i = 0; i < iteration; ++i) { + if ((local_id * iteration + i) >= TOTAL_BLOCK_NUM) + continue; + + val[i] *= scale; + VSTORE_N(CONVERT_CHAR_N(val[i]), 0, output + offset + ((local_id * iteration + i) * block_size)); } - if (sglid == 0 && sgid == 0) + if (sglid == 0 && local_id == 0) output_scale[bf] = 1.0h / scale; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp index fcf9d43341bb13..72e061fb66eda7 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp @@ -6,7 +6,34 @@ #include "kernel_selector_utils.h" #include + +static constexpr size_t simd = 16; + namespace kernel_selector { +static std::pair get_input_bf_size(const dynamic_quantize_params& params) { + size_t input_f = params.inputs[0].Feature().v; + size_t input_batch = params.inputs[0].Batch().v; + // 3D input + if (params.outputs[0].GetLayout() == DataLayout::bfyx) { + input_f = params.inputs[0].Y().v * params.inputs[0].X().v; + input_batch = params.inputs[0].Batch().v * params.inputs[0].Feature().v; + } + + return {input_batch, input_f}; +} + +static size_t get_match_vector_size(const dynamic_quantize_params& params) { + auto block_sizes = { 8, 4, 2 }; + + for (auto block_size : block_sizes) { + if (((params.inputs[0].X().v * params.inputs[0].Y().v) / simd) % block_size == 0) { + return block_size; + } + } + + return 1; +} + ParamsKey DynamicQuantizeKernelOpt::GetSupportedKey() const { ParamsKey k; k.EnableInputDataType(Datatype::F16); @@ -34,8 +61,23 @@ ParamsKey DynamicQuantizeKernelOpt::GetSupportedKey() const { JitConstants DynamicQuantizeKernelOpt::GetJitConstants(const dynamic_quantize_params& params) const { JitConstants jit = MakeBaseParamsJitConstants(params); + auto vec_size = get_match_vector_size(params); + auto bf_size = get_input_bf_size(params); + auto total_block_num = bf_size.second / (simd * vec_size); + size_t aligned_block_num = (total_block_num > 32) ? Align(total_block_num, 32) : total_block_num; + size_t block_num = (total_block_num > 32) ? 32 : total_block_num; + + jit.AddConstant(MakeJitConstant("VEC_SIZE", vec_size)); + jit.AddConstant(MakeJitConstant("SIMD", simd)); + jit.AddConstant(MakeJitConstant("TOTAL_BLOCK_NUM", total_block_num)); + jit.AddConstant(MakeJitConstant("ALIGNED_BLOCK_NUM", aligned_block_num)); + jit.AddConstant(MakeJitConstant("BLOCK_NUM", block_num)); jit.Merge(GetTensorFriendlyWorkGroupsJit(params.outputs[0])); + GPU_DEBUG_LOG << "DynamicQuantizeKernelOpt VEC_SIZE(" << vec_size << ") input bfyx (" << params.inputs[0].Batch().v + << ", " << params.inputs[0].Feature().v << ", " << params.inputs[0].Y().v << ", " << params.inputs[0].X().v << ")" << std::endl; + + return jit; } @@ -43,8 +85,14 @@ CommonDispatchData DynamicQuantizeKernelOpt::SetDefault(const dynamic_quantize_p GPU_DEBUG_GET_INSTANCE(debug_config); CommonDispatchData dispatchData; - dispatchData.gws = {64, 1, params.inputs[0].Batch().v * params.inputs[0].Feature().v}; - dispatchData.lws = {64, 1, 1}; + auto vec_size = get_match_vector_size(params); + auto bf_size = get_input_bf_size(params); + size_t total_block_num = bf_size.second / (simd * vec_size); + size_t batch = get_input_bf_size(params).first; + size_t block_num = (total_block_num > 32) ? 32 : total_block_num; + + dispatchData.gws = {simd, block_num, batch}; + dispatchData.lws = {simd, block_num, 1}; return dispatchData; } @@ -57,6 +105,9 @@ void DynamicQuantizeKernelOpt::GetUpdateDispatchDataFunc(KernelData& kd) const { kd.kernels[0].params.workGroups.global = dispatchData.gws; kd.kernels[0].params.workGroups.local = dispatchData.lws; kd.kernels[0].skip_execution = false; + + GPU_DEBUG_LOG << "Update Dispatch data DynamicQuantizeKernelOpt gws : " << dispatchData.gws[0] << ", " + << dispatchData.gws[1] << ", " << dispatchData.gws[2] << std::endl; }; } @@ -92,8 +143,6 @@ KernelsData DynamicQuantizeKernelOpt::GetKernelsData(const Params& params) const static_cast(prim_params.outputs.size()), prim_params.is_shape_agnostic); - // std::cout << ">> Select dynamic_quantize_kernel_opt : " << prim_params.outputs.size() << std::endl; - return {kd}; } @@ -118,7 +167,10 @@ bool DynamicQuantizeKernelOpt::Validate(const Params& params) const { const auto& dq_params = static_cast(params); // Todo : Add proper exception here - if ((dq_params.outputs[0].X().v * dq_params.outputs[0].Y().v % 32) != 0) + if (((dq_params.inputs[0].X().v * dq_params.inputs[0].Y().v) % (simd * 2)) != 0) + return false; + + if (dq_params.inputs[0].GetPaddedVal() != 0 || dq_params.outputs[0].GetPaddedVal() != 0) return false; return true; diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp index abea5a0fe6f2aa..3e6163bb10db14 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp @@ -97,10 +97,10 @@ class dynamic_quantization_gpu_tests: public ::testing::Test { auto outputs = network->execute(); auto output_mem = outputs.begin()->second.get_memory(); - cldnn::mem_lock output_ptr (output_mem, get_test_stream()); + cldnn::mem_lock output_ptr (output_mem, get_test_stream()); auto ref_output_mem = get_ref_results(); - cldnn::mem_lock output_ptr_ref (ref_output_mem, get_test_stream()); + cldnn::mem_lock output_ptr_ref (ref_output_mem, get_test_stream()); size_t count = 0; float max_diff = 0.f; @@ -111,9 +111,9 @@ class dynamic_quantization_gpu_tests: public ::testing::Test { max_diff = abs_diff; avg = abs_diff; count++; - // OPENVINO_ASSERT(abs_diff < 256); + OPENVINO_ASSERT(abs_diff < 1); } - /*GPU_DEBUG_LOG*/std::cout << "---> count: " << count << ", max_diff:" << max_diff << ", avg_diff: " << (avg/count) << std::endl; + GPU_DEBUG_LOG << "---> count: " << count << ", max_diff:" << max_diff << ", avg_diff: " << (avg/count) << std::endl; } }; @@ -121,3 +121,34 @@ TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_large_size) { this->test_dynamic_quantization(false, false, 2048, 4096); } +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_large_size_dynamic) { + this->test_dynamic_quantization(false, true, 2048, 4096); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_small_size) { + this->test_dynamic_quantization(false, false, 64, 4096); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_single_batch) { + this->test_dynamic_quantization(false, false, 1, 4096); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_ref_only) { + this->test_dynamic_quantization(false, false, 16, 33); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_ref_only_dynamic) { + this->test_dynamic_quantization(false, true, 16, 33); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_invalid) { + this->test_dynamic_quantization(false, false, 16, 7); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_unaligned) { + this->test_dynamic_quantization(false, false, 16, 32); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_unaligned_dynamic) { + this->test_dynamic_quantization(false, true, 16, 32); +} From e7ea3106b3c78cc3ee73ef7cd2dde216855d1fb6 Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Fri, 5 Jul 2024 12:23:00 +0900 Subject: [PATCH 03/29] code cleanup & accuracy fix --- .../intel_gpu/primitives/dynamic_quantize.hpp | 1 - .../intel_gpu/runtime/debug_configuration.hpp | 1 - .../graph_optimizer/build_implementations.cpp | 2 -- .../cl_kernels/dynamic_quantize_gpu_opt.cl | 4 +--- .../dynamic_quantize_kernel_opt.cpp | 14 ++------------ .../dynamic_quantize_kernel_ref.cpp | 2 -- .../intel_gpu/src/plugin/compiled_model.cpp | 1 - src/plugins/intel_gpu/src/plugin/plugin.cpp | 1 - .../src/plugin/transformations_pipeline.cpp | 6 +++--- .../intel_gpu/src/runtime/debug_configuration.cpp | 3 --- 10 files changed, 6 insertions(+), 29 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp index fc051f54bb6940..b1bdea17b66839 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp @@ -14,7 +14,6 @@ struct dynamic_quantize : public primitive_base { dynamic_quantize() : primitive_base("", {}), group_size(0) {} - // [TODO] should fix size of outputs /// @brief Constructs dynamic_quantize primitive /// @param id This primitive id /// @param input Input primitive id diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp index 2f60afbd029734..589b5dd96cb055 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp @@ -129,7 +129,6 @@ class debug_configuration { int serialize_compile; // Serialize creating primitives and compiling kernels std::vector forced_impl_types; // Force implementation type either ocl or onednn int max_kernels_per_batch; // Maximum number of kernels in a batch during compiling kernels - int dynamic_quantization_group_size; // Set dynamic quantization group size. Default is 0.(ignored) int impls_cache_capacity; // The maximum number of entries in the kernel impl cache int enable_sdpa; // Allows to control SDPA decomposition int disable_async_compilation; // Disable async compilation diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp index 84f1b26507f19e..d52b3709b7ad46 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp @@ -20,7 +20,6 @@ void build_implementations::run(program& p) { for (auto& n : p.get_processing_order()) { if (auto impl = n->get_selected_impl()) { auto params = n->get_kernel_impl_params(); - GPU_DEBUG_TRACE << "add_kernels_source: " << params->desc->id << std::endl; cache.add_kernels_source(*params, impl->get_kernels_source()); } } @@ -28,7 +27,6 @@ void build_implementations::run(program& p) { for (auto& n : p.get_processing_order()) { if (auto impl = n->get_selected_impl()) { auto params = n->get_kernel_impl_params(); - GPU_DEBUG_TRACE << "init_kernels: " << params->desc->id << std::endl; impl->init_kernels(cache, *params); impl->reset_kernels_source(); } diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl index fedc1a2858259f..8b7c30f467fc6a 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl @@ -33,7 +33,7 @@ KERNEL(dynamic_quantize_gpu_opt)( const uint iteration = ALIGNED_BLOCK_NUM / BLOCK_NUM; - __local int local_mem[BLOCK_NUM]; + __local half local_mem[BLOCK_NUM]; MAKE_VECTOR_TYPE(INPUT0_TYPE, VEC_SIZE) val[iteration]; MAKE_VECTOR_TYPE(INPUT0_TYPE, VEC_SIZE) abs_val; @@ -60,8 +60,6 @@ KERNEL(dynamic_quantize_gpu_opt)( grp_max = fmax(grp_max, max); } - barrier(CLK_LOCAL_MEM_FENCE); - max_value = sub_group_reduce_max(grp_max); if (sglid == 0) local_mem[local_id] = max_value; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp index 72e061fb66eda7..bbe5727cd8d79f 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp @@ -37,17 +37,7 @@ static size_t get_match_vector_size(const dynamic_quantize_params& params) { ParamsKey DynamicQuantizeKernelOpt::GetSupportedKey() const { ParamsKey k; k.EnableInputDataType(Datatype::F16); - k.EnableInputDataType(Datatype::F32); - k.EnableInputDataType(Datatype::INT8); - k.EnableInputDataType(Datatype::UINT8); - k.EnableInputDataType(Datatype::INT32); - k.EnableInputDataType(Datatype::INT64); - k.EnableOutputDataType(Datatype::F16); - k.EnableOutputDataType(Datatype::F32); k.EnableOutputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::UINT8); - k.EnableOutputDataType(Datatype::INT32); - k.EnableOutputDataType(Datatype::INT64); k.EnableDifferentTypes(); k.EnableAllInputLayout(); k.EnableAllOutputLayout(); @@ -74,7 +64,7 @@ JitConstants DynamicQuantizeKernelOpt::GetJitConstants(const dynamic_quantize_pa jit.AddConstant(MakeJitConstant("BLOCK_NUM", block_num)); jit.Merge(GetTensorFriendlyWorkGroupsJit(params.outputs[0])); - GPU_DEBUG_LOG << "DynamicQuantizeKernelOpt VEC_SIZE(" << vec_size << ") input bfyx (" << params.inputs[0].Batch().v + GPU_DEBUG_TRACE_DETAIL << "DynamicQuantizeKernelOpt VEC_SIZE(" << vec_size << ") input bfyx (" << params.inputs[0].Batch().v << ", " << params.inputs[0].Feature().v << ", " << params.inputs[0].Y().v << ", " << params.inputs[0].X().v << ")" << std::endl; @@ -106,7 +96,7 @@ void DynamicQuantizeKernelOpt::GetUpdateDispatchDataFunc(KernelData& kd) const { kd.kernels[0].params.workGroups.local = dispatchData.lws; kd.kernels[0].skip_execution = false; - GPU_DEBUG_LOG << "Update Dispatch data DynamicQuantizeKernelOpt gws : " << dispatchData.gws[0] << ", " + GPU_DEBUG_TRACE_DETAIL << "Update Dispatch data DynamicQuantizeKernelOpt gws : " << dispatchData.gws[0] << ", " << dispatchData.gws[1] << ", " << dispatchData.gws[2] << std::endl; }; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp index 46e7774e937b54..9bf5088dfa84e3 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp @@ -10,8 +10,6 @@ namespace kernel_selector { ParamsKey DynamicQuantizeKernelRef::GetSupportedKey() const { ParamsKey k; k.EnableInputDataType(Datatype::F16); - k.EnableInputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::F16); k.EnableOutputDataType(Datatype::INT8); k.EnableInputLayout(DataLayout::bfyx); k.EnableOutputLayout(DataLayout::bfyx); diff --git a/src/plugins/intel_gpu/src/plugin/compiled_model.cpp b/src/plugins/intel_gpu/src/plugin/compiled_model.cpp index 178e9b94d422aa..b9729ca7bf0f20 100644 --- a/src/plugins/intel_gpu/src/plugin/compiled_model.cpp +++ b/src/plugins/intel_gpu/src/plugin/compiled_model.cpp @@ -259,7 +259,6 @@ ov::Any CompiledModel::get_property(const std::string& name) const { ov::PropertyName{ov::hint::dynamic_quantization_group_size.name(), PropertyMutability::RO}, ov::PropertyName{ov::device::id.name(), PropertyMutability::RO}, ov::PropertyName{ov::execution_devices.name(), PropertyMutability::RO}, - ov::PropertyName{ov::hint::dynamic_quantization_group_size.name(), PropertyMutability::RO} }; } else if (name == ov::model_name) { return decltype(ov::model_name)::value_type {m_model_name}; diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index 9243b558be99d3..ac540bd2925c46 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -557,7 +557,6 @@ std::vector Plugin::get_supported_properties() const { ov::PropertyName{ov::hint::enable_cpu_pinning.name(), PropertyMutability::RW}, ov::PropertyName{ov::hint::dynamic_quantization_group_size.name(), PropertyMutability::RW}, ov::PropertyName{ov::device::id.name(), PropertyMutability::RW}, - ov::PropertyName{ov::hint::dynamic_quantization_group_size.name(), PropertyMutability::RW} }; return supported_properties; diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index c436f5f91b8196..5b96bc468e3e30 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -849,10 +849,10 @@ void TransformationsPipeline::apply(std::shared_ptr func) { pass_config->disable(); auto dynamic_quantization_group_size = config.get_property(ov::hint::dynamic_quantization_group_size); - GPU_DEBUG_IF(cldnn::debug_configuration::get_instance()->dynamic_quantization_group_size > 0) { - dynamic_quantization_group_size = cldnn::debug_configuration::get_instance()->dynamic_quantization_group_size; + GPU_DEBUG_IF(cldnn::debug_configuration::get_instance()->enable_dynamic_quantize) { + dynamic_quantization_group_size = 1048576; } - + dynamic_quantization_group_size = 1048576; // XXX: temporal enabling for test if (device_info.supports_immad && dynamic_quantization_group_size == 1048576) // XXX: 1048576 is considered per-token manager.register_pass(dynamic_quantization_group_size); diff --git a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp index 7f5859702ee1a6..9a34d77ae47f3e 100644 --- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp +++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp @@ -166,7 +166,6 @@ static void print_help_messages() { " For example fc:onednn gemm:onednn reduce:ocl do:cpu" " For primitives fc, gemm, do, reduce, concat are supported. Separated by space."); message_list.emplace_back("OV_GPU_MaxKernelsPerBatch", "Maximum number of kernels in a batch during compiling kernels"); - message_list.emplace_back("OV_GPU_DynamicQuantizationGroupSize", "Set dynamic quantization group size"); message_list.emplace_back("OV_GPU_ImplsCacheCapacity", "The maximum number of entries in the kernel impl cache"); message_list.emplace_back("OV_GPU_DisableAsyncCompilation", "Disable async compilation"); message_list.emplace_back("OV_GPU_DisableWinogradConv", "Disable Winograd convolution"); @@ -238,7 +237,6 @@ debug_configuration::debug_configuration() , base_batch_for_memory_estimation(-1) , serialize_compile(0) , max_kernels_per_batch(0) - , dynamic_quantization_group_size(0) , impls_cache_capacity(-1) , enable_sdpa(-1) , disable_async_compilation(0) @@ -291,7 +289,6 @@ debug_configuration::debug_configuration() std::string forced_impl_types_str; get_gpu_debug_env_var("ForceImplTypes", forced_impl_types_str); get_gpu_debug_env_var("MaxKernelsPerBatch", max_kernels_per_batch); - get_gpu_debug_env_var("DynamicQuantizationGroupSize", dynamic_quantization_group_size); get_gpu_debug_env_var("ImplsCacheCapacity", impls_cache_capacity); get_gpu_debug_env_var("EnableSDPA", enable_sdpa); get_gpu_debug_env_var("DisableAsyncCompilation", disable_async_compilation); From 99a6f1c2e55e2112724d22de01dcd345eb68d85e Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Tue, 23 Jul 2024 10:22:22 +0900 Subject: [PATCH 04/29] [GPU] restric dynamic_quantization condition for unittest pass --- .../dynamic_quantize_fully_connected.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp index 3ab890becef0ad..52a802c2771496 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp @@ -37,9 +37,17 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(size_t group_size) else if (pattern_map.find(fully_connected_compressed4) != pattern_map.end()) m_fc = std::dynamic_pointer_cast(pattern_map.at(fully_connected_compressed4).get_node_shared_ptr()); - const auto innermost_size = m_fc->get_input_partial_shape(0)[m_fc->get_input_partial_shape(0).size() - 1].get_length(); + if (m_data->get_element_type() == ov::element::Type_t::f32) + return false; + if (!m_data->is_dynamic()) + return false; + + auto weight_shape = m_fc->get_input_partial_shape(1); + const auto innermost_size = weight_shape[weight_shape.size() - 1].get_length(); if (group_size == 0 || (innermost_size % group_size != 0 && static_cast(innermost_size) > group_size)) return false; + if (innermost_size < 32) + return false; OutputVector fc_inputs; auto dyn_quan = std::make_shared(m_data, group_size); From 4d4e1a1880dc48c6c1925fbf789f93307e9d3c04 Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Tue, 23 Jul 2024 14:41:07 +0900 Subject: [PATCH 05/29] New test for dynamic quantization basic test passes dyn_quan test is newly introduced with accuracy issue corner_cases fails --- .../dynamic_quantize_fully_connected.cpp | 2 +- .../dynamic/matmul_weights_decompression.cpp | 46 +++++++++++++++---- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp index 52a802c2771496..19367c86bb4aca 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp @@ -39,7 +39,7 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(size_t group_size) if (m_data->get_element_type() == ov::element::Type_t::f32) return false; - if (!m_data->is_dynamic()) + if (!m_fc->is_dynamic()) return false; auto weight_shape = m_fc->get_input_partial_shape(1); diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp index 9a439132af0bae..07d20db369028a 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp @@ -56,7 +56,9 @@ using MatmulWeightsDecompressionParams = std::tuple; // per-tensor zero-point + bool, // per-tensor zero-point + int // dynamic_quantization_group_size + >; class MatmulWeightsDecompression : public testing::WithParamInterface, virtual public ov::test::SubgraphBaseTest { @@ -69,6 +71,7 @@ class MatmulWeightsDecompression : public testing::WithParamInterface Date: Fri, 26 Jul 2024 10:55:10 +0900 Subject: [PATCH 06/29] [GPU] option cleanup for per-token quantization --- .../include/openvino/runtime/properties.hpp | 2 +- .../include/intel_gpu/op/dynamic_quantize.hpp | 6 ++-- .../intel_gpu/primitives/dynamic_quantize.hpp | 4 +-- .../intel_gpu/src/graph/dynamic_quantize.cpp | 4 +-- .../src/graph/include/dynamic_quantize_inst.h | 2 +- .../src/plugin/ops/dynamic_quantize.cpp | 2 +- .../dynamic_quantize_fully_connected.cpp | 30 +++++++++++++++---- .../dynamic_quantize_fully_connected.hpp | 2 +- .../transformations/op/dynamic_quantize.cpp | 4 +-- .../src/plugin/transformations_pipeline.cpp | 6 ++-- 10 files changed, 39 insertions(+), 23 deletions(-) diff --git a/src/inference/include/openvino/runtime/properties.hpp b/src/inference/include/openvino/runtime/properties.hpp index e0f7df1b16b0c2..7320f9c0921ca7 100644 --- a/src/inference/include/openvino/runtime/properties.hpp +++ b/src/inference/include/openvino/runtime/properties.hpp @@ -571,7 +571,7 @@ static constexpr Property execution_mode{"EXECUTION_MODE_HINT"}; * might result in better accuracy, but the drawback is worse performance. Group size equal 0 means dynamic * quantization optimization is disabled. */ -static constexpr Property dynamic_quantization_group_size{ +static constexpr Property dynamic_quantization_group_size{ "DYNAMIC_QUANTIZATION_GROUP_SIZE"}; /** diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp index ae8c1f5e7461d3..10bc9aa13cb51c 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp @@ -19,15 +19,15 @@ class DynamicQuantize : public ov::op::Op { /// \brief Constructs an DynamicQuantize operation. /// /// \param data Input tensor with data - DynamicQuantize(const Output& data, size_t group_size); + DynamicQuantize(const Output& data, int64_t group_size); void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; - size_t get_group_size() { return m_group_size; }; + int64_t get_group_size() { return m_group_size; }; private: - size_t m_group_size; + int64_t m_group_size; }; std::vector shape_infer(const DynamicQuantize* op, std::vector input_shapes); diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp index b1bdea17b66839..f732f66dad0eb2 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp @@ -21,13 +21,13 @@ struct dynamic_quantize : public primitive_base { /// @param data_type Output data type of quantized dynamic_quantize(const primitive_id& id, const input_info& input, - const size_t group_size, + const int64_t group_size, const std::vector data_types = {optional_data_type(data_types::f16), optional_data_type(data_types::i8)}, const padding& output_padding = padding()) : primitive_base(id, {input}, 2, data_types, {output_padding}) , group_size(group_size) {} - size_t group_size = 0; + int64_t group_size = 0; size_t hash() const override { size_t seed = primitive::hash(); diff --git a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp index 9938ba6f08358c..bbd3d3e9426388 100644 --- a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp @@ -22,7 +22,7 @@ layout dynamic_quantize_inst::calc_output_layout(dynamic_quantize_node const& no } template -std::vector dynamic_quantize_inst::__calc_output_layouts(layout &act_layout, size_t group_size) { +std::vector dynamic_quantize_inst::__calc_output_layouts(layout &act_layout, int64_t group_size) { ov::intel_gpu::op::DynamicQuantize op; auto output_format = act_layout.format; @@ -35,7 +35,7 @@ std::vector dynamic_quantize_inst::__calc_output_layouts(layout &act_lay return { layout(output_shapes[0], data_types::i8, output_format), layout(output_shapes[1], data_types::f16, output_format) }; } -template std::vector dynamic_quantize_inst::__calc_output_layouts(layout &act_layout, size_t group_size); +template std::vector dynamic_quantize_inst::__calc_output_layouts(layout &act_layout, int64_t group_size); template std::vector dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& /*node*/, const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h index 6458b28dda877d..8c548295180740 100644 --- a/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h @@ -35,7 +35,7 @@ class typed_primitive_inst : public typed_primitive_inst_base< // Internal function to be used from fakealignment template - static std::vector __calc_output_layouts(layout &act_layout, size_t group_size); + static std::vector __calc_output_layouts(layout &act_layout, int64_t group_size); static std::string to_string(dynamic_quantize_node const& node); typed_primitive_inst(network& network, dynamic_quantize_node const& node); diff --git a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp index e5a43b4c62541f..bd65804ec283b5 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp @@ -23,7 +23,7 @@ static void CreateDynamicQuantizeOp(ProgramBuilder& p, const std::shared_ptrget_group_size() == 1048576, "Not supported group size: ", op->get_group_size()); + OPENVINO_ASSERT(op->get_group_size() == -1, "Not supported group size: ", op->get_group_size()); auto prim = cldnn::dynamic_quantize(primitive_name, inputs[0], op->get_group_size(), diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp index 19367c86bb4aca..65ecb1c3d245ae 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp @@ -11,18 +11,27 @@ #include "openvino/pass/pattern/op/or.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" #include "transformations/utils/utils.hpp" +#include "intel_gpu/runtime/debug_configuration.hpp" namespace ov { namespace intel_gpu { -DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(size_t group_size) { +DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(int64_t group_size) { + GPU_DEBUG_GET_INSTANCE(debug_config); using namespace ov::pass::pattern; + // per-token quantization is supported + if (group_size != -1) { + GPU_DEBUG_TRACE << "Dynamic quantization is disabled " << group_size << std::endl; + return; + } + auto data = any_input(); auto fully_connected_compressed3 = wrap_type({data, any_input(), any_input(), any_input()}); auto fully_connected_compressed4 = wrap_type({data, any_input(), any_input(), any_input(), any_input()}); auto fully_connected_compressed = std::make_shared(OutputVector{fully_connected_compressed3, fully_connected_compressed4}); + ov::matcher_pass_callback callback = [=](Matcher& m) { if (transformation_callback(m.get_match_root())) { return false; @@ -37,17 +46,26 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(size_t group_size) else if (pattern_map.find(fully_connected_compressed4) != pattern_map.end()) m_fc = std::dynamic_pointer_cast(pattern_map.at(fully_connected_compressed4).get_node_shared_ptr()); - if (m_data->get_element_type() == ov::element::Type_t::f32) + if (m_data->get_element_type() == ov::element::Type_t::f32) { + GPU_DEBUG_TRACE << m_fc->get_friendly_name() << " Dynamic quantization is turned off because input type is not supported" << std::endl; return false; - if (!m_fc->is_dynamic()) + } + if (!m_fc->is_dynamic()) { + GPU_DEBUG_TRACE << m_fc->get_friendly_name() << " Dynamic quantization is turned off because static shape is not supported" << std::endl; return false; + } auto weight_shape = m_fc->get_input_partial_shape(1); - const auto innermost_size = weight_shape[weight_shape.size() - 1].get_length(); - if (group_size == 0 || (innermost_size % group_size != 0 && static_cast(innermost_size) > group_size)) + const int64_t innermost_size = weight_shape[weight_shape.size() - 1].get_length(); + if (group_size != -1 && + (group_size == 0 || (innermost_size % group_size != 0 && innermost_size > group_size))) { + GPU_DEBUG_TRACE << "Dynamic quantization: shape is not aligned with group size " << innermost_size << " / " << group_size << std::endl; return false; - if (innermost_size < 32) + } + if (innermost_size < 32) { + GPU_DEBUG_TRACE << "Dynamic quantization: shape is too small " << innermost_size << " / " << group_size << std::endl; return false; + } OutputVector fc_inputs; auto dyn_quan = std::make_shared(m_data, group_size); diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.hpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.hpp index 35ae934e91d6f3..6de7d355cbb8a2 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.hpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.hpp @@ -12,7 +12,7 @@ namespace intel_gpu { class DynamicQuantizeFullyConnected: public ov::pass::MatcherPass { public: OPENVINO_RTTI("DynamicQuantizeFullyConnected", "0"); - DynamicQuantizeFullyConnected(size_t group_size); + DynamicQuantizeFullyConnected(int64_t group_size); }; } // namespace intel_gpu diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp index c8ca959f791f44..f9af3543d487a9 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp @@ -7,13 +7,12 @@ #include "openvino/core/validation_util.hpp" #include "openvino/op/variadic_split.hpp" #include "variadic_split_shape_inference.hpp" -#include "intel_gpu/runtime/debug_configuration.hpp" namespace ov { namespace intel_gpu { namespace op { -DynamicQuantize::DynamicQuantize(const Output& data, size_t group_size) +DynamicQuantize::DynamicQuantize(const Output& data, int64_t group_size) : Op({data}) , m_group_size(group_size) { set_output_size(2); @@ -36,7 +35,6 @@ std::shared_ptr DynamicQuantize::clone_with_new_inputs(const ov::OutputVec } std::vector shape_infer(const DynamicQuantize* op, std::vector input_shapes) { - GPU_DEBUG_GET_INSTANCE(debug_config); std::vector out_shapes; out_shapes.push_back(input_shapes[0]); // FIXME: generalize to N-dim case diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 5b96bc468e3e30..a75da1fd219236 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -850,10 +850,10 @@ void TransformationsPipeline::apply(std::shared_ptr func) { auto dynamic_quantization_group_size = config.get_property(ov::hint::dynamic_quantization_group_size); GPU_DEBUG_IF(cldnn::debug_configuration::get_instance()->enable_dynamic_quantize) { - dynamic_quantization_group_size = 1048576; + dynamic_quantization_group_size = -1; } - dynamic_quantization_group_size = 1048576; // XXX: temporal enabling for test - if (device_info.supports_immad && dynamic_quantization_group_size == 1048576) // XXX: 1048576 is considered per-token + + if (device_info.supports_immad) manager.register_pass(dynamic_quantization_group_size); // This is supposed to be the last pass to ensure that we don't have name collisions until From 3dd6acd7ac1cf2551a3b08fb95b94f7f29bd5f9b Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Fri, 26 Jul 2024 11:04:20 +0900 Subject: [PATCH 07/29] minor fix --- .../src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl | 2 -- .../subgraph_tests/dynamic/matmul_weights_decompression.cpp | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl index 14e6f8383e07f7..436276a67e48c0 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl @@ -51,7 +51,5 @@ KERNEL(dynamic_quantize_gpu_ref)( output[out_offset + x] = convert_char(input[in_offset + x] * scale); } - ushort8 test = vload8(0, (ushort*)input + INPUT0_GET_INDEX(b, f, 0, 0)); - output_scale[scale_idx] = 1.0h / scale; } diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp index 07d20db369028a..00726a1d591be7 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp @@ -89,7 +89,7 @@ class MatmulWeightsDecompression : public testing::WithParamInterface Date: Fri, 26 Jul 2024 12:17:26 +0900 Subject: [PATCH 08/29] code cleanup --- .../intel_gpu/primitives/dynamic_quantize.hpp | 8 ++++---- .../intel_gpu/primitives/fully_connected.hpp | 3 ++- .../intel_gpu/src/graph/dynamic_quantize.cpp | 6 +++--- .../graph_optimizer/build_implementations.cpp | 1 - .../src/graph/impls/ocl/dynamic_quantize.cpp | 2 +- .../impls/onednn/fully_connected_onednn.cpp | 16 ++++++---------- .../dynamic_quantize_kernel_opt.cpp | 10 ---------- .../dynamic_quantize_kernel_ref.cpp | 10 ---------- .../dynamic/matmul_weights_decompression.cpp | 6 +++--- 9 files changed, 19 insertions(+), 43 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp index f732f66dad0eb2..c43aab50c82c52 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp @@ -17,14 +17,14 @@ struct dynamic_quantize : public primitive_base { /// @brief Constructs dynamic_quantize primitive /// @param id This primitive id /// @param input Input primitive id - /// @param output_size Output data size of the primitive + /// @param group_size Quantization group size /// @param data_type Output data type of quantized + /// @param output_size Output data size of the primitive dynamic_quantize(const primitive_id& id, const input_info& input, const int64_t group_size, - const std::vector data_types = {optional_data_type(data_types::f16), optional_data_type(data_types::i8)}, - const padding& output_padding = padding()) - : primitive_base(id, {input}, 2, data_types, {output_padding}) + const std::vector data_types = {optional_data_type(data_types::f16), optional_data_type(data_types::i8)}) + : primitive_base(id, {input}, 2, data_types) , group_size(group_size) {} int64_t group_size = 0; diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/fully_connected.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/fully_connected.hpp index 2121bf9e122c49..e39078cb1011cc 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/fully_connected.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/fully_connected.hpp @@ -100,7 +100,8 @@ struct fully_connected : public primitive_base { weights_rank(weights_rank) { OPENVINO_ASSERT(!decompression_scale.empty(), "[GPU] Compressed fully connected requires at least decompression scale input"); } - /// @brief Constructs fully connected compressed layer. + + /// @brief Constructs fully connected compressed layer. /// @param id This primitive id. /// @param input Input primitive id. /// @param weights Primitive id containing weights data. diff --git a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp index bbd3d3e9426388..bd7839c9cbe40a 100644 --- a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp @@ -30,18 +30,18 @@ std::vector dynamic_quantize_inst::__calc_output_layouts(layout &act_lay act_layout.get(), }; - std::vector output_shapes = shape_infer(&op, input_shapes); + auto output_shapes = shape_infer(&op, input_shapes); return { layout(output_shapes[0], data_types::i8, output_format), layout(output_shapes[1], data_types::f16, output_format) }; - } + template std::vector dynamic_quantize_inst::__calc_output_layouts(layout &act_layout, int64_t group_size); template std::vector dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& /*node*/, const kernel_impl_params& impl_param) { auto desc = impl_param.typed_desc(); auto input_layout = impl_param.get_input_layout(); - return __calc_output_layouts(input_layout, 0 /*NOT IMPLEMENTED*/); + return __calc_output_layouts(input_layout, 0 /* TODO: handle group_size here */); } template std::vector dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& node, diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp index d52b3709b7ad46..4c1b1008434144 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp @@ -6,7 +6,6 @@ #include "program_helpers.h" #include "intel_gpu/runtime/itt.hpp" -#include "intel_gpu/runtime/debug_configuration.hpp" using namespace cldnn; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp index bf087cd4258e7f..91f141ae062723 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp @@ -33,7 +33,7 @@ struct dynamic_quantize_impl : typed_primitive_impl_ocl { } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { - /// XXX: handle group size here + /// TODO: handle group_size here auto params = get_default_params(impl_param, is_shape_agnostic); params.outputs.push_back(convert_data_tensor(impl_param.get_output_layout(1))); diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp index 5cd06a16f734fb..d84f5d222512a5 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp @@ -79,6 +79,7 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { if (prim->activation_scale.is_valid()) { auto activation_scale_idx = idx++; auto act_scale_mem = instance.dep_memory_ptr(activation_scale_idx); + // TODO: handle group_size here dnnl::memory::desc desc = onednn::layout_to_memory_desc(act_scale_mem->get_layout(), dnnl::memory::format_tag::a, true); args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0, act_scale_mem->get_onednn_memory(desc)}); } @@ -287,12 +288,12 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { } bool has_decompression_zp = !prim->decompression_zero_point.empty() || prim->decompression_zero_point_scalar.has_value(); + auto& arg = impl_params->get_program().get_node(impl_params->desc->id).as(); + int idx = !arg.bias_term() ? 3 : 4; if (has_decompression_zp) { ib >> make_data(&_dzp_data_type, sizeof(dnnl::memory::data_type)); - auto& arg = impl_params->get_program().get_node(impl_params->desc->id).as(); - auto decompression_zp_idx = !arg.bias_term() ? 3 : 4; - auto dzp_layout = arg.get_dependency(decompression_zp_idx).get_output_layout(); + auto dzp_layout = arg.get_dependency(idx++).get_output_layout(); if (dzp_layout.count() == 1) { _attrs->set_zero_points(DNNL_ARG_WEIGHTS, 0, dnnl::memory::dims{}, _dzp_data_type); @@ -307,16 +308,11 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { } if (dynamic_quantized_activation) { - int input_idx = has_bias ? 2 : 1; - if (has_decompression_scale) - input_idx++; - if (has_decompression_zp) - input_idx++; - // Note: it supports per-token activation scale only + // TODO: it supports per-token activation scale only auto partial_shape = impl_params->get_input_layout(0).get_partial_shape(); auto innermost_len = partial_shape[partial_shape.size() - 1].get_length(); - auto act_scale_data_type = convert_data_type(impl_params->get_input_layout(input_idx).data_type); + auto act_scale_data_type = convert_data_type(impl_params->get_input_layout(idx).data_type); _attrs->set_scales(DNNL_ARG_SRC, (1 << 1) | (1 << 0), dnnl::memory::dims{1, innermost_len}, act_scale_data_type); } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp index bbe5727cd8d79f..159cca70f68c03 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp @@ -140,16 +140,6 @@ KernelsPriority DynamicQuantizeKernelOpt::GetKernelsPriority(const Params& /*par return FORCE_PRIORITY_2; } -Datatype DynamicQuantizeKernelOpt::GetAccumulatorType(const dynamic_quantize_params& params) const { - Datatype types[] = { Datatype::F32, Datatype::F16, Datatype::INT64, Datatype::INT32, Datatype::UINT32}; - - for (Datatype type : types) - for (auto& in : params.inputs) - if (in.GetDType() == type) - return type; - return Datatype::F32; -} - bool DynamicQuantizeKernelOpt::Validate(const Params& params) const { if (!KernelBaseOpenCL::Validate(params)) return false; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp index 9bf5088dfa84e3..f1b2e229fcdec0 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp @@ -89,16 +89,6 @@ KernelsPriority DynamicQuantizeKernelRef::GetKernelsPriority(const Params& /*par return FORCE_PRIORITY_8; } -Datatype DynamicQuantizeKernelRef::GetAccumulatorType(const dynamic_quantize_params& params) const { - Datatype types[] = { Datatype::F32, Datatype::F16, Datatype::INT64, Datatype::INT32, Datatype::UINT32}; - - for (Datatype type : types) - for (auto& in : params.inputs) - if (in.GetDType() == type) - return type; - return Datatype::F32; -} - bool DynamicQuantizeKernelRef::Validate(const Params& params) const { if (!KernelBaseOpenCL::Validate(params)) return false; diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp index 00726a1d591be7..0425ba82d1c405 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp @@ -374,9 +374,9 @@ INSTANTIATE_TEST_SUITE_P(MatMulCompressedWeights_corner_cases_big, MatmulWeightsDecompression::get_test_case_name); -// FIXME: per_tensor_zp=0 is not supported -// FIXME: transpose_weights is not supported -// FIXME: weight precision u4 is only supported +// per_tensor_zp=0 is not supported +// transpose_weights is not supported +// weight precision u4 is only supported INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_dyn_quan, MatmulWeightsDecompression, ::testing::Combine(::testing::Values(ShapeParams{{{-1, -1, 4096}, {{1, 1, 4096}}}, {1, 4096, 4096}}), // shape From c48e41f96abdbce705905e3b5a85194a1594b7b8 Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Sat, 27 Jul 2024 12:13:39 +0900 Subject: [PATCH 09/29] update onednn version --- src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu index 7ab8ee9adda866..915b0d4c3954fe 160000 --- a/src/plugins/intel_gpu/thirdparty/onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu @@ -1 +1 @@ -Subproject commit 7ab8ee9adda866d675edeee7a3a6a29b2d0a1572 +Subproject commit 915b0d4c3954fe78f5e9b4dc28221b858f8450bd From b64e9744e61ce349899055f186b8bcf8c1a57e3d Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Sat, 27 Jul 2024 12:31:33 +0900 Subject: [PATCH 10/29] changing group size to size_t --- .../include/openvino/runtime/properties.hpp | 2 +- .../include/intel_gpu/op/dynamic_quantize.hpp | 7 ++++--- .../intel_gpu/primitives/dynamic_quantize.hpp | 4 ++-- src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp | 4 ++-- .../src/graph/include/dynamic_quantize_inst.h | 2 +- .../intel_gpu/src/plugin/ops/dynamic_quantize.cpp | 2 +- .../dynamic_quantize_fully_connected.cpp | 12 ++++++++---- .../dynamic_quantize_fully_connected.hpp | 2 +- .../plugin/transformations/op/dynamic_quantize.cpp | 2 +- .../src/plugin/transformations_pipeline.cpp | 5 ++--- 10 files changed, 23 insertions(+), 19 deletions(-) diff --git a/src/inference/include/openvino/runtime/properties.hpp b/src/inference/include/openvino/runtime/properties.hpp index 7320f9c0921ca7..e0f7df1b16b0c2 100644 --- a/src/inference/include/openvino/runtime/properties.hpp +++ b/src/inference/include/openvino/runtime/properties.hpp @@ -571,7 +571,7 @@ static constexpr Property execution_mode{"EXECUTION_MODE_HINT"}; * might result in better accuracy, but the drawback is worse performance. Group size equal 0 means dynamic * quantization optimization is disabled. */ -static constexpr Property dynamic_quantization_group_size{ +static constexpr Property dynamic_quantization_group_size{ "DYNAMIC_QUANTIZATION_GROUP_SIZE"}; /** diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp index 10bc9aa13cb51c..194bb26e792896 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp @@ -19,15 +19,16 @@ class DynamicQuantize : public ov::op::Op { /// \brief Constructs an DynamicQuantize operation. /// /// \param data Input tensor with data - DynamicQuantize(const Output& data, int64_t group_size); + /// \param group_size Group size for dynamic quantization + DynamicQuantize(const Output& data, size_t group_size); void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; - int64_t get_group_size() { return m_group_size; }; + size_t get_group_size() { return m_group_size; }; private: - int64_t m_group_size; + size_t m_group_size; }; std::vector shape_infer(const DynamicQuantize* op, std::vector input_shapes); diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp index c43aab50c82c52..3ad8b4cebde7b0 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp @@ -22,12 +22,12 @@ struct dynamic_quantize : public primitive_base { /// @param output_size Output data size of the primitive dynamic_quantize(const primitive_id& id, const input_info& input, - const int64_t group_size, + const size_t group_size, const std::vector data_types = {optional_data_type(data_types::f16), optional_data_type(data_types::i8)}) : primitive_base(id, {input}, 2, data_types) , group_size(group_size) {} - int64_t group_size = 0; + size_t group_size = 0; size_t hash() const override { size_t seed = primitive::hash(); diff --git a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp index bd7839c9cbe40a..5cb68e00ab7814 100644 --- a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp @@ -22,7 +22,7 @@ layout dynamic_quantize_inst::calc_output_layout(dynamic_quantize_node const& no } template -std::vector dynamic_quantize_inst::__calc_output_layouts(layout &act_layout, int64_t group_size) { +std::vector dynamic_quantize_inst::__calc_output_layouts(layout &act_layout, size_t group_size) { ov::intel_gpu::op::DynamicQuantize op; auto output_format = act_layout.format; @@ -35,7 +35,7 @@ std::vector dynamic_quantize_inst::__calc_output_layouts(layout &act_lay return { layout(output_shapes[0], data_types::i8, output_format), layout(output_shapes[1], data_types::f16, output_format) }; } -template std::vector dynamic_quantize_inst::__calc_output_layouts(layout &act_layout, int64_t group_size); +template std::vector dynamic_quantize_inst::__calc_output_layouts(layout &act_layout, size_t group_size); template std::vector dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& /*node*/, const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h index 8c548295180740..6458b28dda877d 100644 --- a/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h @@ -35,7 +35,7 @@ class typed_primitive_inst : public typed_primitive_inst_base< // Internal function to be used from fakealignment template - static std::vector __calc_output_layouts(layout &act_layout, int64_t group_size); + static std::vector __calc_output_layouts(layout &act_layout, size_t group_size); static std::string to_string(dynamic_quantize_node const& node); typed_primitive_inst(network& network, dynamic_quantize_node const& node); diff --git a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp index bd65804ec283b5..e5a43b4c62541f 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp @@ -23,7 +23,7 @@ static void CreateDynamicQuantizeOp(ProgramBuilder& p, const std::shared_ptrget_group_size() == -1, "Not supported group size: ", op->get_group_size()); + OPENVINO_ASSERT(op->get_group_size() == 1048576, "Not supported group size: ", op->get_group_size()); auto prim = cldnn::dynamic_quantize(primitive_name, inputs[0], op->get_group_size(), diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp index 65ecb1c3d245ae..7e420252fa4259 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp @@ -16,12 +16,16 @@ namespace ov { namespace intel_gpu { -DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(int64_t group_size) { +DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(size_t group_size) { GPU_DEBUG_GET_INSTANCE(debug_config); using namespace ov::pass::pattern; + GPU_DEBUG_IF(cldnn::debug_configuration::get_instance()->enable_dynamic_quantize) { + group_size = 1048576; + } + // per-token quantization is supported - if (group_size != -1) { + if (group_size != 1048576) { GPU_DEBUG_TRACE << "Dynamic quantization is disabled " << group_size << std::endl; return; } @@ -56,8 +60,8 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(int64_t group_size) } auto weight_shape = m_fc->get_input_partial_shape(1); - const int64_t innermost_size = weight_shape[weight_shape.size() - 1].get_length(); - if (group_size != -1 && + const size_t innermost_size = weight_shape[weight_shape.size() - 1].get_length(); + if (group_size != 1048576 && (group_size == 0 || (innermost_size % group_size != 0 && innermost_size > group_size))) { GPU_DEBUG_TRACE << "Dynamic quantization: shape is not aligned with group size " << innermost_size << " / " << group_size << std::endl; return false; diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.hpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.hpp index 6de7d355cbb8a2..35ae934e91d6f3 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.hpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.hpp @@ -12,7 +12,7 @@ namespace intel_gpu { class DynamicQuantizeFullyConnected: public ov::pass::MatcherPass { public: OPENVINO_RTTI("DynamicQuantizeFullyConnected", "0"); - DynamicQuantizeFullyConnected(int64_t group_size); + DynamicQuantizeFullyConnected(size_t group_size); }; } // namespace intel_gpu diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp index f9af3543d487a9..d15dcd80e28432 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp @@ -12,7 +12,7 @@ namespace ov { namespace intel_gpu { namespace op { -DynamicQuantize::DynamicQuantize(const Output& data, int64_t group_size) +DynamicQuantize::DynamicQuantize(const Output& data, size_t group_size) : Op({data}) , m_group_size(group_size) { set_output_size(2); diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index a75da1fd219236..74125c22e7a3d7 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -850,10 +850,9 @@ void TransformationsPipeline::apply(std::shared_ptr func) { auto dynamic_quantization_group_size = config.get_property(ov::hint::dynamic_quantization_group_size); GPU_DEBUG_IF(cldnn::debug_configuration::get_instance()->enable_dynamic_quantize) { - dynamic_quantization_group_size = -1; + dynamic_quantization_group_size = 1048576; } - - if (device_info.supports_immad) + if (device_info.supports_immad) // XXX: 1048576 is considered per-token manager.register_pass(dynamic_quantization_group_size); // This is supposed to be the last pass to ensure that we don't have name collisions until From 0268abad21c68fdceb2d1ea9581d3fbee5fc9591 Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Sat, 27 Jul 2024 14:33:28 +0900 Subject: [PATCH 11/29] code cleanup for review --- .../dynamic_quantize_fully_connected.cpp | 33 +++++++------------ .../transformations/op/dynamic_quantize.cpp | 4 +-- .../src/plugin/transformations_pipeline.cpp | 13 +++++--- .../src/runtime/execution_config.cpp | 4 +++ 4 files changed, 26 insertions(+), 28 deletions(-) diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp index 7e420252fa4259..5c7c987f3062ff 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp @@ -20,19 +20,23 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(size_t group_size) GPU_DEBUG_GET_INSTANCE(debug_config); using namespace ov::pass::pattern; - GPU_DEBUG_IF(cldnn::debug_configuration::get_instance()->enable_dynamic_quantize) { - group_size = 1048576; - } - // per-token quantization is supported if (group_size != 1048576) { GPU_DEBUG_TRACE << "Dynamic quantization is disabled " << group_size << std::endl; return; } + auto is_dynamic = [](const ov::Output& output) -> bool { + bool is_dynamic = output.get_node_shared_ptr()->get_output_partial_shape(0).is_dynamic(); + size_t num_inputs = output.get_node_shared_ptr()->get_input_size(); + for (size_t idx = 0; idx < num_inputs; idx++) { + is_dynamic |= output.get_node_shared_ptr()->get_input_partial_shape(idx).is_dynamic(); + } + return is_dynamic; + }; auto data = any_input(); - auto fully_connected_compressed3 = wrap_type({data, any_input(), any_input(), any_input()}); - auto fully_connected_compressed4 = wrap_type({data, any_input(), any_input(), any_input(), any_input()}); + auto fully_connected_compressed3 = wrap_type({data, any_input(), any_input(), any_input()}, is_dynamic); + auto fully_connected_compressed4 = wrap_type({data, any_input(), any_input(), any_input(), any_input()}, is_dynamic); auto fully_connected_compressed = std::make_shared(OutputVector{fully_connected_compressed3, fully_connected_compressed4}); @@ -43,21 +47,7 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(size_t group_size) const auto& pattern_map = m.get_pattern_value_map(); const auto& m_data = pattern_map.at(data).get_node_shared_ptr(); - std::shared_ptr m_fc; - - if (pattern_map.find(fully_connected_compressed3) != pattern_map.end()) - m_fc = std::dynamic_pointer_cast(pattern_map.at(fully_connected_compressed3).get_node_shared_ptr()); - else if (pattern_map.find(fully_connected_compressed4) != pattern_map.end()) - m_fc = std::dynamic_pointer_cast(pattern_map.at(fully_connected_compressed4).get_node_shared_ptr()); - - if (m_data->get_element_type() == ov::element::Type_t::f32) { - GPU_DEBUG_TRACE << m_fc->get_friendly_name() << " Dynamic quantization is turned off because input type is not supported" << std::endl; - return false; - } - if (!m_fc->is_dynamic()) { - GPU_DEBUG_TRACE << m_fc->get_friendly_name() << " Dynamic quantization is turned off because static shape is not supported" << std::endl; - return false; - } + auto m_fc = std::dynamic_pointer_cast(m.get_match_root()); auto weight_shape = m_fc->get_input_partial_shape(1); const size_t innermost_size = weight_shape[weight_shape.size() - 1].get_length(); @@ -85,7 +75,6 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(size_t group_size) new_fc->set_friendly_name(m_fc->get_friendly_name()); ov::copy_runtime_info(m_fc, new_fc); - return true; }; auto m = std::make_shared(fully_connected_compressed, "DynamicQuantizeFullyConnected"); diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp index d15dcd80e28432..fbe7b61562ddf3 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp @@ -25,8 +25,8 @@ void DynamicQuantize::validate_and_infer_types() { }; auto out_shapes = shape_infer(this, input_shapes); - set_output_type(0, ov::element::Type_t::i8, out_shapes[0]); - set_output_type(1, ov::element::Type_t::f16, out_shapes[1]); + set_output_type(0, ov::element::i8, out_shapes[0]); + set_output_type(1, ov::element::f16, out_shapes[1]); } std::shared_ptr DynamicQuantize::clone_with_new_inputs(const ov::OutputVector& new_args) const { diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 74125c22e7a3d7..50761ce805b2db 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -849,11 +849,16 @@ void TransformationsPipeline::apply(std::shared_ptr func) { pass_config->disable(); auto dynamic_quantization_group_size = config.get_property(ov::hint::dynamic_quantization_group_size); - GPU_DEBUG_IF(cldnn::debug_configuration::get_instance()->enable_dynamic_quantize) { - dynamic_quantization_group_size = 1048576; - } - if (device_info.supports_immad) // XXX: 1048576 is considered per-token + if (device_info.supports_immad) { // XXX: 1048576 is considered per-token + pass_config->set_callback([=](const_node_ptr& root) -> bool { + if (root->get_input_node_shared_ptr(0)->get_element_type() == ov::element::Type_t::f32) { + GPU_DEBUG_TRACE << root->get_friendly_name() << " Dynamic quantization is turned off because input type is not supported" << std::endl; + return true; + } + return false; + }); manager.register_pass(dynamic_quantization_group_size); + } // This is supposed to be the last pass to ensure that we don't have name collisions until // GPU plugin stops using friendly names for program creation diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp index b59932a4a3c2d9..745ec1004d3ac7 100644 --- a/src/plugins/intel_gpu/src/runtime/execution_config.cpp +++ b/src/plugins/intel_gpu/src/runtime/execution_config.cpp @@ -193,6 +193,10 @@ void ExecutionConfig::apply_debug_options(const cldnn::device_info& info) { GPU_DEBUG_IF(debug_config->disable_dynamic_impl == 1) { set_property(ov::intel_gpu::use_only_static_kernels_for_dynamic_shape(true)); } + + GPU_DEBUG_IF(debug_config->enable_dynamic_quantize) { + set_property(ov::hint::dynamic_quantization_group_size(1048576)); + } } void ExecutionConfig::apply_hints(const cldnn::device_info& info) { From f3cd46e6e0a576221f80e820977c07f1b8b87451 Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Sat, 27 Jul 2024 15:30:16 +0900 Subject: [PATCH 12/29] fix for code review --- .../transformations/dynamic_quantize_fully_connected.cpp | 4 ---- .../intel_gpu/src/plugin/transformations_pipeline.cpp | 7 +++++++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp index 5c7c987f3062ff..8e12b312029617 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp @@ -56,10 +56,6 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(size_t group_size) GPU_DEBUG_TRACE << "Dynamic quantization: shape is not aligned with group size " << innermost_size << " / " << group_size << std::endl; return false; } - if (innermost_size < 32) { - GPU_DEBUG_TRACE << "Dynamic quantization: shape is too small " << innermost_size << " / " << group_size << std::endl; - return false; - } OutputVector fc_inputs; auto dyn_quan = std::make_shared(m_data, group_size); diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 50761ce805b2db..bb50907f0fda6d 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -855,6 +855,13 @@ void TransformationsPipeline::apply(std::shared_ptr func) { GPU_DEBUG_TRACE << root->get_friendly_name() << " Dynamic quantization is turned off because input type is not supported" << std::endl; return true; } + + auto weight_shape = root->get_input_partial_shape(1); + const size_t innermost_size = weight_shape[weight_shape.size() - 1].get_length(); + if (innermost_size < 32) { + GPU_DEBUG_TRACE << "Dynamic quantization: shape is too small " << innermost_size << " / " << dynamic_quantization_group_size << std::endl; + return true; + } return false; }); manager.register_pass(dynamic_quantization_group_size); From daac4a560e08d649e7cf8d3054e838d2686fadd6 Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Sat, 27 Jul 2024 15:50:06 +0900 Subject: [PATCH 13/29] update for code review --- .../cl_kernels/dynamic_quantize_gpu_opt.cl | 11 +++-------- .../dynamic_quantize/dynamic_quantize_kernel_opt.cpp | 5 ----- .../dynamic_quantize/dynamic_quantize_kernel_ref.cpp | 1 + 3 files changed, 4 insertions(+), 13 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl index 8b7c30f467fc6a..628bc69f3886df 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl @@ -48,14 +48,9 @@ KERNEL(dynamic_quantize_gpu_opt)( val[i] = AS_INPUT_TYPE_N(VLOAD_N(0, input + offset + ((local_id * iteration + i) * block_size))); abs_val = fabs(val[i]); - #if VEC_SIZE == 8 - max = fmax(fmax(fmax(abs_val[0], abs_val[1]), fmax(abs_val[2], abs_val[3])), - fmax(fmax(abs_val[4], abs_val[5]), fmax(abs_val[6], abs_val[7]))); - #else - for (int j = 0; j < VEC_SIZE; j++) { - max = fmax(max, abs_val[j]); - } - #endif + unroll_for (int j = 0; j < VEC_SIZE; j++) { + max = fmax(max, abs_val[j]); + } grp_max = fmax(grp_max, max); } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp index 159cca70f68c03..d4b5268eaee4e4 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp @@ -64,15 +64,10 @@ JitConstants DynamicQuantizeKernelOpt::GetJitConstants(const dynamic_quantize_pa jit.AddConstant(MakeJitConstant("BLOCK_NUM", block_num)); jit.Merge(GetTensorFriendlyWorkGroupsJit(params.outputs[0])); - GPU_DEBUG_TRACE_DETAIL << "DynamicQuantizeKernelOpt VEC_SIZE(" << vec_size << ") input bfyx (" << params.inputs[0].Batch().v - << ", " << params.inputs[0].Feature().v << ", " << params.inputs[0].Y().v << ", " << params.inputs[0].X().v << ")" << std::endl; - - return jit; } CommonDispatchData DynamicQuantizeKernelOpt::SetDefault(const dynamic_quantize_params& params) const { - GPU_DEBUG_GET_INSTANCE(debug_config); CommonDispatchData dispatchData; auto vec_size = get_match_vector_size(params); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp index f1b2e229fcdec0..3b214848e2f8ad 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp @@ -33,6 +33,7 @@ CommonDispatchData DynamicQuantizeKernelRef::SetDefault(const dynamic_quantize_p GPU_DEBUG_GET_INSTANCE(debug_config); CommonDispatchData dispatchData; + OPENVINO_ASSERT(params.outputs[0].GetLayout() == DataLayout::bfyx, "It supports only 4d tensor"); dispatchData.gws = {params.outputs[0].Batch().v * params.outputs[0].Feature().v, 1, 1}; dispatchData.lws = {1, 1, 1}; From d4c3c0d6a51c4fa7f4d24c0eec9726b2c9431748 Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Thu, 1 Aug 2024 10:50:24 +0900 Subject: [PATCH 14/29] introduce gsnum --- .../include/openvino/runtime/properties.hpp | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/src/inference/include/openvino/runtime/properties.hpp b/src/inference/include/openvino/runtime/properties.hpp index e0f7df1b16b0c2..afff187f064327 100644 --- a/src/inference/include/openvino/runtime/properties.hpp +++ b/src/inference/include/openvino/runtime/properties.hpp @@ -560,6 +560,30 @@ inline std::istream& operator>>(std::istream& is, ExecutionMode& mode) { */ static constexpr Property execution_mode{"EXECUTION_MODE_HINT"}; +struct GSNum { + constexpr GSNum() : gs_num{0} {}; + + constexpr GSNum(const uint64_t num_) : gs_num{num_} {} + + constexpr operator uint64_t() const { + return gs_num; + } + + uint64_t gs_num = 0; +}; + +/** + * @brief Dynamic quantization for per-token granularity + * @ingroup ov_runtime_cpp_prop_api + */ +static constexpr GSNum PER_TOKEN{UINT64_MAX}; + +/** + * @brief Disable dynamic quantization + * @ingroup ov_runtime_cpp_prop_api + */ +static constexpr GSNum DISABLED{0}; + /** * @brief This property defines group size for dynamic quantization optimization * @ingroup ov_runtime_cpp_prop_api @@ -571,7 +595,7 @@ static constexpr Property execution_mode{"EXECUTION_MODE_HINT"}; * might result in better accuracy, but the drawback is worse performance. Group size equal 0 means dynamic * quantization optimization is disabled. */ -static constexpr Property dynamic_quantization_group_size{ +static constexpr Property dynamic_quantization_group_size{ "DYNAMIC_QUANTIZATION_GROUP_SIZE"}; /** From f01a4a2104816d415bb11ca5c3fa3498187d5c46 Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Thu, 1 Aug 2024 10:49:38 +0900 Subject: [PATCH 15/29] move dyn_quan to common op --- .../include/ov_ops}/dynamic_quantize.hpp | 11 ++++++---- .../src/ov_ops}/dynamic_quantize.cpp | 20 +++++++++---------- .../intel_gpu/src/graph/dynamic_quantize.cpp | 6 +++--- .../src/plugin/ops/dynamic_quantize.cpp | 12 ++--------- .../dynamic_quantize_fully_connected.cpp | 4 ++-- 5 files changed, 24 insertions(+), 29 deletions(-) rename src/{plugins/intel_gpu/include/intel_gpu/op => common/transformations/include/ov_ops}/dynamic_quantize.hpp (64%) rename src/{plugins/intel_gpu/src/plugin/transformations/op => common/transformations/src/ov_ops}/dynamic_quantize.cpp (73%) diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp similarity index 64% rename from src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp rename to src/common/transformations/include/ov_ops/dynamic_quantize.hpp index 194bb26e792896..e38b6672d3c8ef 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp +++ b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp @@ -5,13 +5,14 @@ #pragma once #include "openvino/op/op.hpp" +#include "transformations_visibility.hpp" namespace ov { -namespace intel_gpu { namespace op { +namespace internal { /// \brief Operator performing Dynamic Quantize -class DynamicQuantize : public ov::op::Op { +class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op { public: OPENVINO_OP("DynamicQuantize", "gpu_opset"); @@ -20,18 +21,20 @@ class DynamicQuantize : public ov::op::Op { /// /// \param data Input tensor with data /// \param group_size Group size for dynamic quantization - DynamicQuantize(const Output& data, size_t group_size); + /// \param dt_scale Data type for scale output + DynamicQuantize(const Output& data, size_t group_size, element::Type dt_scale); void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; size_t get_group_size() { return m_group_size; }; + static std::vector shape_infer(const DynamicQuantize* op, std::vector input_shapes); private: size_t m_group_size; + element::Type m_dt_scale; }; -std::vector shape_infer(const DynamicQuantize* op, std::vector input_shapes); } // namespace op } // namespace intel_gpu diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp similarity index 73% rename from src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp rename to src/common/transformations/src/ov_ops/dynamic_quantize.cpp index fbe7b61562ddf3..a2f08d6ee09dbf 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp +++ b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp @@ -2,19 +2,20 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "intel_gpu/op/dynamic_quantize.hpp" +#include "ov_ops/dynamic_quantize.hpp" #include "openvino/core/partial_shape.hpp" #include "openvino/core/validation_util.hpp" #include "openvino/op/variadic_split.hpp" #include "variadic_split_shape_inference.hpp" namespace ov { -namespace intel_gpu { namespace op { +namespace internal { -DynamicQuantize::DynamicQuantize(const Output& data, size_t group_size) +DynamicQuantize::DynamicQuantize(const Output& data, size_t group_size, element::Type dt_scale) : Op({data}) - , m_group_size(group_size) { + , m_group_size(group_size) + , m_dt_scale(dt_scale) { set_output_size(2); validate_and_infer_types(); } @@ -25,16 +26,16 @@ void DynamicQuantize::validate_and_infer_types() { }; auto out_shapes = shape_infer(this, input_shapes); - set_output_type(0, ov::element::i8, out_shapes[0]); - set_output_type(1, ov::element::f16, out_shapes[1]); + set_output_type(0, element::i8, out_shapes[0]); + set_output_type(1, m_dt_scale, out_shapes[1]); } std::shared_ptr DynamicQuantize::clone_with_new_inputs(const ov::OutputVector& new_args) const { check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_group_size); + return std::make_shared(new_args.at(0), m_group_size, m_dt_scale); } -std::vector shape_infer(const DynamicQuantize* op, std::vector input_shapes) { +std::vector DynamicQuantize::shape_infer(const DynamicQuantize* op, std::vector input_shapes) { std::vector out_shapes; out_shapes.push_back(input_shapes[0]); // FIXME: generalize to N-dim case @@ -45,7 +46,6 @@ std::vector shape_infer(const DynamicQuantize* op, std::vector return out_shapes; } - +} // namespace internal } // namespace op -} // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp index 5cb68e00ab7814..0469acc5c9d537 100644 --- a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "intel_gpu/op/dynamic_quantize.hpp" +#include "ov_ops/dynamic_quantize.hpp" #include "dynamic_quantize_inst.h" #include "primitive_type_base.h" @@ -23,14 +23,14 @@ layout dynamic_quantize_inst::calc_output_layout(dynamic_quantize_node const& no template std::vector dynamic_quantize_inst::__calc_output_layouts(layout &act_layout, size_t group_size) { - ov::intel_gpu::op::DynamicQuantize op; + ov::op::internal::DynamicQuantize op; auto output_format = act_layout.format; std::vector input_shapes = { act_layout.get(), }; - auto output_shapes = shape_infer(&op, input_shapes); + auto output_shapes = ov::op::internal::DynamicQuantize::shape_infer(&op, input_shapes); return { layout(output_shapes[0], data_types::i8, output_format), layout(output_shapes[1], data_types::f16, output_format) }; } diff --git a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp index e5a43b4c62541f..3ee6aef3568784 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp @@ -2,23 +2,15 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "intel_gpu/op/dynamic_quantize.hpp" +#include "ov_ops/dynamic_quantize.hpp" #include "intel_gpu/plugin/program_builder.hpp" #include "intel_gpu/plugin/common_utils.hpp" #include "intel_gpu/primitives/dynamic_quantize.hpp" -namespace ov { -namespace op { -namespace internal { -using DynamicQuantize = ov::intel_gpu::op::DynamicQuantize; -} // namespace internal -} // namespace op -} // namespace ov - namespace ov { namespace intel_gpu { -static void CreateDynamicQuantizeOp(ProgramBuilder& p, const std::shared_ptr& op) { +static void CreateDynamicQuantizeOp(ProgramBuilder& p, const std::shared_ptr& op) { validate_inputs_count(op, {1}); auto inputs = p.GetInputInfo(op); std::string primitive_name = layer_type_name_ID(op); diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp index 8e12b312029617..94f79e86d41535 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp @@ -5,7 +5,7 @@ #include "dynamic_quantize_fully_connected.hpp" #include "intel_gpu/op/fully_connected_compressed.hpp" -#include "intel_gpu/op/dynamic_quantize.hpp" +#include "ov_ops/dynamic_quantize.hpp" #include "openvino/core/rt_info.hpp" #include "openvino/pass/pattern/op/or.hpp" @@ -58,7 +58,7 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(size_t group_size) } OutputVector fc_inputs; - auto dyn_quan = std::make_shared(m_data, group_size); + auto dyn_quan = std::make_shared(m_data, group_size, element::f16); for (size_t i = 0; i < m_fc->get_input_size(); i++) fc_inputs.push_back(m_fc->get_input_node_shared_ptr(i)); fc_inputs[0] = dyn_quan->output(0); From 1df4e1f6072137dd1a67393268e3997329a981be Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Thu, 1 Aug 2024 11:36:59 +0900 Subject: [PATCH 16/29] macro for FC mask --- .../impls/onednn/fully_connected_onednn.cpp | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp index d84f5d222512a5..01dfae49ef2f35 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp @@ -19,6 +19,9 @@ namespace onednn { struct fully_connected_onednn : typed_primitive_onednn_impl { using parent = typed_primitive_onednn_impl; using parent::parent; + static constexpr int COMMON = 0; + static constexpr int PER_OC = 2; + static constexpr int GROUPED = 3; DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::onednn::fully_connected_onednn) @@ -282,9 +285,9 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { ib >> _ds_group_size; ib >> make_data(&_ds_data_type, sizeof(dnnl::memory::data_type)); if (!is_four_bit_weight) - _attrs->set_scales(DNNL_ARG_WEIGHTS, 1 << 1, dnnl::memory::dims{}, _ds_data_type); + _attrs->set_scales(DNNL_ARG_WEIGHTS, PER_OC, dnnl::memory::dims{}, _ds_data_type); else - _attrs->set_scales(DNNL_ARG_WEIGHTS, (1 << 1) + (1 << 0), {_ds_group_size, 1}, _ds_data_type); + _attrs->set_scales(DNNL_ARG_WEIGHTS, GROUPED, {_ds_group_size, 1}, _ds_data_type); } bool has_decompression_zp = !prim->decompression_zero_point.empty() || prim->decompression_zero_point_scalar.has_value(); @@ -296,13 +299,13 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { auto dzp_layout = arg.get_dependency(idx++).get_output_layout(); if (dzp_layout.count() == 1) { - _attrs->set_zero_points(DNNL_ARG_WEIGHTS, 0, dnnl::memory::dims{}, _dzp_data_type); + _attrs->set_zero_points(DNNL_ARG_WEIGHTS, COMMON, dnnl::memory::dims{}, _dzp_data_type); } else { auto ngroups = dzp_layout.get_dim(1); if (ngroups == 1) { - _attrs->set_zero_points(DNNL_ARG_WEIGHTS, 1 << 1, dnnl::memory::dims{}, _dzp_data_type); + _attrs->set_zero_points(DNNL_ARG_WEIGHTS, PER_OC, dnnl::memory::dims{}, _dzp_data_type); } else { - _attrs->set_zero_points(DNNL_ARG_WEIGHTS, (1 << 1) + (1 << 0), {_ds_group_size, 1}, _dzp_data_type); + _attrs->set_zero_points(DNNL_ARG_WEIGHTS, GROUPED, {_ds_group_size, 1}, _dzp_data_type); } } } @@ -313,7 +316,7 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { auto innermost_len = partial_shape[partial_shape.size() - 1].get_length(); auto act_scale_data_type = convert_data_type(impl_params->get_input_layout(idx).data_type); - _attrs->set_scales(DNNL_ARG_SRC, (1 << 1) | (1 << 0), dnnl::memory::dims{1, innermost_len}, act_scale_data_type); + _attrs->set_scales(DNNL_ARG_SRC, GROUPED, dnnl::memory::dims{1, innermost_len}, act_scale_data_type); } if (is_compressed) { @@ -358,10 +361,10 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { group_size = ifm / ngroups; if (!is_four_bit_weight) { // 8-bit quantized weight - attr->set_scales(DNNL_ARG_WEIGHTS, 1 << 1, dnnl::memory::dims{}, ds_data_type); + attr->set_scales(DNNL_ARG_WEIGHTS, PER_OC, dnnl::memory::dims{}, ds_data_type); } else { // OneDNN does not support scalar zero-point for s4 and u8 type. Need to broadcast it. - attr->set_scales(DNNL_ARG_WEIGHTS, (1 << 1) + (1 << 0), {group_size, 1}, ds_data_type); + attr->set_scales(DNNL_ARG_WEIGHTS, GROUPED, {group_size, 1}, ds_data_type); } } @@ -371,13 +374,13 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { dzp_data_type = convert_data_type(dzp_layout.data_type); if (dzp_layout.count() == 1) { - attr->set_zero_points(DNNL_ARG_WEIGHTS, 0, dnnl::memory::dims{}, dzp_data_type); + attr->set_zero_points(DNNL_ARG_WEIGHTS, COMMON, dnnl::memory::dims{}, dzp_data_type); } else { auto ngroups = dzp_layout.get_dim(1); if (ngroups == 1) { - attr->set_zero_points(DNNL_ARG_WEIGHTS, 1 << 1, dnnl::memory::dims{}, dzp_data_type); + attr->set_zero_points(DNNL_ARG_WEIGHTS, PER_OC, dnnl::memory::dims{}, dzp_data_type); } else { - attr->set_zero_points(DNNL_ARG_WEIGHTS, (1 << 1) + (1 << 0), {group_size, 1}, dzp_data_type); + attr->set_zero_points(DNNL_ARG_WEIGHTS, GROUPED, {group_size, 1}, dzp_data_type); } } } @@ -389,7 +392,7 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { auto innermost_len = partial_shape[partial_shape.size() - 1].get_length(); auto act_scale_data_type = convert_data_type(impl_params.input_layouts[idx].data_type); - attr->set_scales(DNNL_ARG_SRC, (1 << 1) | (1 << 0), dnnl::memory::dims{1, innermost_len}, act_scale_data_type); + attr->set_scales(DNNL_ARG_SRC, GROUPED, dnnl::memory::dims{1, innermost_len}, act_scale_data_type); } auto prim_desc = get_matmul_primitive_descriptor(impl_params, impl_params.prog->get_engine(), From 845e2a5859749e90067dfe61d96f6fc9e9dbd3ac Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Fri, 2 Aug 2024 17:18:21 +0900 Subject: [PATCH 17/29] group_size is made as vector now --- .../include/ov_ops/dynamic_quantize.hpp | 8 +++--- .../src/ov_ops/dynamic_quantize.cpp | 25 +++++++++++++++---- .../intel_gpu/src/graph/dynamic_quantize.cpp | 7 ++++-- .../src/plugin/ops/dynamic_quantize.cpp | 8 ++++-- .../dynamic_quantize_fully_connected.cpp | 10 +++++--- .../src/runtime/execution_config.cpp | 2 +- 6 files changed, 43 insertions(+), 17 deletions(-) diff --git a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp index e38b6672d3c8ef..5d7af9e94b6731 100644 --- a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp +++ b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp @@ -22,16 +22,16 @@ class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op { /// \param data Input tensor with data /// \param group_size Group size for dynamic quantization /// \param dt_scale Data type for scale output - DynamicQuantize(const Output& data, size_t group_size, element::Type dt_scale); + DynamicQuantize(const Output& data, std::vector group_size, element::Type dt_scale); void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; - size_t get_group_size() { return m_group_size; }; - static std::vector shape_infer(const DynamicQuantize* op, std::vector input_shapes); + std::vector get_group_size() const { return m_group_size; }; + static std::vector shape_infer(const DynamicQuantize* op, std::vector input_shapes, const std::vector group_size); private: - size_t m_group_size; + std::vector m_group_size; element::Type m_dt_scale; }; diff --git a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp index a2f08d6ee09dbf..cf1eaa37f8a70d 100644 --- a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp +++ b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp @@ -12,10 +12,12 @@ namespace ov { namespace op { namespace internal { -DynamicQuantize::DynamicQuantize(const Output& data, size_t group_size, element::Type dt_scale) +DynamicQuantize::DynamicQuantize(const Output& data, std::vector group_size, element::Type dt_scale) : Op({data}) , m_group_size(group_size) , m_dt_scale(dt_scale) { + OPENVINO_ASSERT(data.get_tensor_ptr()->get_partial_shape().rank() == group_size.size(), "FC input rank should be same as the rank of group_size ", + data.get_tensor_ptr()->get_partial_shape().rank(), " / ", group_size.size()); set_output_size(2); validate_and_infer_types(); } @@ -25,7 +27,7 @@ void DynamicQuantize::validate_and_infer_types() { get_input_partial_shape(0) }; - auto out_shapes = shape_infer(this, input_shapes); + auto out_shapes = shape_infer(this, input_shapes, m_group_size); set_output_type(0, element::i8, out_shapes[0]); set_output_type(1, m_dt_scale, out_shapes[1]); } @@ -35,13 +37,26 @@ std::shared_ptr DynamicQuantize::clone_with_new_inputs(const ov::OutputVec return std::make_shared(new_args.at(0), m_group_size, m_dt_scale); } -std::vector DynamicQuantize::shape_infer(const DynamicQuantize* op, std::vector input_shapes) { +std::vector DynamicQuantize::shape_infer(const DynamicQuantize* op, std::vector input_shapes, const std::vector group_size) { std::vector out_shapes; out_shapes.push_back(input_shapes[0]); // FIXME: generalize to N-dim case auto scale_shape = input_shapes[0]; - for (size_t i = 2; i < scale_shape.size(); i++) - scale_shape[i] = 1; + OPENVINO_ASSERT(scale_shape.size() == group_size.size(), "Scale_shape and group_size are supposed to have same rank: " + , scale_shape.size() + , " / " + , group_size.size()); + for (size_t i = 0; i < scale_shape.size(); i++) { + if (scale_shape[i].is_dynamic()) + continue; + + if (group_size[i] == UINT64_MAX) + scale_shape[i] = 1; + else { + scale_shape[i] /= group_size[i]; // if group_size is larger than shape, scale_shape will be 1 + scale_shape[i] = std::max(scale_shape[i].get_length(), 1L); + } + } out_shapes.push_back(scale_shape); return out_shapes; } diff --git a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp index 0469acc5c9d537..5fb47b8de0e348 100644 --- a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp @@ -30,7 +30,10 @@ std::vector dynamic_quantize_inst::__calc_output_layouts(layout &act_lay act_layout.get(), }; - auto output_shapes = ov::op::internal::DynamicQuantize::shape_infer(&op, input_shapes); + std::vector shape_group_size(act_layout.get().size(), 1); + shape_group_size.back() = group_size; + + auto output_shapes = ov::op::internal::DynamicQuantize::shape_infer(&op, input_shapes, shape_group_size); return { layout(output_shapes[0], data_types::i8, output_format), layout(output_shapes[1], data_types::f16, output_format) }; } @@ -41,7 +44,7 @@ template std::vector dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& /*node*/, const kernel_impl_params& impl_param) { auto desc = impl_param.typed_desc(); auto input_layout = impl_param.get_input_layout(); - return __calc_output_layouts(input_layout, 0 /* TODO: handle group_size here */); + return __calc_output_layouts(input_layout, UINT64_MAX /* TODO: handle group_size here */); } template std::vector dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& node, diff --git a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp index 3ee6aef3568784..c42c378ad90d27 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp @@ -15,10 +15,14 @@ static void CreateDynamicQuantizeOp(ProgramBuilder& p, const std::shared_ptrget_group_size() == 1048576, "Not supported group size: ", op->get_group_size()); + auto group_size = op->get_group_size(); + for (size_t i = 0; i < group_size.size() - 1; i++) + OPENVINO_ASSERT(group_size[i] == 1, "Not supported group size at ", i, ": ", group_size[i]); + + OPENVINO_ASSERT(group_size.back() == UINT64_MAX, "Not supported group size: ", group_size.back()); auto prim = cldnn::dynamic_quantize(primitive_name, inputs[0], - op->get_group_size(), + op->get_group_size().back(), get_output_data_types(op) ); p.add_primitive(*op, prim); diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp index 94f79e86d41535..f3e2cc35c6c730 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp @@ -21,7 +21,7 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(size_t group_size) using namespace ov::pass::pattern; // per-token quantization is supported - if (group_size != 1048576) { + if (group_size != UINT64_MAX) { GPU_DEBUG_TRACE << "Dynamic quantization is disabled " << group_size << std::endl; return; } @@ -51,14 +51,18 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(size_t group_size) auto weight_shape = m_fc->get_input_partial_shape(1); const size_t innermost_size = weight_shape[weight_shape.size() - 1].get_length(); - if (group_size != 1048576 && + if (group_size != UINT64_MAX && (group_size == 0 || (innermost_size % group_size != 0 && innermost_size > group_size))) { GPU_DEBUG_TRACE << "Dynamic quantization: shape is not aligned with group size " << innermost_size << " / " << group_size << std::endl; return false; } + auto rank = m_fc->get_input_partial_shape(0).size(); + std::vector shape_group_size(rank, 1); + shape_group_size.back() = group_size; + auto dyn_quan = std::make_shared(m_data, shape_group_size, element::f16); + OutputVector fc_inputs; - auto dyn_quan = std::make_shared(m_data, group_size, element::f16); for (size_t i = 0; i < m_fc->get_input_size(); i++) fc_inputs.push_back(m_fc->get_input_node_shared_ptr(i)); fc_inputs[0] = dyn_quan->output(0); diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp index 745ec1004d3ac7..6c745d67c733cc 100644 --- a/src/plugins/intel_gpu/src/runtime/execution_config.cpp +++ b/src/plugins/intel_gpu/src/runtime/execution_config.cpp @@ -195,7 +195,7 @@ void ExecutionConfig::apply_debug_options(const cldnn::device_info& info) { } GPU_DEBUG_IF(debug_config->enable_dynamic_quantize) { - set_property(ov::hint::dynamic_quantization_group_size(1048576)); + set_property(ov::hint::dynamic_quantization_group_size(UINT64_MAX)); } } From 2b878a4832ebbb4c473a7f4fa0c36eb986c6934e Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Fri, 2 Aug 2024 17:26:23 +0900 Subject: [PATCH 18/29] update for code review --- src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp index 36872f870c3adc..eef29fc9119e06 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp @@ -38,7 +38,7 @@ static void CreateFullyConnectedCompressedOp(ProgramBuilder& p, const std::share size_t input_idx = 4; const size_t INPUT_PORT_IDX = input_idx; std::string zp_name = op->get_has_zp() ? inputs[input_idx++].pid : ""; - std::string activation_scale_name = op->get_has_activation_scale() ? inputs[input_idx++].pid : ""; + auto activation_scale_input = op->get_has_activation_scale() ? inputs[input_idx++] : cldnn::input_info(); float zp_value = 0.0f; bool has_scalar_zp = false; @@ -57,7 +57,7 @@ static void CreateFullyConnectedCompressedOp(ProgramBuilder& p, const std::share bias_name, scale_name, has_scalar_zp && !supports_immad ? "" : zp_name, - {activation_scale_name, 1}, + activation_scale_input, cldnn::element_type_to_data_type(op->get_output_element_type(0)), op->get_input_partial_shape(0).size(), op->get_input_partial_shape(1).size()); From 35ea02f1c96302d4cdc613f170f658b696017677 Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Sat, 3 Aug 2024 09:54:12 +0900 Subject: [PATCH 19/29] update for code review --- .../op/fully_connected_compressed.hpp | 24 +++--- .../src/plugin/ops/fully_connected.cpp | 10 +-- .../dynamic_quantize_fully_connected.cpp | 16 ++-- .../op/fully_connected_compressed.cpp | 83 ++++++++++--------- .../dynamic/matmul_weights_decompression.cpp | 4 +- 5 files changed, 67 insertions(+), 70 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/fully_connected_compressed.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/fully_connected_compressed.hpp index 217095934a6185..1112a3785317a3 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/fully_connected_compressed.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/fully_connected_compressed.hpp @@ -19,30 +19,26 @@ class FullyConnectedCompressed : public FullyConnected { FullyConnectedCompressed(const ov::Output &A, const ov::Output &B, const ov::Output &bias, - const ov::Output &decompression_scale, - const ov::Output &decompression_zero_point, + const ov::Output &w_decompression_scale, + const ov::Output &w_decompression_zero_point, + const ov::Output &a_decompression_scale, const ov::element::Type output_type = ov::element::undefined); + FullyConnectedCompressed(const ov::Output &A, const ov::Output &B, const ov::Output &bias, - const ov::Output &decompression_scale, + const ov::Output &w_decompression_scale, + const ov::Output &w_decompression_zero_point, const ov::element::Type output_type = ov::element::undefined); - FullyConnectedCompressed(const OutputVector& inputs, - bool has_zp = true, - bool has_activation_scale = false, + FullyConnectedCompressed(const ov::Output &A, + const ov::Output &B, + const ov::Output &bias, + const ov::Output &w_decompression_scale, const ov::element::Type output_type = ov::element::undefined); std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; - - bool get_has_zp() const { return m_has_zp; } - bool get_has_activation_scale() const { return m_has_activation_scale; } - - -protected: - bool m_has_zp; - bool m_has_activation_scale; }; } // namespace op diff --git a/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp index eef29fc9119e06..7c0c570f7cf54c 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp @@ -36,14 +36,14 @@ static void CreateFullyConnectedCompressedOp(ProgramBuilder& p, const std::share auto bias_name = inputs[2].pid; auto scale_name = inputs[3].pid; size_t input_idx = 4; - const size_t INPUT_PORT_IDX = input_idx; - std::string zp_name = op->get_has_zp() ? inputs[input_idx++].pid : ""; - auto activation_scale_input = op->get_has_activation_scale() ? inputs[input_idx++] : cldnn::input_info(); + const size_t W_ZP_IDX = input_idx; + std::string zp_name = op->get_input_size() > input_idx ? inputs[input_idx++].pid : ""; + auto activation_scale_input = op->get_input_size() > input_idx ? inputs[input_idx++] : cldnn::input_info(); float zp_value = 0.0f; bool has_scalar_zp = false; - if (op->get_has_zp()) { - auto zp_const = std::dynamic_pointer_cast(op->get_input_node_shared_ptr(INPUT_PORT_IDX)); + if (zp_name.size() > 0) { + auto zp_const = std::dynamic_pointer_cast(op->get_input_node_shared_ptr(W_ZP_IDX)); if (zp_const && ov::shape_size(zp_const->get_output_shape(0)) == 1) { has_scalar_zp = true; zp_value = zp_const->cast_vector()[0]; diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp index f3e2cc35c6c730..c0255b892e2602 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp @@ -5,6 +5,7 @@ #include "dynamic_quantize_fully_connected.hpp" #include "intel_gpu/op/fully_connected_compressed.hpp" +#include #include "ov_ops/dynamic_quantize.hpp" #include "openvino/core/rt_info.hpp" @@ -61,15 +62,14 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(size_t group_size) std::vector shape_group_size(rank, 1); shape_group_size.back() = group_size; auto dyn_quan = std::make_shared(m_data, shape_group_size, element::f16); + auto optional_w_zp = m_fc->get_input_size() > 4 ? m_fc->get_input_node_shared_ptr(4) : std::make_shared(); - OutputVector fc_inputs; - for (size_t i = 0; i < m_fc->get_input_size(); i++) - fc_inputs.push_back(m_fc->get_input_node_shared_ptr(i)); - fc_inputs[0] = dyn_quan->output(0); - fc_inputs.push_back(dyn_quan->output(1)); - auto new_fc = std::make_shared(fc_inputs, - m_fc->get_has_zp(), - true, + auto new_fc = std::make_shared(dyn_quan->output(0), + m_fc->get_input_node_shared_ptr(1), + m_fc->get_input_node_shared_ptr(2), + m_fc->get_input_node_shared_ptr(3), + optional_w_zp, + dyn_quan->output(1), m_fc->get_output_type()); ov::replace_node(m_fc, new_fc); diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/fully_connected_compressed.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/fully_connected_compressed.cpp index c1586db5e56859..2e3819d7e850ee 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/fully_connected_compressed.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/fully_connected_compressed.cpp @@ -11,64 +11,65 @@ namespace op { FullyConnectedCompressed::FullyConnectedCompressed(const ov::Output& A, const ov::Output& B, const ov::Output& bias, - const ov::Output& decompression_scale, - const ov::Output& decompression_zero_point, + const ov::Output& w_decompression_scale, + const ov::Output& w_decompression_zero_point, + const ov::Output& a_decompression_scale, const ov::element::Type output_type) - : FullyConnected(A, B, bias, output_type) - , m_has_zp(true) - , m_has_activation_scale(false) { - set_argument(3, decompression_scale); - set_argument(4, decompression_zero_point); + : FullyConnected(A, B, bias, output_type) { + set_argument(3, w_decompression_scale); + set_argument(4, w_decompression_zero_point); + set_argument(5, a_decompression_scale); validate_and_infer_types(); } FullyConnectedCompressed::FullyConnectedCompressed(const ov::Output& A, const ov::Output& B, const ov::Output& bias, - const ov::Output& decompression_scale, + const ov::Output& w_decompression_scale, + const ov::Output& w_decompression_zero_point, const ov::element::Type output_type) - : FullyConnected(A, B, bias, output_type) - , m_has_zp(false) - , m_has_activation_scale(false) { - set_argument(3, decompression_scale); + : FullyConnected(A, B, bias, output_type) { + set_argument(3, w_decompression_scale); + set_argument(4, w_decompression_zero_point); validate_and_infer_types(); } -FullyConnectedCompressed::FullyConnectedCompressed(const OutputVector& inputs, - bool has_zp, - bool has_activation_scale, - const ov::element::Type output_type) - : FullyConnected(inputs[0], inputs[1], inputs[2], output_type) - , m_has_zp(has_zp) - , m_has_activation_scale(has_activation_scale) -{ - for (size_t i = 3; i < inputs.size(); i++) - set_argument(i, inputs[i]); +FullyConnectedCompressed::FullyConnectedCompressed(const ov::Output& A, + const ov::Output& B, + const ov::Output& bias, + const ov::Output& w_decompression_scale, + const ov::element::Type output_type) + : FullyConnected(A, B, bias, output_type) { + set_argument(3, w_decompression_scale); validate_and_infer_types(); } std::shared_ptr FullyConnectedCompressed::clone_with_new_inputs(const ov::OutputVector& new_args) const { check_new_args_count(this, new_args); - auto input_size = new_args.size(); - auto expected_inputs = 4; - if (m_has_zp) - expected_inputs++; - if (m_has_activation_scale) - expected_inputs++; - NODE_VALIDATION_CHECK(this, - input_size == m_has_zp, - "Number of inputs is incorrect. Current value is: ", - input_size, - ", expected ", - expected_inputs); - - return std::make_shared(new_args, - m_has_zp, - m_has_activation_scale, - m_output_type); -} - + if (new_args.size() == 4) + return std::make_shared(new_args.at(0), + new_args.at(1), + new_args.at(2), + new_args.at(3), + m_output_type); + else if (new_args.size() == 5) + return std::make_shared(new_args.at(0), + new_args.at(1), + new_args.at(2), + new_args.at(3), + new_args.at(4), + m_output_type); + else if (new_args.size() == 6) + return std::make_shared(new_args.at(0), + new_args.at(1), + new_args.at(2), + new_args.at(3), + new_args.at(4), + new_args.at(6), + m_output_type); + else + OPENVINO_THROW("Unexpected inputs count for FullyConnectedCompressed op: ", new_args.size());} } // namespace op } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp index 0425ba82d1c405..11c972a5c1378f 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp @@ -383,10 +383,10 @@ INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_dyn_quan, ::testing::Values(ov::element::u4), ::testing::Values(ov::element::f16), ::testing::Values(false), - ::testing::Values(true), + ::testing::ValuesIn(add_decompression_sub), ::testing::Values(true), ::testing::Values(true), // per_tensor_zp - ::testing::Values(1048576)), + ::testing::Values(UINT64_MAX)), MatmulWeightsDecompression::get_test_case_name); } // namespace From dd9eda0c28d0ad1e9f26bf4490b23213a86137db Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Sat, 3 Aug 2024 11:54:55 +0900 Subject: [PATCH 20/29] reverted property change --- .../include/openvino/runtime/properties.hpp | 26 +------------------ src/plugins/intel_gpu/src/plugin/plugin.cpp | 2 +- 2 files changed, 2 insertions(+), 26 deletions(-) diff --git a/src/inference/include/openvino/runtime/properties.hpp b/src/inference/include/openvino/runtime/properties.hpp index afff187f064327..e0f7df1b16b0c2 100644 --- a/src/inference/include/openvino/runtime/properties.hpp +++ b/src/inference/include/openvino/runtime/properties.hpp @@ -560,30 +560,6 @@ inline std::istream& operator>>(std::istream& is, ExecutionMode& mode) { */ static constexpr Property execution_mode{"EXECUTION_MODE_HINT"}; -struct GSNum { - constexpr GSNum() : gs_num{0} {}; - - constexpr GSNum(const uint64_t num_) : gs_num{num_} {} - - constexpr operator uint64_t() const { - return gs_num; - } - - uint64_t gs_num = 0; -}; - -/** - * @brief Dynamic quantization for per-token granularity - * @ingroup ov_runtime_cpp_prop_api - */ -static constexpr GSNum PER_TOKEN{UINT64_MAX}; - -/** - * @brief Disable dynamic quantization - * @ingroup ov_runtime_cpp_prop_api - */ -static constexpr GSNum DISABLED{0}; - /** * @brief This property defines group size for dynamic quantization optimization * @ingroup ov_runtime_cpp_prop_api @@ -595,7 +571,7 @@ static constexpr GSNum DISABLED{0}; * might result in better accuracy, but the drawback is worse performance. Group size equal 0 means dynamic * quantization optimization is disabled. */ -static constexpr Property dynamic_quantization_group_size{ +static constexpr Property dynamic_quantization_group_size{ "DYNAMIC_QUANTIZATION_GROUP_SIZE"}; /** diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index ac540bd2925c46..e3f7c2cfe2f6b5 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -555,8 +555,8 @@ std::vector Plugin::get_supported_properties() const { ov::PropertyName{ov::hint::num_requests.name(), PropertyMutability::RW}, ov::PropertyName{ov::hint::inference_precision.name(), PropertyMutability::RW}, ov::PropertyName{ov::hint::enable_cpu_pinning.name(), PropertyMutability::RW}, - ov::PropertyName{ov::hint::dynamic_quantization_group_size.name(), PropertyMutability::RW}, ov::PropertyName{ov::device::id.name(), PropertyMutability::RW}, + ov::PropertyName{ov::hint::dynamic_quantization_group_size.name(), PropertyMutability::RW} }; return supported_properties; From 4d5a5208c5633d945bff0730d58bd181faeb51d5 Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Sat, 3 Aug 2024 11:57:25 +0900 Subject: [PATCH 21/29] group_size -> group_sizes --- .../include/ov_ops/dynamic_quantize.hpp | 10 ++++---- .../src/ov_ops/dynamic_quantize.cpp | 24 +++++++++---------- .../src/plugin/ops/dynamic_quantize.cpp | 10 ++++---- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp index 5d7af9e94b6731..6125d73a33cca3 100644 --- a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp +++ b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp @@ -20,18 +20,18 @@ class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op { /// \brief Constructs an DynamicQuantize operation. /// /// \param data Input tensor with data - /// \param group_size Group size for dynamic quantization + /// \param group_sizes Group sizes for dynamic quantization /// \param dt_scale Data type for scale output - DynamicQuantize(const Output& data, std::vector group_size, element::Type dt_scale); + DynamicQuantize(const Output& data, std::vector group_sizes, element::Type dt_scale); void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; - std::vector get_group_size() const { return m_group_size; }; - static std::vector shape_infer(const DynamicQuantize* op, std::vector input_shapes, const std::vector group_size); + std::vector get_group_sizes() const { return m_group_sizes; }; + static std::vector shape_infer(const DynamicQuantize* op, std::vector input_shapes, const std::vector group_sizes); private: - std::vector m_group_size; + std::vector m_group_sizes; element::Type m_dt_scale; }; diff --git a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp index cf1eaa37f8a70d..f87e24cd79982f 100644 --- a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp +++ b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp @@ -12,12 +12,12 @@ namespace ov { namespace op { namespace internal { -DynamicQuantize::DynamicQuantize(const Output& data, std::vector group_size, element::Type dt_scale) +DynamicQuantize::DynamicQuantize(const Output& data, std::vector group_sizes, element::Type dt_scale) : Op({data}) - , m_group_size(group_size) + , m_group_sizes(group_sizes) , m_dt_scale(dt_scale) { - OPENVINO_ASSERT(data.get_tensor_ptr()->get_partial_shape().rank() == group_size.size(), "FC input rank should be same as the rank of group_size ", - data.get_tensor_ptr()->get_partial_shape().rank(), " / ", group_size.size()); + OPENVINO_ASSERT(data.get_partial_shape().rank() == group_sizes.size(), "FC input rank should be same as the rank of group_size ", + data.get_tensor_ptr()->get_partial_shape().rank(), " / ", group_sizes.size()); set_output_size(2); validate_and_infer_types(); } @@ -27,33 +27,33 @@ void DynamicQuantize::validate_and_infer_types() { get_input_partial_shape(0) }; - auto out_shapes = shape_infer(this, input_shapes, m_group_size); + auto out_shapes = shape_infer(this, input_shapes, m_group_sizes); set_output_type(0, element::i8, out_shapes[0]); set_output_type(1, m_dt_scale, out_shapes[1]); } std::shared_ptr DynamicQuantize::clone_with_new_inputs(const ov::OutputVector& new_args) const { check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_group_size, m_dt_scale); + return std::make_shared(new_args.at(0), m_group_sizes, m_dt_scale); } -std::vector DynamicQuantize::shape_infer(const DynamicQuantize* op, std::vector input_shapes, const std::vector group_size) { +std::vector DynamicQuantize::shape_infer(const DynamicQuantize* op, std::vector input_shapes, const std::vector group_sizes) { std::vector out_shapes; out_shapes.push_back(input_shapes[0]); - // FIXME: generalize to N-dim case + auto scale_shape = input_shapes[0]; - OPENVINO_ASSERT(scale_shape.size() == group_size.size(), "Scale_shape and group_size are supposed to have same rank: " + OPENVINO_ASSERT(scale_shape.size() == group_sizes.size(), "Scale_shape and group_size are supposed to have same rank: " , scale_shape.size() , " / " - , group_size.size()); + , group_sizes.size()); for (size_t i = 0; i < scale_shape.size(); i++) { if (scale_shape[i].is_dynamic()) continue; - if (group_size[i] == UINT64_MAX) + if (group_sizes[i] == UINT64_MAX) scale_shape[i] = 1; else { - scale_shape[i] /= group_size[i]; // if group_size is larger than shape, scale_shape will be 1 + scale_shape[i] /= group_sizes[i]; // if group_size is larger than shape, scale_shape will be 1 scale_shape[i] = std::max(scale_shape[i].get_length(), 1L); } } diff --git a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp index c42c378ad90d27..c10161638bb7eb 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp @@ -15,14 +15,14 @@ static void CreateDynamicQuantizeOp(ProgramBuilder& p, const std::shared_ptrget_group_size(); - for (size_t i = 0; i < group_size.size() - 1; i++) - OPENVINO_ASSERT(group_size[i] == 1, "Not supported group size at ", i, ": ", group_size[i]); + auto group_sizes = op->get_group_sizes(); + for (size_t i = 0; i < group_sizes.size() - 1; i++) + OPENVINO_ASSERT(group_sizes[i] == 1, "Not supported group size at ", i, ": ", group_sizes[i]); - OPENVINO_ASSERT(group_size.back() == UINT64_MAX, "Not supported group size: ", group_size.back()); + OPENVINO_ASSERT(group_sizes.back() == UINT64_MAX, "Not supported group size: ", group_sizes.back()); auto prim = cldnn::dynamic_quantize(primitive_name, inputs[0], - op->get_group_size().back(), + op->get_group_sizes().back(), get_output_data_types(op) ); p.add_primitive(*op, prim); From f02ba031dd655ef70a7c7592d45d481d29d6d093 Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Sat, 3 Aug 2024 14:39:03 +0900 Subject: [PATCH 22/29] ci fix --- .../include/ov_ops/dynamic_quantize.hpp | 14 +++++--- .../src/ov_ops/dynamic_quantize.cpp | 32 +++++++++++-------- .../unit/test_cases/hash_key_gpu_test.cpp | 8 ++--- 3 files changed, 31 insertions(+), 23 deletions(-) diff --git a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp index 6125d73a33cca3..c7b1504cf161d0 100644 --- a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp +++ b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp @@ -27,8 +27,12 @@ class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op { void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; - std::vector get_group_sizes() const { return m_group_sizes; }; - static std::vector shape_infer(const DynamicQuantize* op, std::vector input_shapes, const std::vector group_sizes); + std::vector get_group_sizes() const { + return m_group_sizes; + }; + static std::vector shape_infer(const DynamicQuantize* op, + std::vector input_shapes, + const std::vector group_sizes); private: std::vector m_group_sizes; @@ -36,6 +40,6 @@ class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op { }; -} // namespace op -} // namespace intel_gpu -} // namespace ov +} // namespace op +} // namespace intel_gpu +} // namespace ov diff --git a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp index f87e24cd79982f..f8aa503edc2211 100644 --- a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp +++ b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp @@ -13,19 +13,20 @@ namespace op { namespace internal { DynamicQuantize::DynamicQuantize(const Output& data, std::vector group_sizes, element::Type dt_scale) - : Op({data}) - , m_group_sizes(group_sizes) - , m_dt_scale(dt_scale) { - OPENVINO_ASSERT(data.get_partial_shape().rank() == group_sizes.size(), "FC input rank should be same as the rank of group_size ", - data.get_tensor_ptr()->get_partial_shape().rank(), " / ", group_sizes.size()); + : Op({data}), + m_group_sizes(group_sizes), + m_dt_scale(dt_scale) { + OPENVINO_ASSERT(data.get_partial_shape().rank() == group_sizes.size(), + "FC input rank should be same as the rank of group_size ", + data.get_tensor_ptr()->get_partial_shape().rank(), + " / ", + group_sizes.size()); set_output_size(2); validate_and_infer_types(); } void DynamicQuantize::validate_and_infer_types() { - std::vector input_shapes = { - get_input_partial_shape(0) - }; + std::vector input_shapes = {get_input_partial_shape(0)}; auto out_shapes = shape_infer(this, input_shapes, m_group_sizes); set_output_type(0, element::i8, out_shapes[0]); @@ -37,15 +38,18 @@ std::shared_ptr DynamicQuantize::clone_with_new_inputs(const ov::OutputVec return std::make_shared(new_args.at(0), m_group_sizes, m_dt_scale); } -std::vector DynamicQuantize::shape_infer(const DynamicQuantize* op, std::vector input_shapes, const std::vector group_sizes) { +std::vector DynamicQuantize::shape_infer(const DynamicQuantize* op, + std::vector input_shapes, + const std::vector group_sizes) { std::vector out_shapes; out_shapes.push_back(input_shapes[0]); auto scale_shape = input_shapes[0]; - OPENVINO_ASSERT(scale_shape.size() == group_sizes.size(), "Scale_shape and group_size are supposed to have same rank: " - , scale_shape.size() - , " / " - , group_sizes.size()); + OPENVINO_ASSERT(scale_shape.size() == group_sizes.size(), + "Scale_shape and group_size are supposed to have same rank: ", + scale_shape.size(), + " / ", + group_sizes.size()); for (size_t i = 0; i < scale_shape.size(); i++) { if (scale_shape[i].is_dynamic()) continue; @@ -53,7 +57,7 @@ std::vector DynamicQuantize::shape_infer(const DynamicQuantize if (group_sizes[i] == UINT64_MAX) scale_shape[i] = 1; else { - scale_shape[i] /= group_sizes[i]; // if group_size is larger than shape, scale_shape will be 1 + scale_shape[i] /= group_sizes[i]; // if group_size is larger than shape, scale_shape will be 1 scale_shape[i] = std::max(scale_shape[i].get_length(), 1L); } } diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/hash_key_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/hash_key_gpu_test.cpp index 7cda59a7629d2f..c2f6c1a31988e6 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/hash_key_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/hash_key_gpu_test.cpp @@ -71,11 +71,11 @@ class check_hash_value: public ::testing::Test { const auto primitive_hash = primitve->hash(); const auto params_hash = primitve->type->get_fake_aligned_params(*prim_inst->get_impl_params()).hash(); if (!engine.get_device_info().supports_immad) { - ASSERT_EQ(primitive_hash, 14259723886449306729UL); - ASSERT_EQ(params_hash, 3365957578641948513UL); + ASSERT_EQ(primitive_hash, 8017451717095756666UL); + ASSERT_EQ(params_hash, 4374037685392472517UL); } else { - ASSERT_EQ(primitive_hash, 14259723886449306729UL); - ASSERT_EQ(params_hash, 9831190959346679696UL); + ASSERT_EQ(primitive_hash, 8017451717095756666UL); + ASSERT_EQ(params_hash, 17704411706121042861UL); } } From 4380f496f8b1acdb7f667ef39fb78fec63a93328 Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Tue, 6 Aug 2024 12:22:28 +0900 Subject: [PATCH 23/29] style fix --- .../include/ov_ops/dynamic_quantize.hpp | 4 ++-- .../src/ov_ops/dynamic_quantize.cpp | 16 ++++++++-------- .../src/plugin/ops/dynamic_quantize.cpp | 3 +-- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp index c7b1504cf161d0..01a6d56a0ce47c 100644 --- a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp +++ b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp @@ -28,7 +28,7 @@ class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op { std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; std::vector get_group_sizes() const { - return m_group_sizes; + return m_group_sizes; }; static std::vector shape_infer(const DynamicQuantize* op, std::vector input_shapes, @@ -40,6 +40,6 @@ class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op { }; +} // namespace internal } // namespace op -} // namespace intel_gpu } // namespace ov diff --git a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp index f8aa503edc2211..bc1ec4c18f0ab1 100644 --- a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp +++ b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp @@ -17,10 +17,10 @@ DynamicQuantize::DynamicQuantize(const Output& data, std::vector g m_group_sizes(group_sizes), m_dt_scale(dt_scale) { OPENVINO_ASSERT(data.get_partial_shape().rank() == group_sizes.size(), - "FC input rank should be same as the rank of group_size ", - data.get_tensor_ptr()->get_partial_shape().rank(), - " / ", - group_sizes.size()); + "FC input rank should be same as the rank of group_size ", + data.get_tensor_ptr()->get_partial_shape().rank(), + " / ", + group_sizes.size()); set_output_size(2); validate_and_infer_types(); } @@ -46,10 +46,10 @@ std::vector DynamicQuantize::shape_infer(const DynamicQuantize auto scale_shape = input_shapes[0]; OPENVINO_ASSERT(scale_shape.size() == group_sizes.size(), - "Scale_shape and group_size are supposed to have same rank: ", - scale_shape.size(), - " / ", - group_sizes.size()); + "Scale_shape and group_size are supposed to have same rank: ", + scale_shape.size(), + " / ", + group_sizes.size()); for (size_t i = 0; i < scale_shape.size(); i++) { if (scale_shape[i].is_dynamic()) continue; diff --git a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp index c10161638bb7eb..0373251e45c051 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp @@ -23,8 +23,7 @@ static void CreateDynamicQuantizeOp(ProgramBuilder& p, const std::shared_ptrget_group_sizes().back(), - get_output_data_types(op) - ); + get_output_data_types(op)); p.add_primitive(*op, prim); } From b6a15c6d4899e08d561af9d9a3caec35588a7886 Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Tue, 6 Aug 2024 13:32:48 +0900 Subject: [PATCH 24/29] cpplint fix --- .../include/intel_gpu/primitives/dynamic_quantize.hpp | 4 ++-- src/plugins/intel_gpu/src/graph/network.cpp | 3 ++- src/plugins/intel_gpu/src/graph/primitive_inst.cpp | 7 ++++--- .../dynamic/matmul_weights_decompression.cpp | 1 - 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp index 3ad8b4cebde7b0..19d4de1ecb3b0f 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp @@ -24,8 +24,8 @@ struct dynamic_quantize : public primitive_base { const input_info& input, const size_t group_size, const std::vector data_types = {optional_data_type(data_types::f16), optional_data_type(data_types::i8)}) - : primitive_base(id, {input}, 2, data_types) - , group_size(group_size) {} + : primitive_base(id, {input}, 2, data_types), + group_size(group_size) {} size_t group_size = 0; diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index 5a6bd94fe9dd1a..7817e929d0d065 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -1073,7 +1073,8 @@ void network::execute_impl(const std::vector& events) { auto input_mem = get_primitive(inst->id())->dep_memory_ptr(i); if (input_mem->size() != bin.size()) { - std::cout << "WARNING: memory size mis-match for OV_GPU_LoadDumpRawBinary : " + layer_name << " " << input_mem->size() << " / " << bin.size() << std::endl; + std::cout << "WARNING: memory size mis-match for OV_GPU_LoadDumpRawBinary : " + layer_name + << " " << input_mem->size() << " / " << bin.size() << std::endl; bin.resize(input_mem->size()); } diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 8f5b12c7f6f7ee..2f84b5cf1c03cd 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -636,7 +636,8 @@ event::ptr primitive_inst::realloc_if_needed() { auto fc_impl_params = *user->_impl_params; auto fc_input_layout = user->get_node().type()->get_fake_aligned_params(fc_impl_params).input_layouts[0]; if (fc_input_layout.bytes_count() > updated_layouts[dep_idx].bytes_count()) { - GPU_DEBUG_TRACE_DETAIL << id() << ": increase output layout allocation size from " << actual_layouts[dep_idx].to_short_string() << " -> " + GPU_DEBUG_TRACE_DETAIL << id() << ": increase output layout allocation size from " + << actual_layouts[dep_idx].to_short_string() << " -> " << fc_input_layout.to_short_string() << " to meet the input buffer alignment requirements for FC\n"; updated_layouts[dep_idx] = fc_input_layout; } @@ -644,11 +645,11 @@ event::ptr primitive_inst::realloc_if_needed() { // dynamic quantization is only applied to activation of FC if (get_node().is_type()) { auto dyn_quan_scale_layout = dynamic_quantize_inst::__calc_output_layouts(updated_layouts[dep_idx], 0); - GPU_DEBUG_TRACE_DETAIL << "update layout of dynamic quantize scale parameter layout " << dyn_quan_scale_layout[1].to_short_string() << std::endl; + GPU_DEBUG_TRACE_DETAIL << "update layout of dynamic quantize scale parameter layout " + << dyn_quan_scale_layout[1].to_short_string() << std::endl; updated_params.output_layouts[1] = dyn_quan_scale_layout[1]; } } - } } diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp index 11c972a5c1378f..b667a17fb233b8 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp @@ -276,7 +276,6 @@ class MatmulWeightsDecompression : public testing::WithParamInterfaceconfiguration.insert({ov::hint::dynamic_quantization_group_size(dyn_quan_group_size)}); - } void generate_inputs(const std::vector& target_input_static_shapes) override { From 334be2da0e469a92b0e46d6a185ec689b00ab493 Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Tue, 6 Aug 2024 17:07:11 +0900 Subject: [PATCH 25/29] build fix --- src/common/transformations/src/ov_ops/dynamic_quantize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp index bc1ec4c18f0ab1..9bb678c1878b41 100644 --- a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp +++ b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp @@ -58,7 +58,7 @@ std::vector DynamicQuantize::shape_infer(const DynamicQuantize scale_shape[i] = 1; else { scale_shape[i] /= group_sizes[i]; // if group_size is larger than shape, scale_shape will be 1 - scale_shape[i] = std::max(scale_shape[i].get_length(), 1L); + scale_shape[i] = std::max(static_cast(scale_shape[i].get_length()), 1); } } out_shapes.push_back(scale_shape); From 796ba79b534855941bf0aa6df60af64234fe4ef1 Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Wed, 7 Aug 2024 12:27:18 +0900 Subject: [PATCH 26/29] style fix --- src/common/transformations/include/ov_ops/dynamic_quantize.hpp | 1 - src/common/transformations/src/ov_ops/dynamic_quantize.cpp | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp index 01a6d56a0ce47c..f2a20be3eb5614 100644 --- a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp +++ b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp @@ -39,7 +39,6 @@ class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op { element::Type m_dt_scale; }; - } // namespace internal } // namespace op } // namespace ov diff --git a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp index 9bb678c1878b41..04553bb02d7d88 100644 --- a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp +++ b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp @@ -3,6 +3,7 @@ // #include "ov_ops/dynamic_quantize.hpp" + #include "openvino/core/partial_shape.hpp" #include "openvino/core/validation_util.hpp" #include "openvino/op/variadic_split.hpp" From 54922444761a108b6eaa55be2c27c5606360adde Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Thu, 8 Aug 2024 11:00:57 +0900 Subject: [PATCH 27/29] fix for review --- .../include/ov_ops/dynamic_quantize.hpp | 6 +++--- .../transformations/src/ov_ops/dynamic_quantize.cpp | 10 +++++----- src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp | 8 ++++---- .../src/graph/include/dynamic_quantize_inst.h | 2 +- .../dynamic_quantize_fully_connected.cpp | 2 +- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp index f2a20be3eb5614..64facb690d012d 100644 --- a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp +++ b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp @@ -27,12 +27,12 @@ class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op { void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; - std::vector get_group_sizes() const { + const std::vector &get_group_sizes() const { return m_group_sizes; }; static std::vector shape_infer(const DynamicQuantize* op, - std::vector input_shapes, - const std::vector group_sizes); + const std::vector &input_shapes, + const std::vector &group_sizes); private: std::vector m_group_sizes; diff --git a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp index 04553bb02d7d88..d03ffcffe9f596 100644 --- a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp +++ b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp @@ -15,13 +15,13 @@ namespace internal { DynamicQuantize::DynamicQuantize(const Output& data, std::vector group_sizes, element::Type dt_scale) : Op({data}), - m_group_sizes(group_sizes), + m_group_sizes(std::move(group_sizes)), m_dt_scale(dt_scale) { - OPENVINO_ASSERT(data.get_partial_shape().rank() == group_sizes.size(), + OPENVINO_ASSERT(data.get_partial_shape().rank() == m_group_sizes.size(), "FC input rank should be same as the rank of group_size ", data.get_tensor_ptr()->get_partial_shape().rank(), " / ", - group_sizes.size()); + m_group_sizes.size()); set_output_size(2); validate_and_infer_types(); } @@ -40,8 +40,8 @@ std::shared_ptr DynamicQuantize::clone_with_new_inputs(const ov::OutputVec } std::vector DynamicQuantize::shape_infer(const DynamicQuantize* op, - std::vector input_shapes, - const std::vector group_sizes) { + const std::vector &input_shapes, + const std::vector &group_sizes) { std::vector out_shapes; out_shapes.push_back(input_shapes[0]); diff --git a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp index 5fb47b8de0e348..95bc33d57f0975 100644 --- a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp @@ -14,7 +14,7 @@ GPU_DEFINE_PRIMITIVE_TYPE_ID(dynamic_quantize); layout dynamic_quantize_inst::calc_output_layout(dynamic_quantize_node const& node, kernel_impl_params const& impl_param) { auto desc = impl_param.typed_desc(); - auto input_layout = impl_param.get_input_layout(); + const auto& input_layout = impl_param.get_input_layout(); auto output_type = data_types::i8; auto output_format = input_layout.format; @@ -22,7 +22,7 @@ layout dynamic_quantize_inst::calc_output_layout(dynamic_quantize_node const& no } template -std::vector dynamic_quantize_inst::__calc_output_layouts(layout &act_layout, size_t group_size) { +std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, size_t group_size) { ov::op::internal::DynamicQuantize op; auto output_format = act_layout.format; @@ -38,12 +38,12 @@ std::vector dynamic_quantize_inst::__calc_output_layouts(layout &act_lay return { layout(output_shapes[0], data_types::i8, output_format), layout(output_shapes[1], data_types::f16, output_format) }; } -template std::vector dynamic_quantize_inst::__calc_output_layouts(layout &act_layout, size_t group_size); +template std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, size_t group_size); template std::vector dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& /*node*/, const kernel_impl_params& impl_param) { auto desc = impl_param.typed_desc(); - auto input_layout = impl_param.get_input_layout(); + const auto& input_layout = impl_param.get_input_layout(); return __calc_output_layouts(input_layout, UINT64_MAX /* TODO: handle group_size here */); } diff --git a/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h index 6458b28dda877d..70bafde89c4273 100644 --- a/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h @@ -35,7 +35,7 @@ class typed_primitive_inst : public typed_primitive_inst_base< // Internal function to be used from fakealignment template - static std::vector __calc_output_layouts(layout &act_layout, size_t group_size); + static std::vector __calc_output_layouts(const layout &act_layout, size_t group_size); static std::string to_string(dynamic_quantize_node const& node); typed_primitive_inst(network& network, dynamic_quantize_node const& node); diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp index c0255b892e2602..14245266b5c9a6 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp @@ -5,7 +5,7 @@ #include "dynamic_quantize_fully_connected.hpp" #include "intel_gpu/op/fully_connected_compressed.hpp" -#include +#include "intel_gpu/op/placeholder.hpp" #include "ov_ops/dynamic_quantize.hpp" #include "openvino/core/rt_info.hpp" From 109687c094d5eaeb16d57f27a13ea0978f31c625 Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Thu, 8 Aug 2024 13:04:35 +0900 Subject: [PATCH 28/29] fix for style --- .../transformations/include/ov_ops/dynamic_quantize.hpp | 6 +++--- src/common/transformations/src/ov_ops/dynamic_quantize.cpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp index 64facb690d012d..8f7859e6b569e8 100644 --- a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp +++ b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp @@ -27,12 +27,12 @@ class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op { void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; - const std::vector &get_group_sizes() const { + const std::vector& get_group_sizes() const { return m_group_sizes; }; static std::vector shape_infer(const DynamicQuantize* op, - const std::vector &input_shapes, - const std::vector &group_sizes); + const std::vector& input_shapes, + const std::vector& group_sizes); private: std::vector m_group_sizes; diff --git a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp index d03ffcffe9f596..ea8fa663c18f07 100644 --- a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp +++ b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp @@ -40,8 +40,8 @@ std::shared_ptr DynamicQuantize::clone_with_new_inputs(const ov::OutputVec } std::vector DynamicQuantize::shape_infer(const DynamicQuantize* op, - const std::vector &input_shapes, - const std::vector &group_sizes) { + const std::vector& input_shapes, + const std::vector& group_sizes) { std::vector out_shapes; out_shapes.push_back(input_shapes[0]); From 43665f5c803f806e19fbb3931b5ed3c5d5537a0d Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Thu, 8 Aug 2024 14:53:05 +0900 Subject: [PATCH 29/29] change group_size format to uint64_t --- .../transformations/include/ov_ops/dynamic_quantize.hpp | 8 ++++---- .../transformations/src/ov_ops/dynamic_quantize.cpp | 4 ++-- .../include/intel_gpu/primitives/dynamic_quantize.hpp | 4 ++-- src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp | 4 ++-- .../intel_gpu/src/graph/include/dynamic_quantize_inst.h | 2 +- .../transformations/dynamic_quantize_fully_connected.cpp | 2 +- .../transformations/dynamic_quantize_fully_connected.hpp | 2 +- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp index 8f7859e6b569e8..69c148305fb94f 100644 --- a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp +++ b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp @@ -22,20 +22,20 @@ class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op { /// \param data Input tensor with data /// \param group_sizes Group sizes for dynamic quantization /// \param dt_scale Data type for scale output - DynamicQuantize(const Output& data, std::vector group_sizes, element::Type dt_scale); + DynamicQuantize(const Output& data, std::vector group_sizes, element::Type dt_scale); void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; - const std::vector& get_group_sizes() const { + const std::vector& get_group_sizes() const { return m_group_sizes; }; static std::vector shape_infer(const DynamicQuantize* op, const std::vector& input_shapes, - const std::vector& group_sizes); + const std::vector& group_sizes); private: - std::vector m_group_sizes; + std::vector m_group_sizes; element::Type m_dt_scale; }; diff --git a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp index ea8fa663c18f07..74c0498e9a4425 100644 --- a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp +++ b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp @@ -13,7 +13,7 @@ namespace ov { namespace op { namespace internal { -DynamicQuantize::DynamicQuantize(const Output& data, std::vector group_sizes, element::Type dt_scale) +DynamicQuantize::DynamicQuantize(const Output& data, std::vector group_sizes, element::Type dt_scale) : Op({data}), m_group_sizes(std::move(group_sizes)), m_dt_scale(dt_scale) { @@ -41,7 +41,7 @@ std::shared_ptr DynamicQuantize::clone_with_new_inputs(const ov::OutputVec std::vector DynamicQuantize::shape_infer(const DynamicQuantize* op, const std::vector& input_shapes, - const std::vector& group_sizes) { + const std::vector& group_sizes) { std::vector out_shapes; out_shapes.push_back(input_shapes[0]); diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp index 19d4de1ecb3b0f..d93e2f86eed144 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp @@ -22,12 +22,12 @@ struct dynamic_quantize : public primitive_base { /// @param output_size Output data size of the primitive dynamic_quantize(const primitive_id& id, const input_info& input, - const size_t group_size, + const uint64_t group_size, const std::vector data_types = {optional_data_type(data_types::f16), optional_data_type(data_types::i8)}) : primitive_base(id, {input}, 2, data_types), group_size(group_size) {} - size_t group_size = 0; + uint64_t group_size = 0; size_t hash() const override { size_t seed = primitive::hash(); diff --git a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp index 95bc33d57f0975..5c945f4c2d389c 100644 --- a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp @@ -22,7 +22,7 @@ layout dynamic_quantize_inst::calc_output_layout(dynamic_quantize_node const& no } template -std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, size_t group_size) { +std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, uint64_t group_size) { ov::op::internal::DynamicQuantize op; auto output_format = act_layout.format; @@ -38,7 +38,7 @@ std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &a return { layout(output_shapes[0], data_types::i8, output_format), layout(output_shapes[1], data_types::f16, output_format) }; } -template std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, size_t group_size); +template std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, uint64_t group_size); template std::vector dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& /*node*/, const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h index 70bafde89c4273..49dd62c6332549 100644 --- a/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h @@ -35,7 +35,7 @@ class typed_primitive_inst : public typed_primitive_inst_base< // Internal function to be used from fakealignment template - static std::vector __calc_output_layouts(const layout &act_layout, size_t group_size); + static std::vector __calc_output_layouts(const layout &act_layout, uint64_t group_size); static std::string to_string(dynamic_quantize_node const& node); typed_primitive_inst(network& network, dynamic_quantize_node const& node); diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp index 14245266b5c9a6..a2e4a1a38983c8 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp @@ -17,7 +17,7 @@ namespace ov { namespace intel_gpu { -DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(size_t group_size) { +DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(uint64_t group_size) { GPU_DEBUG_GET_INSTANCE(debug_config); using namespace ov::pass::pattern; diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.hpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.hpp index 35ae934e91d6f3..b5d956f7872b5c 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.hpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.hpp @@ -12,7 +12,7 @@ namespace intel_gpu { class DynamicQuantizeFullyConnected: public ov::pass::MatcherPass { public: OPENVINO_RTTI("DynamicQuantizeFullyConnected", "0"); - DynamicQuantizeFullyConnected(size_t group_size); + DynamicQuantizeFullyConnected(uint64_t group_size); }; } // namespace intel_gpu