forked from openvinotoolkit/openvino
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[GPU] Dynamic quantization for OneDNN FC (openvinotoolkit#25372)
### Details: - Integrate OneDNN dynamic quantization - Per-token quantization is only enabled ### Tickets: - 144522 --------- Signed-off-by: Kim, Mingyu <[email protected]> Signed-off-by: Min, Byungil <[email protected]> Signed-off-by: Min, Byung-il <[email protected]> Co-authored-by: Min, Byung-il <[email protected]>
- Loading branch information
Showing
39 changed files
with
1,387 additions
and
53 deletions.
There are no files selected for viewing
44 changes: 44 additions & 0 deletions
44
src/common/transformations/include/ov_ops/dynamic_quantize.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
// Copyright (C) 2024 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#pragma once | ||
|
||
#include "openvino/op/op.hpp" | ||
#include "transformations_visibility.hpp" | ||
|
||
namespace ov { | ||
namespace op { | ||
namespace internal { | ||
|
||
/// \brief Operator performing Dynamic Quantize | ||
class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op { | ||
public: | ||
OPENVINO_OP("DynamicQuantize", "gpu_opset"); | ||
|
||
DynamicQuantize() = default; | ||
/// \brief Constructs an DynamicQuantize operation. | ||
/// | ||
/// \param data Input tensor with data | ||
/// \param group_sizes Group sizes for dynamic quantization | ||
/// \param dt_scale Data type for scale output | ||
DynamicQuantize(const Output<Node>& data, std::vector<uint64_t> group_sizes, element::Type dt_scale); | ||
|
||
void validate_and_infer_types() override; | ||
|
||
std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override; | ||
const std::vector<uint64_t>& get_group_sizes() const { | ||
return m_group_sizes; | ||
}; | ||
static std::vector<ov::PartialShape> shape_infer(const DynamicQuantize* op, | ||
const std::vector<ov::PartialShape>& input_shapes, | ||
const std::vector<uint64_t>& group_sizes); | ||
|
||
private: | ||
std::vector<uint64_t> m_group_sizes; | ||
element::Type m_dt_scale; | ||
}; | ||
|
||
} // namespace internal | ||
} // namespace op | ||
} // namespace ov |
71 changes: 71 additions & 0 deletions
71
src/common/transformations/src/ov_ops/dynamic_quantize.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
// Copyright (C) 2024 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#include "ov_ops/dynamic_quantize.hpp" | ||
|
||
#include "openvino/core/partial_shape.hpp" | ||
#include "openvino/core/validation_util.hpp" | ||
#include "openvino/op/variadic_split.hpp" | ||
#include "variadic_split_shape_inference.hpp" | ||
|
||
namespace ov { | ||
namespace op { | ||
namespace internal { | ||
|
||
DynamicQuantize::DynamicQuantize(const Output<Node>& data, std::vector<uint64_t> group_sizes, element::Type dt_scale) | ||
: Op({data}), | ||
m_group_sizes(std::move(group_sizes)), | ||
m_dt_scale(dt_scale) { | ||
OPENVINO_ASSERT(data.get_partial_shape().rank() == m_group_sizes.size(), | ||
"FC input rank should be same as the rank of group_size ", | ||
data.get_tensor_ptr()->get_partial_shape().rank(), | ||
" / ", | ||
m_group_sizes.size()); | ||
set_output_size(2); | ||
validate_and_infer_types(); | ||
} | ||
|
||
void DynamicQuantize::validate_and_infer_types() { | ||
std::vector<ov::PartialShape> input_shapes = {get_input_partial_shape(0)}; | ||
|
||
auto out_shapes = shape_infer(this, input_shapes, m_group_sizes); | ||
set_output_type(0, element::i8, out_shapes[0]); | ||
set_output_type(1, m_dt_scale, out_shapes[1]); | ||
} | ||
|
||
std::shared_ptr<Node> DynamicQuantize::clone_with_new_inputs(const ov::OutputVector& new_args) const { | ||
check_new_args_count(this, new_args); | ||
return std::make_shared<DynamicQuantize>(new_args.at(0), m_group_sizes, m_dt_scale); | ||
} | ||
|
||
std::vector<ov::PartialShape> DynamicQuantize::shape_infer(const DynamicQuantize* op, | ||
const std::vector<ov::PartialShape>& input_shapes, | ||
const std::vector<uint64_t>& group_sizes) { | ||
std::vector<ov::PartialShape> out_shapes; | ||
out_shapes.push_back(input_shapes[0]); | ||
|
||
auto scale_shape = input_shapes[0]; | ||
OPENVINO_ASSERT(scale_shape.size() == group_sizes.size(), | ||
"Scale_shape and group_size are supposed to have same rank: ", | ||
scale_shape.size(), | ||
" / ", | ||
group_sizes.size()); | ||
for (size_t i = 0; i < scale_shape.size(); i++) { | ||
if (scale_shape[i].is_dynamic()) | ||
continue; | ||
|
||
if (group_sizes[i] == UINT64_MAX) | ||
scale_shape[i] = 1; | ||
else { | ||
scale_shape[i] /= group_sizes[i]; // if group_size is larger than shape, scale_shape will be 1 | ||
scale_shape[i] = std::max(static_cast<int>(scale_shape[i].get_length()), 1); | ||
} | ||
} | ||
out_shapes.push_back(scale_shape); | ||
return out_shapes; | ||
} | ||
|
||
} // namespace internal | ||
} // namespace op | ||
} // namespace ov |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
57 changes: 57 additions & 0 deletions
57
src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
// Copyright (C) 2024 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#pragma once | ||
#include "primitive.hpp" | ||
|
||
namespace cldnn { | ||
|
||
/// @brief Dynamic Quantize primitive | ||
/// @details Performs dynamic quantization | ||
struct dynamic_quantize : public primitive_base<dynamic_quantize> { | ||
CLDNN_DECLARE_PRIMITIVE(dynamic_quantize); | ||
|
||
dynamic_quantize() : primitive_base("", {}), group_size(0) {} | ||
|
||
/// @brief Constructs dynamic_quantize primitive | ||
/// @param id This primitive id | ||
/// @param input Input primitive id | ||
/// @param group_size Quantization group size | ||
/// @param data_type Output data type of quantized | ||
/// @param output_size Output data size of the primitive | ||
dynamic_quantize(const primitive_id& id, | ||
const input_info& input, | ||
const uint64_t group_size, | ||
const std::vector<optional_data_type> data_types = {optional_data_type(data_types::f16), optional_data_type(data_types::i8)}) | ||
: primitive_base(id, {input}, 2, data_types), | ||
group_size(group_size) {} | ||
|
||
uint64_t group_size = 0; | ||
|
||
size_t hash() const override { | ||
size_t seed = primitive::hash(); | ||
seed = hash_combine(seed, group_size); | ||
return seed; | ||
} | ||
|
||
bool operator==(const primitive& rhs) const override { | ||
if (!compare_common_params(rhs)) | ||
return false; | ||
|
||
auto rhs_casted = downcast<const dynamic_quantize>(rhs); | ||
|
||
return group_size == rhs_casted.group_size; | ||
} | ||
|
||
void save(BinaryOutputBuffer& ob) const override { | ||
primitive_base<dynamic_quantize>::save(ob); | ||
ob << group_size; | ||
} | ||
|
||
void load(BinaryInputBuffer& ib) override { | ||
primitive_base<dynamic_quantize>::load(ib); | ||
ib >> group_size; | ||
} | ||
}; | ||
} // namespace cldnn |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
// Copyright (C) 2024 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#include "ov_ops/dynamic_quantize.hpp" | ||
#include "dynamic_quantize_inst.h" | ||
|
||
#include "primitive_type_base.h" | ||
#include "json_object.h" | ||
#include <string> | ||
|
||
namespace cldnn { | ||
GPU_DEFINE_PRIMITIVE_TYPE_ID(dynamic_quantize); | ||
|
||
layout dynamic_quantize_inst::calc_output_layout(dynamic_quantize_node const& node, kernel_impl_params const& impl_param) { | ||
auto desc = impl_param.typed_desc<dynamic_quantize>(); | ||
const auto& input_layout = impl_param.get_input_layout(); | ||
auto output_type = data_types::i8; | ||
auto output_format = input_layout.format; | ||
|
||
return layout(output_type, output_format, input_layout.get_tensor()); | ||
} | ||
|
||
template<typename ShapeType> | ||
std::vector<layout> dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, uint64_t group_size) { | ||
ov::op::internal::DynamicQuantize op; | ||
auto output_format = act_layout.format; | ||
|
||
std::vector<ShapeType> input_shapes = { | ||
act_layout.get<ShapeType>(), | ||
}; | ||
|
||
std::vector<uint64_t> shape_group_size(act_layout.get<ShapeType>().size(), 1); | ||
shape_group_size.back() = group_size; | ||
|
||
auto output_shapes = ov::op::internal::DynamicQuantize::shape_infer(&op, input_shapes, shape_group_size); | ||
|
||
return { layout(output_shapes[0], data_types::i8, output_format), layout(output_shapes[1], data_types::f16, output_format) }; | ||
} | ||
|
||
template std::vector<layout> dynamic_quantize_inst::__calc_output_layouts<ov::PartialShape>(const layout &act_layout, uint64_t group_size); | ||
|
||
template<typename ShapeType> | ||
std::vector<layout> dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& /*node*/, const kernel_impl_params& impl_param) { | ||
auto desc = impl_param.typed_desc<dynamic_quantize>(); | ||
const auto& input_layout = impl_param.get_input_layout(); | ||
return __calc_output_layouts<ov::PartialShape>(input_layout, UINT64_MAX /* TODO: handle group_size here */); | ||
} | ||
|
||
template std::vector<layout> dynamic_quantize_inst::calc_output_layouts<ov::PartialShape>(dynamic_quantize_node const& node, | ||
const kernel_impl_params& impl_param); | ||
|
||
std::string dynamic_quantize_inst::to_string(dynamic_quantize_node const& node) { | ||
auto desc = node.get_primitive(); | ||
auto node_info = node.desc_to_json(); | ||
|
||
std::stringstream primitive_description; | ||
|
||
node_info->dump(primitive_description); | ||
|
||
return primitive_description.str(); | ||
} | ||
|
||
dynamic_quantize_inst::typed_primitive_inst(network& network, dynamic_quantize_node const& node) : parent(network, node) {} | ||
|
||
} // namespace cldnn |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.