Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GPU] Dynamic quantization for OneDNN FC #25372

Merged
merged 29 commits into from
Aug 8, 2024
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
2fe52fb
[GPU] FC dynamic quantization with OneDNN
isanghao Jun 8, 2024
bc3c4e8
Modify dynamic quantize kernels
byungilm Jul 2, 2024
e7ea310
code cleanup & accuracy fix
isanghao Jul 5, 2024
99a6f1c
[GPU] restric dynamic_quantization condition for unittest pass
isanghao Jul 23, 2024
4d4e1a1
New test for dynamic quantization
isanghao Jul 23, 2024
f931fc9
[GPU] option cleanup for per-token quantization
isanghao Jul 26, 2024
3dd6acd
minor fix
isanghao Jul 26, 2024
b3629b4
code cleanup
isanghao Jul 26, 2024
c48e41f
update onednn version
isanghao Jul 27, 2024
b64e974
changing group size to size_t
isanghao Jul 27, 2024
0268aba
code cleanup for review
isanghao Jul 27, 2024
f3cd46e
fix for code review
isanghao Jul 27, 2024
daac4a5
update for code review
isanghao Jul 27, 2024
d4c3c0d
introduce gsnum
isanghao Aug 1, 2024
f01a4a2
move dyn_quan to common op
isanghao Aug 1, 2024
1df4e1f
macro for FC mask
isanghao Aug 1, 2024
845e2a5
group_size is made as vector now
isanghao Aug 2, 2024
2b878a4
update for code review
isanghao Aug 2, 2024
35ea02f
update for code review
isanghao Aug 3, 2024
dd9eda0
reverted property change
isanghao Aug 3, 2024
4d5a520
group_size -> group_sizes
isanghao Aug 3, 2024
f02ba03
ci fix
isanghao Aug 3, 2024
4380f49
style fix
isanghao Aug 6, 2024
b6a15c6
cpplint fix
isanghao Aug 6, 2024
334be2d
build fix
isanghao Aug 6, 2024
796ba79
style fix
isanghao Aug 7, 2024
5492244
fix for review
isanghao Aug 8, 2024
109687c
fix for style
isanghao Aug 8, 2024
43665f5
change group_size format to uint64_t
isanghao Aug 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions src/common/transformations/include/ov_ops/dynamic_quantize.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "openvino/op/op.hpp"
#include "transformations_visibility.hpp"

namespace ov {
namespace op {
namespace internal {

/// \brief Operator performing Dynamic Quantize
class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op {
public:
OPENVINO_OP("DynamicQuantize", "gpu_opset");

DynamicQuantize() = default;
/// \brief Constructs an DynamicQuantize operation.
///
/// \param data Input tensor with data
/// \param group_sizes Group sizes for dynamic quantization
/// \param dt_scale Data type for scale output
DynamicQuantize(const Output<Node>& data, std::vector<size_t> group_sizes, element::Type dt_scale);

void validate_and_infer_types() override;

std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
std::vector<size_t> get_group_sizes() const {
isanghao marked this conversation as resolved.
Show resolved Hide resolved
return m_group_sizes;
};
static std::vector<ov::PartialShape> shape_infer(const DynamicQuantize* op,
std::vector<ov::PartialShape> input_shapes,
const std::vector<size_t> group_sizes);
isanghao marked this conversation as resolved.
Show resolved Hide resolved

private:
std::vector<size_t> m_group_sizes;
element::Type m_dt_scale;
};

} // namespace internal
} // namespace op
} // namespace ov
71 changes: 71 additions & 0 deletions src/common/transformations/src/ov_ops/dynamic_quantize.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "ov_ops/dynamic_quantize.hpp"

#include "openvino/core/partial_shape.hpp"
#include "openvino/core/validation_util.hpp"
#include "openvino/op/variadic_split.hpp"
#include "variadic_split_shape_inference.hpp"

namespace ov {
namespace op {
namespace internal {

DynamicQuantize::DynamicQuantize(const Output<Node>& data, std::vector<size_t> group_sizes, element::Type dt_scale)
: Op({data}),
m_group_sizes(group_sizes),
m_dt_scale(dt_scale) {
OPENVINO_ASSERT(data.get_partial_shape().rank() == group_sizes.size(),
"FC input rank should be same as the rank of group_size ",
data.get_tensor_ptr()->get_partial_shape().rank(),
" / ",
group_sizes.size());
set_output_size(2);
validate_and_infer_types();
}

void DynamicQuantize::validate_and_infer_types() {
std::vector<ov::PartialShape> input_shapes = {get_input_partial_shape(0)};

auto out_shapes = shape_infer(this, input_shapes, m_group_sizes);
set_output_type(0, element::i8, out_shapes[0]);
set_output_type(1, m_dt_scale, out_shapes[1]);
}

std::shared_ptr<Node> DynamicQuantize::clone_with_new_inputs(const ov::OutputVector& new_args) const {
check_new_args_count(this, new_args);
return std::make_shared<DynamicQuantize>(new_args.at(0), m_group_sizes, m_dt_scale);
}

std::vector<ov::PartialShape> DynamicQuantize::shape_infer(const DynamicQuantize* op,
std::vector<ov::PartialShape> input_shapes,
const std::vector<size_t> group_sizes) {
std::vector<ov::PartialShape> out_shapes;
out_shapes.push_back(input_shapes[0]);

auto scale_shape = input_shapes[0];
OPENVINO_ASSERT(scale_shape.size() == group_sizes.size(),
"Scale_shape and group_size are supposed to have same rank: ",
scale_shape.size(),
" / ",
group_sizes.size());
for (size_t i = 0; i < scale_shape.size(); i++) {
if (scale_shape[i].is_dynamic())
continue;

if (group_sizes[i] == UINT64_MAX)
scale_shape[i] = 1;
else {
scale_shape[i] /= group_sizes[i]; // if group_size is larger than shape, scale_shape will be 1
scale_shape[i] = std::max(static_cast<int>(scale_shape[i].get_length()), 1);
}
}
out_shapes.push_back(scale_shape);
return out_shapes;
}

} // namespace internal
} // namespace op
} // namespace ov
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,23 @@ class FullyConnectedCompressed : public FullyConnected {
FullyConnectedCompressed(const ov::Output<Node> &A,
const ov::Output<Node> &B,
const ov::Output<Node> &bias,
const ov::Output<Node> &decompression_scale,
const ov::Output<Node> &decompression_zero_point,
const ov::Output<Node> &w_decompression_scale,
const ov::Output<Node> &w_decompression_zero_point,
const ov::Output<Node> &a_decompression_scale,
const ov::element::Type output_type = ov::element::undefined);


FullyConnectedCompressed(const ov::Output<Node> &A,
const ov::Output<Node> &B,
const ov::Output<Node> &bias,
const ov::Output<Node> &w_decompression_scale,
const ov::Output<Node> &w_decompression_zero_point,
const ov::element::Type output_type = ov::element::undefined);

FullyConnectedCompressed(const ov::Output<Node> &A,
const ov::Output<Node> &B,
const ov::Output<Node> &bias,
const ov::Output<Node> &decompression_scale,
const ov::Output<Node> &w_decompression_scale,
const ov::element::Type output_type = ov::element::undefined);
vladimir-paramuzov marked this conversation as resolved.
Show resolved Hide resolved

std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -287,3 +287,4 @@ REGISTER_FACTORY(internal, Placeholder);
REGISTER_FACTORY(internal, SDPA);
REGISTER_FACTORY(internal, IndirectSDPA);
REGISTER_FACTORY(internal, RoPE);
REGISTER_FACTORY(internal, DynamicQuantize);
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once
#include "primitive.hpp"

namespace cldnn {

/// @brief Dynamic Quantize primitive
/// @details Performs dynamic quantization
struct dynamic_quantize : public primitive_base<dynamic_quantize> {
CLDNN_DECLARE_PRIMITIVE(dynamic_quantize);

dynamic_quantize() : primitive_base("", {}), group_size(0) {}

/// @brief Constructs dynamic_quantize primitive
/// @param id This primitive id
/// @param input Input primitive id
/// @param group_size Quantization group size
/// @param data_type Output data type of quantized
/// @param output_size Output data size of the primitive
dynamic_quantize(const primitive_id& id,
const input_info& input,
const size_t group_size,
const std::vector<optional_data_type> data_types = {optional_data_type(data_types::f16), optional_data_type(data_types::i8)})
: primitive_base(id, {input}, 2, data_types),
group_size(group_size) {}

size_t group_size = 0;

size_t hash() const override {
size_t seed = primitive::hash();
seed = hash_combine(seed, group_size);
return seed;
}

bool operator==(const primitive& rhs) const override {
if (!compare_common_params(rhs))
return false;

auto rhs_casted = downcast<const dynamic_quantize>(rhs);

return group_size == rhs_casted.group_size;
}

void save(BinaryOutputBuffer& ob) const override {
primitive_base<dynamic_quantize>::save(ob);
ob << group_size;
}

void load(BinaryInputBuffer& ib) override {
primitive_base<dynamic_quantize>::load(ib);
ib >> group_size;
}
};
} // namespace cldnn
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,46 @@ struct fully_connected : public primitive_base<fully_connected> {
compressed_weights(true),
decompression_scale(decompression_scale),
decompression_zero_point(decompression_zero_point),
dynamic_quantized_activation(false),
input_size(input_size),
weights_rank(weights_rank) {
OPENVINO_ASSERT(!decompression_scale.empty(), "[GPU] Compressed fully connected requires at least decompression scale input");
}

/// @brief Constructs fully connected compressed layer.
/// @param id This primitive id.
/// @param input Input primitive id.
/// @param weights Primitive id containing weights data.
/// @param bias Primitive id containing bias data.
/// @param compression_scale Primitive id containing scale factors for weights decompression.
/// @param compression_zero_point Primitive id containing zero points for weights decompression.
/// @param activation_scale Primitive id containing scale factor for activation.
fully_connected(const primitive_id& id,
const input_info& input,
const primitive_id& weights,
const primitive_id& bias,
const primitive_id& decompression_scale,
const primitive_id& decompression_zero_point,
const input_info& activation_scale,
const data_types data_type,
const size_t input_size = 2,
const size_t weights_rank = 2)
: primitive_base(id, { input }, 1, {optional_data_type{data_type}}),
weights(weights),
bias(bias),
compressed_weights(true),
decompression_scale(decompression_scale),
decompression_zero_point(decompression_zero_point),
dynamic_quantized_activation(false),
activation_scale(activation_scale),
input_size(input_size),
weights_rank(weights_rank) {
if (activation_scale.is_valid())
dynamic_quantized_activation = true;

OPENVINO_ASSERT(!decompression_scale.empty(), "[GPU] Compressed fully connected requires at least decompression scale input");
}

/// @brief Primitive id containing weights data.
primitive_id weights;
/// @brief Primitive id containing bias data.
Expand All @@ -108,6 +143,8 @@ struct fully_connected : public primitive_base<fully_connected> {
bool compressed_weights = false;
primitive_id decompression_scale = "";
primitive_id decompression_zero_point = "";
bool dynamic_quantized_activation = false;
input_info activation_scale = {"", 0};
optional_value<float> decompression_zero_point_scalar = optional_value<float>();

/// @brief Primitive dimension size.
Expand All @@ -123,6 +160,7 @@ struct fully_connected : public primitive_base<fully_connected> {
seed = hash_combine(seed, compressed_weights);
seed = hash_combine(seed, !decompression_scale.empty());
seed = hash_combine(seed, !decompression_zero_point.empty());
seed = hash_combine(seed, activation_scale.is_valid());
seed = hash_combine(seed, decompression_zero_point_scalar.has_value());
seed = hash_combine(seed, decompression_zero_point_scalar.value_or(0.0f));
return seed;
Expand All @@ -140,6 +178,7 @@ struct fully_connected : public primitive_base<fully_connected> {
compressed_weights == rhs_casted.compressed_weights &&
decompression_scale.empty() == rhs_casted.decompression_scale.empty() &&
decompression_zero_point.empty() == rhs_casted.decompression_zero_point.empty() &&
activation_scale.is_valid() == rhs_casted.activation_scale.is_valid() &&
decompression_zero_point_scalar.value_or(0.0f) == rhs_casted.decompression_zero_point_scalar.value_or(0.0f);
}

Expand All @@ -150,8 +189,10 @@ struct fully_connected : public primitive_base<fully_connected> {
ob << compressed_weights;
ob << decompression_scale;
ob << decompression_zero_point;
ob << activation_scale;
ob << input_size;
ob << weights_rank;
ob << dynamic_quantized_activation;

if (decompression_zero_point_scalar.has_value()) {
ob << true;
Expand All @@ -169,8 +210,10 @@ struct fully_connected : public primitive_base<fully_connected> {
ib >> compressed_weights;
ib >> decompression_scale;
ib >> decompression_zero_point;
ib >> activation_scale;
ib >> input_size;
ib >> weights_rank;
ib >> dynamic_quantized_activation;

bool has_value;
ib >> has_value;
Expand All @@ -197,6 +240,9 @@ struct fully_connected : public primitive_base<fully_connected> {
if (!decompression_zero_point.empty())
ret.push_back(decompression_zero_point);

if (activation_scale.is_valid())
ret.push_back(activation_scale);

return ret;
}
};
Expand Down
66 changes: 66 additions & 0 deletions src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "ov_ops/dynamic_quantize.hpp"
#include "dynamic_quantize_inst.h"

#include "primitive_type_base.h"
#include "json_object.h"
#include <string>

namespace cldnn {
GPU_DEFINE_PRIMITIVE_TYPE_ID(dynamic_quantize);

layout dynamic_quantize_inst::calc_output_layout(dynamic_quantize_node const& node, kernel_impl_params const& impl_param) {
auto desc = impl_param.typed_desc<dynamic_quantize>();
auto input_layout = impl_param.get_input_layout();
auto output_type = data_types::i8;
auto output_format = input_layout.format;

return layout(output_type, output_format, input_layout.get_tensor());
}

template<typename ShapeType>
std::vector<layout> dynamic_quantize_inst::__calc_output_layouts(layout &act_layout, size_t group_size) {
ov::op::internal::DynamicQuantize op;
auto output_format = act_layout.format;

std::vector<ShapeType> input_shapes = {
act_layout.get<ShapeType>(),
};

std::vector<uint64_t> shape_group_size(act_layout.get<ShapeType>().size(), 1);
shape_group_size.back() = group_size;

auto output_shapes = ov::op::internal::DynamicQuantize::shape_infer(&op, input_shapes, shape_group_size);

return { layout(output_shapes[0], data_types::i8, output_format), layout(output_shapes[1], data_types::f16, output_format) };
}

template std::vector<layout> dynamic_quantize_inst::__calc_output_layouts<ov::PartialShape>(layout &act_layout, size_t group_size);

template<typename ShapeType>
std::vector<layout> dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& /*node*/, const kernel_impl_params& impl_param) {
auto desc = impl_param.typed_desc<dynamic_quantize>();
auto input_layout = impl_param.get_input_layout();
isanghao marked this conversation as resolved.
Show resolved Hide resolved
return __calc_output_layouts<ov::PartialShape>(input_layout, UINT64_MAX /* TODO: handle group_size here */);
}

template std::vector<layout> dynamic_quantize_inst::calc_output_layouts<ov::PartialShape>(dynamic_quantize_node const& node,
const kernel_impl_params& impl_param);

std::string dynamic_quantize_inst::to_string(dynamic_quantize_node const& node) {
auto desc = node.get_primitive();
auto node_info = node.desc_to_json();

std::stringstream primitive_description;

node_info->dump(primitive_description);

return primitive_description.str();
}

dynamic_quantize_inst::typed_primitive_inst(network& network, dynamic_quantize_node const& node) : parent(network, node) {}

} // namespace cldnn
3 changes: 3 additions & 0 deletions src/plugins/intel_gpu/src/graph/fully_connected.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,9 @@ std::string fully_connected_inst::to_string(fully_connected_node const& node) {
fc_info.add("decompression zp value", desc->decompression_zero_point_scalar.value());
}
}
if (desc->dynamic_quantized_activation) {
fc_info.add("activation scale id", desc->activation_scale.pid);
}

node_info->add("fully connected info", fc_info);
node_info->dump(primitive_description);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,8 @@ void prepare_primitive_fusing::fuse_bias(program &p) {
fc_with_bias_prim->decompression_zero_point = desc->decompression_zero_point;
if (desc->decompression_zero_point_scalar.has_value())
fc_with_bias_prim->decompression_zero_point_scalar = desc->decompression_zero_point_scalar.value();
fc_with_bias_prim->activation_scale = desc->activation_scale;
fc_with_bias_prim->dynamic_quantized_activation = desc->dynamic_quantized_activation;
}
auto& new_fc_node = p.get_or_create(fc_with_bias_prim);
fuse_bias_f(fc, new_fc_node, bias_node, eltw_node);
Expand Down
Loading
Loading