Skip to content

Commit

Permalink
[GPU] Dynamic quantization for OneDNN FC (openvinotoolkit#25372)
Browse files Browse the repository at this point in the history
### Details:
 - Integrate OneDNN dynamic quantization
 - Per-token quantization is only enabled

### Tickets:
 - 144522

---------

Signed-off-by: Kim, Mingyu <[email protected]>
Signed-off-by: Min, Byungil <[email protected]>
Signed-off-by: Min, Byung-il <[email protected]>
Co-authored-by: Min, Byung-il <[email protected]>
  • Loading branch information
isanghao and byungilm authored Aug 8, 2024
1 parent 8b3ae94 commit 7d6ffd3
Show file tree
Hide file tree
Showing 39 changed files with 1,387 additions and 53 deletions.
44 changes: 44 additions & 0 deletions src/common/transformations/include/ov_ops/dynamic_quantize.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "openvino/op/op.hpp"
#include "transformations_visibility.hpp"

namespace ov {
namespace op {
namespace internal {

/// \brief Operator performing Dynamic Quantize
class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op {
public:
OPENVINO_OP("DynamicQuantize", "gpu_opset");

DynamicQuantize() = default;
/// \brief Constructs an DynamicQuantize operation.
///
/// \param data Input tensor with data
/// \param group_sizes Group sizes for dynamic quantization
/// \param dt_scale Data type for scale output
DynamicQuantize(const Output<Node>& data, std::vector<uint64_t> group_sizes, element::Type dt_scale);

void validate_and_infer_types() override;

std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
const std::vector<uint64_t>& get_group_sizes() const {
return m_group_sizes;
};
static std::vector<ov::PartialShape> shape_infer(const DynamicQuantize* op,
const std::vector<ov::PartialShape>& input_shapes,
const std::vector<uint64_t>& group_sizes);

private:
std::vector<uint64_t> m_group_sizes;
element::Type m_dt_scale;
};

} // namespace internal
} // namespace op
} // namespace ov
71 changes: 71 additions & 0 deletions src/common/transformations/src/ov_ops/dynamic_quantize.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "ov_ops/dynamic_quantize.hpp"

#include "openvino/core/partial_shape.hpp"
#include "openvino/core/validation_util.hpp"
#include "openvino/op/variadic_split.hpp"
#include "variadic_split_shape_inference.hpp"

namespace ov {
namespace op {
namespace internal {

DynamicQuantize::DynamicQuantize(const Output<Node>& data, std::vector<uint64_t> group_sizes, element::Type dt_scale)
: Op({data}),
m_group_sizes(std::move(group_sizes)),
m_dt_scale(dt_scale) {
OPENVINO_ASSERT(data.get_partial_shape().rank() == m_group_sizes.size(),
"FC input rank should be same as the rank of group_size ",
data.get_tensor_ptr()->get_partial_shape().rank(),
" / ",
m_group_sizes.size());
set_output_size(2);
validate_and_infer_types();
}

void DynamicQuantize::validate_and_infer_types() {
std::vector<ov::PartialShape> input_shapes = {get_input_partial_shape(0)};

auto out_shapes = shape_infer(this, input_shapes, m_group_sizes);
set_output_type(0, element::i8, out_shapes[0]);
set_output_type(1, m_dt_scale, out_shapes[1]);
}

std::shared_ptr<Node> DynamicQuantize::clone_with_new_inputs(const ov::OutputVector& new_args) const {
check_new_args_count(this, new_args);
return std::make_shared<DynamicQuantize>(new_args.at(0), m_group_sizes, m_dt_scale);
}

std::vector<ov::PartialShape> DynamicQuantize::shape_infer(const DynamicQuantize* op,
const std::vector<ov::PartialShape>& input_shapes,
const std::vector<uint64_t>& group_sizes) {
std::vector<ov::PartialShape> out_shapes;
out_shapes.push_back(input_shapes[0]);

auto scale_shape = input_shapes[0];
OPENVINO_ASSERT(scale_shape.size() == group_sizes.size(),
"Scale_shape and group_size are supposed to have same rank: ",
scale_shape.size(),
" / ",
group_sizes.size());
for (size_t i = 0; i < scale_shape.size(); i++) {
if (scale_shape[i].is_dynamic())
continue;

if (group_sizes[i] == UINT64_MAX)
scale_shape[i] = 1;
else {
scale_shape[i] /= group_sizes[i]; // if group_size is larger than shape, scale_shape will be 1
scale_shape[i] = std::max(static_cast<int>(scale_shape[i].get_length()), 1);
}
}
out_shapes.push_back(scale_shape);
return out_shapes;
}

} // namespace internal
} // namespace op
} // namespace ov
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,23 @@ class FullyConnectedCompressed : public FullyConnected {
FullyConnectedCompressed(const ov::Output<Node> &A,
const ov::Output<Node> &B,
const ov::Output<Node> &bias,
const ov::Output<Node> &decompression_scale,
const ov::Output<Node> &decompression_zero_point,
const ov::Output<Node> &w_decompression_scale,
const ov::Output<Node> &w_decompression_zero_point,
const ov::Output<Node> &a_decompression_scale,
const ov::element::Type output_type = ov::element::undefined);


FullyConnectedCompressed(const ov::Output<Node> &A,
const ov::Output<Node> &B,
const ov::Output<Node> &bias,
const ov::Output<Node> &w_decompression_scale,
const ov::Output<Node> &w_decompression_zero_point,
const ov::element::Type output_type = ov::element::undefined);

FullyConnectedCompressed(const ov::Output<Node> &A,
const ov::Output<Node> &B,
const ov::Output<Node> &bias,
const ov::Output<Node> &decompression_scale,
const ov::Output<Node> &w_decompression_scale,
const ov::element::Type output_type = ov::element::undefined);

std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -287,3 +287,4 @@ REGISTER_FACTORY(internal, Placeholder);
REGISTER_FACTORY(internal, SDPA);
REGISTER_FACTORY(internal, IndirectSDPA);
REGISTER_FACTORY(internal, RoPE);
REGISTER_FACTORY(internal, DynamicQuantize);
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once
#include "primitive.hpp"

namespace cldnn {

/// @brief Dynamic Quantize primitive
/// @details Performs dynamic quantization
struct dynamic_quantize : public primitive_base<dynamic_quantize> {
CLDNN_DECLARE_PRIMITIVE(dynamic_quantize);

dynamic_quantize() : primitive_base("", {}), group_size(0) {}

/// @brief Constructs dynamic_quantize primitive
/// @param id This primitive id
/// @param input Input primitive id
/// @param group_size Quantization group size
/// @param data_type Output data type of quantized
/// @param output_size Output data size of the primitive
dynamic_quantize(const primitive_id& id,
const input_info& input,
const uint64_t group_size,
const std::vector<optional_data_type> data_types = {optional_data_type(data_types::f16), optional_data_type(data_types::i8)})
: primitive_base(id, {input}, 2, data_types),
group_size(group_size) {}

uint64_t group_size = 0;

size_t hash() const override {
size_t seed = primitive::hash();
seed = hash_combine(seed, group_size);
return seed;
}

bool operator==(const primitive& rhs) const override {
if (!compare_common_params(rhs))
return false;

auto rhs_casted = downcast<const dynamic_quantize>(rhs);

return group_size == rhs_casted.group_size;
}

void save(BinaryOutputBuffer& ob) const override {
primitive_base<dynamic_quantize>::save(ob);
ob << group_size;
}

void load(BinaryInputBuffer& ib) override {
primitive_base<dynamic_quantize>::load(ib);
ib >> group_size;
}
};
} // namespace cldnn
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,46 @@ struct fully_connected : public primitive_base<fully_connected> {
compressed_weights(true),
decompression_scale(decompression_scale),
decompression_zero_point(decompression_zero_point),
dynamic_quantized_activation(false),
input_size(input_size),
weights_rank(weights_rank) {
OPENVINO_ASSERT(!decompression_scale.empty(), "[GPU] Compressed fully connected requires at least decompression scale input");
}

/// @brief Constructs fully connected compressed layer.
/// @param id This primitive id.
/// @param input Input primitive id.
/// @param weights Primitive id containing weights data.
/// @param bias Primitive id containing bias data.
/// @param compression_scale Primitive id containing scale factors for weights decompression.
/// @param compression_zero_point Primitive id containing zero points for weights decompression.
/// @param activation_scale Primitive id containing scale factor for activation.
fully_connected(const primitive_id& id,
const input_info& input,
const primitive_id& weights,
const primitive_id& bias,
const primitive_id& decompression_scale,
const primitive_id& decompression_zero_point,
const input_info& activation_scale,
const data_types data_type,
const size_t input_size = 2,
const size_t weights_rank = 2)
: primitive_base(id, { input }, 1, {optional_data_type{data_type}}),
weights(weights),
bias(bias),
compressed_weights(true),
decompression_scale(decompression_scale),
decompression_zero_point(decompression_zero_point),
dynamic_quantized_activation(false),
activation_scale(activation_scale),
input_size(input_size),
weights_rank(weights_rank) {
if (activation_scale.is_valid())
dynamic_quantized_activation = true;

OPENVINO_ASSERT(!decompression_scale.empty(), "[GPU] Compressed fully connected requires at least decompression scale input");
}

/// @brief Primitive id containing weights data.
primitive_id weights;
/// @brief Primitive id containing bias data.
Expand All @@ -108,6 +143,8 @@ struct fully_connected : public primitive_base<fully_connected> {
bool compressed_weights = false;
primitive_id decompression_scale = "";
primitive_id decompression_zero_point = "";
bool dynamic_quantized_activation = false;
input_info activation_scale = {"", 0};
optional_value<float> decompression_zero_point_scalar = optional_value<float>();

/// @brief Primitive dimension size.
Expand All @@ -123,6 +160,7 @@ struct fully_connected : public primitive_base<fully_connected> {
seed = hash_combine(seed, compressed_weights);
seed = hash_combine(seed, !decompression_scale.empty());
seed = hash_combine(seed, !decompression_zero_point.empty());
seed = hash_combine(seed, activation_scale.is_valid());
seed = hash_combine(seed, decompression_zero_point_scalar.has_value());
seed = hash_combine(seed, decompression_zero_point_scalar.value_or(0.0f));
return seed;
Expand All @@ -140,6 +178,7 @@ struct fully_connected : public primitive_base<fully_connected> {
compressed_weights == rhs_casted.compressed_weights &&
decompression_scale.empty() == rhs_casted.decompression_scale.empty() &&
decompression_zero_point.empty() == rhs_casted.decompression_zero_point.empty() &&
activation_scale.is_valid() == rhs_casted.activation_scale.is_valid() &&
decompression_zero_point_scalar.value_or(0.0f) == rhs_casted.decompression_zero_point_scalar.value_or(0.0f);
}

Expand All @@ -150,8 +189,10 @@ struct fully_connected : public primitive_base<fully_connected> {
ob << compressed_weights;
ob << decompression_scale;
ob << decompression_zero_point;
ob << activation_scale;
ob << input_size;
ob << weights_rank;
ob << dynamic_quantized_activation;

if (decompression_zero_point_scalar.has_value()) {
ob << true;
Expand All @@ -169,8 +210,10 @@ struct fully_connected : public primitive_base<fully_connected> {
ib >> compressed_weights;
ib >> decompression_scale;
ib >> decompression_zero_point;
ib >> activation_scale;
ib >> input_size;
ib >> weights_rank;
ib >> dynamic_quantized_activation;

bool has_value;
ib >> has_value;
Expand All @@ -197,6 +240,9 @@ struct fully_connected : public primitive_base<fully_connected> {
if (!decompression_zero_point.empty())
ret.push_back(decompression_zero_point);

if (activation_scale.is_valid())
ret.push_back(activation_scale);

return ret;
}
};
Expand Down
66 changes: 66 additions & 0 deletions src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "ov_ops/dynamic_quantize.hpp"
#include "dynamic_quantize_inst.h"

#include "primitive_type_base.h"
#include "json_object.h"
#include <string>

namespace cldnn {
GPU_DEFINE_PRIMITIVE_TYPE_ID(dynamic_quantize);

layout dynamic_quantize_inst::calc_output_layout(dynamic_quantize_node const& node, kernel_impl_params const& impl_param) {
auto desc = impl_param.typed_desc<dynamic_quantize>();
const auto& input_layout = impl_param.get_input_layout();
auto output_type = data_types::i8;
auto output_format = input_layout.format;

return layout(output_type, output_format, input_layout.get_tensor());
}

template<typename ShapeType>
std::vector<layout> dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, uint64_t group_size) {
ov::op::internal::DynamicQuantize op;
auto output_format = act_layout.format;

std::vector<ShapeType> input_shapes = {
act_layout.get<ShapeType>(),
};

std::vector<uint64_t> shape_group_size(act_layout.get<ShapeType>().size(), 1);
shape_group_size.back() = group_size;

auto output_shapes = ov::op::internal::DynamicQuantize::shape_infer(&op, input_shapes, shape_group_size);

return { layout(output_shapes[0], data_types::i8, output_format), layout(output_shapes[1], data_types::f16, output_format) };
}

template std::vector<layout> dynamic_quantize_inst::__calc_output_layouts<ov::PartialShape>(const layout &act_layout, uint64_t group_size);

template<typename ShapeType>
std::vector<layout> dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& /*node*/, const kernel_impl_params& impl_param) {
auto desc = impl_param.typed_desc<dynamic_quantize>();
const auto& input_layout = impl_param.get_input_layout();
return __calc_output_layouts<ov::PartialShape>(input_layout, UINT64_MAX /* TODO: handle group_size here */);
}

template std::vector<layout> dynamic_quantize_inst::calc_output_layouts<ov::PartialShape>(dynamic_quantize_node const& node,
const kernel_impl_params& impl_param);

std::string dynamic_quantize_inst::to_string(dynamic_quantize_node const& node) {
auto desc = node.get_primitive();
auto node_info = node.desc_to_json();

std::stringstream primitive_description;

node_info->dump(primitive_description);

return primitive_description.str();
}

dynamic_quantize_inst::typed_primitive_inst(network& network, dynamic_quantize_node const& node) : parent(network, node) {}

} // namespace cldnn
3 changes: 3 additions & 0 deletions src/plugins/intel_gpu/src/graph/fully_connected.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,9 @@ std::string fully_connected_inst::to_string(fully_connected_node const& node) {
fc_info.add("decompression zp value", desc->decompression_zero_point_scalar.value());
}
}
if (desc->dynamic_quantized_activation) {
fc_info.add("activation scale id", desc->activation_scale.pid);
}

node_info->add("fully connected info", fc_info);
node_info->dump(primitive_description);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,8 @@ void prepare_primitive_fusing::fuse_bias(program &p) {
fc_with_bias_prim->decompression_zero_point = desc->decompression_zero_point;
if (desc->decompression_zero_point_scalar.has_value())
fc_with_bias_prim->decompression_zero_point_scalar = desc->decompression_zero_point_scalar.value();
fc_with_bias_prim->activation_scale = desc->activation_scale;
fc_with_bias_prim->dynamic_quantized_activation = desc->dynamic_quantized_activation;
}
auto& new_fc_node = p.get_or_create(fc_with_bias_prim);
fuse_bias_f(fc, new_fc_node, bias_node, eltw_node);
Expand Down
Loading

0 comments on commit 7d6ffd3

Please sign in to comment.