Skip to content

Commit

Permalink
[GPU] Implement bias on internal FC op
Browse files Browse the repository at this point in the history
Added bias semantics support for internal FC op
  • Loading branch information
dnkurek committed Mar 8, 2024
1 parent 0852c4e commit 20daf03
Show file tree
Hide file tree
Showing 13 changed files with 124 additions and 61 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class FullyConnected : public ov::op::Op {

FullyConnected(const ov::Output<Node>& A,
const ov::Output<Node>& B,
const ov::Output<Node>& bias,
const ov::element::Type output_type = ov::element::undefined);

bool visit_attributes(ov::AttributeVisitor &visitor) override;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@ class FullyConnectedCompressed : public FullyConnected {

FullyConnectedCompressed(const ov::Output<Node> &A,
const ov::Output<Node> &B,
const ov::Output<Node> &bias,
const ov::Output<Node> &decompression_scale,
const ov::Output<Node> &decompression_zero_point,
const ov::element::Type output_type = ov::element::undefined);

FullyConnectedCompressed(const ov::Output<Node> &A,
const ov::Output<Node> &B,
const ov::Output<Node> &bias,
const ov::Output<Node> &decompression_scale,
const ov::element::Type output_type = ov::element::undefined);

Expand Down
14 changes: 8 additions & 6 deletions src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,15 @@ namespace ov {
namespace intel_gpu {

static void CreateFullyConnectedCompressedOp(ProgramBuilder& p, const std::shared_ptr<op::FullyConnectedCompressed>& op) {
validate_inputs_count(op, {3, 4});
validate_inputs_count(op, {4, 5});
auto inputs = p.GetInputInfo(op);
std::string primitive_name = layer_type_name_ID(op);

auto input_name = inputs[0].pid;
auto weights_name = inputs[1].pid;
auto scale_name = inputs[2].pid;
auto zp_name = inputs.size() == 4 ? inputs[3].pid : "";
auto bias_name = inputs[2].pid;
auto scale_name = inputs[3].pid;
auto zp_name = inputs.size() == 5 ? inputs[4].pid : "";

float zp_value = 0.0f;
bool has_scalar_zp = false;
Expand All @@ -47,7 +48,7 @@ static void CreateFullyConnectedCompressedOp(ProgramBuilder& p, const std::share
auto fc = cldnn::fully_connected(primitive_name,
cldnn::input_info(input_name),
weights_name,
"",
bias_name,
scale_name,
has_scalar_zp ? "" : zp_name,
cldnn::element_type_to_data_type(op->get_output_element_type(0)),
Expand All @@ -63,12 +64,13 @@ static void CreateFullyConnectedCompressedOp(ProgramBuilder& p, const std::share
}

static void CreateFullyConnectedOp(ProgramBuilder& p, const std::shared_ptr<op::FullyConnected>& op) {
validate_inputs_count(op, {2});
validate_inputs_count(op, {3});
auto inputs = p.GetInputInfo(op);
std::string layerName = layer_type_name_ID(op);

auto input_name = inputs[0].pid;
auto weights_name = inputs[1].pid;
auto bias_name = inputs[2].pid;

auto shape_a = op->get_input_partial_shape(0);
auto shape_b = op->get_input_partial_shape(1);
Expand All @@ -79,7 +81,7 @@ static void CreateFullyConnectedOp(ProgramBuilder& p, const std::shared_ptr<op::
auto fcPrim = cldnn::fully_connected(layerName,
cldnn::input_info(input_name),
weights_name,
"",
bias_name,
cldnn::element_type_to_data_type(op->get_output_element_type(0)),
cldnn::padding(),
rank_a,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,14 +59,16 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon
auto transpose_m = wrap_type<ov::op::v1::Transpose>({transpose_input, transpose_const_m});

auto data_m = any_input();
auto bias_m = any_input();
auto weights_input_m = std::make_shared<ov::pass::pattern::op::Or>(ov::OutputVector{reshape_m, transpose_m, mul_m});
auto fully_connected_m = wrap_type<op::FullyConnected>({data_m, weights_input_m});
auto fully_connected_m = wrap_type<op::FullyConnected>({data_m, weights_input_m, bias_m});

ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) {
const auto& pattern_map = m.get_pattern_value_map();
OPENVINO_ASSERT(pattern_map.count(fully_connected_m));
OPENVINO_ASSERT(pattern_map.count(mul_const_m));
OPENVINO_ASSERT(pattern_map.count(weights_m));
OPENVINO_ASSERT(pattern_map.count(bias_m));
OPENVINO_ASSERT(pattern_map.count(convert_m));
auto fc = std::dynamic_pointer_cast<op::FullyConnected>(pattern_map.at(fully_connected_m).get_node_shared_ptr());
if (!fc || transformation_callback(fc)) {
Expand Down Expand Up @@ -103,6 +105,7 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon
std::shared_ptr<ov::Node> fc_input_b = reshape_const_to_2d(pattern_map.at(weights_m).get_node_shared_ptr());
std::shared_ptr<ov::Node> fc_input_scale = scale;
std::shared_ptr<ov::Node> fc_input_zp = optional_zero_point;
std::shared_ptr<ov::Node> fc_input_bias = pattern_map.at(bias_m).get_node_shared_ptr();
std::vector<std::shared_ptr<ov::Node>> result_nodes = {};
if (has_transpose) {
const auto& transpose = pattern_map.at(transpose_m).get_node_shared_ptr();
Expand All @@ -128,12 +131,14 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon
if (with_zero_point) {
new_fc = std::make_shared<op::FullyConnectedCompressed>(fc_input_a,
fc_input_b,
fc_input_bias,
fc_input_scale,
fc_input_zp,
fc->get_output_type());
} else {
new_fc = std::make_shared<op::FullyConnectedCompressed>(fc_input_a,
fc_input_b,
fc_input_bias,
fc_input_scale,
fc->get_output_type());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
//

#include "intel_gpu/op/fully_connected.hpp"
#include "intel_gpu/op/placeholder.hpp"
#include "convert_matmul_to_fc.hpp"
#include "openvino/op/matmul.hpp"
#include "openvino/op/convert.hpp"
Expand Down Expand Up @@ -177,8 +178,10 @@ ConvertMatMulToFullyConnected::ConvertMatMulToFullyConnected() {
fc_input_b = convert;
}

auto no_bias = std::make_shared<op::Placeholder>();

// Create FullyConnected
auto fc = std::make_shared<op::FullyConnected>(fc_input_a, fc_input_b, matmul->get_output_element_type(0));
auto fc = std::make_shared<op::FullyConnected>(fc_input_a, fc_input_b, no_bias, matmul->get_output_element_type(0));
fc->set_friendly_name(matmul->get_friendly_name());
new_ops.push_back(fc);
ov::copy_runtime_info(matmul, new_ops);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ FullyConnectedConvertFusion::FullyConnectedConvertFusion() {

auto data = any_input();
auto weights = any_input();
auto fully_connected = wrap_type<op::FullyConnected>({data, weights}, consumers_count(1));
auto fully_connected_compressed = wrap_type<op::FullyConnectedCompressed>({data, weights, any_input(), any_input()}, consumers_count(1));
auto bias = any_input();
auto fully_connected = wrap_type<op::FullyConnected>({data, weights, bias}, consumers_count(1));
auto fully_connected_compressed = wrap_type<op::FullyConnectedCompressed>({data, weights, bias, any_input(), any_input()}, consumers_count(1));
auto fc = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{fully_connected, fully_connected_compressed});
auto convert = wrap_type<ov::op::v0::Convert>({fc}, type_matches(element::f32));

Expand All @@ -30,6 +31,7 @@ FullyConnectedConvertFusion::FullyConnectedConvertFusion() {

const auto& m_data = pattern_map.at(data).get_node_shared_ptr();
const auto& m_weights = pattern_map.at(weights).get_node_shared_ptr();
const auto& m_bias = pattern_map.at(bias).get_node_shared_ptr();
const auto& m_convert = pattern_map.at(convert).get_node_shared_ptr();
auto output_type = m_convert->get_output_element_type(0);

Expand All @@ -38,13 +40,14 @@ FullyConnectedConvertFusion::FullyConnectedConvertFusion() {
auto it = pattern_map.find(fully_connected);
if (it != pattern_map.end()) {
m_fc = it->second.get_node_shared_ptr();
new_fc = std::make_shared<op::FullyConnected>(m_data, m_weights, output_type);
new_fc = std::make_shared<op::FullyConnected>(m_data, m_weights, m_bias, output_type);
} else {
m_fc = pattern_map.at(fully_connected_compressed).get_node_shared_ptr();
new_fc = std::make_shared<op::FullyConnectedCompressed>(m_data,
m_weights,
m_fc->input_value(2),
m_bias,
m_fc->input_value(3),
m_fc->input_value(4),
output_type);
}
new_fc->set_friendly_name(m_convert->get_friendly_name());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ MoveFCReshapeToWeights::MoveFCReshapeToWeights() {
auto weights_input_m = std::make_shared<ov::pass::pattern::op::Or>(ov::OutputVector{reshape_m, transpose_m});

auto data_m = any_input();
auto fully_connected_m = wrap_type<op::FullyConnected>({data_m, weights_input_m});
auto fully_connected_m = wrap_type<op::FullyConnected>({data_m, weights_input_m, any_input()});

ov::matcher_pass_callback callback = [&](ov::pass::pattern::Matcher& m) {
const auto fully_connected = m.get_match_root();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,25 @@ namespace op {

FullyConnected::FullyConnected(const ov::Output<Node>& A,
const ov::Output<Node>& B,
const ov::Output<Node>& bias,
const ov::element::Type output_type)
: Op({A, B}), m_output_type(output_type) {
: Op({A, B, bias}), m_output_type(output_type) {
validate_and_infer_types();
}

std::shared_ptr<ov::Node> FullyConnected::clone_with_new_inputs(const ov::OutputVector& new_args) const {
check_new_args_count(this, new_args);

return std::make_shared<FullyConnected>(new_args.at(0), new_args.at(1), m_output_type);
return std::make_shared<FullyConnected>(new_args.at(0), new_args.at(1), new_args.at(2), m_output_type);
}

void FullyConnected::validate_and_infer_types() {
const auto input_size = get_input_size();
NODE_VALIDATION_CHECK(this,
input_size >= 2,
input_size >= 3,
"Number of inputs is incorrect. Current value is: ",
input_size,
", expected at least 2.");
", expected at least 3.");

ov::op::v0::MatMul op;
op.set_transpose_a(false);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,37 +10,41 @@ namespace op {

FullyConnectedCompressed::FullyConnectedCompressed(const ov::Output<Node>& A,
const ov::Output<Node>& B,
const ov::Output<Node>& bias,
const ov::Output<Node>& decompression_scale,
const ov::Output<Node>& decompression_zero_point,
const ov::element::Type output_type)
: FullyConnected(A, B, output_type) {
set_argument(2, decompression_scale);
set_argument(3, decompression_zero_point);
: FullyConnected(A, B, bias, output_type) {
set_argument(3, decompression_scale);
set_argument(4, decompression_zero_point);
validate_and_infer_types();
}

FullyConnectedCompressed::FullyConnectedCompressed(const ov::Output<Node>& A,
const ov::Output<Node>& B,
const ov::Output<Node>& bias,
const ov::Output<Node>& decompression_scale,
const ov::element::Type output_type)
: FullyConnected(A, B, output_type) {
set_argument(2, decompression_scale);
: FullyConnected(A, B, bias, output_type) {
set_argument(3, decompression_scale);
validate_and_infer_types();
}

std::shared_ptr<ov::Node> FullyConnectedCompressed::clone_with_new_inputs(const ov::OutputVector& new_args) const {
check_new_args_count(this, new_args);

if (new_args.size() == 3)
if (new_args.size() == 4)
return std::make_shared<FullyConnectedCompressed>(new_args.at(0),
new_args.at(1),
new_args.at(2),
new_args.at(3),
m_output_type);
else if (new_args.size() == 4)
else if (new_args.size() == 5)
return std::make_shared<FullyConnectedCompressed>(new_args.at(0),
new_args.at(1),
new_args.at(2),
new_args.at(3),
new_args.at(4),
m_output_type);
else
OPENVINO_THROW("Unexpected inputs count for FullyConnectedCompressed op: ", new_args.size());
Expand Down
Loading

0 comments on commit 20daf03

Please sign in to comment.