Skip to content

Commit

Permalink
[GNA] Remove transposes around MatMul
Browse files Browse the repository at this point in the history
  • Loading branch information
elilobanova committed Jun 29, 2021
1 parent 7580bd3 commit b02adb0
Show file tree
Hide file tree
Showing 16 changed files with 365 additions and 109 deletions.
Binary file modified docs/img/OV-diagram-step1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/img/OV-diagram-step4.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
7 changes: 4 additions & 3 deletions inference-engine/src/gna_plugin/backend/make_pwl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -279,19 +279,20 @@ void make_gna_pwl(const DnnActivation fun,
gnalog() << "=========================== LeakyReLU Segments ======================\n";
int32_t x_lower = INT32_MIN;
int32_t x_upper = INT32_MAX;
int16_t y_lower = y_min;
int32_t y_lower = y_min;
int16_t y_upper = y_max;
if (fun.fqParams.set) {
x_lower = std::max(FLOAT_TO_INT64(*fun.fqParams.input_low * 1.25 * in_scale), static_cast<int64_t>(x_lower));
x_upper = std::min(FLOAT_TO_INT64(*fun.fqParams.input_high * 1.25 * in_scale), static_cast<int64_t>(x_upper));
y_lower = std::max(FLOAT_TO_INT32(*fun.fqParams.input_low * 1.25 * out_scale), static_cast<int32_t>(y_lower));
// y_lower can be reduced with negative slope
y_lower = *fun.fqParams.input_low * 1.25 * out_scale;
y_upper = std::min(FLOAT_TO_INT32(*fun.fqParams.input_high * 1.25 * out_scale), static_cast<int32_t>(y_upper));
} else {
if (x_lower < y_lower * in_scale / out_scale) x_lower = FLOAT_TO_INT32(y_lower * in_scale / out_scale);
if (y_lower < x_lower * out_scale / in_scale) y_lower = FLOAT_TO_INT16(x_lower * out_scale / in_scale);
}

gna_pwl[0].yBase = y_lower * fun.args.lrelu.negative_slope;
gna_pwl[0].yBase = std::max(FLOAT_TO_INT32(y_lower * fun.args.lrelu.negative_slope), static_cast<int32_t>(y_min));
s = gna_slope(fun.args.lrelu.negative_slope, in_scale, out_scale);
gna_pwl[0].xBase = (x_lower & XBASEMASK) | s.slope_scale_index; // zero out the 2 lsb
gna_pwl[0].slope = FLOAT_TO_INT16(s.slope * s.slope_scale);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -978,7 +978,7 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
gnalog() << "[UFS] from : " << concatLayer->name << " reached: " << layer->name;
// found that direct input to concat is a indirect parent of align filter - so no link required
auto info = LayerInfo(layer);
if (!info.isWeightable() && !info.isActivation() && !info.isConst()) {
if (!info.isWeightable() && !info.isActivation() && !info.isConst() && !info.isMemory()) {
gnalog() << "... skipped\n";
return;
}
Expand Down Expand Up @@ -1030,8 +1030,8 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
}

quantDataForConCatInput->_dst_quant.SetScale(newScaleFactor);
} else if (restarLayerInfo.isConst()) {
gnalog() << "... warning const layer will be requantized\n";
} else if (restarLayerInfo.isConst() || restarLayerInfo.isMemory()) {
gnalog() << "... warning " << restartedLayer->type << " layer will be requantized\n";
quantDataForConCatInput->_src_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
quantDataForConCatInput->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
} else {
Expand Down
7 changes: 5 additions & 2 deletions inference-engine/src/gna_plugin/gna_graph_compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,12 +145,15 @@ void GNAGraphCompiler::fillSplitConnections(InferenceEngine::CNNLayerPtr layer)
size_t output_layer_size = 0;

for (int j = 0; j != getInputTo(layer->outData[i]).size(); j++) {
auto outFunctionalLayer = CNNNetGetNextLayerSkipCertain(layer, i, j, [](CNNLayerPtr l) {
auto outFunctionalLayer = CNNNetCheckNextLayerSkipCertain(layer, i, j, true, [](CNNLayerPtr l) {
return LayerInfo(l).isNonFunctional();
});

if (!outFunctionalLayer.first) {
THROW_GNA_LAYER_EXCEPTION(layer) << " outData["<< i << "]" << " connected by " << j <<" connection doesnt connect to functional layer";
output_layer_size =
InferenceEngine::details::product(begin(layer->outData[i]->getDims()),
end(layer->outData[i]->getDims())) * layer->outData[i]->getPrecision().size();
continue;
}

for (int idx : outFunctionalLayer.second) {
Expand Down
4 changes: 2 additions & 2 deletions inference-engine/src/gna_plugin/gna_plugin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,11 @@

#include "transformations/remove_extra_reshapes.hpp"
#include "transformations/insert_transpose_after_convolution_or_pooling.hpp"
#include "transformations/insert_transpose_before_matmul.hpp"
#include "transformations/reorder_activation_and_pooling.hpp"
#include "transformations/swap_input_matmul_gna.hpp"
#include "transformations/convert_matmul_to_pointwise_convolution.hpp"
#include "transformations/split_convolution_with_large_buffer_size.hpp"
#include "transformations/handle_transposes_around_matmul.hpp"

#include <ngraph/opsets/opset7.hpp>

Expand Down Expand Up @@ -687,7 +687,7 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
manager.register_pass<SplitConvolutionWithFq>();
manager.register_pass<SplitConvolutionWithBias>();
manager.register_pass<SplitConvolution>();
manager.register_pass<InsertTransposeBeforeMatmul>();
manager.register_pass<HandleTransposesAroundMatMul>();
manager.register_pass<SwapInputMatMul>();
manager.register_pass<InsertTransposeAfterConvOrPool>();
manager.register_pass<ReorderActivationAndPooling>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2118,8 +2118,11 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
THROW_GNA_LAYER_EXCEPTION(fqLayer) << "Zero levels";
}

// Before FQ layer is removed, the previous layer has to be updated with its quantization data
auto quantParamsPrevLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(prevLayer);
// Before FQ layer is removed, the previous functional layer has to be updated with its quantization data
auto prevFuncLayer = CNNNetPrevLayerSkipCertain(*fqLayer, 0, [](CNNLayerPtr layer) {
return LayerInfo(layer).isNonFunctional();
});
auto quantParamsPrevLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(prevFuncLayer);
quantParamsPrevLayer->_dst_quant.SetLevels(fqLevels);
quantParamsPrevLayer->_dst_quant.SetMinValues({ inputRange.first[0] }, true);
quantParamsPrevLayer->_dst_quant.SetMaxValues({ inputRange.second[0] }, true);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "transformations/handle_transposes_around_matmul.hpp"

#include <numeric>

#include <ngraph/opsets/opset7.hpp>
#include <ngraph/pattern/op/wrap_type.hpp>
#include <ngraph/pattern/op/or.hpp>
#include <ngraph/rt_info.hpp>

#include "gna_plugin_log.hpp"

using namespace GNAPluginNS;

NGRAPH_RTTI_DEFINITION(HandleTransposesAroundMatMul, "HandleTransposesAroundMatMul", 0);
NGRAPH_RTTI_DEFINITION(HandleTransposeBeforeMatMul, "HandleTransposeBeforeMatMul", 0);
NGRAPH_RTTI_DEFINITION(HandleTransposeAfterMatMul, "HandleTransposeAfterMatMul", 0);

static void ReplaceTransposeWithReshape(std::shared_ptr<ngraph::Node> transpose_node) {
auto shape = transpose_node->get_output_shape(0);
auto reshape_const = std::make_shared<ngraph::opset7::Constant>(ngraph::element::Type_t::i64,
ngraph::Shape{shape.size()}, shape);
auto reshape_node = std::make_shared<ngraph::opset7::Reshape>(transpose_node->input_value(0), reshape_const, false);
reshape_node->set_friendly_name(transpose_node->get_friendly_name() + "/reshape");
ngraph::copy_runtime_info(transpose_node, reshape_node);
transpose_node->output(0).replace(reshape_node->output(0));
}

static void InsertTranspose(std::shared_ptr<ngraph::Node> prev_node, const std::string& base_name) {
auto consumers = prev_node->output(0).get_target_inputs();
const auto orig_shape = prev_node->get_output_shape(0);
std::vector<size_t> transpose_ids;
for (size_t i = 0; i < orig_shape.size(); ++i) {
if (orig_shape[i] > 1) {
transpose_ids.push_back(i);
}
}
IE_ASSERT(transpose_ids.size() == 2);
std::vector<size_t> permute_order(orig_shape.size());
std::iota(std::begin(permute_order), std::end(permute_order), 0);
std::swap(permute_order[transpose_ids[0]], permute_order[transpose_ids[1]]);

auto transpose_order = ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{permute_order.size()}, permute_order);
auto transpose = std::make_shared<ngraph::opset7::Transpose>(prev_node, transpose_order);
transpose->set_friendly_name(base_name + "/in_transpose");

auto reshapeConstAfter = std::make_shared<ngraph::opset7::Constant>(ngraph::element::Type_t::i64,
ngraph::Shape{orig_shape.size()}, orig_shape);
auto reshapeAfter = std::make_shared<ngraph::opset7::Reshape>(transpose, reshapeConstAfter, false);
reshapeAfter->set_friendly_name(base_name + "/reshape_after_transpose");
ngraph::copy_runtime_info(prev_node, ngraph::NodeVector{transpose, reshapeAfter});

for (auto input : consumers) {
input.replace_source_output(reshapeAfter);
}
}

static bool IsTransposeSupported(const ngraph::Shape& shape) {
auto shape_no_1 = shape;
shape_no_1.erase(std::remove(shape_no_1.begin(), shape_no_1.end(), 1), shape_no_1.end());
if (shape_no_1.size() != 2) return false;
size_t min, max;
std::tie(min, max) = std::minmax(shape_no_1[0], shape_no_1[1]);
return min <= 8 && max % 8 == 0;
}

HandleTransposeBeforeMatMul::HandleTransposeBeforeMatMul() {
auto reshape = ngraph::pattern::wrap_type<ngraph::opset7::Reshape>({ngraph::pattern::any_input(),
ngraph::pattern::any_input()}, VerifyReshape());
auto transpose = ngraph::pattern::wrap_type<ngraph::opset7::Transpose>({reshape,
ngraph::pattern::any_input()});
auto matmul_input = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{reshape, transpose});
auto matmul1 = ngraph::pattern::wrap_type<ngraph::opset7::MatMul>({matmul_input, ngraph::pattern::any_input()});
auto matmul2 = ngraph::pattern::wrap_type<ngraph::opset7::MatMul>({ngraph::pattern::any_input(), matmul_input});
auto matmul = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{matmul1, matmul2});

ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
const auto& pattern_map = m.get_pattern_value_map();
auto transpose_it = pattern_map.find(transpose);
if (transpose_it != std::end(pattern_map)) {
ReplaceTransposeWithReshape(transpose_it->second.get_node_shared_ptr());
} else {
auto reshape_node = pattern_map.at(reshape).get_node_shared_ptr();
if (!IsTransposeSupported(reshape_node->get_output_shape(0))) return false;
auto matmul_it = pattern_map.find(matmul1);
auto matmul_out = matmul_it != std::end(pattern_map) ? matmul_it->second : pattern_map.at(matmul2);
InsertTranspose(reshape_node, matmul_out.get_node_shared_ptr()->get_friendly_name());
}
return true;
};

auto m = std::make_shared<ngraph::pattern::Matcher>(matmul, "HandleTransposeBeforeMatMul");
this->register_matcher(m, callback);
}

HandleTransposeAfterMatMul::HandleTransposeAfterMatMul() {
auto matmul = ngraph::pattern::wrap_type<ngraph::opset7::MatMul>();
auto fq = ngraph::pattern::wrap_type<ngraph::opset7::FakeQuantize>({matmul, ngraph::pattern::any_input(),
ngraph::pattern::any_input(), ngraph::pattern::any_input(), ngraph::pattern::any_input()});
auto transpose_input = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{matmul, fq});
auto transpose = ngraph::pattern::wrap_type<ngraph::opset7::Transpose>({transpose_input, ngraph::pattern::any_input()});
auto reshape_input = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{transpose_input, transpose});
auto reshape = ngraph::pattern::wrap_type<ngraph::opset7::Reshape>({reshape_input,
ngraph::pattern::any_input()}, VerifyReshape());

ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
const auto& pattern_map = m.get_pattern_value_map();
auto transpose_it = pattern_map.find(transpose);
if (transpose_it != std::end(pattern_map)) {
ReplaceTransposeWithReshape(transpose_it->second.get_node_shared_ptr());
} else {
auto reshape_node = pattern_map.at(reshape).get_node_shared_ptr();
if (!IsTransposeSupported(reshape_node->get_input_shape(0))) return false;
auto matmul_node = pattern_map.at(matmul).get_node_shared_ptr();
InsertTranspose(matmul_node, matmul_node->get_friendly_name());
}
return true;
};

auto m = std::make_shared<ngraph::pattern::Matcher>(reshape, "HandleTransposeAfterMatMul");
this->register_matcher(m, callback);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <ngraph/pass/graph_rewrite.hpp>

namespace GNAPluginNS {

struct VerifyReshape {
bool operator()(const ngraph::Output<ngraph::Node>& reshape_out) const {
auto in_shape = reshape_out.get_node_shared_ptr()->get_input_shape(0);
auto out_shape = reshape_out.get_node_shared_ptr()->get_output_shape(0);

// Check if Reshape changes the final 2d shape of Affine primitive
in_shape.erase(std::remove(in_shape.begin(), in_shape.end(), 1), in_shape.end());
out_shape.erase(std::remove(out_shape.begin(), out_shape.end(), 1), out_shape.end());
return in_shape != out_shape;
}
};

/**
* @brief Inserts Transpose before MatMul or removes it (if it exists) if there is Reshape
* before MatMul which changes the batch size:
* [1, A*B] [1, A*B]
* | |
* Reshape Reshape
* | |
* [1, A, 1, B] [1, A, 1, B]
* | |
* | Transpose
* | -> |
* | <- [1, B, 1, A]
* | |
* MatMul MatMul
*/
class HandleTransposeBeforeMatMul : public ngraph::pass::MatcherPass {
public:
NGRAPH_RTTI_DECLARATION;
HandleTransposeBeforeMatMul();
};

/**
* @brief Inserts Transpose after MatMul or removes it (if it exists) if there is Reshape
* after MatMul which changes the batch size:
* MatMul MatMul
* | |
* [1, A, 1, B] [1, A, 1, B]
* | |
* | Transpose
* | -> |
* | <- [1, B, 1, A]
* | |
* Reshape Reshape
* | |
* [1, A*B] [1, A*B]
*/
class HandleTransposeAfterMatMul : public ngraph::pass::MatcherPass {
public:
NGRAPH_RTTI_DECLARATION;
HandleTransposeAfterMatMul();
};

class HandleTransposesAroundMatMul: public ngraph::pass::GraphRewrite {
public:
NGRAPH_RTTI_DECLARATION;
HandleTransposesAroundMatMul() {
add_matcher<HandleTransposeBeforeMatMul>();
add_matcher<HandleTransposeAfterMatMul>();
}
};

} // namespace GNAPluginNS

This file was deleted.

Loading

0 comments on commit b02adb0

Please sign in to comment.