From 33c3aeb867f74ceae64312be02d5a53193a51a01 Mon Sep 17 00:00:00 2001 From: Lukasz Debski Date: Sat, 5 Sep 2020 18:01:43 +0200 Subject: [PATCH 01/66] [IE CLDNN] Fixing blocked format opting for strided_slice (#2073) --- inference-engine/thirdparty/clDNN/src/program.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/inference-engine/thirdparty/clDNN/src/program.cpp b/inference-engine/thirdparty/clDNN/src/program.cpp index 0a0e596dde5883..203041871bc277 100644 --- a/inference-engine/thirdparty/clDNN/src/program.cpp +++ b/inference-engine/thirdparty/clDNN/src/program.cpp @@ -61,6 +61,7 @@ #include "split_inst.h" #include "mvn_inst.h" #include "reduce_inst.h" +#include "strided_slice_inst.h" #include "to_string_utils.h" #include "gpu/memory_gpu.h" @@ -1168,7 +1169,8 @@ void program_impl::set_layout_optimizer_attributes(layout_optimizer& lo) { || prim.as().get_primitive()->across_channels) && prim.type() != cldnn::arg_max_min::type_id() && prim.type() != cldnn::mutable_data::type_id() && - prim.type() != cldnn::reduce::type_id()) + prim.type() != cldnn::reduce::type_id() && + prim.type() != cldnn::strided_slice::type_id()) can_use_fsv16 = false; if (prim.type() == cldnn::quantize::type_id() && From 0cd0c1a551befbb19a503456c298d0c3b2e08dd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20Do=C5=82bniak?= Date: Mon, 7 Sep 2020 04:48:32 +0200 Subject: [PATCH 02/66] Handle Split axes as i64 (#2079) --- ngraph/core/builder/include/ngraph/builder/split.hpp | 2 +- ngraph/core/builder/src/builder/split.cpp | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ngraph/core/builder/include/ngraph/builder/split.hpp b/ngraph/core/builder/include/ngraph/builder/split.hpp index bca0627d90fc0c..08604cec7cca87 100644 --- a/ngraph/core/builder/include/ngraph/builder/split.hpp +++ b/ngraph/core/builder/include/ngraph/builder/split.hpp @@ -34,7 +34,7 @@ namespace ngraph NGRAPH_DEPRECATED("This builder was deprecated.") OutputVector split(const Output& value, const std::vector& length_parts, - size_t axis = 0); + int64_t axis = 0); /// \brief Split node on specified axis into multiple parts. /// diff --git a/ngraph/core/builder/src/builder/split.cpp b/ngraph/core/builder/src/builder/split.cpp index 812b3e69ceb0ff..76293d69fd5c80 100644 --- a/ngraph/core/builder/src/builder/split.cpp +++ b/ngraph/core/builder/src/builder/split.cpp @@ -30,7 +30,7 @@ namespace } std::shared_ptr make_ng_slice(const Output& output, - const std::vector& axes, + const std::vector& axes, const std::vector& starts, const std::vector& ends) { @@ -38,7 +38,7 @@ namespace std::vector lower_bounds(upper_bounds.size()); for (size_t index{0}; index < axes.size(); ++index) { - size_t axis{axes.at(index)}; + int64_t axis{axes.at(index)}; lower_bounds.at(axis) = get_valid_array_index(starts.at(index), output.get_shape().at(axis)); upper_bounds.at(axis) = @@ -51,7 +51,7 @@ namespace } OutputVector - builder::split(const Output& value, const std::vector& length_parts, size_t axis) + builder::split(const Output& value, const std::vector& length_parts, int64_t axis) { size_t start_index{0}; OutputVector outputs; @@ -81,7 +81,7 @@ OutputVector builder::opset1::split(const Output& value, const std::vector& split_lengths, int64_t axis) { - const auto axis_node = ngraph::opset1::Constant::create(element::u64, Shape{}, {axis}); + const auto axis_node = ngraph::opset1::Constant::create(element::i64, Shape{}, {axis}); const auto split_lengths_node = ngraph::opset1::Constant::create(element::u64, Shape{split_lengths.size()}, split_lengths); const auto variadic_split = @@ -92,7 +92,7 @@ OutputVector builder::opset1::split(const Output& value, OutputVector builder::opset1::split(const Output& value, size_t num_splits, int64_t axis) { - const auto axis_node = ngraph::opset1::Constant::create(element::u64, Shape{}, {axis}); + const auto axis_node = ngraph::opset1::Constant::create(element::i64, Shape{}, {axis}); const auto split = std::make_shared(value, axis_node, num_splits); return split->outputs(); From 51564f415cae4cfacd2237f43af3887fa3c23ce2 Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Mon, 7 Sep 2020 09:04:05 +0300 Subject: [PATCH 03/66] [IE CLDNN] Fixed fsv16 lrn kernel with fp16 input (#2086) --- ..._across_channel_multiple_features_fsv16.cl | 5 +- .../clDNN/tests/test_cases/lrn_gpu_test.cpp | 49 +++++++++++++++++++ 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lrn_gpu_across_channel_multiple_features_fsv16.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lrn_gpu_across_channel_multiple_features_fsv16.cl index 7baa5adcd177cf..901e5c65797eac 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lrn_gpu_across_channel_multiple_features_fsv16.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lrn_gpu_across_channel_multiple_features_fsv16.cl @@ -41,14 +41,15 @@ KERNEL (lrn_gpu_across_channel_multiple_features_fsv16)( for (uint i = 0; i < LOCAL_SIZE; ++i, ++input_offset_f) { bool non_zero = input_offset_f >= 0 && input_offset_f < INPUT0_FEATURE_NUM; uint input_idx = INPUT0_GET_INDEX(batch_id, input_offset_f, y, x); - val[i] = (int)non_zero * TO_INPUT0_TYPE(input[input_idx]); + val[i] = (int)non_zero * TO_INPUT0_TYPE(ALPHA_VAL_FACTOR_DIV_BY_SIZE) * TO_INPUT0_TYPE(input[input_idx]); res = mad(val[i], val[i], res); } res = mad(res, TO_INPUT0_TYPE(ALPHA_DIV_BY_SIZE), TO_INPUT0_TYPE(K)); res = native_powr(res, -TO_INPUT0_TYPE(BETA)); uint output_idx = OUTPUT_GET_INDEX(batch_id, feature_id, y, x); - INPUT0_TYPE lrn_result = res * val[PADDING]; + uint input_idx = INPUT0_GET_INDEX(batch_id, feature_id, y, x); + INPUT0_TYPE lrn_result = res * input[input_idx]; #if HAS_FUSED_OPS FUSED_OPS; output[output_idx] = TO_OUTPUT_TYPE(FUSED_OPS_RESULT); diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/lrn_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/lrn_gpu_test.cpp index 38579ffda3b9e1..7ca2850ff2b86b 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/lrn_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/lrn_gpu_test.cpp @@ -125,6 +125,55 @@ TEST(lrn_fp32_gpu, basic2) { } } +TEST(lrn_fp16_gpu, basic1) { + // input : 1x16x1x1 + // Output : 1x16x1x1 + const auto& engine = get_test_engine(); + + const size_t b = 1; + const size_t f = 16; + const size_t y = 1; + const size_t x = 1; + + auto input = memory::allocate(engine, { data_types::f16, format::b_fs_yx_fsv16, { b, f, x, y } }); + std::vector inputVals(b * f * y * x); + std::generate(inputVals.begin(), inputVals.end(), []() { + static float n = 0; + return half_t(n++); + }); + + set_values(input, inputVals); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + uint32_t size = 5; + float k = 0.5f; + float alpha = 9.9e-05f; + float beta = 1.f; + topology.add(lrn("lrn", "input", size, k, alpha, beta, cldnn::lrn_norm_region_across_channel)); + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("lrn").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 0.f, 1.99889f, 3.99525f, 5.98696f, + 7.97159f, 9.94682f, 11.9104f, 13.86f, + 15.7936f, 17.709f, 19.6041f, 21.4769f, + 23.3257f, 25.1485f, 27.2091f, 29.3151f + }; + + ASSERT_EQ(output_ptr.size(), expected_results.size()); + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_TRUE(are_equal(expected_results[i], half_to_float(output_ptr[i]))) << i; + } +} + TEST(lrn_fp32_gpu, basic3) { // input : 2x16x4x4 // Output : 2x16x4x4 From 6730cab19299d8cf8cc189e95c0c31ba2f4e634e Mon Sep 17 00:00:00 2001 From: Maxim Vafin Date: Mon, 7 Sep 2020 10:20:24 +0300 Subject: [PATCH 04/66] Move FakeOutput resolving to back phase (#2033) --- model-optimizer/automation/package_BOM.txt | 2 +- .../{front => back}/FakeOutputResolver.py | 11 ++- .../back/FakeOutputResolver_test.py | 93 +++++++++++++++++++ .../front/FakeOutputResolver_test.py | 92 ------------------ model-optimizer/extensions/ops/fake_output.py | 4 +- 5 files changed, 103 insertions(+), 99 deletions(-) rename model-optimizer/extensions/{front => back}/FakeOutputResolver.py (83%) create mode 100644 model-optimizer/extensions/back/FakeOutputResolver_test.py delete mode 100644 model-optimizer/extensions/front/FakeOutputResolver_test.py diff --git a/model-optimizer/automation/package_BOM.txt b/model-optimizer/automation/package_BOM.txt index 02e61f4bec9aa7..df8cdba97446dd 100644 --- a/model-optimizer/automation/package_BOM.txt +++ b/model-optimizer/automation/package_BOM.txt @@ -19,6 +19,7 @@ extensions/back/CropToStridedSlice.py extensions/back/CutMemory.py extensions/back/disable_unsupported_ND_operations.py extensions/back/EnableConstantStridedSlice.py +extensions/back/FakeOutputResolver.py extensions/back/ForceStrictPrecision.py extensions/back/fuse_sub_div_min.py extensions/back/FuseTransposesSequence.py @@ -120,7 +121,6 @@ extensions/front/disable_weights_quantize_value_propagation.py extensions/front/div.py extensions/front/eltwise_n.py extensions/front/ExpandDimsToUnsqueeze.py -extensions/front/FakeOutputResolver.py extensions/front/FillToBroadcast.py extensions/front/flatten_to_reshape.py extensions/front/freeze_placeholder_value.py diff --git a/model-optimizer/extensions/front/FakeOutputResolver.py b/model-optimizer/extensions/back/FakeOutputResolver.py similarity index 83% rename from model-optimizer/extensions/front/FakeOutputResolver.py rename to model-optimizer/extensions/back/FakeOutputResolver.py index d9b1d731085bb8..f6a27186fa1daa 100644 --- a/model-optimizer/extensions/front/FakeOutputResolver.py +++ b/model-optimizer/extensions/back/FakeOutputResolver.py @@ -15,18 +15,19 @@ """ from extensions.ops.elementwise import Add -from mo.front.common.replacement import FrontReplacementPattern +from mo.back.replacement import BackReplacementPattern from mo.front.common.partial_infer.utils import int64_array from mo.front.tf.graph_utils import create_op_with_const_inputs from mo.graph.graph import Graph, rename_nodes, rename_node -class FakeOutputResolver(FrontReplacementPattern): +class FakeOutputResolver(BackReplacementPattern): """ This transformation removes FakeOutput nodes. If producer of FakeOutput have only one consumer (FakeOutput itself) the name of FakeOutput is inherited by its producer, otherwise FakeOutput is replaced with op which does nothing. """ enabled = True + force_clean_up = True def find_and_replace_pattern(self, graph: Graph): for fake_output in graph.get_op_nodes(op='FakeOutput'): @@ -46,5 +47,7 @@ def find_and_replace_pattern(self, graph: Graph): fake_output.in_port(0).get_connection().set_destination(add.in_port(0)) fake_output.out_port(0).get_connection().set_source(add.out_port(0)) else: - graph.erase_node(fake_output) - rename_node(producer, name) + result_in_port = fake_output.out_port(0).get_destination() + result_in_port.disconnect() + fake_output.in_port(0).get_connection().set_destination(result_in_port) + rename_nodes([(fake_output, name + '/TBD'), (producer, name)]) diff --git a/model-optimizer/extensions/back/FakeOutputResolver_test.py b/model-optimizer/extensions/back/FakeOutputResolver_test.py new file mode 100644 index 00000000000000..85cab5eff27151 --- /dev/null +++ b/model-optimizer/extensions/back/FakeOutputResolver_test.py @@ -0,0 +1,93 @@ +""" + Copyright (C) 2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import unittest + +from extensions.back.FakeOutputResolver import FakeOutputResolver +from mo.front.common.partial_infer.utils import int64_array +from mo.utils.ir_engine.compare_graphs import compare_graphs +from mo.utils.unittest.graph import build_graph, result, regular_op_with_empty_data, const_with_data, connect, \ + empty_data + + +class FakeOutputResolverTest(unittest.TestCase): + def test_one(self): + nodes = { + **regular_op_with_empty_data('input', {'type': 'Parameter'}), + **regular_op_with_empty_data('some_op', {'type': 'SomeOp', 'name': 'some_op_name'}), + **regular_op_with_empty_data('fake_output', + {'type': None, 'kind': 'op', 'op': 'FakeOutput', 'name': 'my_output_name'}), + **result('result'), + } + edges = [*connect('input', 'some_op'), + *connect('some_op', 'fake_output'), + *connect('fake_output', 'result'), + ] + graph = build_graph(nodes, edges) + + edges_ref = [*connect('input', 'some_op'), + *connect('some_op', 'result'), + ] + + graph_ref = build_graph(nodes, edges_ref, {'some_op': {'name': 'my_output_name'}}) + + FakeOutputResolver().find_and_replace_pattern(graph) + + (flag, resp) = compare_graphs(graph, graph_ref, 'result') + self.assertTrue(flag, resp) + + def test_multi(self): + nodes = { + **regular_op_with_empty_data('input', {'type': 'Parameter'}), + **regular_op_with_empty_data('some_op', {'type': 'SomeOp', 'name': 'some_op_name'}), + **empty_data('some_op_d2'), + **regular_op_with_empty_data('fake_output1', + {'type': None, 'kind': 'op', 'op': 'FakeOutput', 'name': 'my_output_name1'}), + **regular_op_with_empty_data('fake_output2', + {'type': None, 'kind': 'op', 'op': 'FakeOutput', 'name': 'my_output_name2'}), + + **const_with_data('const1', int64_array(0)), + **const_with_data('const2', int64_array(0)), + **regular_op_with_empty_data('add1', {'type': None, 'kind': 'op', 'op': 'Add', 'name': 'my_output_name1'}), + **regular_op_with_empty_data('add2', {'type': None, 'kind': 'op', 'op': 'Add', 'name': 'my_output_name2'}), + **result('result1'), + **result('result2'), + } + edges = [*connect('input', 'some_op'), + *connect('some_op', 'fake_output1'), + ('some_op', 'some_op_d2'), + ('some_op_d2', 'fake_output2'), + *connect('fake_output1', 'result1'), + *connect('fake_output2', 'result2'), + ] + graph = build_graph(nodes, edges) + + edges_ref = [*connect('input', 'some_op'), + *connect('some_op', '0:add1'), + *connect('const1', '1:add1'), + ('some_op', 'some_op_d2'), + ('some_op_d2', 'add2', {'in': 0}), + *connect('const2', '1:add2'), + *connect('add1', 'result1'), + *connect('add2', 'result2'), + ] + + graph_ref = build_graph(nodes, edges_ref) + + FakeOutputResolver().find_and_replace_pattern(graph) + + (flag, resp) = compare_graphs(graph, graph_ref, 'result1') + self.assertTrue(flag, resp) diff --git a/model-optimizer/extensions/front/FakeOutputResolver_test.py b/model-optimizer/extensions/front/FakeOutputResolver_test.py deleted file mode 100644 index 8e57a0ef4fa26f..00000000000000 --- a/model-optimizer/extensions/front/FakeOutputResolver_test.py +++ /dev/null @@ -1,92 +0,0 @@ -""" - Copyright (C) 2020 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import unittest - -from extensions.front.FakeOutputResolver import FakeOutputResolver -from mo.front.common.partial_infer.utils import int64_array -from mo.utils.ir_engine.compare_graphs import compare_graphs -from mo.utils.unittest.graph import build_graph, result, regular_op, const - - -class FakeOutputResolverTest(unittest.TestCase): - def test_one(self): - nodes = { - **regular_op('input', {'type': 'Parameter'}), - **regular_op('some_op', {'type': 'SomeOp', 'name': 'some_op_name'}), - **regular_op('fake_output', {'type': None, 'kind': 'op', 'op': 'FakeOutput', 'name': 'my_output_name'}), - **result('result'), - } - edges = [('input', 'some_op'), - ('some_op', 'fake_output'), - ('fake_output', 'result'), - ] - graph = build_graph(nodes, edges) - - graph.graph['layout'] = 'NCHW' - graph.stage = 'front' - - edges_ref = [('input', 'some_op'), - ('some_op', 'result'), - ] - - graph_ref = build_graph(nodes, edges_ref, {'some_op': {'name': 'my_output_name'}}) - - FakeOutputResolver().find_and_replace_pattern(graph) - - (flag, resp) = compare_graphs(graph, graph_ref, 'result') - self.assertTrue(flag, resp) - - def test_multi(self): - nodes = { - **regular_op('input', {'type': 'Parameter'}), - **regular_op('some_op', {'type': 'SomeOp', 'name': 'some_op_name'}), - **regular_op('fake_output1', {'type': None, 'kind': 'op', 'op': 'FakeOutput', 'name': 'my_output_name1'}), - **regular_op('fake_output2', {'type': None, 'kind': 'op', 'op': 'FakeOutput', 'name': 'my_output_name2'}), - - **const('const1', int64_array(0)), - **const('const2', int64_array(0)), - **regular_op('add1', {'type': None, 'kind': 'op', 'op': 'Add', 'name': 'my_output_name1'}), - **regular_op('add2', {'type': None, 'kind': 'op', 'op': 'Add', 'name': 'my_output_name2'}), - **result('result1'), - **result('result2'), - } - edges = [('input', 'some_op'), - ('some_op', 'fake_output1'), - ('some_op', 'fake_output2'), - ('fake_output1', 'result1'), - ('fake_output2', 'result2'), - ] - graph = build_graph(nodes, edges) - - graph.graph['layout'] = 'NCHW' - graph.stage = 'front' - - edges_ref = [('input', 'some_op'), - ('some_op', 'add1'), - ('const1', 'add1'), - ('some_op', 'add2'), - ('const2', 'add2'), - ('add1', 'result1'), - ('add2', 'result2'), - ] - - graph_ref = build_graph(nodes, edges_ref) - - FakeOutputResolver().find_and_replace_pattern(graph) - - (flag, resp) = compare_graphs(graph, graph_ref, 'result1') - self.assertTrue(flag, resp) diff --git a/model-optimizer/extensions/ops/fake_output.py b/model-optimizer/extensions/ops/fake_output.py index 513e93b2b33664..ee184f9a2c9a87 100644 --- a/model-optimizer/extensions/ops/fake_output.py +++ b/model-optimizer/extensions/ops/fake_output.py @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. """ - +from mo.front.common.partial_infer.elemental import copy_shape_infer, copy_value from mo.graph.graph import Graph from mo.ops.op import Op @@ -31,7 +31,7 @@ def __init__(self, graph: Graph, attrs: dict): 'type': None, 'version': None, - 'infer': None, + 'infer': lambda n: copy_shape_infer(n, copy_value), 'type_infer': None, From 50c6f02a2e0ee2981fadfab7862de58b4cd05bba Mon Sep 17 00:00:00 2001 From: Vladislav Volkov Date: Mon, 7 Sep 2020 10:36:52 +0300 Subject: [PATCH 05/66] Ngraph improvements (#2058) --- ngraph/core/include/ngraph/partial_shape.hpp | 29 ++++------- ngraph/core/src/partial_shape.cpp | 54 +++++++++++++++----- ngraph/core/src/type/element_type.cpp | 14 +++-- 3 files changed, 64 insertions(+), 33 deletions(-) diff --git a/ngraph/core/include/ngraph/partial_shape.hpp b/ngraph/core/include/ngraph/partial_shape.hpp index e42b4f776dc123..8c89a7d57ddd81 100644 --- a/ngraph/core/include/ngraph/partial_shape.hpp +++ b/ngraph/core/include/ngraph/partial_shape.hpp @@ -55,28 +55,18 @@ namespace ngraph /// PartialShape s{}; // rank=0 /// PartialShape s{2,Dimension::dynamic(),3}; // rank=3, dimension 1 dynamic /// \endcode - PartialShape(std::initializer_list init) - : PartialShape(true, init) - { - } + PartialShape(std::initializer_list init); /// \brief Constructs a PartialShape with static rank from a vector of Dimension. /// \param dimensions The Dimension values for the constructed shape. - PartialShape(const std::vector& dimensions) - : m_rank_is_static(true) - , m_dimensions(dimensions) - { - } + PartialShape(const std::vector& dimensions); /// \brief Constructs a PartialShape with static rank from a vector of dimensions values. /// \param dimensions The Dimension values for the constructed shape. PartialShape(const std::vector& dimensions); /// \brief Constructs a static PartialShape with zero rank (the shape of a scalar). - PartialShape() - : PartialShape(std::initializer_list{}) - { - } + PartialShape(); /// \brief Constructs a static PartialShape from a Shape. /// \param shape The Shape to convert into PartialShape. @@ -235,15 +225,18 @@ namespace ngraph private: // Private constructor for PartialShape::dynamic(). - PartialShape(bool rank_is_static, std::vector dimensions) - : m_rank_is_static(rank_is_static) - , m_dimensions(dimensions) - { - } + PartialShape(bool rank_is_static, const std::vector& dimensions); // True if the shape's rank is static. bool m_rank_is_static; + // True if the shape is static. + mutable enum class ShapeType { + SHAPE_IS_UNKNOWN, + SHAPE_IS_STATIC, + SHAPE_IS_DYNAMIC + } m_shape_type{ShapeType::SHAPE_IS_UNKNOWN}; + // Shape dimensions. This has no meaning if m_rank_is_static is false. std::vector m_dimensions; }; diff --git a/ngraph/core/src/partial_shape.cpp b/ngraph/core/src/partial_shape.cpp index e31afb0174f4fd..3f0c38d19776cf 100644 --- a/ngraph/core/src/partial_shape.cpp +++ b/ngraph/core/src/partial_shape.cpp @@ -23,26 +23,53 @@ using namespace ngraph; +PartialShape::PartialShape() + : PartialShape(std::initializer_list{}) +{ +} + +PartialShape::PartialShape(std::initializer_list init) + : PartialShape(true, init) +{ +} + PartialShape::PartialShape(const std::vector& dimensions) : m_rank_is_static(true) + , m_dimensions(dimensions.begin(), dimensions.end()) { - std::transform(dimensions.cbegin(), - dimensions.cend(), - std::back_inserter(m_dimensions), - [](const Dimension::value_type& dimension) { return dimension; }); } PartialShape::PartialShape(const Shape& shape) - : PartialShape(true, {}) + : m_rank_is_static(true) + , m_shape_type(ShapeType::SHAPE_IS_STATIC) + , m_dimensions(shape.begin(), shape.end()) +{ +} + +PartialShape::PartialShape(bool rank_is_static, const std::vector& dimensions) + : m_rank_is_static(rank_is_static) + , m_dimensions(dimensions) +{ +} + +PartialShape::PartialShape(const std::vector& dimensions) + : m_rank_is_static(true) + , m_dimensions(dimensions) { - m_dimensions.assign(shape.begin(), shape.end()); } bool ngraph::PartialShape::is_static() const { - return m_rank_is_static && std::all_of(m_dimensions.begin(), - m_dimensions.end(), - [](const Dimension& d) { return d.is_static(); }); + if (m_shape_type == ShapeType::SHAPE_IS_UNKNOWN) + { + m_shape_type = + m_rank_is_static && std::all_of(m_dimensions.begin(), + m_dimensions.end(), + [](const Dimension& d) { return d.is_static(); }) + ? ShapeType::SHAPE_IS_STATIC + : ShapeType::SHAPE_IS_DYNAMIC; + } + return m_shape_type == ShapeType::SHAPE_IS_STATIC; } bool ngraph::PartialShape::operator==(const PartialShape& partial_shape) const @@ -282,6 +309,7 @@ bool PartialShape::merge_rank(Rank r) { m_rank_is_static = true; m_dimensions = std::vector(r.get_length(), Dimension::dynamic()); + m_shape_type = ShapeType::SHAPE_IS_UNKNOWN; return true; } else @@ -297,13 +325,13 @@ Shape PartialShape::to_shape() const throw std::invalid_argument("to_shape was called on a dynamic shape."); } - std::vector dimensions_to_shape(m_dimensions.size()); + std::vector shape_dimensions(m_dimensions.size()); std::transform(m_dimensions.begin(), m_dimensions.end(), - dimensions_to_shape.begin(), + shape_dimensions.begin(), [](const Dimension& d) { return d.get_length(); }); - return Shape(dimensions_to_shape.begin(), dimensions_to_shape.end()); + return shape_dimensions; } bool PartialShape::merge_into(PartialShape& dst, const PartialShape& src) @@ -444,6 +472,8 @@ Dimension& PartialShape::operator[](size_t i) { throw std::out_of_range("Accessing out-of-range dimension in Dimension[]"); } + m_shape_type = + ShapeType::SHAPE_IS_UNKNOWN; // We can't guarantee that the shape remains static or dynamic. return m_dimensions[i]; } diff --git a/ngraph/core/src/type/element_type.cpp b/ngraph/core/src/type/element_type.cpp index 588a14006aca88..a807bb56a7c197 100644 --- a/ngraph/core/src/type/element_type.cpp +++ b/ngraph/core/src/type/element_type.cpp @@ -15,8 +15,9 @@ //***************************************************************************** #include +#include #include -#include +#include #include "ngraph/log.hpp" #include "ngraph/type/element_type.hpp" @@ -69,9 +70,16 @@ class TypeInfo std::string m_type_name; }; -static const map& get_type_info_map() +struct element_type_hash { - static map s_type_info_map{ + size_t operator()(element::Type_t t) const { return static_cast(t); } +}; + +typedef unordered_map element_types_map_t; + +static const element_types_map_t& get_type_info_map() +{ + static element_types_map_t s_type_info_map{ {element::Type_t::undefined, TypeInfo( std::numeric_limits::max(), false, false, false, "undefined", "undefined")}, From b683b5501d204a7cb27e28c0219c4620f681e5d6 Mon Sep 17 00:00:00 2001 From: Andrey Chekhonin Date: Mon, 7 Sep 2020 10:38:07 +0300 Subject: [PATCH 06/66] [IE][VPU]: Fix behavior tests for MXpcie (#1879) * Moved mem type patching under USB protocol switch * Myriad beh tests fix --- .../myriad_tests/helpers/myriad_devices.cpp | 1 + .../myriad_tests/helpers/myriad_devices.hpp | 4 ++- .../helpers/myriad_load_network_case.cpp | 4 +-- .../vpu/myriad_tests/vpu_get_metric_tests.cpp | 8 ++--- .../vpu/myriad_tests/vpu_protocol_tests.cpp | 28 +++++++++++++++ .../vpu/myriad_tests/vpu_watchdog_tests.cpp | 35 +++++++++++++------ .../thirdparty/movidius/mvnc/src/mvnc_data.c | 24 +++++-------- 7 files changed, 70 insertions(+), 34 deletions(-) diff --git a/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/helpers/myriad_devices.cpp b/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/helpers/myriad_devices.cpp index a33a5837edfc1b..647871f84ee0d1 100644 --- a/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/helpers/myriad_devices.cpp +++ b/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/helpers/myriad_devices.cpp @@ -13,6 +13,7 @@ constexpr char MyriadDevicesInfo::kMyriadXName[]; constexpr char MyriadDevicesInfo::kMyriad2Name[]; +constexpr char MyriadDevicesInfo::kMyriadXPCIeName[]; MyriadDevicesInfo::MyriadDevicesInfo() { #if (defined(_WIN32) || defined(_WIN64)) diff --git a/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/helpers/myriad_devices.hpp b/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/helpers/myriad_devices.hpp index 00d2738e77bb8b..6e365440b13f24 100644 --- a/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/helpers/myriad_devices.hpp +++ b/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/helpers/myriad_devices.hpp @@ -19,6 +19,7 @@ class MyriadDevicesInfo { // Constants static constexpr char kMyriadXName[] = "ma2480"; static constexpr char kMyriad2Name[] = "ma2450"; + static constexpr char kMyriadXPCIeName[] = "mxl"; //Constructor MyriadDevicesInfo(); @@ -54,7 +55,8 @@ const std::string& MyriadDevicesInfo::firmwareDir() { } bool MyriadDevicesInfo::isMyriadXDevice(const std::string &device_name) { - return (device_name.find(kMyriadXName) != std::string::npos); + return ( (device_name.find(kMyriadXName) != std::string::npos) + || (device_name.find(kMyriadXPCIeName) != std::string::npos) ); } bool MyriadDevicesInfo::isMyriad2Device(const std::string &device_name) { diff --git a/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/helpers/myriad_load_network_case.cpp b/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/helpers/myriad_load_network_case.cpp index 7bf15980f9c490..20e8faa7452d08 100644 --- a/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/helpers/myriad_load_network_case.cpp +++ b/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/helpers/myriad_load_network_case.cpp @@ -26,6 +26,6 @@ void MyriadLoadNetworkTestCase::LoadNetwork() { } bool MyriadLoadNetworkTestCase::IsDeviceAvailable(std::string device_name) { - auto act_devices = getDevicesList(); + auto act_devices = getDevicesList(NC_ANY_PROTOCOL, NC_ANY_PLATFORM, X_LINK_UNBOOTED); return std::find(act_devices.begin(), act_devices.end(), device_name) != act_devices.end(); -} \ No newline at end of file +} diff --git a/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/vpu_get_metric_tests.cpp b/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/vpu_get_metric_tests.cpp index 6555a67709ebfd..5ffd53f5b7d109 100644 --- a/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/vpu_get_metric_tests.cpp +++ b/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/vpu_get_metric_tests.cpp @@ -83,18 +83,16 @@ TEST_F(VPUGetMetric, smoke_ThermalStatsFromPluginWithoutLoadedNetwork) { ASSERT_TRUE(result.empty()); } -TEST_F(VPUGetMetric, smoke_MyriadGetAvailableDevices) { +TEST_F(VPUGetMetric, smoke_MyriadGetFullDeviceName) { std::vector availableDevices; ASSERT_NO_THROW(availableDevices = getAvailableDevices()); ASSERT_TRUE(!availableDevices.empty()); auto result = Parameter{}; - auto deviceNames = std::vector(availableDevices.size()); for (size_t i = 0; i < availableDevices.size(); ++i) { const auto deviceName = "MYRIAD." + availableDevices[i]; ASSERT_NO_THROW(result = ie.GetMetric(deviceName, METRIC_KEY(FULL_DEVICE_NAME))); - - deviceNames[i] = result.as(); - ASSERT_TRUE(deviceNames[i] != availableDevices[i]); + auto act_res = result.as(); + ASSERT_TRUE(!act_res.empty()); } } diff --git a/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/vpu_protocol_tests.cpp b/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/vpu_protocol_tests.cpp index d66b56ed36a359..a2c2fb48ba59ce 100644 --- a/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/vpu_protocol_tests.cpp +++ b/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/vpu_protocol_tests.cpp @@ -3,6 +3,7 @@ // #include "helpers/myriad_protocol_case.hpp" +#include "XLinkLog.h" std::shared_ptr MyriadProtocolTests::ie = nullptr; @@ -26,6 +27,33 @@ TEST_P(MyriadProtocolTests, CanInferenceWithProtocol) { ASSERT_EQ(statusCode, StatusCode::OK) << resp.msg; } + + +TEST_P(MyriadProtocolTests, NoErrorsMessagesWhenLoadNetworkSuccessful) { + if (protocol != NC_USB) { + GTEST_SKIP(); + } + + char buff[8192] = {}; + setbuf(stdout, buff); + + auto network = ie->ReadNetwork(FuncTestUtils::TestModel::convReluNormPoolFcModelFP16.model_xml_str, + FuncTestUtils::TestModel::convReluNormPoolFcModelFP16.weights_blob); + + std::map config = {{CONFIG_KEY(LOG_LEVEL), CONFIG_VALUE(LOG_WARNING)}}; + + InferenceEngine::IExecutableNetwork::Ptr exe_network = + ie->LoadNetwork(network, "MYRIAD", config); + setbuf(stdout, NULL); + + + std::string content(buff); + for (int i = MVLOG_WARN; i < MVLOG_LAST; i++) { + auto found = content.find(mvLogHeader[i]); + ASSERT_TRUE(found == std::string::npos); + } +} + INSTANTIATE_TEST_CASE_P(smoke_VPUConfigProtocolTests, MyriadProtocolTests, ::testing::ValuesIn(myriadProtocols), diff --git a/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/vpu_watchdog_tests.cpp b/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/vpu_watchdog_tests.cpp index bfb729903b6eeb..b9b6057fd55ff4 100644 --- a/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/vpu_watchdog_tests.cpp +++ b/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/vpu_watchdog_tests.cpp @@ -60,10 +60,10 @@ class MYRIADWatchdog : public BehaviorPluginTest, int total() const {return booted + unbooted;} }; - DevicesState queryDevices() { + DevicesState queryDevices(ncDeviceProtocol_t protocol = NC_ANY_PROTOCOL) { DevicesState devicesState; - devicesState.booted = getAmountOfBootedDevices(NC_USB); - devicesState.unbooted = getAmountOfUnbootedDevices(NC_USB); + devicesState.booted = getAmountOfBootedDevices(protocol); + devicesState.unbooted = getAmountOfUnbootedDevices(protocol); return devicesState; } @@ -120,8 +120,11 @@ class MYRIADWatchdog : public BehaviorPluginTest, } TEST_P(MYRIADWatchdog, canDisableWatchdog) { - - auto startup_devices = queryDevices(); + auto startup_devices = queryDevices(NC_PCIE); + if (startup_devices.unbooted >= 1) { + GTEST_SKIP(); + } + startup_devices = queryDevices(NC_USB); ASSERT_GE(startup_devices.unbooted, 1); auto ctime = Time::now(); @@ -136,7 +139,7 @@ TEST_P(MYRIADWatchdog, canDisableWatchdog) { for (int j = 0; j != 20; j++) { std::this_thread::sleep_for(std::chrono::milliseconds(1000)); std::cout << "Time since boot:" << chrono::duration_cast(Time::now() - ctime).count() << std::endl; - if (queryDevices().booted == startup_devices.booted) { + if (queryDevices(NC_USB).booted == startup_devices.booted) { SUCCEED() << "All devices gets reset"; break; } @@ -149,7 +152,11 @@ TEST_P(MYRIADWatchdog, canDisableWatchdog) { } TEST_P(MYRIADWatchdog, canDetectWhenHostSiteStalled) { - auto startup_devices = queryDevices(); + auto startup_devices = queryDevices(NC_PCIE); + if (startup_devices.unbooted >= 1) { + GTEST_SKIP(); + } + startup_devices = queryDevices(NC_USB); ASSERT_GE(startup_devices.unbooted, 1); auto ctime = Time::now(); @@ -180,12 +187,13 @@ TEST_P(MYRIADWatchdog, canDetectWhenHostSiteStalled) { TEST_P(MYRIADWatchdog, watchDogIntervalDefault) { auto startup_devices = queryDevices(); + ASSERT_GE(startup_devices.unbooted, 1); + auto ctime = Time::now(); { InferenceEngine::Core core; auto model = FuncTestUtils::TestModel::convReluNormPoolFcModelFP16; CNNNetwork network = core.ReadNetwork(model.model_xml_str, model.weights_blob); - ASSERT_GE(startup_devices.unbooted, 1); ExecutableNetwork ret; ctime = Time::now(); @@ -212,13 +220,18 @@ TEST_P(MYRIADWatchdog, watchDogIntervalDefault) { } TEST_P(MYRIADWatchdog, canTurnoffWatchDogViaConfig) { - auto startup_devices = queryDevices(); + auto startup_devices = queryDevices(NC_PCIE); + if (startup_devices.unbooted >= 1) { + GTEST_SKIP(); + } + startup_devices = queryDevices(NC_USB); + ASSERT_GE(startup_devices.unbooted, 1); + auto ctime = Time::now(); { InferenceEngine::Core core; auto model = FuncTestUtils::TestModel::convReluNormPoolFcModelFP16; CNNNetwork network = core.ReadNetwork(model.model_xml_str, model.weights_blob); - ASSERT_GE(startup_devices.unbooted, 1); ExecutableNetwork ret; ctime = Time::now(); @@ -232,7 +245,7 @@ TEST_P(MYRIADWatchdog, canTurnoffWatchDogViaConfig) { for (int j = 0; j != 20; j++) { std::this_thread::sleep_for(std::chrono::milliseconds(1000)); std::cout << "Time since boot:" << chrono::duration_cast(Time::now() - ctime).count() << std::endl; - if (queryDevices().booted == startup_devices.booted) { + if (queryDevices(NC_USB).booted == startup_devices.booted) { SUCCEED() << "All devices gets reset"; break; } diff --git a/inference-engine/thirdparty/movidius/mvnc/src/mvnc_data.c b/inference-engine/thirdparty/movidius/mvnc/src/mvnc_data.c index 16004b2a97d4a9..bc199ec48c034d 100644 --- a/inference-engine/thirdparty/movidius/mvnc/src/mvnc_data.c +++ b/inference-engine/thirdparty/movidius/mvnc/src/mvnc_data.c @@ -202,10 +202,9 @@ static ncStatus_t patchSetWdSwitchCommand(char **firmware, size_t *length, const } if(!executeCommandFound) { - mvLog(MVLOG_ERROR, "Fail to find execute command"); + mvLog(MVLOG_WARN, "Fail to find execute command"); return NC_ERROR; } - return patchFirmware(firmware, length, executeCommandIdx, g_setWdSwitchCommandMX, sizeof(g_setWdSwitchCommandMX), wdEnable); } @@ -241,10 +240,9 @@ static ncStatus_t patchSetMemTypeCommand(char **firmware, size_t *length, const } if(!callCommandFound) { - mvLog(MVLOG_ERROR, "Fail to find call command"); + mvLog(MVLOG_WARN, "Fail to find call command"); return NC_ERROR; } - return patchFirmware(firmware, length, callCommandIdx, g_setMemTypeCommandMX, sizeof(g_setMemTypeCommandMX), memType); } @@ -266,18 +264,14 @@ ncStatus_t bootDevice(deviceDesc_t* deviceDescToBoot, if(deviceDescToBoot->protocol != X_LINK_PCIE) { sc = patchSetWdSwitchCommand(&firmware, &length, bootOptions.wdEnable); if(sc) { - mvLog(MVLOG_ERROR, "Fail to patch \"Set wd switch value\" command for firmware sc = %d", sc); - free(firmware); - return sc; + mvLog(MVLOG_WARN, "Fail to patch \"Set wd switch value\" command for firmware sc = %d", sc); } - } - - sc = patchSetMemTypeCommand(&firmware, &length, bootOptions.memType); - if(sc) { - mvLog(MVLOG_ERROR, "Fail to patch \"Set memory type\" command for firmware sc = %d", sc); - free(firmware); - return sc; - } + + sc = patchSetMemTypeCommand(&firmware, &length, bootOptions.memType); + if(sc) { + mvLog(MVLOG_WARN, "Fail to patch \"Set memory type\" command for firmware sc = %d", sc); + } + } } XLinkError_t rc = XLinkBootFirmware(deviceDescToBoot, firmware, length); From 9939253fed780c099963aa407f5bef14e50847d7 Mon Sep 17 00:00:00 2001 From: iliya mironov Date: Mon, 7 Sep 2020 10:41:47 +0300 Subject: [PATCH 07/66] Refactored legacy code for mean_scale_values transformations (#1936) * Remove move_to_preproc. Not actual. * Updated documentation --- .../convert_model/Converting_Model_General.md | 1 - model-optimizer/automation/package_BOM.txt | 1 - .../extensions/middle/preprocessing.py | 19 +- .../mo/middle/passes/mean_scale_values.py | 81 -------- .../middle/passes/mean_scale_values_test.py | 174 ------------------ model-optimizer/mo/utils/cli_parser.py | 2 +- 6 files changed, 2 insertions(+), 276 deletions(-) delete mode 100644 model-optimizer/mo/middle/passes/mean_scale_values.py delete mode 100644 model-optimizer/mo/middle/passes/mean_scale_values_test.py diff --git a/docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md b/docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md index 90091d5bb7bc22..5bdbf7b9dc4fe8 100644 --- a/docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md +++ b/docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md @@ -109,7 +109,6 @@ Framework-agnostic parameters: --disable_gfusing Turn off fusing of grouped convolutions --enable_concat_optimization Turn on Concat optimization. - --move_to_preprocess Move mean values to IR preprocess section --extensions EXTENSIONS Directory or a comma separated list of directories with extensions. To disable all extensions including diff --git a/model-optimizer/automation/package_BOM.txt b/model-optimizer/automation/package_BOM.txt index df8cdba97446dd..df126dd35d99d6 100644 --- a/model-optimizer/automation/package_BOM.txt +++ b/model-optimizer/automation/package_BOM.txt @@ -872,7 +872,6 @@ mo/middle/passes/fusing/mark_unfused_nodes.py mo/middle/passes/fusing/resnet_optimization.py mo/middle/passes/infer.py mo/middle/passes/leaky_relu.py -mo/middle/passes/mean_scale_values.py mo/middle/passes/tensor_names.py mo/middle/pattern_match.py mo/middle/replacement.py diff --git a/model-optimizer/extensions/middle/preprocessing.py b/model-optimizer/extensions/middle/preprocessing.py index 24ab8a5436fa1c..451ceaa8faa488 100644 --- a/model-optimizer/extensions/middle/preprocessing.py +++ b/model-optimizer/extensions/middle/preprocessing.py @@ -16,36 +16,19 @@ from extensions.middle.LeakyReluPattern import LeakyReLU from extensions.middle.pass_separator import PostMiddleStart from mo.graph.graph import Graph -from mo.middle.passes.mean_scale_values import move_scaleshift_to_preprocess from mo.middle.replacement import MiddleReplacementPattern from mo.utils.error import Error from mo.utils.find_inputs import find_inputs from mo.utils.utils import refer_to_faq_msg -class Preprocessing(MiddleReplacementPattern): - enabled = True - force_clean_up = True - - def run_after(self): - return [LeakyReLU] - - def run_before(self): - return [PostMiddleStart] - - def find_and_replace_pattern(self, graph: Graph): - argv = graph.graph['cmd_params'] - if argv.move_to_preprocess: - move_scaleshift_to_preprocess(graph) - - class CaffeMeanFileProcessing(MiddleReplacementPattern): enabled = True force_clean_up = True graph_condition = [lambda graph: graph.graph['fw'] == 'caffe'] def run_after(self): - return [Preprocessing] + return [LeakyReLU] def run_before(self): return [PostMiddleStart] diff --git a/model-optimizer/mo/middle/passes/mean_scale_values.py b/model-optimizer/mo/middle/passes/mean_scale_values.py deleted file mode 100644 index e6ffc79f2948c6..00000000000000 --- a/model-optimizer/mo/middle/passes/mean_scale_values.py +++ /dev/null @@ -1,81 +0,0 @@ -""" - Copyright (C) 2018-2020 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import numpy as np - -from mo.graph.graph import Graph -from mo.middle.pattern_match import apply_pattern - - -def move_scaleshift_to_preprocess_action(graph, match): - mean_values = {} - input_op = match['input_op'] - scale_shift = match['scale_shift'] - weights = np.squeeze(match['weights'].value) - biases = np.squeeze(match['biases'].value) - - if graph.graph['cmd_params'].reverse_input_channels: - biases = np.flip(biases) - - if any([x != 1 for x in weights]): - return - - # Keep biases (mean values) for current input as graph attr and remove ScaleShift layer - # Input->data->ScaleShift->scsh_data => Input->scsh_data - graph.remove_edge(input_op.id, input_op.out_node().id) - graph.add_edge(input_op.id, scale_shift.out_node().id, out=0) - graph.remove_edge(scale_shift.id, scale_shift.out_node().id) - - # If bias contains zeros we just remove it - if all([x == 0 for x in biases]): - return - - # In pre-process section, mean_values are subtracted - biases *= -1 - - mean_values.update({input_op.name: np.array(biases)}) - - # Add graph attribute 'mean_values' that stores mean_values per input if exists - if graph.graph.get('mean_values', None): - graph.graph['mean_values'].update(mean_values) - else: - graph.graph['mean_values'] = mean_values - - -def move_scaleshift_to_preprocess(graph: Graph): - """ - This function finds scaleshift layer after input layer and if it has weights with ones, it deletes scaleshift layer - and creates graph dict attribute : {'input':np.array(...), 'input2': ... } - """ - apply_pattern( - graph, - nodes=[ - ('weights', dict(kind='data')), - ('biases', dict(kind='data')), - ('input_output', dict(kind='data')), - ('scsh_output', dict(kind='data')), - ('input_op', dict(kind='op', type='Parameter')), - ('scale_shift', dict(kind='op', type='ScaleShift')), - ], - edges=[ - ('input_op', 'input_output'), - ('scale_shift', 'scsh_output'), - ('input_output', 'scale_shift', {'in': 0}), - ('weights', 'scale_shift', {'in': 1}), - ('biases', 'scale_shift', {'in': 2}), - ], - action=move_scaleshift_to_preprocess_action - ) diff --git a/model-optimizer/mo/middle/passes/mean_scale_values_test.py b/model-optimizer/mo/middle/passes/mean_scale_values_test.py deleted file mode 100644 index 3b4410d3f40157..00000000000000 --- a/model-optimizer/mo/middle/passes/mean_scale_values_test.py +++ /dev/null @@ -1,174 +0,0 @@ -""" - Copyright (C) 2018-2020 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import unittest -from argparse import Namespace - -import numpy as np - -from mo.middle.passes.mean_scale_values import move_scaleshift_to_preprocess -from mo.utils.ir_engine.compare_graphs import compare_graphs -from mo.utils.unittest.graph import build_graph - -nodes_attributes = {'node_1': {'type': 'Identity', 'value': None, 'kind': 'op'}, - 'node_2': {'type': 'Identity', 'value': None, 'kind': 'op'}, - 'concat': {'type': 'Concat', 'value': None, 'kind': 'op'}, - 'node_3': {'type': 'Identity', 'value': None, 'kind': 'op'}, - # Placeholders - 'placeholder_1': {'value': None, 'shape': None, 'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'}, - 'placeholder_1_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, - 'placeholder_2': {'value': None, 'shape': None, 'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'}, - 'placeholder_2_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, - # ScaleShift layer - 'scaleshift_1': {'type': 'ScaleShift', 'value': None, 'kind': 'op', 'op': 'ScaleShift'}, - 'scaleshift_1_w': {'value': None, 'shape': None, 'kind': 'data'}, - 'scaleshift_1_b': {'value': None, 'shape': None, 'kind': 'data'}, - 'scaleshift_1_data': {'value': None, 'shape': None, 'kind': 'data'}, - 'op_output': { 'kind': 'op', 'op': 'Result'}, - 'op_output_1': { 'kind': 'op', 'op': 'Result'} - - } - - -class TestScaleShift_To_Preprocess(unittest.TestCase): - def test_move_scaleshift_to_preprocess_1(self): - graph = build_graph(nodes_attributes, - [('placeholder_1', 'placeholder_1_data'), - ('placeholder_1_data', 'scaleshift_1'), - ('scaleshift_1', 'scaleshift_1_data'), - ('scaleshift_1_w', 'scaleshift_1'), - ('scaleshift_1_b', 'scaleshift_1'), - ('scaleshift_1_data', 'op_output') - ], - {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, - 'scaleshift_1_w': {'shape': np.array([3]), 'value': np.ones(3)}, - 'scaleshift_1_b': {'shape': np.array([3]), 'value': np.array([-1, -2, -3])}, - }) - graph.graph['cmd_params'] = Namespace(reverse_input_channels=False) - del graph['placeholder_1']['placeholder_1_data'][0]['in'] - del graph['scaleshift_1']['scaleshift_1_data'][0]['in'] - - graph_ref = build_graph(nodes_attributes, - [('placeholder_1', 'scaleshift_1_data'), - ('scaleshift_1_data', 'op_output') - ]) - - move_scaleshift_to_preprocess(graph) - self.assertTrue(graph.graph['mean_values'] is not None) - self.assertTrue(np.array_equal(graph.graph['mean_values']['placeholder_1'], np.array([1, 2, 3]))) - - (flag, resp) = compare_graphs(graph, graph_ref, 'scaleshift_1_data') - self.assertTrue(flag, resp) - - def test_move_scaleshift_to_preprocess_2(self): - graph = build_graph(nodes_attributes, - [('placeholder_1', 'placeholder_1_data'), - ('placeholder_1_data', 'scaleshift_1'), - ('scaleshift_1', 'scaleshift_1_data'), - ('scaleshift_1_w', 'scaleshift_1'), - ('scaleshift_1_b', 'scaleshift_1'), - ('scaleshift_1_data', 'op_output'), - ('placeholder_1_data', 'op_output_1') - ], - {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, - 'scaleshift_1_w': {'shape': np.array([3]), 'value': np.array((1, 2, 3))}, - 'scaleshift_1_b': {'shape': np.array([3]), 'value': np.array([-1, -2, -3])}, - }) - graph.graph['cmd_params'] = Namespace(reverse_input_channels=False) - del graph['placeholder_1']['placeholder_1_data'][0]['in'] - del graph['scaleshift_1']['scaleshift_1_data'][0]['in'] - - graph_ref = build_graph(nodes_attributes, - [('placeholder_1', 'placeholder_1_data'), - ('placeholder_1_data', 'scaleshift_1'), - ('scaleshift_1', 'scaleshift_1_data'), - ('scaleshift_1_w', 'scaleshift_1'), - ('scaleshift_1_b', 'scaleshift_1'), - ('placeholder_1_data', 'op_output_1'), - ('scaleshift_1_data', 'op_output') - ], - {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, - 'scaleshift_1_w': {'shape': np.array([3]), 'value': np.array((1, 2, 3))}, - 'scaleshift_1_b': {'shape': np.array([3]), 'value': np.array([-1, -2, -3])}, - }) - - move_scaleshift_to_preprocess(graph) - self.assertTrue(graph.graph.get('mean_values', None) is None) - - (flag, resp) = compare_graphs(graph, graph_ref, 'scaleshift_1_data') - self.assertTrue(flag, resp) - - def test_move_scaleshift_to_preprocess_3(self): - graph = build_graph(nodes_attributes, - [('placeholder_1', 'placeholder_1_data'), - ('placeholder_1_data', 'scaleshift_1'), - ('scaleshift_1', 'scaleshift_1_data'), - ('scaleshift_1_w', 'scaleshift_1'), - ('scaleshift_1_data', 'op_output'), - ('placeholder_1_data', 'op_output_1') - ], - {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, - 'scaleshift_1_w': {'shape': np.array([3]), 'value': np.array((1, 2, 3))}, - }) - graph.graph['cmd_params'] = Namespace(reverse_input_channels=False) - del graph['placeholder_1']['placeholder_1_data'][0]['in'] - del graph['scaleshift_1']['scaleshift_1_data'][0]['in'] - - graph_ref = build_graph(nodes_attributes, - [('placeholder_1', 'placeholder_1_data'), - ('placeholder_1_data', 'scaleshift_1'), - ('scaleshift_1', 'scaleshift_1_data'), - ('scaleshift_1_w', 'scaleshift_1'), - ('scaleshift_1_data', 'op_output'), - ('placeholder_1_data', 'op_output_1') - ], - {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, - 'scaleshift_1_w': {'shape': np.array([3]), 'value': np.array((1, 2, 3))}, - }) - - move_scaleshift_to_preprocess(graph) - self.assertTrue(graph.graph.get('mean_values', None) == None) - - (flag, resp) = compare_graphs(graph, graph_ref, 'scaleshift_1_data') - self.assertTrue(flag, resp) - - def test_move_scaleshift_to_preprocess_4(self): - graph = build_graph(nodes_attributes, - [('placeholder_1', 'placeholder_1_data'), - ('placeholder_1_data', 'scaleshift_1'), - ('scaleshift_1', 'scaleshift_1_data'), - ('scaleshift_1_w', 'scaleshift_1'), - ('scaleshift_1_b', 'scaleshift_1'), - ('scaleshift_1_data', 'op_output') - ], - {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, - 'scaleshift_1_w': {'shape': np.array([3]), 'value': np.ones(3)}, - 'scaleshift_1_b': {'shape': np.array([3]), 'value': np.zeros(3)}, - }) - graph.graph['cmd_params'] = Namespace(reverse_input_channels=False) - del graph['placeholder_1']['placeholder_1_data'][0]['in'] - del graph['scaleshift_1']['scaleshift_1_data'][0]['in'] - - graph_ref = build_graph(nodes_attributes, - [('placeholder_1', 'scaleshift_1_data'), - ('scaleshift_1_data', 'op_output') - ]) - - move_scaleshift_to_preprocess(graph) - self.assertTrue(graph.graph.get('mean_values', None) is None) - - (flag, resp) = compare_graphs(graph, graph_ref, 'scaleshift_1_data') - self.assertTrue(flag, resp) diff --git a/model-optimizer/mo/utils/cli_parser.py b/model-optimizer/mo/utils/cli_parser.py index cd001faa8c496b..e8b55a10e661f4 100644 --- a/model-optimizer/mo/utils/cli_parser.py +++ b/model-optimizer/mo/utils/cli_parser.py @@ -283,7 +283,7 @@ def get_common_cli_parser(parser: argparse.ArgumentParser = None): action='store_true') common_group.add_argument('--move_to_preprocess', help='Move mean values to IR preprocess section', - action='store_true') + action=DeprecatedStoreTrue) # we use CanonicalizeDirCheckExistenceAction instead of readable_dirs to handle empty strings common_group.add_argument("--extensions", help="Directory or a comma separated list of directories with extensions. To disable all " From 062a4e29003dbe3549ff43b48f42e49fd957dee3 Mon Sep 17 00:00:00 2001 From: Evgeny Latkin Date: Mon, 7 Sep 2020 10:58:36 +0300 Subject: [PATCH 08/66] VPU: update firmware version to 1354 (#2087) --- inference-engine/cmake/vpu_dependencies.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference-engine/cmake/vpu_dependencies.cmake b/inference-engine/cmake/vpu_dependencies.cmake index 3d5f6b35d2581d..262bcd21207bef 100644 --- a/inference-engine/cmake/vpu_dependencies.cmake +++ b/inference-engine/cmake/vpu_dependencies.cmake @@ -19,7 +19,7 @@ set(VPU_SUPPORTED_FIRMWARES usb-ma2450 usb-ma2x8x pcie-ma248x) # Default packages # -set(FIRMWARE_PACKAGE_VERSION 1349) +set(FIRMWARE_PACKAGE_VERSION 1354) set(VPU_CLC_MA2X8X_VERSION "movi-cltools-20.02.0") # From 75b4e193f79f4216ee666e0c214bacfc7cf20956 Mon Sep 17 00:00:00 2001 From: Mikhail Letavin Date: Mon, 7 Sep 2020 11:49:56 +0300 Subject: [PATCH 09/66] [IE CLDNN] Fix the order of dimensions in reshapes inserted by Select primitive (#2082) --- inference-engine/src/cldnn_engine/cldnn_program.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/inference-engine/src/cldnn_engine/cldnn_program.cpp b/inference-engine/src/cldnn_engine/cldnn_program.cpp index fa57a9d8f400bb..d6405ed4c277b8 100644 --- a/inference-engine/src/cldnn_engine/cldnn_program.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_program.cpp @@ -4844,9 +4844,9 @@ void Program::CreateSelectPrimitive(cldnn::topology& topology, InferenceEngine:: auto selectSpecificTensor = [](const InferenceEngine::SizeVector& dims, int def = 1) { switch (dims.size()) { case 0: return cldnn::tensor(cldnn::batch(def), cldnn::feature(def), cldnn::spatial(def, def)); - case 1: return cldnn::tensor(cldnn::batch(def), cldnn::feature(def), cldnn::spatial(dims[0], def)); - case 2: return cldnn::tensor(cldnn::batch(def), cldnn::feature(def), cldnn::spatial(dims[1], dims[0])); - case 3: return cldnn::tensor(cldnn::batch(def), cldnn::feature(dims[0]), cldnn::spatial(dims[2], dims[1])); + case 1: return cldnn::tensor(cldnn::batch(dims[0]), cldnn::feature(def), cldnn::spatial(def, def)); + case 2: return cldnn::tensor(cldnn::batch(dims[0]), cldnn::feature(dims[1]), cldnn::spatial(def, def)); + case 3: return cldnn::tensor(cldnn::batch(dims[0]), cldnn::feature(dims[1]), cldnn::spatial(def, dims[2])); case 4: return cldnn::tensor(cldnn::batch(dims[0]), cldnn::feature(dims[1]), cldnn::spatial(dims[3], dims[2])); case 5: return cldnn::tensor(cldnn::batch(dims[0]), cldnn::feature(dims[1]), cldnn::spatial(dims[4], dims[3], dims[2])); case 6: return cldnn::tensor(cldnn::batch(dims[0]), cldnn::feature(dims[1]), cldnn::spatial(dims[5], dims[4], dims[3], dims[2])); From a5389010a9189480779167fabef7c6cc67d82e40 Mon Sep 17 00:00:00 2001 From: Tomasz Socha Date: Mon, 7 Sep 2020 10:50:55 +0200 Subject: [PATCH 10/66] Remove list of the supported ops by ONNX Importer (#2061) --- ...grate_with_customer_application_new_API.md | 2 +- docs/IE_DG/Introduction.md | 2 +- docs/IE_DG/Migration_CoreAPI.md | 2 +- docs/IE_DG/ONNX_Support.md | 19 ++ docs/IE_DG/ONNX_Supported_Ops.md | 215 ------------------ docs/doxygen/ie_docs.xml | 2 +- 6 files changed, 23 insertions(+), 219 deletions(-) create mode 100644 docs/IE_DG/ONNX_Support.md delete mode 100644 docs/IE_DG/ONNX_Supported_Ops.md diff --git a/docs/IE_DG/Integrate_with_customer_application_new_API.md b/docs/IE_DG/Integrate_with_customer_application_new_API.md index 30a763d520aa47..519d6cd7a87d37 100644 --- a/docs/IE_DG/Integrate_with_customer_application_new_API.md +++ b/docs/IE_DG/Integrate_with_customer_application_new_API.md @@ -36,7 +36,7 @@ InferenceEngine::Core core; ```cpp auto network = core.ReadNetwork("Model.xml"); ``` -**Or read the model from ONNX format** (.onnx and .prototxt are supported formats). You can find more information about the ONNX format support in the document [ONNX format support in the OpenVINO™](./ONNX_Supported_Ops.md). +**Or read the model from ONNX format** (.onnx and .prototxt are supported formats). You can find more information about the ONNX format support in the document [ONNX format support in the OpenVINO™](./ONNX_Support.md). ```cpp auto network = core.ReadNetwork("model.onnx"); ``` diff --git a/docs/IE_DG/Introduction.md b/docs/IE_DG/Introduction.md index 32d5c733c2e2f4..7409fd4e5b9584 100644 --- a/docs/IE_DG/Introduction.md +++ b/docs/IE_DG/Introduction.md @@ -117,7 +117,7 @@ Please refer to the [Overview of nGraph Flow](nGraph_Flow.md) describing the det Inference Engine is a runtime that delivers a unified API to integrate the inference with application logic: -* Takes a model as an input. The model can be presented in [the native ONNX format](./ONNX_Supported_Ops.md) or in the specific form of [Intermediate Representation (IR)](../MO_DG/IR_and_opsets.md) +* Takes a model as an input. The model can be presented in [the native ONNX format](./ONNX_Support.md) or in the specific form of [Intermediate Representation (IR)](../MO_DG/IR_and_opsets.md) produced by Model Optimizer. * Optimizes inference execution for target hardware. * Delivers inference solution with reduced footprint on embedded inference platforms. diff --git a/docs/IE_DG/Migration_CoreAPI.md b/docs/IE_DG/Migration_CoreAPI.md index 1b05854560e59e..f1b68eb06acf57 100644 --- a/docs/IE_DG/Migration_CoreAPI.md +++ b/docs/IE_DG/Migration_CoreAPI.md @@ -45,7 +45,7 @@ read networks using the Core class: ```cpp CNNNetwork network = core.ReadNetwork(input_model); ``` -The Core class also allows reading models from the ONNX format (more information is [here](./ONNX_Supported_Ops.md)): +The Core class also allows reading models from the ONNX format (more information is [here](./ONNX_Support.md)): ```cpp CNNNetwork network = core.ReadNetwork("model.onnx"); ``` diff --git a/docs/IE_DG/ONNX_Support.md b/docs/IE_DG/ONNX_Support.md new file mode 100644 index 00000000000000..85500200046f30 --- /dev/null +++ b/docs/IE_DG/ONNX_Support.md @@ -0,0 +1,19 @@ +# ONNX format support in the OpenVINO™ {#openvino_docs_IE_DG_ONNX_Support} + +Starting from the 2020.4 release, OpenVINO™ supports reading native ONNX models. +`Core::ReadNetwork()` method provides a uniform way to read models from IR or ONNX format, it is a recommended approach to reading models. +Example: + +```cpp +InferenceEngine::Core core; +auto network = core.ReadNetwork("model.onnx"); +``` + +OpenVINO™ doesn't provide a mechanism to specify pre-processing (like mean values subtraction, reverse input channels) for the ONNX format. +If an ONNX model contains dynamic shapes for input, please use the `CNNNetwork::reshape` method for shape specialization. + +Unsupported types of tensors: + +* `string`, +* `complex64`, +* `complex128`. diff --git a/docs/IE_DG/ONNX_Supported_Ops.md b/docs/IE_DG/ONNX_Supported_Ops.md deleted file mode 100644 index 60c2c92e3a93ba..00000000000000 --- a/docs/IE_DG/ONNX_Supported_Ops.md +++ /dev/null @@ -1,215 +0,0 @@ -# ONNX format support in the OpenVINO™ {#openvino_docs_IE_DG_ONNX_Supported_Ops} - -Starting from the 2020.4 release, OpenVINO™ supports reading native ONNX models. -`Core::ReadNetwork()` method provides a uniform way to read models from IR or ONNX format, it is a recommended approach to reading models. -Example: - -```cpp -InferenceEngine::Core core; -auto network = core.ReadNetwork("model.onnx"); -``` - -This document describes the list of supported ONNX operations and known limitations of this functionality. - -OpenVINO™ doesn't provide a mechanism to specify pre-processing (like mean values subtraction, reverse input channels) for the ONNX format. -If an ONNX model contains dynamic shapes for input, please use the `CNNNetwork::reshape` method for shape specialization. - -Generally nGraph doesn't support tensors of types: - -* `string`, -* `complex64`, -* `complex128`. - -Value in `()` _parenthesis_ indicates that this op was introduced since the specific -ONNX Standard opset version. -Values seperated by `-` _dash_ indicate the changes were made to that op definition -in the ONNX Standard. If there were minor changes they are usually supported by single -implementation, otherwise there are multiple versions, each appropriate for specific opset -version range. -For example, with the schema represented below the operator `Abs` is supported in all -opset versions starting from `1` to `6` and to the latest opset version. - -## Supported Ops: - -| Name | ONNX Opset supported | nGraph opset support | Comment | -|------|----------------------------|---------|-----| -| Abs | 1-6- | 0,1 | -| Acos | 7- | 0,1 | -| Acosh | 9- | 0, | Have to change to only v1 ops (NGONNX-1015) -| Add | 1-6-7- | 0,1 | -| And | 1-7- | 0,1 | -| ArgMax | 1- | 0,1 | -| ArgMin | 1- | 0,1 | -| Asin | 7- | 0,1 | -| Asinh | 9- | 0, | Have to change to only v1 ops (NGONNX-1015) -| Atan | 7 - | 0,1 | -| Atanh | 9- | 0, | Have to change to only v1 ops (NGONNX-1015) -| AveragePool | 1-7- | 0,1 | -| BatchNormalization | 1-6-7- | 0,1 | -| Cast | 1-6-9- | 0,1 | -| Ceil | 1-6- | 0,1 | -| Clip | 1-6- | 0,1 | -| Concat | 1-4- | 0,1 | -| Constant | 1- | 0,1 | -| Conv | 1- | 0,1 | -| ConvInteger | 10- | 0, | -| ConvTranspose | 1- | 0,1 | -| Cos | 7- | 0,1 | -| Cosh | 9- | 0,1 | Have to change to only v1 ops (NGONNX-1015) -| CumSum | 11- | 0, | NGONNX-944 -| DepthToSpace | 1-11- | 0,1 | -| DequantizeLinear | 10- | 0, | -| Div | 1-6-7- | 0,1 | -| Dropout | (1-6-7)-10- | 0,1 | Only for inference. -| Elu | 1-6- | 0,1 | -| Equal | 1-7 | 0,1 | -| Erf | 9- | 0,1 | -| Exp | 1-6- | 0,1 | -| Expand | 8- | 0,1 | Only static version -| EyeLike | 9- | 0,1 | -| Flatten | 1-9- | 0,1 | -| Floor | 1-6- | 0,1 | -| Gather | 11- | 0,1 | -| GatherND | 11- | 0, | -| Gemm | 1-6-7-9-11 | 0, | Some tests failing (NGONNX-494), Have to change to only v1 ops (NGONNX-1015) -| GlobalAveragePool | 1- | 0,1 | -| GlobalLpPool | 1-2- | 0, (1) | Not fully v1, need `lp_norm` expressed with v1 ops (NGONNX-1018) -| GlobalMaxPool | 1- | 0,1 | -| Greater | 1-7-9 | 0,1 | -| HardSigmoid | 1-6- | 0,1 | -| Hardmax | 11- | 0, (1) | GOE -| Identity | 1- | 0,1 | -| InstanceNormalization | 1- | 0, (1) | Have to change to only v1 ops (NGONNX-1015) -| LRN | 1- | 0,1 | -| LSTM | 1-7- | 0,(1) | -| LeakyRelu | 1-6- | 0,(1) | (NGONNX-1015) -| Less | 1-7-9 | 0,1 | -| Log | 1-6- | 0,1 | -| LogSoftmax | 1- | 0,1 | -| LpNormalization | 1- | 0, | (NGONNX-1018) need to update some builders -| MatMul | 1-9 | 0,(1) | Uses `v0::Dot`, v0 broadcasts and reshapes, update builders -| MatMulInteger | 10- | 0, | `v0::QuantizedDot` -| Max | 1-6-8- | 0, 1 | -| MaxPool | 1-8- | 0, 1 | -| Mean | 1-6-8- | 0, 1 | -| Min | 1-6-8- | 0,1 | -| Mod | 10- | 1 | -| Mul | (1-6-)7- | 0,1 | -| Neg | 1-6- | 0,1 | -| NonMaxSuppression | 11- | 1 | -| Not | 1- | 0,1 | (aka `v1::LogicalNot`) -| OneHot | (9) | 0, (1) | (NGONNX-1015) -| Or | 1-7- | 0,1 | (aka `v1::LogicalOr`) -| PRelu | 1-6-7-9 | 0, 1 | fused op uses arithmetic and broadcasting from v0 -| Pad | 1-2-11- | 0, (1) | (NGONNX-1015) -| Pow | 1-7- | 0,1 | -| QLinearConv | 10- | 0 | `opset0::QuantizedConvolution` -| QLinearMatMul | 10- | 0 | `v0::QuantizedDot` -| QuantizeLinear | 10- | 0 | `opset0::Quantize` -| Reciprocal | 1-6- | 0, 1| -| ReduceL1 | 1- | 0, | (NGONNX-1018) -| ReduceL2 | 1- | 0,1 | -| ReduceLogSum | 1- | 0,1 | -| ReduceLogSumExp | 1- | 0,1 | -| ReduceMax | 1- | 0,1 | -| ReduceMean | 1- | 0,1 | -| ReduceMin | 1- | 0,1 | -| ReduceProd | 1- | 0,1 | -| ReduceSum | 1- | 0,1 | -| ReduceSumSquare | 1- | 0,1 | -| Relu | 1-6- | 0,1 | -| Reshape | 1-5- | (0,1) | v1 supports dynamic target shape, but only as Constant? -| ReverseSequence | 10- | 0,1 | -| ScatterND | 11- | 0, | -| Selu | 1-6- | 0, 1 | -| Shape | 1- | 0,1 | -| Shrink | 1- | 0,1 | -| Sigmoid | 1-6- | 0,1 | -| Sign | 9- | 0,1 | -| Sin | 7- | 0,1 | -| Sinh | 9- | 0,1 | -| Size | 1- | 0,1 | -| Slice | 1- | 0,1 | -| Softmax | 1- | 0,1 | -| Softplus | 1- | 0,1 | -| Softsign | 1- | 0,(1) | (NGONNX-1015) -| SpaceToDepth | 1- | 0,1 | -| Split | 1-2- | 0,1 | -| Sqrt | 1-6- | 0,1 | -| Squeeze | 1- | 0,(1) | -| Sub | (1-6-)7- | 0,1 | -| Sum | 1-6-8- | 0,1 | -| Tan | 7- | 0,1 | -| Tanh | 1-6- | 0,1 | -| ThresholdedRelu | 10- | 0,1 | -| TopK | 1- | 0,(1) | Need v0::GOE -| Transpose | 1- | 0,1 | -| Unsqueeze | 1- | 0,1 | -| Xor | 1-7- | 0,1 | -| Where | 9- | 0,1 | - -### Able to implement or WIP -| Name | Opset supported | NGCORE | NGONNX | Comment | -|------|-----------------|--------|--------|---------| -| BitShift | (11)- | | 1014 | -| ConstantOfShape | (9) | 286 | 445 | Dynamic shape input. WIP | -| DynamicQuantizeLinear | (11) | | 786 | -| GRU | - | | 325, 177 | There is no `GRUCell` nor `GRU` in v1 | -| RNN | - | | 323, 287 | `v1::RNNCell`| -| Round | (11)- | | 1008 | `v0::Round` -| Tile | - | NGRAPH-3292 | 368 | Dynamic op. WIP | -| Cast | 1-6- | 290 | 452 | Float16 unsupported. | - -## Unsupported Ops: - -### Lack of support in nGraph -| Name | Opset supported | NGCORE | NGONNX | Comment | -|------|-----------------|--------|--------|---------| -| MaxUnpool | (9) | 286, 289 | 447 | | -| LpPool | - | 291 | 488 | Unsupported by nGraph - only max/avg pooling ops. Need separate kernel. | -| Multinomial | - | 199 | 435 | Lack of PRNG in nGraph. | -| RandomNormal | - | 199 | 434 | Lack of PRNG in nGraph. | -| RandomNormalLike | - | 199 | 434 | Lack of PRNG in nGraph. | -| IsInf | (10) | | 528 | -| StringNormalizer | (10) | | 600 | Need support for `string` data type. -| TfIdfVectorizer | (9) | | 523 | -| Det | (11) | | 754 | | - -### Futher analysis needed -| Name | Opset supported | NGCORE | NGONNX | Comment | -|------|-----------------|--------|--------|---------| -| If | - | | 432 | At this moment probably impossible. | -| IsNaN | (9) | | 440 | Hacky way is to generate constant nodes with representations of NaN and compare with them. | -| Loop | - | | 432 | Static loops with some preconditions may be possible, however no idea how to pass graph (proto?) as a _body_ attribute. (what about graph contains `Loop`?) | -| Scan | - | | 433 | Further analysis needed. - determine whether it is possible to import graph passed by op attribute. | -| Einsum | (12) | | | User can define in a language the operation to perform | -| NonZero | (9) | | 472 | Maybe we can leverage TopK here? First count NonZero elements with logic ops and reduction and then TopK. -| Resize | (10-11)- | | 782 | Look like Interpolation over ROIs. Very specialized types of interpolation. -| ScatterElements | (11) | | 977 | -| ScatterND | (11) | | 1020 | -| Unique | (11) | | 761 | - -### Dynamic operators -| Name | Opset supported | NGCORE | NGONNX | Comment | -|------|-----------------|--------|--------|---------| -| Compress | (9-11) | 285 | 438 | Dynamically selected indices | -| Expand | - | NGRAPH-3289 | 367 | Dynamic op. | -| GatherElements | - | | 757 | | -| OneHot | (9) | NGCORE-339 | 486 | Dynamic output shape -| Upsample | (7-9-10-) | 287 | 441 | Dynamic op. **Deprecated** from opset 10 | -| MaxRoiPool | - | 288 | 487 | Dynamic op - Need dynamic slicing. Beside just use _slice/op/concat_ pattern. | -| Reshape | 1-5- | NGRAPH-3290 | 357 | Lack of support for dynamic shape input. Only as a Constant or as an Initializer. | -| Scatter | (9) | 289 | 446 | Dynamic indices input. **Deprecated** in ONNX standard | -| RoiAlign | (10) | | 601 | - -### Sequence* ops -| Name | Opset supported | NGCORE | NGONNX | Comment | -|------|-----------------|--------|--------|---------| -| ConcatFromSequence | (11)- | | 1016 | -| SequenceAt | (11) | | 1021 | need further analysis | -| SequenceConstruct | (11) | | 1021 | need further analysis | -| SequenceEmpty | (11) | | 1021 | need further analysis | -| SequenceErase | (11) | | 1021 | need further analysis | -| SequenceInsert | (11) | | 1021 | need further analysis | -| SequenceLength | (11) | | 1021 | need further analysis | -| SplitToSequence | (11) | | 1021 | need further analysis | diff --git a/docs/doxygen/ie_docs.xml b/docs/doxygen/ie_docs.xml index 08344c65c64b28..c474141f86c3b6 100644 --- a/docs/doxygen/ie_docs.xml +++ b/docs/doxygen/ie_docs.xml @@ -267,7 +267,7 @@ - + From ab6d3a52277b2ec5ee45b749b2c5b849d5623302 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20=C5=BByczy=C5=84ski?= Date: Mon, 7 Sep 2020 10:52:06 +0200 Subject: [PATCH 11/66] [IE CLDNN] Grouped convolution kernel improvement (#2063) --- .../convolution/convolution_kernel_imad.cpp | 4 +++- .../cl_kernels/fused_conv_eltwise_gpu_imad.cl | 18 ++++++++++++++++++ .../tests/test_cases/convolution_gpu_test.cpp | 11 ++++++++++- 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad.cpp index dd193c2e14d2e7..bb4158f8fb27b4 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad.cpp @@ -124,6 +124,7 @@ JitConstants ConvolutionKernel_imad::GetJitConstants(const convolution_params& p MakeJitConstant("OWPAD", output.X().pad.Total()), MakeJitConstant("OHPAD", output.Y().pad.Total()), MakeJitConstant("SIMD_SIZE", SIMD_SIZE), + MakeJitConstant("FSV", in_fsv), }); if (params.filterSize.x != 3 || params.filterSize.y != 3) { @@ -193,7 +194,8 @@ bool ConvolutionKernel_imad::Validate(const Params& params, const optional_param } auto& newParams = static_cast(params); - if (newParams.groups > 1 && newParams.weights.IFM().v % 4 != 0) + if (newParams.groups > 1 && newParams.weights.IFM().v % 4 != 0 && + newParams.inputs[0].GetLayout() != DataLayout::b_fs_yx_fsv16) return false; size_t min_block_size_x = (newParams.weights.X().v - 1) * newParams.dilation.x + 1; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_imad.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_imad.cl index f470f17d424bbc..b4f39ea8922213 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_imad.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_imad.cl @@ -98,6 +98,10 @@ KERNEL (fused_convolution_eltwise_gpu_imad)( int w[NUM_FILTERS]; int in_addr; +#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % PACK != 0)) + int in_start_addr = INPUT0_GET_INDEX(batch, 0, input_y, input_x + sglid); +#endif + #ifdef BLOCK_LOAD_WEIGHTS int weight_addr = (ofmg * CEIL_DIV(FILTER_IFM_NUM, PACK) * FILTER_SIZE_Y * FILTER_SIZE_X * SIMD_SIZE) + (g * FILTER_GROUPS_PITCH / 4); #else @@ -110,7 +114,11 @@ KERNEL (fused_convolution_eltwise_gpu_imad)( for(int kd = 0; kd < CEIL_DIV(FILTER_IFM_NUM, PACK); kd++) { #if INPUT0_LAYOUT_B_FS_YX_FSV16 + #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % PACK != 0)) + int feature_location = kd * PACK + g * FILTER_IFM_NUM; + #else in_addr = INPUT0_GET_INDEX(batch, (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * PACK, input_y, input_x + sglid); + #endif #else #ifdef BLOCK_LOAD_INPUTS in_addr = INPUT0_OFFSET + (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * INPUT0_FEATURE_PITCH + input_y * INPUT0_Y_PITCH + input_x; @@ -119,10 +127,20 @@ KERNEL (fused_convolution_eltwise_gpu_imad)( #endif in_addr += batch * input_size; // adjust for batching #endif + for(uint reg = 0; reg < IN_BLOCK_HEIGHT; reg++) { #if INPUT0_LAYOUT_B_FS_YX_FSV16 + #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % PACK != 0)) + INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &in[reg]; + in_addr = in_start_addr + reg * INPUT0_Y_PITCH * FSV; + for (uint v = 0; v < PACK; v++) { + int f_addr = ((feature_location + v) / FSV + INPUT0_PAD_BEFORE_FEATURE_NUM / FSV) * INPUT0_FEATURE_PITCH * FSV + (feature_location + v) % FSV; + input_int8_arr[v] = conv_input[in_addr + f_addr]; + } + #else in[reg] = *(__global PACKED_TYPE*)(conv_input + in_addr); in_addr += (INPUT0_SIZE_X + IWPAD) * 16; + #endif #else #ifdef BLOCK_LOAD_INPUTS in[reg] = AS_PACKED_TYPE(intel_sub_group_block_read(&conv_input[in_addr])); diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp index 5cc8c4b06df066..04ca03f9fa69a0 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp @@ -7168,6 +7168,15 @@ INSTANTIATE_TEST_CASE_P(convolution_grouped_fsv4_fsv16, // Input X size, Input Y size, Input Z size, Input features, Output features, // Kernel size X, Kernel size Y, Kernel size Z, Groups number, Stride, Batch, // Input data format, Implementation name + // Format: b_fs_yx_fsv16 + TestParamType_grouped_convolution_gpu(12, 12, 1, 96, 96, 3, 3, 1, 32, 1, 1, format::b_fs_yx_fsv16, ""), + TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 16, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""), + TestParamType_grouped_convolution_gpu(7, 7, 1, 8, 4, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""), + TestParamType_grouped_convolution_gpu(5, 5, 1, 34, 12, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""), + TestParamType_grouped_convolution_gpu(8, 8, 1, 34, 24, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""), + TestParamType_grouped_convolution_gpu(2, 2, 1, 12, 12, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""), + TestParamType_grouped_convolution_gpu(3, 3, 1, 8, 8, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""), + TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 2, 4, format::b_fs_yx_fsv16, ""), // Format: b_fs_yx_fsv4 TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, format::b_fs_yx_fsv4, ""), @@ -7188,7 +7197,7 @@ INSTANTIATE_TEST_CASE_P(convolution_grouped_fsv4_fsv16, TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, format::b_fs_yx_fsv16, ""), TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, format::b_fs_yx_fsv16, ""), TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, format::b_fs_yx_fsv16, ""), - + // Format: b_fs_zyx_fsv16 TestParamType_grouped_convolution_gpu(4, 4, 4, 16, 17, 3, 3, 3, 1, 1, 1, format::b_fs_zyx_fsv16, ""), TestParamType_grouped_convolution_gpu(4, 4, 4, 16, 16, 3, 3, 3, 4, 1, 1, format::b_fs_zyx_fsv16, ""), From cba089283200112843248a3be38bb2a531dd6a96 Mon Sep 17 00:00:00 2001 From: Chen Xu Date: Mon, 7 Sep 2020 20:35:11 +0800 Subject: [PATCH 12/66] * [CPU] Extend Reduce node to support blocked layouts nC[d]hw8/16C (#580) --- .../src/mkldnn_plugin/CMakeLists.txt | 2 +- .../src/mkldnn_plugin/mkldnn_node.cpp | 13 + .../src/mkldnn_plugin/mkldnn_node.h | 38 +- .../src/mkldnn_plugin/nodes/base.hpp | 2 +- .../src/mkldnn_plugin/nodes/list_tbl.hpp | 12 - .../nodes/mkldnn_reduce_node.cpp | 1943 +++++++++++++++++ .../mkldnn_plugin/nodes/mkldnn_reduce_node.h | 126 ++ .../src/mkldnn_plugin/nodes/reduce.cpp | 406 ---- .../single_layer_tests/reduce_ops.cpp | 104 +- inference-engine/thirdparty/mkl-dnn | 2 +- 10 files changed, 2213 insertions(+), 435 deletions(-) create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.h delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/reduce.cpp diff --git a/inference-engine/src/mkldnn_plugin/CMakeLists.txt b/inference-engine/src/mkldnn_plugin/CMakeLists.txt index d33688a57d4133..166818cda371c4 100644 --- a/inference-engine/src/mkldnn_plugin/CMakeLists.txt +++ b/inference-engine/src/mkldnn_plugin/CMakeLists.txt @@ -46,6 +46,7 @@ set(LAYERS ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_normalize_node.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_scatter_update_node.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_interpolate_node.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_reduce_node.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/list.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/batch_to_space.cpp @@ -77,7 +78,6 @@ set(LAYERS ${CMAKE_CURRENT_SOURCE_DIR}/nodes/proposal_onnx.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/psroi.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/range.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/nodes/reduce.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/region_yolo.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/reorg_yolo.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/reverse_sequence.cpp diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp index 5f4b45c64b9611..6f877538cdda81 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -124,6 +125,18 @@ static const InferenceEngine::details::caseless_unordered_map { "ScatterElementsUpdate", ScatterElementsUpdate}, { "ScatterNDUpdate", ScatterNDUpdate}, { "Interpolate", Interpolate}, + { "ReduceAnd", ReduceAnd}, + { "ReduceL1", ReduceL1}, + { "ReduceL2", ReduceL2}, + { "ReduceLogSum", ReduceLogSum}, + { "ReduceLogSumExp", ReduceLogSumExp}, + { "ReduceMax", ReduceMax}, + { "ReduceMean", ReduceMean}, + { "ReduceMin", ReduceMin}, + { "ReduceOr", ReduceOr}, + { "ReduceProd", ReduceProd}, + { "ReduceSum", ReduceSum}, + { "ReduceSumSquare", ReduceSumSquare}, }; Type TypeFromName(const std::string type) { diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.h b/inference-engine/src/mkldnn_plugin/mkldnn_node.h index 0e5dc7b3b2d5c6..be994f7b46fdee 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_node.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.h @@ -77,7 +77,19 @@ enum Type { ScatterUpdate, ScatterElementsUpdate, ScatterNDUpdate, - Interpolate + Interpolate, + ReduceAnd, + ReduceL1, + ReduceL2, + ReduceLogSum, + ReduceLogSumExp, + ReduceMax, + ReduceMean, + ReduceMin, + ReduceOr, + ReduceProd, + ReduceSum, + ReduceSumSquare }; Type TypeFromName(const std::string type); @@ -168,6 +180,30 @@ static std::string NameFromType(Type type) { return "ScatterNDUpdate"; case Interpolate: return "Interpolate"; + case ReduceAnd: + return "ReduceAnd"; + case ReduceL1: + return "ReduceL1"; + case ReduceL2: + return "ReduceL2"; + case ReduceLogSum: + return "ReduceLogSum"; + case ReduceLogSumExp: + return "ReduceLogSumExp"; + case ReduceMax: + return "ReduceMax"; + case ReduceMean: + return "ReduceMean"; + case ReduceMin: + return "ReduceMin"; + case ReduceOr: + return "ReduceOr"; + case ReduceProd: + return "ReduceProd"; + case ReduceSum: + return "ReduceSum"; + case ReduceSumSquare: + return "ReduceSumSquare"; default: return "Unknown"; } diff --git a/inference-engine/src/mkldnn_plugin/nodes/base.hpp b/inference-engine/src/mkldnn_plugin/nodes/base.hpp index 6c7732b40346d6..c0adec071985d0 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/base.hpp +++ b/inference-engine/src/mkldnn_plugin/nodes/base.hpp @@ -102,7 +102,7 @@ class ExtLayerBase: public ILayerExecImpl { const bool isInt8 = (data->getPrecision() == Precision::I8 || data->getPrecision() == Precision::U8); if (conf.layout == ConfLayout::BLK8 || conf.layout == ConfLayout::BLK16) { - if (data_dims.size() < 4 && data_dims.size() > 5) + if (data_dims.size() < 4 || data_dims.size() > 5) THROW_IE_EXCEPTION << "Inapplicable blocking layout." << "Tensor should be 4D or 5D."; diff --git a/inference-engine/src/mkldnn_plugin/nodes/list_tbl.hpp b/inference-engine/src/mkldnn_plugin/nodes/list_tbl.hpp index a6d3501251f688..ec6d86cd3a0f85 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/list_tbl.hpp +++ b/inference-engine/src/mkldnn_plugin/nodes/list_tbl.hpp @@ -76,18 +76,6 @@ MKLDNN_EXTENSION_NODE(GatherImpl, Gather); MKLDNN_EXTENSION_NODE(ProposalImpl, Proposal); MKLDNN_EXTENSION_NODE(RangeImpl, Range); MKLDNN_EXTENSION_NODE(SelectImpl, Select); -MKLDNN_EXTENSION_NODE(ReduceImpl, ReduceAnd); -MKLDNN_EXTENSION_NODE(ReduceImpl, ReduceL1); -MKLDNN_EXTENSION_NODE(ReduceImpl, ReduceL2); -MKLDNN_EXTENSION_NODE(ReduceImpl, ReduceLogSum); -MKLDNN_EXTENSION_NODE(ReduceImpl, ReduceLogSumExp); -MKLDNN_EXTENSION_NODE(ReduceImpl, ReduceMax); -MKLDNN_EXTENSION_NODE(ReduceImpl, ReduceMean); -MKLDNN_EXTENSION_NODE(ReduceImpl, ReduceMin); -MKLDNN_EXTENSION_NODE(ReduceImpl, ReduceOr); -MKLDNN_EXTENSION_NODE(ReduceImpl, ReduceProd); -MKLDNN_EXTENSION_NODE(ReduceImpl, ReduceSum); -MKLDNN_EXTENSION_NODE(ReduceImpl, ReduceSumSquare); MKLDNN_EXTENSION_NODE(GatherTreeImpl, GatherTree); MKLDNN_EXTENSION_NODE(PriorBoxClusteredImpl, PriorBoxClustered); MKLDNN_EXTENSION_NODE(SpaceToBatchImpl, SpaceToBatch); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp new file mode 100644 index 00000000000000..23eac1b5ee6638 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp @@ -0,0 +1,1943 @@ +// Copyright (C) 2018-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "mkldnn_reduce_node.h" +#include "desc_iterator.hpp" +#include "mkldnn_quantize_node.h" +#include "mkldnn_depthwise_node.h" +#include "mkldnn_activation_node.h" +#include +#include +#include +#include +#include +#include +#include +#include "ie_parallel.hpp" +#include + +#include "jit_generator.hpp" +#include "jit_uni_eltwise.hpp" +#include "jit_uni_depthwise.hpp" +#include "jit_uni_quantization.hpp" + +using namespace mkldnn; +using namespace MKLDNNPlugin; +using namespace InferenceEngine; +using namespace mkldnn::impl; +using namespace mkldnn::impl::cpu; +using namespace mkldnn::impl::utils; +using namespace Xbyak; + +#define SET_SRC_DIM_VALUE(batch, channel, depth, height, width) IB = batch; \ + IC = channel; \ + ID = depth; \ + IH = height; \ + IW = width; +#define SET_DST_DIM_VALUE(batch, channel, depth, height, width) OB = batch; \ + OC = channel; \ + OD = depth; \ + OH = height; \ + OW = width; + +#define GET_OFF(field) offsetof(jit_reduce_call_args, field) + +#define GET_PTR_N_PLN const uint8_t *in_ptr_n = in_ptr + src_data_size * ib * IC * ID * IH * IW; \ + uint8_t *out_ptr_n = out_ptr + dst_data_size * ob * OC * OD * OH * OW; +#define GET_PTR_NC_PLN const uint8_t *in_ptr_nc = in_ptr_n + src_data_size * ic * ID * IH * IW; \ + uint8_t *out_ptr_nc = out_ptr_n + dst_data_size * oc * OD * OH * OW; +#define GET_PTR_NCD_PLN const uint8_t *in_ptr_ncd = in_ptr_nc + src_data_size * id * IH * IW; \ + uint8_t *out_ptr_ncd = out_ptr_nc + dst_data_size * od * OH * OW; +#define GET_PTR_NCDH_PLN const uint8_t *in_ptr_ncdh = in_ptr_ncd + src_data_size * ih * IW; \ + uint8_t *out_ptr_ncdh = out_ptr_ncd + dst_data_size * oh * OW; +#define GET_PTR_NCD_BASE_PTR_N_PLN const uint8_t *in_ptr_ncd = in_ptr_n + src_data_size * (ic * ID + id) * IH * IW; \ + uint8_t *out_ptr_ncd = out_ptr_n + dst_data_size * (oc * OD + od) * OH * OW; +#define GET_PTR_N_BLK const uint8_t *in_ptr_n = in_ptr + src_data_size * ib * ICB * ID * IH * IW * blk_size; \ + uint8_t *out_ptr_n = out_ptr + dst_data_size * ob * OCB * OD * OH * OW * blk_size; +#define GET_PTR_NC_BLK const uint8_t *in_ptr_nc = in_ptr_n + src_data_size * icb * ID * IH * IW * blk_size; \ + uint8_t *out_ptr_nc = out_ptr_n + dst_data_size * ocb * OD * OH * OW * blk_size; +#define GET_PTR_NCD_BLK const uint8_t *in_ptr_ncd = in_ptr_nc + src_data_size * id * IH * IW * blk_size; \ + uint8_t *out_ptr_ncd = out_ptr_nc + dst_data_size * od * OH * OW * blk_size; +#define GET_PTR_NCDH_BLK const uint8_t *in_ptr_ncdh = in_ptr_ncd + src_data_size * ih * IW * blk_size; \ + uint8_t *out_ptr_ncdh = out_ptr_ncd + dst_data_size * oh * OW * blk_size; +#define GET_PTR_NCDHW_BLK const uint8_t *in_ptr_ncdhw = in_ptr_ncdh + src_data_size * iw * blk_size; \ + uint8_t *out_ptr_ncdhw = out_ptr_ncdh + dst_data_size * ow * blk_size; +#define GET_PTR_NCD_BASE_PTR_N_BLK const uint8_t *in_ptr_ncd = in_ptr_n + src_data_size * (icb * ID + id) * IH * IW * blk_size; \ + uint8_t *out_ptr_ncd = out_ptr_n + dst_data_size * (ocb * OD + od) * OH * OW * blk_size; + +template +struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_generator { + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_reduce_kernel_f32) + + explicit jit_uni_reduce_kernel_f32(jit_reduce_config_params jcp) + : jit_uni_reduce_kernel(jcp), jit_generator() { + exp_injector.reset(new jit_uni_eltwise_injector_f32(this, alg_kind::eltwise_exp, 0.f, 0.f)); + + this->preamble(); + + mov(reg_src, ptr[reg_params + GET_OFF(src)]); + mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); + mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); + if (jcp_.planar_layout) + mov(reg_reduce_w, ptr[reg_params + GET_OFF(reduce_w)]); + + if (jcp_.reduce_mode == Reduce::And || jcp_.reduce_mode == Reduce::L1 || jcp_.reduce_mode == Reduce::Max || + jcp_.reduce_mode == Reduce::Min || jcp_.reduce_mode == Reduce::Prod || jcp_.reduce_mode == Reduce::Or) { + mov(reg_table, l_table); + } + + if (isa == cpu::avx512_common || jcp_.reduce_mode == Reduce::And || jcp_.reduce_mode == Reduce::Or) + uni_vpxor(vmm_zero, vmm_zero, vmm_zero); + + if ((isa == cpu::avx512_common && jcp_.reduce_mode == Reduce::And) || jcp_.reduce_mode == Reduce::Or) { + uni_vmovups(vmm_aux, table_val(0)); + } + + reduce_main(); + reduce_tail(); + + this->postamble(); + + if (jcp_.reduce_mode == Reduce::And || jcp_.reduce_mode == Reduce::L1 || jcp_.reduce_mode == Reduce::Max || + jcp_.reduce_mode == Reduce::Min || jcp_.reduce_mode == Reduce::Prod || jcp_.reduce_mode == Reduce::Or) { + prepare_aux_table(); + } else if (jcp_.reduce_mode == Reduce::LogSumExp) { + exp_injector->prepare_table(); + } + + ker_ = (decltype(ker_)) this->getCode(); + } + +private: + using Vmm = typename conditional3::type; + size_t vlen = cpu_isa_traits::vlen; + + Xbyak::Address table_val(int index) { return ptr[reg_table + index * vlen]; } + + Xbyak::Reg64 reg_src = r8; + Xbyak::Reg64 reg_dst = r9; + Xbyak::Reg64 reg_work_amount = r10; + Xbyak::Reg64 reg_reduce_w = r11; + Xbyak::Reg64 reg_table = r12; + Xbyak::Reg64 reg_params = abi_param1; + + Xbyak::Reg8 reg_tmp_8 = r13b; + Xbyak::Reg32 reg_tmp_32 = r13d; + Xbyak::Reg64 reg_tmp_64 = r13; + + Vmm vmm_aux = Vmm(0); + Xmm xmm_aux = Xmm(0); + Vmm vmm_src = Vmm(1); + Xmm xmm_src = Xmm(1); + Vmm vmm_dst = Vmm(2); + Xmm xmm_dst = Xmm(2); + Vmm vmm_zero = Vmm(3); + Xmm xmm_zero = Xmm(3); + Vmm vmm_dst_aux = Vmm(4); + Xbyak::Xmm xmm_aux1 = Xbyak::Xmm(5); + Xbyak::Xmm xmm_aux2 = Xbyak::Xmm(6); + Xbyak::Xmm xmm_aux3 = Xbyak::Xmm(7); + + const Xbyak::Opmask k_mask = Xbyak::Opmask(1); + + Xbyak::Label l_table; + + std::shared_ptr> exp_injector; + + inline void reduce_main() { + // ================================================================ + // ***isa: AVX512*** + // Reduce::And (Logical And) + // step 1: init dst 0x3f800000 (1.0f) + // aux 0x3f800000 (1.0f) + // zero 0x00000000 (0.0f) + // step 2: if src equals 0, set mask bit 0, else set mask bit 1 + // step 3: src = mask bit == 0 ? zero : aux + // step 4: dst = dst & src + // src mask_bit new_src dst new_dst + // case 1 ~0 1 1.0f 1.0f 1.0f + // case 2 0 0 0.0f 1.0f 0.0f + // case 3 ~0 1 1.0f 0.0f 0.0f + // case 4 0 0 0.0f 0.0f 0.0f + // step 5: loop: offset src, and do step 2 and step 3 + // + // Reduce::Or (Logical Or) + // step 1: init dst 0x00000000 (0.0f) + // aux 0x3f800000 (1.0f) + // zero 0x00000000 (0.0f) + // step 2: if src equals 0, set mask bit 0, else set mask bit 1 + // step 3: src = mask bit == 0 ? zero : aux + // step 4: dst = dst | src + // src mask_bit new_src dst new_dst + // case 1 0 0 0.0f 0.0f 0.0f + // case 2 ~0 1 1.0f 0.0f 1.0f + // case 3 0 0 0.0f 1.0f 1.0f + // case 4 ~0 1 1.0f 1.0f 1.0f + // step 5: loop: offset src, and do step 2 and step 3 + // ================================================================ + // ***isa: OTHER*** + // Reduce::And (Logical And) + // step 1: init dst 0x3f800000 (1.0f) + // step 2: if src equals 0, set it 0x00000000, else set 0xffffffff + // step 3: dst = dst & src + // 0x3f800000 = 0x3f800000 & 0xffffffff (result: 1.0f) + // 0x00000000 = 0x3f800000 & 0x00000000 (result: 0.0f) + // 0x00000000 = 0x00000000 & 0xffffffff (result: 0.0f) + // 0x00000000 = 0x00000000 & 0x00000000 (result: 0.0f) + // step 4: loop: offset src, and do step 2 and step 3 + // + // Reduce::Or (Logical Or) + // step 1: init dst 0x00000000 (0.0f) + // aux 0x3f800000 (1.0f) + // step 2: dst = dst | src + // 0x00000000 = 0x00000000 | 0x00000000 + // A = 0x00000000 | A + // A = A | 0x00000000 + // C = A | B + // (A, B stand for number other than 0x00000000) + // step 3: loop: offset src, and do step 2 + // step 4: if dst equals 0, set it 0x00000000, else set 0xffffffff + // step 5: dst = dst & aux + // 0x00000000 = 0x00000000 & 0x3f800000 (result: 0.0f) + // 0x3f800000 = 0xffffffff & 0x3f800000 (result: 1.0f) + // ================================================================ + Xbyak::Label reduce_to_vector_label; + Xbyak::Label reduce_to_scalar_label; + Xbyak::Label reduce_main_end_label; + if (jcp_.planar_layout) { + cmp(reg_reduce_w, 1); // planar layout reducing W + je(reduce_to_scalar_label, T_NEAR); + } + + // store vmm_dst directly into memory after reducing + // cases: [planar layout reducing other dimensions but W] [blocked layout] + L(reduce_to_vector_label); + { + int step = vlen / sizeof(float) < 8 ? 8 : vlen / sizeof(float); + cmp(reg_work_amount, step); + jl(reduce_main_end_label, T_NEAR); //avoid illegal loading and storing + + if (jcp_.reduce_mode == Reduce::L1) { + uni_vmovups(vmm_aux, table_val(1)); + } + + // load + load_dst_vector(); + + Xbyak::Label reduce_loop_label; + Xbyak::Label reduce_loop_end_label; + + // reduce + L(reduce_loop_label); + { + cmp(reg_work_amount, step); + jl(reduce_loop_end_label, T_NEAR); + + load_vector(vmm_src, ptr[reg_src], jcp_.src_dt); + reduce_kernel(vmm_src, vmm_dst); + + if (isa == cpu::sse42) { + load_vector(vmm_src, ptr[reg_src + 4 * jcp_.src_data_size], jcp_.src_dt); + reduce_kernel(vmm_src, vmm_dst_aux); + } + + add(reg_src, step * jcp_.src_data_size); + sub(reg_work_amount, step); + + jmp(reduce_loop_label, T_NEAR); + } + L(reduce_loop_end_label); + + // store + store_dst_vector(); + + jmp(reduce_main_end_label, T_NEAR); + } + + // reduce vector in vmm_dst to be a scalar before store into memory + // cases: [planar layout reducing W] + L(reduce_to_scalar_label); + { + // init dst, dst loading is embedded in horiz_reduce_store + switch (jcp_.reduce_mode) { + case Reduce::And: + case Reduce::Prod: + uni_vmovups(vmm_dst, table_val(0)); + break; + case Reduce::L1: + uni_vmovups(vmm_aux, table_val(1)); + uni_vpxor(vmm_dst, vmm_dst, vmm_dst); + break; + case Reduce::L2: + case Reduce::LogSum: + case Reduce::LogSumExp: + case Reduce::Mean: + case Reduce::Or: + case Reduce::Sum: + case Reduce::SumSquare: + uni_vpxor(vmm_dst, vmm_dst, vmm_dst); + break; + case Reduce::Max: + if (jcp_.dst_dt == memory::f32) + uni_vmovups(vmm_dst, table_val(2)); + else + uni_vmovups(vmm_dst, table_val(4)); + break; + case Reduce::Min: + if (jcp_.dst_dt == memory::f32) + uni_vmovups(vmm_dst, table_val(3)); + else + uni_vmovups(vmm_dst, table_val(5)); + break; + default: + assert(!"unsupported reduce mode"); + } + // reduce + reduce_main_loop(); + if (jcp_.reduce_mode == Reduce::Or && isa != avx512_common) { + vcmpneqps(vmm_dst, vmm_dst, vmm_zero); + uni_vandps(vmm_dst, vmm_dst, vmm_aux); + } + // store + // store after horizontal calculation and calculation with loaded original ptr[reg_dst] + load_embedded_horiz_reduce_store(vmm_dst, jcp_.dst_dt); + } + + L(reduce_main_end_label); + } + + inline void reduce_tail() { + if (jcp_.reduce_mode == Reduce::L1) { + uni_vmovups(xmm_aux, table_val(1)); + } + + Xbyak::Label tail_dst_shifted_label; + Xbyak::Label tail_dst_fixed_label; + Xbyak::Label reduce_tail_end_label; + if (jcp_.planar_layout) { + cmp(reg_reduce_w, 1); // planar layout reducing W + je(tail_dst_fixed_label, T_NEAR); + } + + // each src scalar reduce to each dst scalar (X1, X2, X3, ...) -> (Y1, Y2, Y3, ...) + // cases: [planar layout reducing other dimensions but W] [blocked layout concern padding] + L(tail_dst_shifted_label); + { + Xbyak::Label reduce_loop_label; + Xbyak::Label reduce_loop_end_label; + + int step = 1; + L(reduce_loop_label); + { + cmp(reg_work_amount, step); + jl(reduce_loop_end_label, T_NEAR); + + // load + load_scalar(xmm_dst, ptr[reg_dst], jcp_.dst_dt); + load_scalar(xmm_src, ptr[reg_src], jcp_.src_dt); + + // reduce + reduce_kernel_scalar(xmm_src, xmm_dst); + if (jcp_.reduce_mode == Reduce::Or) { + vcmpneqps(xmm_dst, xmm_dst, xmm_zero); + uni_vandps(xmm_dst, xmm_dst, xmm_aux); + } + + // store + store_scalar(ptr[reg_dst], xmm_dst, jcp_.dst_dt); + + add(reg_dst, step * jcp_.dst_data_size); + add(reg_src, step * jcp_.src_data_size); + sub(reg_work_amount, step); + + jmp(reduce_loop_label, T_NEAR); + } + L(reduce_loop_end_label); + + jmp(reduce_tail_end_label, T_NEAR); + } + + // each src scalar reduce to the same dst scalar (X1, X2, X3, ...) -> (Y1) + // cases: [planar layout reducing W] + L(tail_dst_fixed_label); + { + // load + load_scalar(xmm_dst, ptr[reg_dst], jcp_.dst_dt); + + Xbyak::Label reduce_loop_label; + Xbyak::Label reduce_loop_end_label; + + // reduce + int step = 1; + L(reduce_loop_label); + { + cmp(reg_work_amount, step); + jl(reduce_loop_end_label, T_NEAR); + + load_scalar(xmm_src, ptr[reg_src], jcp_.src_dt); + + reduce_kernel_scalar(xmm_src, xmm_dst); + if (jcp_.reduce_mode == Reduce::Or) { + vcmpneqps(xmm_dst, xmm_dst, xmm_zero); + uni_vandps(xmm_dst, xmm_dst, xmm_aux); + } + + add(reg_src, step * jcp_.src_data_size); + sub(reg_work_amount, step); + + jmp(reduce_loop_label, T_NEAR); + } + L(reduce_loop_end_label); + + // store + store_scalar(ptr[reg_dst], xmm_dst, jcp_.dst_dt); + add(reg_dst, step * jcp_.dst_data_size); + } + + L(reduce_tail_end_label); + } + + inline void reduce_main_loop() { + Xbyak::Label reduce_loop_label; + Xbyak::Label reduce_loop_end_label; + + int step = vlen / sizeof(float) < 8 ? 8 : vlen / sizeof(float); + L(reduce_loop_label); + { + cmp(reg_work_amount, step); + jl(reduce_loop_end_label, T_NEAR); + + load_vector(vmm_src, ptr[reg_src], jcp_.src_dt); + reduce_kernel(vmm_src, vmm_dst); + + if (isa == cpu::sse42) { + load_vector(vmm_src, ptr[reg_src + 4 * jcp_.src_data_size], jcp_.src_dt); + reduce_kernel(vmm_src, vmm_dst); + } + + add(reg_src, step * jcp_.src_data_size); + sub(reg_work_amount, step); + + jmp(reduce_loop_label, T_NEAR); + } + L(reduce_loop_end_label); + } + + inline void reduce_kernel(Vmm vmm_src, Vmm vmm_dst) { + switch (jcp_.reduce_mode) { + case Reduce::And: + if (isa == avx512_common) { + vcmpps(k_mask, vmm_src, vmm_zero, _cmp_neq_uq); + vblendmps(vmm_src | k_mask, vmm_zero, vmm_aux); + } else { + vcmpneqps(vmm_src, vmm_src, vmm_zero); + } + uni_vandps(vmm_dst, vmm_dst, vmm_src); + break; + case Reduce::L1: + uni_vandps(vmm_src, vmm_src, vmm_aux); + uni_vaddps(vmm_dst, vmm_dst, vmm_src); + break; + case Reduce::LogSum: + case Reduce::Mean: + case Reduce::Sum: + uni_vaddps(vmm_dst, vmm_dst, vmm_src); + break; + case Reduce::Max: + uni_vmaxps(vmm_dst, vmm_dst, vmm_src); + break; + case Reduce::Min: + uni_vminps(vmm_dst, vmm_dst, vmm_src); + break; + case Reduce::L2: + case Reduce::SumSquare: + uni_vmulps(vmm_src, vmm_src, vmm_src); + uni_vaddps(vmm_dst, vmm_dst, vmm_src); + break; + case Reduce::LogSumExp: + exp_injector->compute_vector_range(vmm_src.getIdx(), vmm_src.getIdx() + 1); + uni_vaddps(vmm_dst, vmm_dst, vmm_src); + break; + case Reduce::Or: + if (isa == avx512_common) { + vcmpps(k_mask, vmm_src, vmm_zero, _cmp_neq_uq); + vblendmps(vmm_src | k_mask, vmm_zero, vmm_aux); + } + uni_vorps(vmm_dst, vmm_dst, vmm_src); + break; + case Reduce::Prod: + uni_vmulps(vmm_dst, vmm_dst, vmm_src); + break; + default: + assert(!"unsupported reduce mode"); + } + } + + inline void reduce_kernel_scalar(Xmm xmm_src, Xmm xmm_dst) { + switch (jcp_.reduce_mode) { + case Reduce::And: + vcmpneqps(xmm_src, xmm_src, xmm_zero); + uni_vandps(xmm_dst, xmm_dst, xmm_src); + break; + case Reduce::L1: + uni_vandps(xmm_src, xmm_src, xmm_aux); + uni_vaddps(xmm_dst, xmm_dst, xmm_src); + break; + case Reduce::LogSum: + case Reduce::Mean: + case Reduce::Sum: + uni_vaddps(xmm_dst, xmm_dst, xmm_src); + break; + case Reduce::Max: + uni_vmaxps(xmm_dst, xmm_dst, xmm_src); + break; + case Reduce::Min: + uni_vminps(xmm_dst, xmm_dst, xmm_src); + break; + case Reduce::L2: + case Reduce::SumSquare: + uni_vmulps(xmm_src, xmm_src, xmm_src); + uni_vaddps(xmm_dst, xmm_dst, xmm_src); + break; + case Reduce::LogSumExp: + exp_injector->compute_vector_range(xmm_src.getIdx(), xmm_src.getIdx() + 1); + uni_vaddps(xmm_dst, xmm_dst, xmm_src); + break; + case Reduce::Or: + uni_vorps(xmm_dst, xmm_dst, xmm_src); + break; + case Reduce::Prod: + uni_vmulps(xmm_dst, xmm_dst, xmm_src); + break; + default: + assert(!"unsupported reduce mode"); + } + } + + inline void load_dst_vector() { + load_vector(vmm_dst, ptr[reg_dst], jcp_.dst_dt); + if (isa == cpu::sse42) + load_vector(vmm_dst_aux, ptr[reg_dst + 4 * jcp_.dst_data_size], jcp_.dst_dt); + } + + inline void store_dst_vector() { + if (jcp_.reduce_mode == Reduce::Or && isa != avx512_common) { + vcmpneqps(vmm_dst, vmm_dst, vmm_zero); + uni_vandps(vmm_dst, vmm_dst, vmm_aux); + if (isa == cpu::sse42) { + vcmpneqps(vmm_dst_aux, vmm_dst_aux, vmm_zero); + uni_vandps(vmm_dst_aux, vmm_dst_aux, vmm_aux); + } + } + store_vector(ptr[reg_dst], vmm_dst, jcp_.dst_dt); + if (isa == cpu::sse42) + store_vector(ptr[reg_dst + 4 * jcp_.dst_data_size], vmm_dst_aux, jcp_.dst_dt); + } + + inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, memory::data_type src_dt) { + switch (src_dt) { + case memory::f32: + case memory::s32: + uni_vmovups(vmm_src, op); + break; + case memory::s8: + uni_vpmovsxbd(vmm_src, op); + break; + case memory::u8: + uni_vpmovzxbd(vmm_src, op); + break; + default: + assert(!"unknown src_dt"); + } + + if (src_dt != memory::f32) + uni_vcvtdq2ps(vmm_src, vmm_src); + } + + inline void load_scalar(Xmm xmm_src, const Xbyak::Address &op, memory::data_type src_dt) { + switch (src_dt) { + case memory::f32: + case memory::s32: + movss(xmm_src, op); + break; + case memory::s8: + movsx(reg_tmp_32, op); + movq(xmm_src, reg_tmp_64); + break; + case memory::u8: + movzx(reg_tmp_32, op); + movq(xmm_src, reg_tmp_64); + break; + default: + assert(!"unknown src_dt"); + } + + if (src_dt != data_type::f32) { + uni_vcvtdq2ps(xmm_src, xmm_src); + } + } + + inline void store_vector(const Xbyak::Address &op, Vmm vmm_dst, memory::data_type dst_dt) { + Xmm xmm_dst = Xmm(vmm_dst.getIdx()); + Ymm ymm_dst = Ymm(vmm_dst.getIdx()); + + if (dst_dt != memory::f32) { + uni_vcvtps2dq(vmm_dst, vmm_dst); + } + + switch (dst_dt) { + case memory::f32: + case memory::s32: + uni_vmovups(op, vmm_dst); + break; + case memory::s8: + if (isa == avx512_common) { + vmaxps(vmm_dst, vmm_zero, vmm_dst); + vpmovsdb(op, vmm_dst); + } else { + uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst); + if (isa != cpu::sse42) + vpermq(ymm_dst, ymm_dst, 0x08); + uni_vpacksswb(vmm_dst, vmm_dst, vmm_dst); + if (isa != cpu::sse42) + vmovq(op, xmm_dst); + else + movd(op, xmm_dst); + } + break; + case memory::u8: + if (isa == avx512_common) { + vpmovusdb(op, vmm_dst); + } else { + uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst); + if (isa != cpu::sse42) + vpermq(ymm_dst, ymm_dst, 0x08); + uni_vpackuswb(vmm_dst, vmm_dst, vmm_dst); + if (isa != cpu::sse42) + vmovq(op, xmm_dst); + else + movd(op, xmm_dst); + } + break; + default: + assert(!"unknown dst_dt"); + } + } + + inline void store_scalar(const Xbyak::Address &op, Xmm xmm_dst, memory::data_type dst_dt) { + if (dst_dt != memory::f32) { + uni_vcvtps2dq(xmm_dst, xmm_dst); + } + + switch (dst_dt) { + case memory::f32: + case memory::s32: + movss(op, xmm_dst); + break; + case memory::s8: + uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); + uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_8); + break; + case memory::u8: + uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); + uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_8); + break; + default: + assert(!"unknown dst_dt"); + } + } + + inline void load_embedded_horiz_reduce_store(Vmm vmm_dst, memory::data_type dst_dt) { + if (isa == cpu::sse42) { + load_embedded_horiz_store(vmm_dst, dst_dt); + } else if (isa == cpu::avx2) { + Xbyak::Ymm ymm_dst = Xbyak::Ymm(vmm_dst.getIdx()); + vextractf128(xmm_aux1, ymm_dst, 0); + vextractf128(xmm_aux2, ymm_dst, 1); + horiz_ps(xmm_aux1, xmm_aux2); + load_embedded_horiz_store(xmm_aux1, dst_dt); + } else { + Xbyak::Zmm zmm_dst = Xbyak::Zmm(vmm_dst.getIdx()); + vextractf32x4(xmm_aux1, zmm_dst, 0); + vextractf32x4(xmm_aux2, zmm_dst, 1); + horiz_ps(xmm_aux1, xmm_aux2); + vextractf32x4(xmm_aux2, zmm_dst, 2); + vextractf32x4(xmm_aux3, zmm_dst, 3); + horiz_ps(xmm_aux2, xmm_aux3); + horiz_ps(xmm_aux1, xmm_aux2); + load_embedded_horiz_store(xmm_aux1, dst_dt); + } + } + + inline void load_embedded_horiz_store(Xbyak::Xmm xmm_dst, memory::data_type dst_dt) { + movshdup(xmm_aux3, xmm_dst); // dst:1,2,3,4; aux3:2,2,4,4 + horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2),f(2,2),f(3,4),f(4,4) + movhlps(xmm_aux3, xmm_dst); // aux3:f(3,4),f(4,4),4,4 + horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2,3,4),... + switch (dst_dt) { + case memory::f32: + movss(xmm_aux3, ptr[reg_dst]); + horiz_ps(xmm_dst, xmm_aux3); + movss(ptr[reg_dst], xmm_dst); + break; + case memory::s32: + movss(xmm_aux3, ptr[reg_dst]); + uni_vcvtdq2ps(xmm_aux3, xmm_aux3); + horiz_ps(xmm_dst, xmm_aux3); + uni_vcvtps2dq(xmm_dst, xmm_dst); + movss(ptr[reg_dst], xmm_dst); + break; + case memory::u8: + vpbroadcastb(xmm_aux3, ptr[reg_dst]); + uni_vpmovzxbd(xmm_aux3, xmm_aux3); + uni_vcvtdq2ps(xmm_aux3, xmm_aux3); + horiz_ps(xmm_dst, xmm_aux3); + uni_vcvtps2dq(xmm_dst, xmm_dst); + uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); + uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); + pextrb(ptr[reg_dst], xmm_dst, 0); + break; + case memory::s8: + vpbroadcastb(xmm_aux3, ptr[reg_dst]); + uni_vpmovsxbd(xmm_aux3, xmm_aux3); + uni_vcvtdq2ps(xmm_aux3, xmm_aux3); + horiz_ps(xmm_dst, xmm_aux3); + uni_vcvtps2dq(xmm_dst, xmm_dst); + uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); + uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); + pextrb(ptr[reg_dst], xmm_dst, 0); + break; + default: + assert(!"unknown dst_dt"); + } + } + + inline void horiz_ps(const Xmm& xmm, const Operand& op) { + switch (jcp_.reduce_mode) { + case Reduce::And: + andps(xmm, op); + break; + case Reduce::L1: + case Reduce::L2: + case Reduce::LogSum: + case Reduce::Mean: + case Reduce::Sum: + case Reduce::SumSquare: + case Reduce::LogSumExp: + addps(xmm, op); + break; + case Reduce::Max: + maxps(xmm, op); + break; + case Reduce::Min: + minps(xmm, op); + break; + case Reduce::Or: + orps(xmm, op); + break; + case Reduce::Prod: + mulps(xmm, op); + break; + default: + assert(!"unsupported reduce mode"); + } + } + + void prepare_aux_table() { + auto broadcast_int = [&](int val) { + for (size_t d = 0; d < vlen / sizeof(float); ++d) { + dd(val); + } + }; + + align(64); + L(l_table); + + broadcast_int(aux_vals.float_one); + broadcast_int(aux_vals.float_abs); + broadcast_int(aux_vals.float_min); + broadcast_int(aux_vals.float_max); + broadcast_int(aux_vals.int32_min); + broadcast_int(aux_vals.int32_max); + } + + const struct aux_vals_type { + int float_one = 0x3f800000; // 1.0f + int float_abs = 0x7fffffff; // mask to make positive + int float_min = 0xff7fffff; // float minimum + int float_max = 0x7f7fffff; // float maximum + int int32_min = 0xcf000000; // -2^31 presented in float + int int32_max = 0x4effffff; // 2^31-1 presented in float + } aux_vals; +}; + +template +struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, public jit_generator { + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_reduce_post_kernel_f32) + + explicit jit_uni_reduce_post_kernel_f32(jit_reduce_config_params jcp) + : jit_uni_reduce_post_kernel(jcp), jit_generator() { + log_injector.reset(new jit_uni_eltwise_injector_f32(this, alg_kind::eltwise_log, 0.f, 0.f)); + + this->preamble(); + + mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); + mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); + mov(reg_divisor, ptr[reg_params + GET_OFF(divisor)]); + if (!jcp_.planar_layout) + mov(reg_reduce_c, ptr[reg_params + GET_OFF(reduce_c)]); + + if (isa == cpu::avx512_common) + uni_vpxor(vmm_zero, vmm_zero, vmm_zero); + + reduce_post_main(); + if (jcp_.planar_layout) + reduce_post_tail(); + + this->postamble(); + + if (jcp_.reduce_mode == Reduce::LogSum || jcp_.reduce_mode == Reduce::LogSumExp) { + log_injector->prepare_table(); + } + + ker_ = (decltype(ker_)) this->getCode(); + } + +private: + using Vmm = typename conditional3::type; + size_t vlen = cpu_isa_traits::vlen; + + Xbyak::Reg64 reg_dst = r8; + Xbyak::Reg64 reg_work_amount = r9; + Xbyak::Reg64 reg_divisor = r10; + Xbyak::Reg64 reg_reduce_c = r11; + Xbyak::Reg64 reg_params = abi_param1; + + Xbyak::Reg8 reg_tmp_8 = r12b; + Xbyak::Reg32 reg_tmp_32 = r12d; + Xbyak::Reg64 reg_tmp_64 = r12; + + Vmm vmm_aux = Vmm(0); + Xmm xmm_aux = Xmm(0); + Vmm vmm_dst = Vmm(1); + Xmm xmm_dst = Xmm(1); + Vmm vmm_zero = Vmm(2); + Vmm vmm_dst_aux = Vmm(3); + Xbyak::Xmm xmm_aux1 = Xbyak::Xmm(4); + Xbyak::Xmm xmm_aux2 = Xbyak::Xmm(5); + Xbyak::Xmm xmm_aux3 = Xbyak::Xmm(6); + + std::shared_ptr> log_injector; + + inline void reduce_post_main() { + Xbyak::Label reduce_channel_label; + Xbyak::Label reduce_map_label; + if (jcp_.planar_layout) { + jmp(reduce_map_label, T_NEAR); + } else { + cmp(reg_reduce_c, 1); + jne(reduce_map_label, T_NEAR); + } + + // further reduce channel block since reduce channel batch has already been reduced + // (X1, X2, X3, X4, X5, X6, X7, X8) -> (Y1, N/A, N/A, N/A, N/A, N/A, N/A, N/A) + // cases: [blocked layout reducing channel dimensions] + L(reduce_channel_label); + { + Xbyak::Label reduce_loop_label; + Xbyak::Label reduce_loop_end_label; + + int step = vlen / sizeof(float) < 8 ? 8 : vlen / sizeof(float); + L(reduce_loop_label); + { + cmp(reg_work_amount, step); + jl(reduce_loop_end_label, T_NEAR); + + // load + load_vector(vmm_dst, ptr[reg_dst], jcp_.dst_dt); + if (isa == cpu::sse42) + load_vector(vmm_dst_aux, ptr[reg_dst + 4 * jcp_.dst_data_size], jcp_.dst_dt); + + // reduce and store + horiz_reduce_store(vmm_dst, jcp_.dst_dt); + if (isa == cpu::sse42) + load_embedded_horiz_reduce_store(vmm_dst_aux, jcp_.dst_dt); + + add(reg_dst, step * jcp_.dst_data_size); + sub(reg_work_amount, step); + + jmp(reduce_loop_label, T_NEAR); + } + L(reduce_loop_end_label); + + mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); + mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); + } + + // reduce map for value in dst memory + // cases: [ReduceL2] [ReduceLogSum] [ReduceLogSumExp] [ReduceMean] + L(reduce_map_label); + { + if (jcp_.reduce_mode == Reduce::L2 || jcp_.reduce_mode == Reduce::Mean || + jcp_.reduce_mode == Reduce::LogSum || jcp_.reduce_mode == Reduce::LogSumExp) { + if (jcp_.reduce_mode == Reduce::Mean) + uni_vbroadcastss(vmm_aux, ptr[reg_divisor]); + + Xbyak::Label reduce_loop_label; + Xbyak::Label reduce_loop_end_label; + + int step = vlen / sizeof(float) < 8 ? 8 : vlen / sizeof(float); + L(reduce_loop_label); + { + cmp(reg_work_amount, step); + jl(reduce_loop_end_label, T_NEAR); + + // load + load_vector(vmm_dst, ptr[reg_dst], jcp_.dst_dt); + if (isa == cpu::sse42) + load_vector(vmm_dst_aux, ptr[reg_dst + 4 * jcp_.dst_data_size], jcp_.dst_dt); + + // reduce + reduce_map_kernel(vmm_dst); + if (isa == cpu::sse42) + reduce_map_kernel(vmm_dst_aux); + + // store + store_vector(ptr[reg_dst], vmm_dst, jcp_.dst_dt); + if (isa == cpu::sse42) + store_vector(ptr[reg_dst + 4 * jcp_.dst_data_size], vmm_dst_aux, jcp_.dst_dt); + + add(reg_dst, step * jcp_.dst_data_size); + sub(reg_work_amount, step); + + jmp(reduce_loop_label, T_NEAR); + } + L(reduce_loop_end_label); + } + } + } + + inline void reduce_post_tail() { + // reduce map for tail in dst memory + // cases: [ReduceL2] [ReduceLogSum] [ReduceLogSumExp] [ReduceMean] in planar layout + if (jcp_.reduce_mode == Reduce::L2 || jcp_.reduce_mode == Reduce::Mean || + jcp_.reduce_mode == Reduce::LogSum || jcp_.reduce_mode == Reduce::LogSumExp) { + if (jcp_.reduce_mode == Reduce::Mean) + uni_vbroadcastss(xmm_aux, ptr[reg_divisor]); + + Xbyak::Label reduce_loop_label; + Xbyak::Label reduce_loop_end_label; + + int step = 1; + L(reduce_loop_label); + { + cmp(reg_work_amount, step); + jl(reduce_loop_end_label, T_NEAR); + + // load + load_scalar(xmm_dst, ptr[reg_dst], jcp_.dst_dt); + + // reduce + reduce_map_kernel_scalar(xmm_dst); + + // store + store_scalar(ptr[reg_dst], xmm_dst, jcp_.dst_dt); + + add(reg_dst, step * jcp_.dst_data_size); + sub(reg_work_amount, step); + + jmp(reduce_loop_label, T_NEAR); + } + L(reduce_loop_end_label); + } + } + + inline void reduce_map_kernel(Vmm vmm_dst) { + if (jcp_.reduce_mode == Reduce::Mean) + uni_vdivps(vmm_dst, vmm_dst, vmm_aux); + else if (jcp_.reduce_mode == Reduce::L2) + uni_vsqrtps(vmm_dst, vmm_dst); + else if (jcp_.reduce_mode == Reduce::LogSum || jcp_.reduce_mode == Reduce::LogSumExp) + log_injector->compute_vector_range(vmm_dst.getIdx(), vmm_dst.getIdx() + 1); + } + + inline void reduce_map_kernel_scalar(Xmm xmm_dst) { + if (jcp_.reduce_mode == Reduce::Mean) + uni_vdivps(xmm_dst, xmm_dst, xmm_aux); + else if (jcp_.reduce_mode == Reduce::L2) + uni_vsqrtps(xmm_dst, xmm_dst); + else if (jcp_.reduce_mode == Reduce::LogSum || jcp_.reduce_mode == Reduce::LogSumExp) + log_injector->compute_vector_range(xmm_dst.getIdx(), xmm_dst.getIdx() + 1); + } + + inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, memory::data_type src_dt) { + switch (src_dt) { + case memory::f32: + case memory::s32: + uni_vmovups(vmm_src, op); + break; + case memory::s8: + uni_vpmovsxbd(vmm_src, op); + break; + case memory::u8: + uni_vpmovzxbd(vmm_src, op); + break; + default: + assert(!"unknown src_dt"); + } + + if (src_dt != memory::f32) + uni_vcvtdq2ps(vmm_src, vmm_src); + } + + inline void load_scalar(Xmm xmm_src, const Xbyak::Address &op, memory::data_type src_dt) { + switch (src_dt) { + case memory::f32: + case memory::s32: + movss(xmm_src, op); + break; + case memory::s8: + movsx(reg_tmp_32, op); + movq(xmm_src, reg_tmp_64); + break; + case memory::u8: + movzx(reg_tmp_32, op); + movq(xmm_src, reg_tmp_64); + break; + default: + assert(!"unknown src_dt"); + } + + if (src_dt != data_type::f32) { + uni_vcvtdq2ps(xmm_src, xmm_src); + } + } + + inline void store_vector(const Xbyak::Address &op, Vmm vmm_dst, memory::data_type dst_dt) { + Xmm xmm_dst = Xmm(vmm_dst.getIdx()); + Ymm ymm_dst = Ymm(vmm_dst.getIdx()); + + if (dst_dt != memory::f32) { + uni_vcvtps2dq(vmm_dst, vmm_dst); + } + + switch (dst_dt) { + case memory::f32: + case memory::s32: + uni_vmovups(op, vmm_dst); + break; + case memory::s8: + if (isa == avx512_common) { + vmaxps(vmm_dst, vmm_zero, vmm_dst); + vpmovsdb(op, vmm_dst); + } else { + uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst); + if (isa != cpu::sse42) + vpermq(ymm_dst, ymm_dst, 0x08); + uni_vpacksswb(vmm_dst, vmm_dst, vmm_dst); + if (isa != cpu::sse42) + vmovq(op, xmm_dst); + else + movd(op, xmm_dst); + } + break; + case memory::u8: + if (isa == avx512_common) { + vpmovusdb(op, vmm_dst); + } else { + uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst); + if (isa != cpu::sse42) + vpermq(ymm_dst, ymm_dst, 0x08); + uni_vpackuswb(vmm_dst, vmm_dst, vmm_dst); + if (isa != cpu::sse42) + vmovq(op, xmm_dst); + else + movd(op, xmm_dst); + } + break; + default: + assert(!"unknown dst_dt"); + } + } + + inline void store_scalar(const Xbyak::Address &op, Xmm xmm_dst, memory::data_type dst_dt) { + if (dst_dt != memory::f32) { + uni_vcvtps2dq(xmm_dst, xmm_dst); + } + + switch (dst_dt) { + case memory::f32: + case memory::s32: + movss(op, xmm_dst); + break; + case memory::s8: + uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); + uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_8); + break; + case memory::u8: + uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); + uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_8); + break; + default: + assert(!"unknown dst_dt"); + } + } + + inline void horiz_reduce_store(Vmm vmm_dst, memory::data_type dst_dt) { + if (isa == cpu::sse42) { + horize_store(vmm_dst, dst_dt); + } else if (isa == cpu::avx2) { + Xbyak::Ymm ymm_dst = Xbyak::Ymm(vmm_dst.getIdx()); + vextractf128(xmm_aux1, ymm_dst, 0); + vextractf128(xmm_aux2, ymm_dst, 1); + horiz_ps(xmm_aux1, xmm_aux2); + horize_store(xmm_aux1, dst_dt); + } else { + Xbyak::Zmm zmm_dst = Xbyak::Zmm(vmm_dst.getIdx()); + vextractf32x4(xmm_aux1, zmm_dst, 0); + vextractf32x4(xmm_aux2, zmm_dst, 1); + horiz_ps(xmm_aux1, xmm_aux2); + vextractf32x4(xmm_aux2, zmm_dst, 2); + vextractf32x4(xmm_aux3, zmm_dst, 3); + horiz_ps(xmm_aux2, xmm_aux3); + horiz_ps(xmm_aux1, xmm_aux2); + horize_store(xmm_aux1, dst_dt); + } + } + + inline void horize_store(Xbyak::Xmm xmm_dst, memory::data_type dst_dt) { + movshdup(xmm_aux3, xmm_dst); // dst:1,2,3,4; aux3:2,2,4,4 + horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2),f(2,2),f(3,4),f(4,4) + movhlps(xmm_aux3, xmm_dst); // aux3:f(3,4),f(4,4),4,4 + horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2,3,4),... + switch (dst_dt) { + case memory::f32: + movss(ptr[reg_dst], xmm_dst); + break; + case memory::s32: + uni_vcvtps2dq(xmm_dst, xmm_dst); + movss(ptr[reg_dst], xmm_dst); + break; + case memory::u8: + uni_vcvtps2dq(xmm_dst, xmm_dst); + uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); + uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); + pextrb(ptr[reg_dst], xmm_dst, 0); + break; + case memory::s8: + uni_vcvtps2dq(xmm_dst, xmm_dst); + uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); + uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); + pextrb(ptr[reg_dst], xmm_dst, 0); + break; + default: + assert(!"unknown dst_dt"); + } + } + + inline void load_embedded_horiz_reduce_store(Vmm vmm_dst, memory::data_type dst_dt) { + if (isa == cpu::sse42) { + load_embedded_horiz_store(vmm_dst, dst_dt); + } else if (isa == cpu::avx2) { + Xbyak::Ymm ymm_dst = Xbyak::Ymm(vmm_dst.getIdx()); + vextractf128(xmm_aux1, ymm_dst, 0); + vextractf128(xmm_aux2, ymm_dst, 1); + horiz_ps(xmm_aux1, xmm_aux2); + load_embedded_horiz_store(xmm_aux1, dst_dt); + } else { + Xbyak::Zmm zmm_dst = Xbyak::Zmm(vmm_dst.getIdx()); + vextractf32x4(xmm_aux1, zmm_dst, 0); + vextractf32x4(xmm_aux2, zmm_dst, 1); + horiz_ps(xmm_aux1, xmm_aux2); + vextractf32x4(xmm_aux2, zmm_dst, 2); + vextractf32x4(xmm_aux3, zmm_dst, 3); + horiz_ps(xmm_aux2, xmm_aux3); + horiz_ps(xmm_aux1, xmm_aux2); + load_embedded_horiz_store(xmm_aux1, dst_dt); + } + } + + inline void load_embedded_horiz_store(Xbyak::Xmm xmm_dst, memory::data_type dst_dt) { + movshdup(xmm_aux3, xmm_dst); // dst:1,2,3,4; aux3:2,2,4,4 + horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2),f(2,2),f(3,4),f(4,4) + movhlps(xmm_aux3, xmm_dst); // aux3:f(3,4),f(4,4),4,4 + horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2,3,4),... + switch (dst_dt) { + case memory::f32: + movss(xmm_aux3, ptr[reg_dst]); + horiz_ps(xmm_dst, xmm_aux3); + movss(ptr[reg_dst], xmm_dst); + break; + case memory::s32: + movss(xmm_aux3, ptr[reg_dst]); + uni_vcvtdq2ps(xmm_aux3, xmm_aux3); + horiz_ps(xmm_dst, xmm_aux3); + uni_vcvtps2dq(xmm_dst, xmm_dst); + movss(ptr[reg_dst], xmm_dst); + break; + case memory::u8: + vpbroadcastb(xmm_aux3, ptr[reg_dst]); + uni_vpmovzxbd(xmm_aux3, xmm_aux3); + uni_vcvtdq2ps(xmm_aux3, xmm_aux3); + horiz_ps(xmm_dst, xmm_aux3); + uni_vcvtps2dq(xmm_dst, xmm_dst); + uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); + uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); + pextrb(ptr[reg_dst], xmm_dst, 0); + break; + case memory::s8: + vpbroadcastb(xmm_aux3, ptr[reg_dst]); + uni_vpmovsxbd(xmm_aux3, xmm_aux3); + uni_vcvtdq2ps(xmm_aux3, xmm_aux3); + horiz_ps(xmm_dst, xmm_aux3); + uni_vcvtps2dq(xmm_dst, xmm_dst); + uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); + uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); + pextrb(ptr[reg_dst], xmm_dst, 0); + break; + default: + assert(!"unknown dst_dt"); + } + } + + inline void horiz_ps(const Xmm& xmm, const Operand& op) { + switch (jcp_.reduce_mode) { + case Reduce::And: + andps(xmm, op); + break; + case Reduce::L1: + case Reduce::L2: + case Reduce::LogSum: + case Reduce::Mean: + case Reduce::Sum: + case Reduce::SumSquare: + case Reduce::LogSumExp: + addps(xmm, op); + break; + case Reduce::Max: + maxps(xmm, op); + break; + case Reduce::Min: + minps(xmm, op); + break; + case Reduce::Or: + orps(xmm, op); + break; + case Reduce::Prod: + mulps(xmm, op); + break; + default: + assert(!"unsupported reduce mode"); + } + } +}; + +MKLDNNReduceNode::MKLDNNReduceNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) + : MKLDNNNode(layer, eng, cache) {} + +void MKLDNNReduceNode::getSupportedDescriptors() { + if (!descs.empty()) + return; + + if (getParentEdges().size() != 2) + THROW_IE_EXCEPTION << "Reduce layer with name " << getName() << " gets incorrect number of input edges!"; + if (getChildEdges().empty()) + THROW_IE_EXCEPTION << "Reduce layer with name " << getName() << " gets incorrect number of output edges!"; + + if (getParentEdgeAt(REDUCE_INDEXES)->getDims().ndims() != 1) { + THROW_IE_EXCEPTION << "Reduce layer with name " << getName() << " gets incorrect index vector dimension! Index vector should be 1 dimension."; + } + + auto *layer = getCnnLayer().get(); + keep_dims = layer->GetParamAsBool("keep_dims", false); + + if (keep_dims) { + if (getParentEdgeAt(REDUCE_DATA)->getDims().ndims() != getChildEdgeAt(0)->getDims().ndims()) + THROW_IE_EXCEPTION << "Reduce layer with name " << getName() << "gets incorrect number of input/output dimensions!"; + } else { + if (getParentEdgeAt(REDUCE_DATA)->getDims().ndims() <= getChildEdgeAt(0)->getDims().ndims()) + THROW_IE_EXCEPTION << "Reduce layer with name " << getName() << "gets incorrect number of input/output dimensions!"; + } + + Type reduce_mode = getType(); + if (reduce_mode == ReduceAnd) reduceMode = Reduce::And; + else if (reduce_mode == ReduceL1) reduceMode = Reduce::L1; + else if (reduce_mode == ReduceL2) reduceMode = Reduce::L2; + else if (reduce_mode == ReduceLogSum) reduceMode = Reduce::LogSum; + else if (reduce_mode == ReduceLogSumExp) reduceMode = Reduce::LogSumExp; + else if (reduce_mode == ReduceMax) reduceMode = Reduce::Max; + else if (reduce_mode == ReduceMean) reduceMode = Reduce::Mean; + else if (reduce_mode == ReduceMin) reduceMode = Reduce::Min; + else if (reduce_mode == ReduceOr) reduceMode = Reduce::Or; + else if (reduce_mode == ReduceProd) reduceMode = Reduce::Prod; + else if (reduce_mode == ReduceSum) reduceMode = Reduce::Sum; + else if (reduce_mode == ReduceSumSquare) reduceMode = Reduce::SumSquare; + else + THROW_IE_EXCEPTION << "Reduce layer with name " << getName() << " gets unsupported Reduce layer type!"; +} + +void MKLDNNReduceNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + Precision inputPrecision = getCnnLayer()->insData[REDUCE_DATA].lock()->getPrecision(); + Precision outputPrecision = getCnnLayer()->outData[0]->getPrecision(); + + if (inputPrecision == Precision::BF16) inputPrecision = Precision::FP32; + if (outputPrecision == Precision::BF16) outputPrecision = Precision::FP32; + + auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(inputPrecision); + auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(outputPrecision); + + input_prec = inputPrecision; + output_prec = outputPrecision; + src_data_size = MKLDNNExtensionUtils::sizeOfDataType(inputDataType); + dst_data_size = MKLDNNExtensionUtils::sizeOfDataType(outputDataType); + + InferenceEngine::LayerConfig config; + config.dynBatchSupport = false; + config.inConfs.resize(2); + config.outConfs.resize(1); + config.inConfs[REDUCE_DATA].constant = false; + config.inConfs[REDUCE_INDEXES].constant = false; + config.outConfs[0].constant = false; + config.inConfs[REDUCE_DATA].inPlace = -1; + config.inConfs[REDUCE_INDEXES].inPlace = -1; + config.outConfs[0].inPlace = -1; + + auto pushDesc = [&](memory::format inFormat, memory::format outFormat, memory::data_type inDataType, memory::data_type outDataType) { + config.inConfs[REDUCE_DATA].desc = MKLDNNMemoryDesc(getParentEdgeAt(REDUCE_DATA)->getDims(), inDataType, inFormat); + config.inConfs[REDUCE_INDEXES].desc = MKLDNNMemoryDesc(getParentEdgeAt(REDUCE_INDEXES)->getDims(), memory::s32, memory::x); + config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outDataType, outFormat); + supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, outFormat}); + }; + + jit_mode = (mayiuse(cpu::sse42)) && getParentEdgeAt(REDUCE_DATA)->getDims().ndims() <= 5 && + (inputPrecision == Precision::FP32 || inputPrecision == Precision::I32 || inputPrecision == Precision::U8 || inputPrecision == Precision::I8) && + (outputPrecision == Precision::FP32 || outputPrecision == Precision::I32 || outputPrecision == Precision::U8 || outputPrecision == Precision::I8); + if (jit_mode) { + pushDesc(MKLDNNMemory::GetPlainFormat(memory::dims(getParentEdgeAt(REDUCE_DATA)->getDims().ndims())), + MKLDNNMemory::GetPlainFormat(memory::dims(getChildEdgeAt(0)->getDims().ndims())), inputDataType, outputDataType); + if (keep_dims) { + if (getParentEdgeAt(REDUCE_DATA)->getDims().ndims() == 4 && getParentEdgeAt(REDUCE_DATA)->getDims().ToSizeVector()[1] > 1) { + if (mayiuse(cpu::avx512_common)) { + pushDesc(memory::nChw16c, memory::nChw16c, inputDataType, outputDataType); + } else if (mayiuse(cpu::avx2) || mayiuse(cpu::sse42)) { + pushDesc(memory::nChw8c, memory::nChw8c, inputDataType, outputDataType); + } + } else if (getParentEdgeAt(REDUCE_DATA)->getDims().ndims() == 5 && getParentEdgeAt(REDUCE_DATA)->getDims().ToSizeVector()[1] > 1) { + if (mayiuse(cpu::avx512_common)) { + pushDesc(memory::nCdhw16c, memory::nCdhw16c, inputDataType, outputDataType); + } else if (mayiuse(cpu::avx2) || mayiuse(cpu::sse42)) { + pushDesc(memory::nCdhw8c, memory::nCdhw8c, inputDataType, outputDataType); + } + } + } + } else { + pushDesc(MKLDNNMemory::GetPlainFormat(memory::dims(getParentEdgeAt(REDUCE_DATA)->getDims().ndims())), + MKLDNNMemory::GetPlainFormat(memory::dims(getChildEdgeAt(0)->getDims().ndims())), memory::f32, memory::f32); + } +} + +void MKLDNNReduceNode::createPrimitive() { + auto &dstMemPtr = getChildEdgeAt(0)->getMemoryPtr(); + auto &srcDataMemPtr = getParentEdgeAt(REDUCE_DATA)->getMemoryPtr(); + auto &srcIndexesMemPtr = getParentEdgeAt(REDUCE_INDEXES)->getMemoryPtr(); + if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr()) + THROW_IE_EXCEPTION << "Reduce layer with name " << getName() << "didn't allocate destination memory."; + if (!srcDataMemPtr || !srcDataMemPtr->GetPrimitivePtr() || !srcIndexesMemPtr || !srcIndexesMemPtr->GetPrimitivePtr()) + THROW_IE_EXCEPTION << "Reduce layer with name " << getName() << "didn't allocate input memory."; + if (getSelectedPrimitiveDescriptor() == nullptr) + THROW_IE_EXCEPTION << "Reduce layer with name " << getName() << "didn't set preferable primitive descriptor."; + + auto selectedPD = getSelectedPrimitiveDescriptor(); + Layout selected_layout = selectedPD->getConfig().inConfs[REDUCE_DATA].desc.getLayout(); + planar_layout = MKLDNNMemory::GetPlainLayout(getParentEdgeAt(REDUCE_DATA)->getDims()) == selected_layout; + + auto jcp = jit_reduce_config_params(); + jcp.src_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(selectedPD->getConfig().inConfs[REDUCE_DATA].desc.getPrecision()); + jcp.dst_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(selectedPD->getConfig().outConfs[0].desc.getPrecision()); + jcp.src_data_size = MKLDNNExtensionUtils::sizeOfDataType(jcp.src_dt); + jcp.dst_data_size = MKLDNNExtensionUtils::sizeOfDataType(jcp.dst_dt); + jcp.planar_layout = planar_layout; + jcp.reduce_mode = reduceMode; + + if (mayiuse(cpu::avx512_common)) { + reduce_kernel.reset(new jit_uni_reduce_kernel_f32(jcp)); + reduce_post_kernel.reset(new jit_uni_reduce_post_kernel_f32(jcp)); + blk_size = 16; + } else if (mayiuse(cpu::avx2)) { + reduce_kernel.reset(new jit_uni_reduce_kernel_f32(jcp)); + reduce_post_kernel.reset(new jit_uni_reduce_post_kernel_f32(jcp)); + blk_size = 8; + } else if (mayiuse(cpu::sse42)) { + reduce_kernel.reset(new jit_uni_reduce_kernel_f32(jcp)); + reduce_post_kernel.reset(new jit_uni_reduce_post_kernel_f32(jcp)); + blk_size = 8; + } + + jit_mode = jit_mode && reduce_kernel; +} + +void MKLDNNReduceNode::execute(mkldnn::stream strm) { + auto &dstMemPtr = getChildEdgeAt(0)->getMemoryPtr(); + auto &srcMemPtr = getParentEdgeAt(REDUCE_DATA)->getMemoryPtr(); + auto &srcIndexesMemPtr = getParentEdgeAt(REDUCE_INDEXES)->getMemoryPtr(); + + const auto idx_data = reinterpret_cast(srcIndexesMemPtr->GetData()); + size_t dst_size = dstMemPtr->GetSize(); + src_dims = getParentEdgeAt(REDUCE_DATA)->getDesc().getDims(); + src_strides = getParentEdgeAt(REDUCE_DATA)->getDesc().getBlockingDesc().getStrides(); + dims_size = src_dims.size(); + calc_process_dst_dims(idx_data); + + if (dims_size <= 5) { + if (dims_size == 5) { + SET_SRC_DIM_VALUE(src_dims[0], src_dims[1], src_dims[2], src_dims[3], src_dims[4]); + SET_DST_DIM_VALUE(process_dst_dims[0], process_dst_dims[1], process_dst_dims[2], process_dst_dims[3], process_dst_dims[4]); + } else if (dims_size == 4) { + SET_SRC_DIM_VALUE(src_dims[0], src_dims[1], 1, src_dims[2], src_dims[3]); + SET_DST_DIM_VALUE(process_dst_dims[0], process_dst_dims[1], 1, process_dst_dims[2], process_dst_dims[3]); + } else if (dims_size == 3) { + SET_SRC_DIM_VALUE(1, src_dims[0], 1, src_dims[1], src_dims[2]); + SET_DST_DIM_VALUE(1, process_dst_dims[0], 1, process_dst_dims[1], process_dst_dims[2]); + } else if (dims_size == 2) { + SET_SRC_DIM_VALUE(1, 1, 1, src_dims[0], src_dims[1]); + SET_DST_DIM_VALUE(1, 1, 1, process_dst_dims[0], process_dst_dims[1]); + } else { + SET_SRC_DIM_VALUE(1, src_dims[0], 1, 1, 1); + SET_DST_DIM_VALUE(1, process_dst_dims[0], 1, 1, 1); + } + + ReduceN = IB != OB && OB == 1; + ReduceC = IC != OC && OC == 1; + ReduceD = ID != OD && OD == 1; + ReduceH = IH != OH && OH == 1; + ReduceW = IW != OW && OW == 1; + } + + const uint8_t *src_data = reinterpret_cast(srcMemPtr->GetData()) + + srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding * + MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(srcMemPtr->GetDescriptor().data.data_type)); + uint8_t *dst_data = reinterpret_cast(dstMemPtr->GetData()) + + dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding * + MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(dstMemPtr->GetDescriptor().data.data_type)); + if (jit_mode) { + reduce_type(src_data, dst_data, dst_size); + } else { + if (planar_layout) { + auto in_ptr = reinterpret_cast(src_data); + auto out_ptr = reinterpret_cast(dst_data); + reduce_ref(in_ptr, out_ptr); + } else { + THROW_IE_EXCEPTION << "Reduce layer with name " << getName() << "only supports plain layout on machine w/o sse42."; + } + } +} + +void MKLDNNReduceNode::reduce_type(const uint8_t *in_ptr, uint8_t *out_ptr, size_t dst_size) { + init_dst_data(out_ptr, dst_size); + + if (planar_layout) { + reduce_PLN(in_ptr, out_ptr); + } else { + if ((reduceMode == Reduce::And || reduceMode == Reduce::LogSumExp || reduceMode == Reduce::Max || + reduceMode == Reduce::Min || reduceMode == Reduce::Prod) && ReduceC) { + reduce_BLK_concern_padding(in_ptr, out_ptr); + } else { + reduce_BLK(in_ptr, out_ptr); + } + } +} + +void MKLDNNReduceNode::reduce_PLN(const uint8_t *in_ptr, uint8_t *out_ptr) { + for (size_t ib = 0; ib < IB; ib++) { + size_t ob = ReduceN ? 0 : ib; GET_PTR_N_PLN; + if (!ReduceC && !ReduceD && ReduceH && ReduceW) { + parallel_for2d(IC, ID, [&](size_t ic, size_t id) { + size_t oc = ic, od = id; GET_PTR_NCD_BASE_PTR_N_PLN; + reduce_kernel_process(in_ptr_ncd, out_ptr_ncd, IH * IW, 1); + }); + } else if (ReduceH && ReduceW) { + for (size_t ic = 0; ic < IC; ic++) { + size_t oc = ReduceC ? 0 : ic; GET_PTR_NC_PLN; + for (size_t id = 0; id < ID; id++) { + size_t od = ReduceD ? 0 : id; GET_PTR_NCD_PLN; + reduce_kernel_process(in_ptr_ncd, out_ptr_ncd, IH * IW, 1); + } + } + } else if (!ReduceH && ReduceW) { + for (size_t ic = 0; ic < IC; ic++) { + size_t oc = ReduceC ? 0 : ic; GET_PTR_NC_PLN; + for (size_t id = 0; id < ID; id++) { + size_t od = ReduceD ? 0 : id; GET_PTR_NCD_PLN; + parallel_for(IH, [&](size_t ih){ + size_t oh = ih; GET_PTR_NCDH_PLN; + reduce_kernel_process(in_ptr_ncdh, out_ptr_ncdh, IW, 1); + }); + } + } + } else if (ReduceW) { + for (size_t ic = 0; ic < IC; ic++) { + size_t oc = ReduceC ? 0 : ic; GET_PTR_NC_PLN; + for (size_t id = 0; id < ID; id++) { + size_t od = ReduceD ? 0 : id; GET_PTR_NCD_PLN; + for (size_t ih = 0; ih < IH; ih++) { + size_t oh = ReduceH ? 0 : ih; GET_PTR_NCDH_PLN; + reduce_kernel_process(in_ptr_ncdh, out_ptr_ncdh, IW, 1); + } + } + } + } else { + for (size_t ic = 0; ic < IC; ic++) { + size_t oc = ReduceC ? 0 : ic; GET_PTR_NC_PLN; + for (size_t id = 0; id < ID; id++) { + size_t od = ReduceD ? 0 : id; GET_PTR_NCD_PLN; + for (size_t ih = 0; ih < IH; ih++) { + size_t oh = ReduceH ? 0 : ih; GET_PTR_NCDH_PLN; + for (size_t ibw = 0; ibw < IW / blk_size; ibw++) { + size_t obw = ibw; + reduce_kernel_process(in_ptr_ncdh + ibw * blk_size * src_data_size, + out_ptr_ncdh + obw * blk_size * dst_data_size, blk_size, 0); + } + size_t tail_start = IW / blk_size * blk_size; + reduce_kernel_process(in_ptr_ncdh + tail_start * src_data_size, out_ptr_ncdh + tail_start * dst_data_size, IW - tail_start, 0); + } + } + } + } + } + + reduce_kernel_post_process(out_ptr); +} + +void MKLDNNReduceNode::reduce_BLK(const uint8_t *in_ptr, uint8_t *out_ptr) { + size_t ICB = div_up(IC, blk_size); + size_t OCB = div_up(OC, blk_size); + + for (size_t ib = 0; ib < IB; ib++) { + size_t ob = ReduceN ? 0 : ib; GET_PTR_N_BLK; + if (!ReduceC && !ReduceD && ReduceH && ReduceW) { + parallel_for2d(ICB, ID, [&](size_t icb, size_t id) { + size_t ocb = icb, od = id; GET_PTR_NCD_BASE_PTR_N_BLK; + reduce_kernel_process(in_ptr_ncd, out_ptr_ncd, IH * IW * blk_size); + }); + } else if (ReduceH && ReduceW) { + for (size_t icb = 0; icb < ICB; icb++) { + size_t ocb = ReduceC ? 0 : icb; GET_PTR_NC_BLK; + for (size_t id = 0; id < ID; id++) { + size_t od = ReduceD ? 0 : id; GET_PTR_NCD_BLK; + reduce_kernel_process(in_ptr_ncd, out_ptr_ncd, IH * IW * blk_size); + } + } + } else if (ReduceW) { + for (size_t icb = 0; icb < ICB; icb++) { + size_t ocb = ReduceC ? 0 : icb; GET_PTR_NC_BLK; + for (size_t id = 0; id < ID; id++) { + size_t od = ReduceD ? 0 : id; GET_PTR_NCD_BLK; + for (size_t ih = 0; ih < IH; ih++) { + size_t oh = ReduceH ? 0 : ih; GET_PTR_NCDH_BLK; + reduce_kernel_process(in_ptr_ncdh, out_ptr_ncdh, IW * blk_size); + } + } + } + } else { + for (size_t icb = 0; icb < ICB; icb++) { + size_t ocb = ReduceC ? 0 : icb; GET_PTR_NC_BLK; + for (size_t id = 0; id < ID; id++) { + size_t od = ReduceD ? 0 : id; GET_PTR_NCD_BLK; + for (size_t ih = 0; ih < IH; ih++) { + size_t oh = ReduceH ? 0 : ih; GET_PTR_NCDH_BLK; + parallel_for(IW, [&](size_t iw) { + size_t ow = iw; GET_PTR_NCDHW_BLK; + reduce_kernel_process(in_ptr_ncdhw, out_ptr_ncdhw, blk_size); + }); + } + } + } + } + } + + reduce_kernel_post_process(out_ptr); +} + +void MKLDNNReduceNode::reduce_BLK_concern_padding(const uint8_t *in_ptr, uint8_t *out_ptr) { + size_t ICB = div_up(IC, blk_size); + size_t OCB = div_up(OC, blk_size); + + auto reduceSkipPadding = [&](const uint8_t *in_ptr_ncd, uint8_t *out_ptr_ncd, size_t ic) { + size_t blk_valid_size = IC - ic; + for (size_t ih = 0; ih < IH; ih++) { + size_t oh = ReduceH ? 0 : ih; GET_PTR_NCDH_BLK; + for (size_t iw = 0; iw < IW; iw++) { + size_t ow = ReduceW ? 0 : iw; GET_PTR_NCDHW_BLK; + reduce_kernel_process(in_ptr_ncdhw, out_ptr_ncdhw, blk_valid_size); + } + } + }; + + for (size_t ib = 0; ib < IB; ib++) { + size_t ob = ReduceN ? 0 : ib; GET_PTR_N_BLK; + if (!ReduceD && ReduceH && ReduceW) { + for (size_t icb = 0; icb < ICB; icb++) { + size_t ocb = 0; GET_PTR_NC_BLK; + size_t ic = icb * blk_size; + parallel_for(ID, [&](size_t id) { + size_t od = id; GET_PTR_NCD_BASE_PTR_N_BLK; + if (ic + blk_size <= IC) { + reduce_kernel_process(in_ptr_ncd, out_ptr_ncd, IH * IW * blk_size); + } else { + reduceSkipPadding(in_ptr_ncd, out_ptr_ncd, ic); + } + }); + } + } else if (ReduceD && ReduceH && ReduceW) { + for (size_t icb = 0; icb < ICB; icb++) { + size_t ocb = 0; GET_PTR_NC_BLK; + size_t ic = icb * blk_size; + if (ic + blk_size <= IC) { + reduce_kernel_process(in_ptr_nc, out_ptr_nc, ID * IH * IW * blk_size); + } else { + for (size_t id = 0; id < ID; id++) { + size_t od = 0; GET_PTR_NCD_BLK; + reduceSkipPadding(in_ptr_ncd, out_ptr_ncd, ic); + } + } + } + } else if (ReduceW) { + for (size_t icb = 0; icb < ICB; icb++) { + size_t ocb = 0; GET_PTR_NC_BLK; + size_t ic = icb * blk_size; + for (size_t id = 0; id < ID; id++) { + size_t od = ReduceD ? 0 : id; GET_PTR_NCD_BLK; + if (ic + blk_size <= IC) { + for (size_t ih = 0; ih < IH; ih++) { + size_t oh = ReduceH ? 0 : ih; GET_PTR_NCDH_BLK; + reduce_kernel_process(in_ptr_ncdh, out_ptr_ncdh, IW * blk_size); + } + } else { + reduceSkipPadding(in_ptr_ncd, out_ptr_ncd, ic); + } + } + } + } else { + for (size_t icb = 0; icb < ICB; icb++) { + size_t ocb = 0; GET_PTR_NC_BLK; + size_t ic = icb * blk_size; + for (size_t id = 0; id < ID; id++) { + size_t od = ReduceD ? 0 : id; GET_PTR_NCD_BLK; + if (ic + blk_size <= IC) { + for (size_t ih = 0; ih < IH; ih++) { + size_t oh = ReduceH ? 0 : ih; GET_PTR_NCDH_BLK; + parallel_for(IW, [&](size_t iw) { + size_t ow = iw; GET_PTR_NCDHW_BLK; + reduce_kernel_process(in_ptr_ncdhw, out_ptr_ncdhw, blk_size); + }); + } + } else { + reduceSkipPadding(in_ptr_ncd, out_ptr_ncd, ic); + } + } + } + } + } + + reduce_kernel_post_process(out_ptr); +} + +inline void MKLDNNReduceNode::reduce_kernel_process(const uint8_t *in_p, uint8_t *out_p, size_t work_amount, size_t reduce_w) { + auto arg = jit_reduce_call_args(); + arg.src = static_cast(in_p); + arg.dst = static_cast(out_p); + arg.work_amount = work_amount; + arg.reduce_w = reduce_w; + (*reduce_kernel)(&arg); +} + +inline void MKLDNNReduceNode::reduce_kernel_post_process(uint8_t *out_ptr) { + const float divisor = static_cast(IB * IC * ID * IH * IW / (OB * OC * OD * OH * OW)); + if (planar_layout) { + size_t parallel_amount = OB * OC * OD; + parallel_for(parallel_amount, [&](size_t i) { + uint8_t *out_p = out_ptr + i * OH * OW * dst_data_size; + auto arg = jit_reduce_call_args(); + arg.dst = static_cast(out_p); + arg.reduce_c = 2; + arg.work_amount = OH * OW; + arg.divisor = &divisor; + (*reduce_post_kernel)(&arg); + }); + } else { + size_t OCB = div_up(OC, blk_size); + size_t parallel_amount = OB * OCB * OD; + parallel_for(parallel_amount, [&](size_t i) { + uint8_t *out_p = out_ptr + i * OH * OW * blk_size * dst_data_size; + auto arg = jit_reduce_call_args(); + arg.dst = static_cast(out_p); + arg.reduce_c = ReduceC ? 1 : 0; + arg.work_amount = OH * OW * blk_size; + arg.divisor = &divisor; + (*reduce_post_kernel)(&arg); + }); + } +} + +inline void MKLDNNReduceNode::init_dst_data(uint8_t *out_ptr, size_t dst_size) { + switch (reduceMode) { + case Reduce::L1: + case Reduce::L2: + case Reduce::LogSum: + case Reduce::LogSumExp: + case Reduce::Mean: + case Reduce::Or: + case Reduce::Sum: + case Reduce::SumSquare: + memset(out_ptr, 0, dst_size); + break; + case Reduce::And: + case Reduce::Prod: + if (output_prec == Precision::FP32) { + auto out_p = reinterpret_cast(out_ptr); + parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast(1); }); + } else if (output_prec == Precision::I32) { + auto out_p = reinterpret_cast(out_ptr); + parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast(1); }); + } else if (output_prec == Precision::U8) { + auto out_p = reinterpret_cast(out_ptr); + parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast(1); }); + } else if (output_prec == Precision::I8) { + auto out_p = reinterpret_cast(out_ptr); + parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast(1); }); + } + break; + case Reduce::Max: + if (output_prec == Precision::FP32) { + auto out_p = reinterpret_cast(out_ptr); + parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::min(); }); + } else if (output_prec == Precision::I32) { + auto out_p = reinterpret_cast(out_ptr); + parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::min(); }); + } else if (output_prec == Precision::U8) { + auto out_p = reinterpret_cast(out_ptr); + parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::min(); }); + } else if (output_prec == Precision::I8) { + auto out_p = reinterpret_cast(out_ptr); + parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::min(); }); + } + break; + case Reduce::Min: + if (output_prec == Precision::FP32) { + auto out_p = reinterpret_cast(out_ptr); + parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::max(); }); + } else if (output_prec == Precision::I32) { + auto out_p = reinterpret_cast(out_ptr); + parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::max(); }); + } else if (output_prec == Precision::U8) { + auto out_p = reinterpret_cast(out_ptr); + parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::max(); }); + } else if (output_prec == Precision::I8) { + auto out_p = reinterpret_cast(out_ptr); + parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::max(); }); + } + break; + default: + THROW_IE_EXCEPTION << "Reduce layer with name " << getName() << "gets unsupported reduce mode."; + } +} + +inline void MKLDNNReduceNode::calc_process_dst_dims(const int32_t *idx_data) { + SizeVector out_dims; + SizeVector dst_dims = getChildEdgeAt(0)->getDesc().getDims(); + std::set axes; + for (size_t i = 0; i < getParentEdgeAt(REDUCE_INDEXES)->getDims()[0]; i++) { + int32_t axis = idx_data[i]; + if (axis < 0) + axis += src_dims.size(); + if (static_cast(axis) > src_dims.size()) + THROW_IE_EXCEPTION << "Reduce layer with name " << getName() << "exceeds data tensor dimension on index to reduce"; + axes.insert(static_cast(axis)); + } + for (size_t i = 0; i < src_dims.size(); i++) { + bool found = false; + for (auto axis : axes) { + if (i == axis) { + found = true; + break; + } + } + if (found) { + if (keep_dims) out_dims.push_back(1); + process_dst_dims.push_back(1); + axes_for_reduction.push_back(i); + } else { + out_dims.push_back(src_dims[i]); + process_dst_dims.push_back(src_dims[i]); + } + } + for (size_t i = 0; i < std::min(out_dims.size(), dst_dims.size()); i++) { + if (out_dims[i] != dst_dims[i]) + THROW_IE_EXCEPTION << "Reduce layer with name " << getName() << "gets incorrect number of output dimensions!"; + } +} + +inline void MKLDNNReduceNode::reduce_ref(const float *in_ptr, float *out_ptr) { + switch (reduceMode) { + case Reduce::And: + reduce_ref_process(in_ptr, out_ptr, 1, [](float x, float y)->float { return x && y; }); + break; + case Reduce::L1: + reduce_ref_process(in_ptr, out_ptr, 0, [](float old, float y)->float { return old + (y >= 0 ? y : -y); }); + break; + case Reduce::L2: + reduce_ref_process(in_ptr, out_ptr, 0, [](float old, float y)->float { return old + y * y; }); + break; + case Reduce::LogSum: + reduce_ref_process(in_ptr, out_ptr, 0, [](float x, float y)->float { return x + y; }); + break; + case Reduce::LogSumExp: + reduce_ref_process(in_ptr, out_ptr, 0, [](float old, float y)->float { return old + expf(y); }); + break; + case Reduce::Max: + reduce_ref_process(in_ptr, out_ptr, std::numeric_limits::min(), + [](float x, float y)->float { return x > y ? x : y; }); + break; + case Reduce::Mean: + reduce_ref_process(in_ptr, out_ptr, 0, [](float x, float y)->float { return x + y; }); + break; + case Reduce::Min: + reduce_ref_process(in_ptr, out_ptr, std::numeric_limits::max(), + [](float x, float y)->float { return x < y ? x : y; }); + break; + case Reduce::Or: + reduce_ref_process(in_ptr, out_ptr, 0, [](float x, float y)->float { return x || y; }); + break; + case Reduce::Prod: + reduce_ref_process(in_ptr, out_ptr, 1, [](float x, float y)->float { return x * y; }); + break; + case Reduce::Sum: + reduce_ref_process(in_ptr, out_ptr, 0, [](float x, float y)->float { return x + y; }); + break; + case Reduce::SumSquare: + reduce_ref_process(in_ptr, out_ptr, 0, [](float old, float y)->float { return old + y * y; }); + break; + default: + THROW_IE_EXCEPTION << "Reduce layer with name " << getName() << "gets unsupported reduce mode."; + } +} + +void MKLDNNReduceNode::reduce_ref_process(const float *in_ptr, float *out_ptr, float init_value, std::function func) { + size_t work_amount_dst = 1, reduced_dims_work_amount = 1; + for (size_t i = 0; i < process_dst_dims.size(); i++) + work_amount_dst *= process_dst_dims[i]; + for (size_t i = 0; i < src_dims.size(); i++) + reduced_dims_work_amount *= src_dims[i]; + reduced_dims_work_amount /= work_amount_dst; + + parallel_nt(0, [&](const int ithr, const int nthr) { + int j; + size_t i, start = 0, end = 0; + SizeVector dst_counters(process_dst_dims.size(), 0); + splitter(work_amount_dst, nthr, ithr, start, end); + for (j = process_dst_dims.size() - 1, i = start; j >= 0; j--) { + dst_counters[j] = i % process_dst_dims[j]; + i /= process_dst_dims[j]; + } + for (size_t src_idx = 0, dst_idx = start; dst_idx < end; ++dst_idx) { + float reduce_prod = init_value; + bool update_idx = true; + SizeVector src_counters = dst_counters; + for (i = 0; i < reduced_dims_work_amount; ++i) { + if (update_idx) { + src_idx = 0; + for (j = 0; j < static_cast(src_dims.size()); ++j) + src_idx += (src_counters[j] % src_dims[j]) * src_strides[j]; + update_idx = false; + } + reduce_prod = func(reduce_prod, in_ptr[src_idx]); + for (j = axes_for_reduction.size() - 1; j >= 0; j--) { + src_counters[axes_for_reduction[j]]++; + if (src_counters[axes_for_reduction[j]] < src_dims[axes_for_reduction[j]]) { + src_idx += src_strides[axes_for_reduction[j]]; + break; + } else { + src_counters[axes_for_reduction[j]] = 0; + update_idx = true; + } + } + } + out_ptr[dst_idx] = reduce_prod; + for (j = process_dst_dims.size() - 1; j >= 0; j--) { + dst_counters[j]++; + if (dst_counters[j] < process_dst_dims[j]) + break; + else + dst_counters[j] = 0; + } + } + }); + + reduce_ref_map(out_ptr, work_amount_dst, reduced_dims_work_amount); +} + +inline void MKLDNNReduceNode::reduce_ref_map(float *out_ptr, size_t work_amount_dst, size_t reduced_dims_work_amount) { + switch (reduceMode) { + case Reduce::And: + case Reduce::L1: + case Reduce::Max: + case Reduce::Min: + case Reduce::Or: + case Reduce::Prod: + case Reduce::Sum: + case Reduce::SumSquare: + break; + case Reduce::L2: + parallel_for(work_amount_dst, [&](size_t i) { + out_ptr[i] = std::sqrt(out_ptr[i]); + }); + break; + case Reduce::LogSum: + case Reduce::LogSumExp: + parallel_for(work_amount_dst, [&](size_t i) { + out_ptr[i] = logf(out_ptr[i]); + }); + break; + case Reduce::Mean: + parallel_for(work_amount_dst, [&](size_t i) { + out_ptr[i] /= reduced_dims_work_amount; + }); + break; + default: + THROW_IE_EXCEPTION << "Reduce layer with name " << getName() << "gets unsupported reduce mode."; + } +} + +bool MKLDNNReduceNode::created() const { + return getType() == ReduceAnd || getType() == ReduceL1 || getType() == ReduceL2 || + getType() == ReduceLogSum || getType() == ReduceLogSumExp || getType() == ReduceMax || + getType() == ReduceMean || getType() == ReduceMin || getType() == ReduceOr || + getType() == ReduceProd || getType() == ReduceSum || getType() == ReduceSumSquare; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNReduceNode, ReduceAnd); +REG_MKLDNN_PRIM_FOR(MKLDNNReduceNode, ReduceL1); +REG_MKLDNN_PRIM_FOR(MKLDNNReduceNode, ReduceL2); +REG_MKLDNN_PRIM_FOR(MKLDNNReduceNode, ReduceLogSum); +REG_MKLDNN_PRIM_FOR(MKLDNNReduceNode, ReduceLogSumExp); +REG_MKLDNN_PRIM_FOR(MKLDNNReduceNode, ReduceMax); +REG_MKLDNN_PRIM_FOR(MKLDNNReduceNode, ReduceMean); +REG_MKLDNN_PRIM_FOR(MKLDNNReduceNode, ReduceMin); +REG_MKLDNN_PRIM_FOR(MKLDNNReduceNode, ReduceOr); +REG_MKLDNN_PRIM_FOR(MKLDNNReduceNode, ReduceProd); +REG_MKLDNN_PRIM_FOR(MKLDNNReduceNode, ReduceSum); +REG_MKLDNN_PRIM_FOR(MKLDNNReduceNode, ReduceSumSquare); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.h new file mode 100644 index 00000000000000..c6e6eb64d3a60a --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.h @@ -0,0 +1,126 @@ +// Copyright (C) 2018-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include + +namespace MKLDNNPlugin { + +enum class Reduce { + And, + L1, + L2, + LogSum, + LogSumExp, + Max, + Mean, + Min, + Or, + Prod, + Sum, + SumSquare +}; + +struct jit_reduce_config_params { + bool planar_layout; + Reduce reduce_mode; + mkldnn::memory::data_type src_dt; + mkldnn::memory::data_type dst_dt; + int src_data_size; + int dst_data_size; +}; + +struct jit_reduce_call_args { + const void *src; + void *dst; + size_t work_amount; + size_t reduce_w = 2; // only used in planar layout [1: reduce width dimension] [0: reduce other dimension] [other value: N/A] + size_t reduce_c = 2; // only used in blocked layout [1: reduce channel dimension] [0: reduce other dimension] [other value: N/A] + const float *divisor; // mean = sum / divisor +}; + +struct jit_uni_reduce_kernel { + void (*ker_)(const jit_reduce_call_args *); + + void operator()(const jit_reduce_call_args *args) { + assert(ker_); + ker_(args); + } + + explicit jit_uni_reduce_kernel(jit_reduce_config_params jcp) : ker_(nullptr), jcp_(jcp) {} + virtual ~jit_uni_reduce_kernel() {} + + jit_reduce_config_params jcp_; +}; + +struct jit_uni_reduce_post_kernel { + void (*ker_)(const jit_reduce_call_args *); + + void operator()(const jit_reduce_call_args *args) { + assert(ker_); + ker_(args); + } + + explicit jit_uni_reduce_post_kernel(jit_reduce_config_params jcp) : ker_(nullptr), jcp_(jcp) {} + virtual ~jit_uni_reduce_post_kernel() {} + + jit_reduce_config_params jcp_; +}; + +class MKLDNNReduceNode : public MKLDNNNode { +public: + MKLDNNReduceNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + ~MKLDNNReduceNode() override = default; + + void getSupportedDescriptors() override; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override; + bool created() const override; + void execute(mkldnn::stream strm) override; + bool canBeInPlace() const override { + return false; + } + +private: + void reduce_type(const uint8_t *in_ptr, uint8_t *out_ptr, size_t dst_size); + void reduce_PLN(const uint8_t *in_ptr, uint8_t *out_ptr); + void reduce_BLK(const uint8_t *in_ptr, uint8_t *out_ptr); + void reduce_BLK_concern_padding(const uint8_t *in_ptr, uint8_t *out_ptr); + inline void reduce_kernel_process(const uint8_t *in_p, uint8_t *out_p, size_t work_amount, size_t reduce_w = 2); + inline void reduce_kernel_post_process(uint8_t *out_ptr); + inline void init_dst_data(uint8_t *out_ptr, size_t dst_size); + inline void calc_process_dst_dims(const int32_t *idx_data); + inline void reduce_ref(const float *in_ptr, float *out_ptr); + void reduce_ref_process(const float *in_ptr, float *out_ptr, float init_value, std::function func); + inline void reduce_ref_map(float *out_ptr, size_t work_amount_dst, size_t reduced_dims_work_amount); + + Reduce reduceMode = Reduce::Sum; + size_t blk_size; + size_t dims_size; + const size_t REDUCE_DATA = 0; + const size_t REDUCE_INDEXES = 1; + bool planar_layout = true; + bool jit_mode = true; + bool keep_dims = true; + bool ReduceN, ReduceC, ReduceD, ReduceH, ReduceW; + size_t IB, IC, ID, IH, IW; + size_t OB, OC, OD, OH, OW; + size_t src_data_size, dst_data_size; + InferenceEngine::Precision input_prec, output_prec; + InferenceEngine::SizeVector src_dims; + InferenceEngine::SizeVector src_strides; + InferenceEngine::SizeVector process_dst_dims; + InferenceEngine::SizeVector axes_for_reduction; + + std::shared_ptr reduce_kernel; + std::shared_ptr reduce_post_kernel; +}; + +} // namespace MKLDNNPlugin + diff --git a/inference-engine/src/mkldnn_plugin/nodes/reduce.cpp b/inference-engine/src/mkldnn_plugin/nodes/reduce.cpp deleted file mode 100644 index 869732dc6fe709..00000000000000 --- a/inference-engine/src/mkldnn_plugin/nodes/reduce.cpp +++ /dev/null @@ -1,406 +0,0 @@ -// Copyright (C) 2018-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "base.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include "ie_parallel.hpp" - -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { - -class ReduceImpl: public ExtLayerBase { -public: - explicit ReduceImpl(const CNNLayer* layer) { - try { - if (layer->insData.empty() || layer->outData.empty()) - THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output edges!"; - - if (layer->insData.size() != 2) - THROW_IE_EXCEPTION << layer->name << " Incorrect number of input edges!"; - - idx_dims = layer->insData[REDUCE_INDEXES].lock()->getTensorDesc().getDims(); - if (idx_dims.size() > 1) - THROW_IE_EXCEPTION << layer->name << " Index vector should be 1 dimension"; - - if (layer->insData[REDUCE_DATA].lock()->getTensorDesc().getPrecision() != Precision::FP32 && - layer->insData[REDUCE_DATA].lock()->getTensorDesc().getPrecision() != Precision::I32 && - layer->insData[REDUCE_DATA].lock()->getTensorDesc().getPrecision() != Precision::U8) - THROW_IE_EXCEPTION << layer->name << " Incorrect input data tensor precision. Only FP32/I32/U8 are supported!"; - - if (layer->insData[REDUCE_INDEXES].lock()->getTensorDesc().getPrecision() != Precision::I32) - THROW_IE_EXCEPTION << layer->name << " Incorrect 'axes_to_reduction' input precision. Only I32 is supported!"; - - data_dims = layer->insData[REDUCE_DATA].lock()->getTensorDesc().getDims(); - SizeVector dst_dims = layer->outData[0]->getTensorDesc().getDims(); - - keep_dims = layer->GetParamAsBool("keep_dims", true); - if (keep_dims) { - if (data_dims.size() != dst_dims.size()) - THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output dimensions!"; - } else { - if (data_dims.size() <= dst_dims.size()) - THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output dimensions!"; - } - - std::string reduce_mode = layer->type; - if (reduce_mode == "ReduceAnd") reduceMode = Reduce::And; - else if (reduce_mode == "ReduceL1") reduceMode = Reduce::L1; - else if (reduce_mode == "ReduceL2") reduceMode = Reduce::L2; - else if (reduce_mode == "ReduceLogSum") reduceMode = Reduce::LogSum; - else if (reduce_mode == "ReduceLogSumExp") reduceMode = Reduce::LogSumExp; - else if (reduce_mode == "ReduceMax") reduceMode = Reduce::Max; - else if (reduce_mode == "ReduceMean") reduceMode = Reduce::Mean; - else if (reduce_mode == "ReduceMin") reduceMode = Reduce::Min; - else if (reduce_mode == "ReduceOr") reduceMode = Reduce::Or; - else if (reduce_mode == "ReduceProd") reduceMode = Reduce::Prod; - else if (reduce_mode == "ReduceSum") reduceMode = Reduce::Sum; - else if (reduce_mode == "ReduceSumSquare") reduceMode = Reduce::SumSquare; - else - THROW_IE_EXCEPTION << layer->name << " Incorrect Reduce layer type!"; - - src_dims = layer->insData[REDUCE_DATA].lock()->getTensorDesc().getDims(); - srcStrides = layer->insData[REDUCE_DATA].lock()->getTensorDesc().getBlockingDesc().getStrides(); - - addConfig(layer, { { ConfLayout::PLN, false }, { ConfLayout::PLN, false } }, { { ConfLayout::PLN, false } }); - } catch (InferenceEngine::details::InferenceEngineException &ex) { - errorMsg = ex.what(); - } - } - - StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { - int32_t *idx_data = inputs[REDUCE_INDEXES]->cbuffer().as() + - inputs[REDUCE_INDEXES]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - SizeVector axes; - const size_t axesIter = idx_dims.empty() ? 1 : idx_dims[0]; - for (size_t i = 0; i < axesIter; i++) { - int32_t axis = idx_data[i]; - if (axis < 0) - axis += data_dims.size(); - - if (static_cast(axis) > data_dims.size()) { - if (resp) { - std::string errorMsg = "Index to reduce exceeds data tensor dimension"; - errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); - } - return PARAMETER_MISMATCH; - } - axes.push_back(static_cast(axis)); - } - - size_t reduced_dims_work_amount = 1; - InferenceEngine::SizeVector our_dims, out_dims, axes_for_reduction; - for (size_t i = 0; i < src_dims.size(); i++) { - bool found = false; - for (size_t axis : axes) - if (i == axis) found = true; - - if (found) { - axes_for_reduction.push_back(i); - reduced_dims_work_amount *= src_dims[i]; - if (keep_dims) out_dims.push_back(1); - our_dims.push_back(1); - } else { - out_dims.push_back(src_dims[i]); - our_dims.push_back(src_dims[i]); - } - } - - if (!our_dims.size()) - our_dims = InferenceEngine::SizeVector(1, 1); - - InferenceEngine::SizeVector dst_dims = outputs[0]->getTensorDesc().getDims(); - for (size_t i = 0; i < (std::min)(out_dims.size(), dst_dims.size()); i++) { - if (out_dims[i] != dst_dims[i]) { - if (resp) { - std::string errorMsg = "Incorrect number of output dimensions!"; - errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); - } - return PARAMETER_MISMATCH; - } - } - - size_t work_amount_dst; - if (!dst_dims.size()) { - work_amount_dst = 1; - } else { - size_t stride = !outputs[0]->getTensorDesc().getBlockingDesc().getStrides().empty() - ? outputs[0]->getTensorDesc().getBlockingDesc().getStrides()[0] - : 1; - work_amount_dst = stride * dst_dims[0]; - } - - auto compare = getPrecisionMask(inputs[REDUCE_DATA]->getTensorDesc().getPrecision(), outputs[0]->getTensorDesc().getPrecision()); - switch (compare) { - case getPrecisionMask(Precision::FP32, Precision::FP32): - return reduce_type(inputs, outputs, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims); - case getPrecisionMask(Precision::I32, Precision::I64): - return reduce_type(inputs, outputs, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims); - case getPrecisionMask(Precision::I32, Precision::U64): - return reduce_type(inputs, outputs, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims); - case getPrecisionMask(Precision::I32, Precision::FP32): - return reduce_type(inputs, outputs, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims); - case getPrecisionMask(Precision::I32, Precision::I32): - return reduce_type(inputs, outputs, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims); - case getPrecisionMask(Precision::U8, Precision::U8): - return reduce_type(inputs, outputs, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims); - case getPrecisionMask(Precision::FP32, Precision::U8): - return reduce_type(inputs, outputs, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims); - default: - if (resp) { - std::string errorMsg = "Incorrect Reduce layer type"; - errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); - } - return GENERAL_ERROR; - } - } - -private: - template - void reduce(const src_d *src_data, dst_t* dst_data, size_t work_amount_dst, size_t reduced_dims_work_amount, - SizeVector axes_for_reduction, SizeVector dst_dims, dst_t init_value, F1 func1, F2 func2); - template - StatusCode reduce_type(std::vector& inputs, std::vector& outputs, size_t work_amount_dst, size_t reduced_dims_work_amount, - SizeVector axes_for_reduction, SizeVector dst_dims); - enum class Reduce { And, L1, L2, LogSum, LogSumExp, Max, Mean, Min, Or, Prod, Sum, SumSquare }; - - const size_t REDUCE_DATA = 0; - const size_t REDUCE_INDEXES = 1; - bool keep_dims = true; - Reduce reduceMode = Reduce::Sum; - SizeVector data_dims; - SizeVector idx_dims; - SizeVector src_dims; - SizeVector srcStrides; -}; - -template -StatusCode ReduceImpl::reduce_type( - std::vector& inputs, - std::vector& outputs, - size_t work_amount_dst, - size_t reduced_dims_work_amount, - SizeVector axes_for_reduction, - SizeVector our_dims -) { - const src_d *src_data = inputs[REDUCE_DATA]->cbuffer().as() + - inputs[REDUCE_DATA]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - dst_t* dst_data = outputs[0]->cbuffer().as() + - outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - - switch (reduceMode) { - case Reduce::And: - reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast(1), - [](dst_t x, src_d y)->dst_t { return x && y; }, - [](dst_t x, src_d y)->dst_t { return x && y; }); - break; - case Reduce::L1: - reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast(0), - [](dst_t old, src_d y)->dst_t { return old + (std::abs)(y); }, - [](dst_t x, src_d y)->dst_t { return x + y; }); - break; - case Reduce::L2: - reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast(0), - [](dst_t old, src_d y)->dst_t { return old + y * y;}, - [](dst_t x, src_d y)->dst_t { return x + y; }); - - parallel_for(work_amount_dst, [&](size_t i) { - dst_data[i] = sqrt(dst_data[i]); - }); - break; - case Reduce::LogSum: - reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast(0), - [](dst_t x, src_d y)->dst_t { return x + y; }, - [](dst_t x, src_d y)->dst_t { return x + y; }); - - parallel_for(work_amount_dst, [&](size_t i) { - dst_data[i] = logf(dst_data[i]); - }); - break; - case Reduce::LogSumExp: - reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast(0), - [](dst_t old, src_d y)->dst_t { return old + expf(y); }, - [](dst_t x, src_d y)->dst_t { return x + y; }); - - parallel_for(work_amount_dst, [&](size_t i) { - dst_data[i] = logf(dst_data[i]); - }); - break; - case Reduce::Max: - reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, - (std::numeric_limits::min)(), - [](dst_t x, src_d y)->dst_t { return x > y ? x : y; }, - [](dst_t x, src_d y)->dst_t { return x > y ? x : y; }); - break; - case Reduce::Mean: - reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast(0), - [](dst_t x, src_d y)->dst_t { return x + y; }, - [](dst_t x, src_d y)->dst_t { return x + y; }); - - parallel_for(work_amount_dst, [&](size_t i) { - dst_data[i] /= static_cast(reduced_dims_work_amount); - }); - break; - case Reduce::Min: - reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, - (std::numeric_limits::max)(), - [](dst_t x, src_d y)->dst_t { return x < y ? x : y; }, - [](dst_t x, src_d y)->dst_t { return x < y ? x : y; }); - break; - case Reduce::Or: - reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast(0), - [](dst_t x, src_d y)->dst_t { return x || y; }, - [](dst_t x, src_d y)->dst_t { return x || y; }); - break; - case Reduce::Prod: - reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast(1), - [](dst_t x, src_d y)->dst_t { return x * y; }, - [](dst_t x, src_d y)->dst_t { return x * y; }); - break; - case Reduce::Sum: - reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast(0), - [](dst_t x, src_d y)->dst_t { return x + y; }, - [](dst_t x, src_d y)->dst_t { return x + y; }); - break; - case Reduce::SumSquare: - reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast(0), - [](dst_t old, src_d y)->dst_t { return old + y * y; }, - [](dst_t x, src_d y)->dst_t { return x + y; }); - break; - default: - return GENERAL_ERROR; - } - return OK; -} - -template -void ReduceImpl::reduce( - const src_d *src_data, - dst_t *dst_data, - size_t work_amount_dst, - size_t reduced_dims_work_amount, - SizeVector axes_for_reduction, - SizeVector dst_dims, - dst_t init_value, - F1 func1, - F2 func2 -) { - unsigned int nthr = parallel_get_max_threads(); - if ((work_amount_dst + 1) >= nthr) { - parallel_nt(0, [&](const int ithr, const int nthr) { - int j; - size_t i, start = 0, end = 0; - SizeVector dst_counters(dst_dims.size(), 0); - splitter(work_amount_dst, nthr, ithr, start, end); - for (j = dst_dims.size() - 1, i = start; j >= 0; j--) { - dst_counters[j] = i % dst_dims[j]; - i /= dst_dims[j]; - } - for (size_t src_idx = 0, dst_idx = start; dst_idx < end; ++dst_idx) { - dst_t reduce_prod = init_value; - bool update_idx = true; - SizeVector src_counters = dst_counters; - for (i = 0; i < reduced_dims_work_amount; ++i) { - if (update_idx) { - src_idx = 0; - for (j = 0; j < static_cast(src_dims.size()); ++j) - src_idx += (src_counters[j] % src_dims[j]) * srcStrides[j]; - update_idx = false; - } - reduce_prod = func1(reduce_prod, src_data[src_idx]); - for (j = axes_for_reduction.size() - 1; j >= 0; j--) { - src_counters[axes_for_reduction[j]]++; - if (src_counters[axes_for_reduction[j]] < src_dims[axes_for_reduction[j]]) { - src_idx += srcStrides[axes_for_reduction[j]]; - break; - } else { - src_counters[axes_for_reduction[j]] = 0; - update_idx = true; - } - } - } - dst_data[dst_idx] = reduce_prod; - for (j = dst_dims.size() - 1; j >= 0; j--) { - dst_counters[j]++; - if (dst_counters[j] < dst_dims[j]) - break; - else - dst_counters[j] = 0; - } - } - }); - } else { - std::vector reduce_prod((nthr * work_amount_dst), init_value); - if (work_amount_dst == 1) { - parallel_nt(nthr, [&](const int ithr, const int nthr) { - size_t i, start = 0, end = 0; - splitter((srcStrides[0] * src_dims[0]), nthr, ithr, start, end); - for (i = start; i < end; ++i) - reduce_prod[ithr] = func1(reduce_prod[ithr], src_data[i]); - }); - } else { - SizeVector dstStrides(dst_dims.size(), 1); - for (int j = dst_dims.size() - 1; j >= 1; --j) - dstStrides[j - 1] = dstStrides[j] * dst_dims[j]; - parallel_nt(nthr, [&](const int ithr, const int nthr) { - int j; - bool update_idx = true; - size_t i, src_idx, dst_idx = 0, start = 0, end = 0; - splitter((srcStrides[0] * src_dims[0]), nthr, ithr, start, end); - SizeVector src_counters(src_dims.size(), 0); - for (j = src_dims.size() - 1, src_idx = start; j >= 0; j--) { - src_counters[j] = src_idx % src_dims[j]; - src_idx /= src_dims[j]; - } - for (src_idx = start; src_idx < end; ++src_idx) { - if (update_idx) { - for (i = 0, dst_idx = 0; i < dst_dims.size(); ++i) - dst_idx += (src_counters[i] % dst_dims[i]) * dstStrides[i]; - update_idx = false; - } - reduce_prod[ithr * work_amount_dst + dst_idx] = func1(reduce_prod[ithr * work_amount_dst + dst_idx], src_data[src_idx]); - for (j = src_dims.size() - 1; j >= 0; j--) { - src_counters[j]++; - if (src_counters[j] < src_dims[j]) { - if (dst_dims[j] > 1) dst_idx += dstStrides[j]; - break; - } else { - src_counters[j] = 0; - update_idx = true; - } - } - } - }); - } - for (size_t dst_idx = 0; dst_idx < work_amount_dst; dst_idx++) { - for (size_t ithr = work_amount_dst; ithr < (nthr * work_amount_dst); ithr += work_amount_dst) - reduce_prod[dst_idx] = func2(reduce_prod[dst_idx], reduce_prod[dst_idx + ithr]); - dst_data[dst_idx] = reduce_prod[dst_idx]; - } - } -} - -REG_FACTORY_FOR(ReduceImpl, ReduceAnd); -REG_FACTORY_FOR(ReduceImpl, ReduceL1); -REG_FACTORY_FOR(ReduceImpl, ReduceL2); -REG_FACTORY_FOR(ReduceImpl, ReduceLogSum); -REG_FACTORY_FOR(ReduceImpl, ReduceLogSumExp); -REG_FACTORY_FOR(ReduceImpl, ReduceMax); -REG_FACTORY_FOR(ReduceImpl, ReduceMean); -REG_FACTORY_FOR(ReduceImpl, ReduceMin); -REG_FACTORY_FOR(ReduceImpl, ReduceOr); -REG_FACTORY_FOR(ReduceImpl, ReduceProd); -REG_FACTORY_FOR(ReduceImpl, ReduceSum); -REG_FACTORY_FOR(ReduceImpl, ReduceSumSquare); - -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/reduce_ops.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/reduce_ops.cpp index 268ad6b7c800c7..370879808efe8f 100644 --- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/reduce_ops.cpp +++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/reduce_ops.cpp @@ -12,15 +12,37 @@ using namespace LayerTestsDefinitions; namespace { const std::vector netPrecisions = { InferenceEngine::Precision::FP32, + InferenceEngine::Precision::I32, + InferenceEngine::Precision::U8, + InferenceEngine::Precision::I8, +}; + +const std::vector keepDims = { + true, + false, }; const std::vector> inputShapes = { - std::vector{10, 20, 40}, - std::vector{5, 6, 10, 11}, + std::vector{10, 20, 30, 40}, + std::vector{3, 5, 7, 9}, }; const std::vector> axes = { + {0}, + {1}, + {2}, + {3}, + {0, 1}, {0, 2}, + {0, 3}, + {1, 2}, + {1, 3}, + {2, 3}, + {0, 1, 2}, + {0, 1, 3}, + {0, 2, 3}, + {1, 2, 3}, + {0, 1, 2, 3}, {1, -1} }; @@ -45,11 +67,57 @@ const auto paramsOneAxis = testing::Combine( testing::ValuesIn(opTypes), testing::Values(true, false), testing::ValuesIn(reductionTypes), - testing::ValuesIn(netPrecisions), + testing::Values(InferenceEngine::Precision::FP32), + testing::ValuesIn(inputShapes), + testing::Values(CommonTestUtils::DEVICE_CPU) +); + +const auto params_Precisions = testing::Combine( + testing::Values(std::vector{1, 3}), + testing::Values(opTypes[1]), + testing::ValuesIn(keepDims), + testing::Values(ngraph::helpers::ReductionType::Sum), + testing::Values(InferenceEngine::Precision::FP32, + InferenceEngine::Precision::I32), + testing::Values(std::vector{2, 2, 2, 2}), + testing::Values(CommonTestUtils::DEVICE_CPU) +); + +const auto params_InputShapes = testing::Combine( + testing::Values(std::vector{0}), + testing::Values(opTypes[1]), + testing::ValuesIn(keepDims), + testing::Values(ngraph::helpers::ReductionType::Mean), + testing::Values(InferenceEngine::Precision::FP32), + testing::Values(std::vector{3}, + std::vector{3, 5}, + std::vector{2, 4, 6}, + std::vector{2, 4, 6, 8}, + std::vector{2, 2, 2, 2, 2}, + std::vector{2, 2, 2, 2, 2, 2}), + testing::Values(CommonTestUtils::DEVICE_CPU) +); + +const auto params_Axes = testing::Combine( + testing::ValuesIn(axes), + testing::Values(opTypes[1]), + testing::ValuesIn(keepDims), + testing::Values(ngraph::helpers::ReductionType::Mean), + testing::Values(InferenceEngine::Precision::FP32), testing::ValuesIn(inputShapes), testing::Values(CommonTestUtils::DEVICE_CPU) ); +const auto params_ReductionTypes = testing::Combine( + testing::Values(std::vector{0, 1, 3}), + testing::Values(opTypes[1]), + testing::ValuesIn(keepDims), + testing::ValuesIn(reductionTypes), + testing::Values(InferenceEngine::Precision::FP32), + testing::Values(std::vector{2, 9, 2, 9}), + testing::Values(CommonTestUtils::DEVICE_CPU) +); + INSTANTIATE_TEST_CASE_P( ReduceOneAxis, ReduceOpsLayerTest, @@ -57,21 +125,31 @@ INSTANTIATE_TEST_CASE_P( ReduceOpsLayerTest::getTestCaseName ); -const auto params = testing::Combine( - testing::ValuesIn(axes), - testing::Values(opTypes[1]), - testing::Values(true, false), - testing::ValuesIn(reductionTypes), - testing::ValuesIn(netPrecisions), - testing::ValuesIn(inputShapes), - testing::Values(CommonTestUtils::DEVICE_CPU) +INSTANTIATE_TEST_CASE_P( + Reduce_Precisions, + ReduceOpsLayerTest, + params_Precisions, + ReduceOpsLayerTest::getTestCaseName +); + +INSTANTIATE_TEST_CASE_P( + Reduce_InputShapes, + ReduceOpsLayerTest, + params_InputShapes, + ReduceOpsLayerTest::getTestCaseName ); INSTANTIATE_TEST_CASE_P( - Reduce, + Reduce_Axes, ReduceOpsLayerTest, - params, + params_Axes, ReduceOpsLayerTest::getTestCaseName ); +INSTANTIATE_TEST_CASE_P( + Reduce_ReductionTypes, + ReduceOpsLayerTest, + params_ReductionTypes, + ReduceOpsLayerTest::getTestCaseName +); } // namespace diff --git a/inference-engine/thirdparty/mkl-dnn b/inference-engine/thirdparty/mkl-dnn index e759306d9500a9..ae3c03550796c2 160000 --- a/inference-engine/thirdparty/mkl-dnn +++ b/inference-engine/thirdparty/mkl-dnn @@ -1 +1 @@ -Subproject commit e759306d9500a958033954390be0faeac5e31f99 +Subproject commit ae3c03550796c2131dfb683a8eefb286cf7e8db3 From 7c95e8f8ffd50a07fe37ee0fd628f4b9efe27ed5 Mon Sep 17 00:00:00 2001 From: Anton Voronov Date: Mon, 7 Sep 2020 15:39:19 +0300 Subject: [PATCH 13/66] [CPU] fix: supported Tile with more than 5 dims (#2062) --- .../mkldnn_plugin/nodes/mkldnn_tile_node.cpp | 16 +----- .../single_layer_tests/tile.cpp | 40 +++++++++++++++ .../include/single_layer_tests/tile.hpp | 34 +++++++++++++ .../shared/src/single_layer_tests/tile.cpp | 49 +++++++++++++++++++ .../include/ngraph_functions/builders.hpp | 4 ++ .../tests/ngraph_functions/src/tile.cpp | 18 +++++++ 6 files changed, 146 insertions(+), 15 deletions(-) create mode 100644 inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/tile.cpp create mode 100644 inference-engine/tests/functional/plugin/shared/include/single_layer_tests/tile.hpp create mode 100644 inference-engine/tests/functional/plugin/shared/src/single_layer_tests/tile.cpp create mode 100644 inference-engine/tests/ngraph_functions/src/tile.cpp diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.cpp index e8d7356f4ac4c6..23cd681ae875c3 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.cpp @@ -45,21 +45,7 @@ void MKLDNNTileNode::initSupportedPrimitiveDescriptors() { auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision); auto& inDims = getParentEdgeAt(0)->getDims(); - memory::format fmt = memory::format::any; - if (inDims.ndims() == 1) { - fmt = memory::format::x; - } else if (inDims.ndims() == 2) { - fmt = memory::format::nc; - } else if (inDims.ndims() == 3) { - fmt = memory::format::tnc; - } else if (inDims.ndims() == 4) { - fmt = memory::format::nchw; - } else if (inDims.ndims() == 5) { - fmt = memory::format::ncdhw; - } - if (fmt == memory::format::any) { - THROW_IE_EXCEPTION << "Tile " << getName() << " supports only 2D, 4D and 5D dimensions!"; - } + memory::format fmt = MKLDNNMemory::GetPlainFormat(inDims); InferenceEngine::LayerConfig config; config.dynBatchSupport = true; diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/tile.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/tile.cpp new file mode 100644 index 00000000000000..008c44b3afd6a6 --- /dev/null +++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/tile.cpp @@ -0,0 +1,40 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "single_layer_tests/tile.hpp" + +using namespace LayerTestsDefinitions; + +namespace { + +const std::vector netPrecisions = { + InferenceEngine::Precision::FP32 +}; + +const std::vector> repeats = { + {1, 2, 3}, + {2, 1, 1}, + {2, 3, 1}, + {2, 2, 2}, +}; + +INSTANTIATE_TEST_CASE_P(Tile, TileLayerTest, + ::testing::Combine( + ::testing::ValuesIn(repeats), + ::testing::ValuesIn(netPrecisions), + ::testing::Values(std::vector({2, 3, 4})), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + TileLayerTest::getTestCaseName); + +INSTANTIATE_TEST_CASE_P(Tile6d, TileLayerTest, + ::testing::Combine( + ::testing::Values(std::vector({1, 1, 1, 2, 1, 2})), + ::testing::ValuesIn(netPrecisions), + ::testing::Values(std::vector({1, 4, 3, 1, 3, 1})), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + TileLayerTest::getTestCaseName); + +} // namespace diff --git a/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/tile.hpp b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/tile.hpp new file mode 100644 index 00000000000000..5ab4d6944ec7da --- /dev/null +++ b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/tile.hpp @@ -0,0 +1,34 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include + +#include "functional_test_utils/layer_test_utils.hpp" +#include "ngraph_functions/builders.hpp" + +typedef std::vector TileSpecificParams; +typedef std::tuple< + TileSpecificParams, + InferenceEngine::Precision, // Net precision + InferenceEngine::SizeVector, // Input shapes + LayerTestsUtils::TargetDevice // Device name +> TileLayerTestParamsSet; + +namespace LayerTestsDefinitions { + +class TileLayerTest : public testing::WithParamInterface, + public LayerTestsUtils::LayerTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + void SetUp() override; +}; + +} // namespace LayerTestsDefinitions diff --git a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/tile.cpp b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/tile.cpp new file mode 100644 index 00000000000000..8cb5da3e09228f --- /dev/null +++ b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/tile.cpp @@ -0,0 +1,49 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include +#include + +#include "single_layer_tests/tile.hpp" + + +namespace LayerTestsDefinitions { + +std::string TileLayerTest::getTestCaseName(testing::TestParamInfo obj) { + TileSpecificParams tileParams; + InferenceEngine::Precision netPrecision; + InferenceEngine::SizeVector inputShapes; + std::string targetDevice; + std::tie(tileParams, netPrecision, inputShapes, targetDevice) = obj.param; + + std::ostringstream result; + result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_"; + result << "Repeats=" << CommonTestUtils::vec2str(tileParams) << "_"; + result << "netPRC=" << netPrecision.name() << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void TileLayerTest::SetUp() { + TileSpecificParams tileParams; + std::vector inputShape; + auto netPrecision = InferenceEngine::Precision::UNSPECIFIED; + std::tie(tileParams, netPrecision, inputShape, targetDevice) = this->GetParam(); + auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); + auto params = ngraph::builder::makeParams(ngPrc, {inputShape}); + auto paramOuts = ngraph::helpers::convert2OutputVector( + ngraph::helpers::castOps2Nodes(params)); + auto tile = ngraph::builder::makeTile(paramOuts[0], tileParams); + ngraph::ResultVector results{std::make_shared(tile)}; + function = std::make_shared(results, params, "tile"); +} + +TEST_P(TileLayerTest, CompareWithRefs) { + Run(); +} + +} // namespace LayerTestsDefinitions diff --git a/inference-engine/tests/ngraph_functions/include/ngraph_functions/builders.hpp b/inference-engine/tests/ngraph_functions/include/ngraph_functions/builders.hpp index 42854679d5367e..1ce7d4e3123075 100644 --- a/inference-engine/tests/ngraph_functions/include/ngraph_functions/builders.hpp +++ b/inference-engine/tests/ngraph_functions/include/ngraph_functions/builders.hpp @@ -415,5 +415,9 @@ std::shared_ptr makeRNNCell(const OutputVector& in, const std::vector& activations_alpha = {}, const std::vector& activations_beta = {}, float clip = 0.f); + +std::shared_ptr makeTile(const ngraph::Output& in, + const std::vector& repeats); + } // namespace builder } // namespace ngraph diff --git a/inference-engine/tests/ngraph_functions/src/tile.cpp b/inference-engine/tests/ngraph_functions/src/tile.cpp new file mode 100644 index 00000000000000..3e8e58c9fb980e --- /dev/null +++ b/inference-engine/tests/ngraph_functions/src/tile.cpp @@ -0,0 +1,18 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ngraph_functions/builders.hpp" + +namespace ngraph { +namespace builder { + +std::shared_ptr makeTile(const ngraph::Output& in, + const std::vector& repeats) { + auto repeatsNode = std::make_shared(ngraph::element::i64, std::vector{repeats.size()}, repeats); + auto tileNode = std::make_shared(in, repeatsNode); + return tileNode; +} + +} // namespace builder +} // namespace ngraph From 52ebe68cc742624485575a56c9a4460eb00cf739 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20Do=C5=82bniak?= Date: Mon, 7 Sep 2020 16:05:19 +0200 Subject: [PATCH 14/66] v1::AvgPooling type prop unit tests (#2013) --- ngraph/test/type_prop/avg_pool.cpp | 452 +++++++++++++++++++++++++++++ 1 file changed, 452 insertions(+) diff --git a/ngraph/test/type_prop/avg_pool.cpp b/ngraph/test/type_prop/avg_pool.cpp index 427e1ba5b20bac..a08c58a2139d91 100644 --- a/ngraph/test/type_prop/avg_pool.cpp +++ b/ngraph/test/type_prop/avg_pool.cpp @@ -103,3 +103,455 @@ TEST(type_prop, avg_pool_auto_padding_spatial_dims_dynamic) ASSERT_EQ(mp->get_pads_begin(), (Shape{})); ASSERT_EQ(mp->get_pads_end(), (Shape{})); } + +TEST(type_prop, avg_pool_1d_deduce) +{ + const auto param = make_shared(element::f32, Shape{64, 3, 100}); + const Shape kernel{10}; + const auto avg_pool = make_shared( + param, Strides{1}, Shape{}, Shape{}, kernel, true, op::RoundingType::FLOOR); + + EXPECT_EQ(avg_pool->get_output_element_type(0), element::f32); + EXPECT_EQ(avg_pool->get_output_shape(0), (Shape{64, 3, 91})); + + EXPECT_EQ(avg_pool->get_strides(), Strides{1}); + EXPECT_EQ(avg_pool->get_kernel(), Shape{10}); + EXPECT_EQ(avg_pool->get_pads_begin(), Shape{0}); + EXPECT_EQ(avg_pool->get_pads_end(), Shape{0}); +} + +TEST(type_prop, avg_pool_1d_deduce_strided) +{ + const auto param = make_shared(element::f32, Shape{64, 3, 100}); + const Shape kernel{10}; + const auto move_strides = Strides{2}; + const auto avg_pool = make_shared( + param, move_strides, Shape{}, Shape{}, kernel, true, op::RoundingType::FLOOR); + + EXPECT_EQ(avg_pool->get_output_element_type(0), element::f32); + EXPECT_EQ(avg_pool->get_output_shape(0), (Shape{64, 3, 46})); + + EXPECT_EQ(avg_pool->get_strides(), Strides{2}); + EXPECT_EQ(avg_pool->get_kernel(), Shape{10}); + EXPECT_EQ(avg_pool->get_pads_begin(), Shape{0}); + EXPECT_EQ(avg_pool->get_pads_end(), Shape{0}); +} + +TEST(type_prop, avg_pool_1d_deduce_strided_small_uneven) +{ + const auto param = make_shared(element::f32, Shape{64, 3, 5}); + const Shape kernel{2}; + const auto move_strides = Strides{2}; + const auto avg_pool = make_shared( + param, move_strides, Shape{}, Shape{}, kernel, true, op::RoundingType::FLOOR); + + EXPECT_EQ(avg_pool->get_output_element_type(0), element::f32); + EXPECT_EQ(avg_pool->get_output_shape(0), (Shape{64, 3, 2})); + + EXPECT_EQ(avg_pool->get_strides(), Strides{2}); + EXPECT_EQ(avg_pool->get_kernel(), Shape{2}); + EXPECT_EQ(avg_pool->get_pads_begin(), Shape{0}); + EXPECT_EQ(avg_pool->get_pads_end(), Shape{0}); +} + +TEST(type_prop, avg_pool_1d_deduce_strided_small_even) +{ + const auto param = make_shared(element::f32, Shape{64, 3, 6}); + const Shape kernel{2}; + const auto move_strides = Strides{2}; + const auto avg_pool = make_shared( + param, move_strides, Shape{}, Shape{}, kernel, true, op::RoundingType::FLOOR); + + EXPECT_EQ(avg_pool->get_output_element_type(0), element::f32); + EXPECT_EQ(avg_pool->get_output_shape(0), (Shape{64, 3, 3})); + + EXPECT_EQ(avg_pool->get_strides(), Strides{2}); + EXPECT_EQ(avg_pool->get_kernel(), Shape{2}); + EXPECT_EQ(avg_pool->get_pads_begin(), Shape{0}); + EXPECT_EQ(avg_pool->get_pads_end(), Shape{0}); +} + +TEST(type_prop, avg_pool_2d_deduce) +{ + const auto param = make_shared(element::f32, Shape{64, 3, 100, 150}); + const Shape kernel{10, 20}; + const auto avg_pool = make_shared( + param, Strides{1, 1}, Shape{0, 0}, Shape{0, 0}, kernel, true, op::RoundingType::FLOOR); + + EXPECT_EQ(avg_pool->get_output_element_type(0), element::f32); + EXPECT_EQ(avg_pool->get_output_shape(0), (Shape{64, 3, 91, 131})); + + EXPECT_EQ(avg_pool->get_strides(), (Strides{1, 1})); + EXPECT_EQ(avg_pool->get_kernel(), (Shape{10, 20})); + EXPECT_EQ(avg_pool->get_pads_begin(), (Shape{0, 0})); + EXPECT_EQ(avg_pool->get_pads_end(), (Shape{0, 0})); +} + +TEST(type_prop, avg_pool_2d_deduce_strided) +{ + const auto param = make_shared(element::f32, Shape{64, 3, 100, 150}); + const Shape kernel{10, 20}; + const auto move_strides = Strides{2, 3}; + const auto avg_pool = make_shared( + param, move_strides, Shape{0, 0}, Shape{0, 0}, kernel, true, op::RoundingType::FLOOR); + + EXPECT_EQ(avg_pool->get_output_element_type(0), element::f32); + EXPECT_EQ(avg_pool->get_output_shape(0), (Shape{64, 3, 46, 44})); + + EXPECT_EQ(avg_pool->get_strides(), (Strides{2, 3})); + EXPECT_EQ(avg_pool->get_kernel(), (Shape{10, 20})); + EXPECT_EQ(avg_pool->get_pads_begin(), (Shape{0, 0})); + EXPECT_EQ(avg_pool->get_pads_end(), (Shape{0, 0})); +} + +TEST(type_prop, avg_pool_3d_deduce_strided_small) +{ + const auto param = make_shared(element::f32, Shape{64, 3, 7, 8, 10}); + const Shape kernel{2, 3, 2}; + const auto move_strides = Strides{2, 3, 4}; + const auto avg_pool = make_shared( + param, move_strides, Shape{0, 0, 0}, Shape{0, 0, 0}, kernel, true, op::RoundingType::FLOOR); + + EXPECT_EQ(avg_pool->get_output_element_type(0), element::f32); + EXPECT_EQ(avg_pool->get_output_shape(0), (Shape{64, 3, 3, 2, 3})); + + EXPECT_EQ(avg_pool->get_strides(), (Strides{2, 3, 4})); + EXPECT_EQ(avg_pool->get_kernel(), (Shape{2, 3, 2})); + EXPECT_EQ(avg_pool->get_pads_begin(), (Shape{0, 0, 0})); + EXPECT_EQ(avg_pool->get_pads_end(), (Shape{0, 0, 0})); +} + +TEST(type_prop, avg_pool_3d_deduce_strided_padded_small) +{ + const auto param = make_shared(element::f32, Shape{64, 3, 7, 8, 10}); + const Shape kernel{2, 3, 2}; + const auto move_strides = Strides{2, 3, 4}; + const Shape pads_begin{5, 6, 4}; + const Shape pads_end{6, 4, 5}; + const auto avg_pool = make_shared( + param, move_strides, pads_begin, pads_end, kernel, false, op::RoundingType::FLOOR); + + EXPECT_EQ(avg_pool->get_output_element_type(0), element::f32); + EXPECT_EQ(avg_pool->get_output_shape(0), (Shape{64, 3, 9, 6, 5})); + + EXPECT_EQ(avg_pool->get_strides(), (Strides{2, 3, 4})); + EXPECT_EQ(avg_pool->get_kernel(), (Shape{2, 3, 2})); + EXPECT_EQ(avg_pool->get_pads_begin(), (Shape{5, 6, 4})); + EXPECT_EQ(avg_pool->get_pads_end(), (Shape{6, 4, 5})); +} + +TEST(type_prop, avg_pool_invalid_0d_input) +{ + const auto param = make_shared(element::f32, Shape{}); + const Shape kernel{}; + EXPECT_THROW(make_shared( + param, Strides{1}, Shape{}, Shape{}, kernel, true, op::RoundingType::FLOOR), + NodeValidationFailure); +} + +TEST(type_prop, avg_pool_invalid_1d_input) +{ + const auto param = make_shared(element::f32, Shape{2}); + const Shape kernel{}; + EXPECT_THROW(make_shared( + param, Strides{1}, Shape{}, Shape{}, kernel, true, op::RoundingType::FLOOR), + NodeValidationFailure); +} + +TEST(type_prop, avg_pool_invalid_2d_input) +{ + const auto param = make_shared(element::f32, Shape{2, 6}); + const Shape kernel{}; + EXPECT_THROW(make_shared( + param, Strides{1}, Shape{}, Shape{}, kernel, true, op::RoundingType::FLOOR), + NodeValidationFailure); +} + +TEST(type_prop, avg_pool_invalid_0_batch_size) +{ + const auto param = make_shared(element::f32, Shape{0, 6, 1}); + const Shape kernel{1}; + EXPECT_THROW(make_shared( + param, Strides{1}, Shape{}, Shape{}, kernel, true, op::RoundingType::FLOOR), + NodeValidationFailure); +} + +TEST(type_prop, avg_pool_invalid_0_channels) +{ + const auto param = make_shared(element::f32, Shape{6, 0, 1}); + const Shape kernel{1}; + EXPECT_THROW(make_shared( + param, Strides{1}, Shape{}, Shape{}, kernel, true, op::RoundingType::FLOOR), + NodeValidationFailure); +} + +TEST(type_prop, avg_pool_invalid_wrong_number_of_window_dimensions_too_many) +{ + const auto param = make_shared(element::f32, Shape{6, 2, 10, 10}); + const Shape kernel{3, 3, 3}; + EXPECT_THROW(make_shared( + param, Strides{1}, Shape{}, Shape{}, kernel, true, op::RoundingType::FLOOR), + NodeValidationFailure); +} + +TEST(type_prop, avg_pool_invalid_wrong_number_of_window_dimensions_too_few) +{ + const auto param = make_shared(element::f32, Shape{6, 2, 10, 10}); + const Shape kernel{3}; + EXPECT_THROW(make_shared( + param, Strides{1}, Shape{}, Shape{}, kernel, true, op::RoundingType::FLOOR), + NodeValidationFailure); +} + +TEST(type_prop, avg_pool_invalid_movement_stride_rank) +{ + const auto param = make_shared(element::f32, Shape{6, 2, 10, 10}); + const Shape kernel{3, 3}; + const auto move_strides = Strides{2, 3, 8}; + EXPECT_THROW(make_shared( + param, move_strides, Shape{}, Shape{}, kernel, true, op::RoundingType::FLOOR), + NodeValidationFailure); +} + +TEST(type_prop, avg_pool_invalid_padding_below_rank) +{ + const auto param = make_shared(element::f32, Shape{6, 2, 10, 10}); + const Shape kernel{3, 3}; + const auto move_strides = Strides{2, 3}; + const Shape pads_begin{1, 2, 3}; + const Shape pads_end{1, 2}; + EXPECT_THROW( + make_shared( + param, move_strides, pads_begin, pads_end, kernel, true, op::RoundingType::FLOOR), + NodeValidationFailure); +} + +TEST(type_prop, avg_pool_invalid_padding_above_rank) +{ + const auto param = make_shared(element::f32, Shape{6, 2, 10, 10}); + const Shape kernel{3, 3}; + const auto move_strides = Strides{2, 3}; + const Shape pads_begin{1, 2}; + const Shape pads_end{1, 2, 3}; + EXPECT_THROW( + make_shared( + param, move_strides, pads_begin, pads_end, kernel, true, op::RoundingType::FLOOR), + NodeValidationFailure); +} + +TEST(type_prop, avg_pool_invalid_input_item_size_0) +{ + const auto param = make_shared(element::f32, Shape{6, 2, 0, 10}); + const Shape kernel{3, 3}; + EXPECT_THROW(make_shared( + param, Strides{1}, Shape{}, Shape{}, kernel, true, op::RoundingType::FLOOR), + NodeValidationFailure); +} + +TEST(type_prop, avg_pool_invalid_window_size_0) +{ + const auto param = make_shared(element::f32, Shape{6, 2, 10, 10}); + const Shape kernel{3, 0}; + EXPECT_THROW(make_shared( + param, Strides{1}, Shape{}, Shape{}, kernel, true, op::RoundingType::FLOOR), + NodeValidationFailure); +} + +TEST(type_prop, avg_pool_invalid_dilated_too_large) +{ + const auto param = make_shared(element::f32, Shape{6, 2, 8, 8}); + const Shape kernel{9, 9}; + EXPECT_THROW(make_shared( + param, Strides{1}, Shape{}, Shape{}, kernel, true, op::RoundingType::FLOOR), + NodeValidationFailure); +} + +TEST(type_prop, avg_pool_larger_than_pre_padding_but_fits_in_post_padding) +{ + const auto param = make_shared(element::f32, Shape{6, 2, 8, 8}); + const Shape kernel{9, 9}; + const Strides window_strides{1, 1}; + const Shape pads_begin{0, 0}; + const Shape pads_end{1, 1}; + const auto avg_pool = make_shared( + param, window_strides, pads_begin, pads_end, kernel, true, op::RoundingType::FLOOR); + + ASSERT_EQ(avg_pool->get_output_element_type(0), element::f32); + ASSERT_EQ(avg_pool->get_output_shape(0), (Shape{6, 2, 1, 1})); +} + +TEST(type_prop, avg_pool_invalid_movement_stride_0) +{ + const auto param = make_shared(element::f32, Shape{6, 2, 10, 10}); + const Shape kernel{3, 3}; + const auto move_strides = Strides{0, 1}; + EXPECT_THROW(make_shared( + param, move_strides, Shape{}, Shape{}, kernel, true, op::RoundingType::FLOOR), + NodeValidationFailure); +} + +TEST(type_prop, avg_pool_partial_rank_dynamic_ok) +{ + const PartialShape arg_shape{PartialShape::dynamic()}; + const Shape kernel{2, 3, 4, 5}; + const Strides window_movement_strides{1, 1, 1, 1}; + const Shape pads_begin{0, 0, 0, 0}; + const Shape pads_end{0, 0, 0, 0}; + + const auto param = make_shared(element::f32, arg_shape); + auto ap = make_shared(param, + window_movement_strides, + pads_begin, + pads_end, + kernel, + false, + op::RoundingType::FLOOR); + + ASSERT_EQ(ap->get_output_element_type(0), element::f32); + ASSERT_TRUE(ap->get_output_partial_shape(0).same_scheme(PartialShape::dynamic(6))); +} + +TEST(type_prop, avg_pool_partial_rank_dynamic_attrib_rank_mismatch) +{ + const PartialShape arg_shape{PartialShape::dynamic()}; + const Shape kernel{2, 3, 4, 5}; + const Strides window_movement_strides{1, 1, 1, 1, 1}; + const Shape pads_begin{0, 0, 0, 0}; + const Shape pads_end{0, 0, 0, 0}; + + const auto param = make_shared(element::f32, arg_shape); + + EXPECT_THROW(make_shared(param, + window_movement_strides, + pads_begin, + pads_end, + kernel, + false, + op::RoundingType::FLOOR), + NodeValidationFailure); +} + +TEST(type_prop, avg_pool_partial_rank_static_dynamic_ok) +{ + const PartialShape arg_shape{PartialShape::dynamic(6)}; + const Shape kernel{2, 3, 4, 5}; + const Strides window_movement_strides{1, 1, 1, 1}; + const Shape pads_begin{0, 0, 0, 0}; + const Shape pads_end{0, 0, 0, 0}; + + const auto param = make_shared(element::f32, arg_shape); + auto ap = make_shared(param, + window_movement_strides, + pads_begin, + pads_end, + kernel, + false, + op::RoundingType::FLOOR); + + ASSERT_EQ(ap->get_output_element_type(0), element::f32); + ASSERT_TRUE(ap->get_output_partial_shape(0).same_scheme(PartialShape::dynamic(6))); +} + +TEST(type_prop, avg_pool_partial_rank_static_dynamic_some_dims_known_ok) +{ + const PartialShape arg_shape{5, Dimension::dynamic(), 8, Dimension::dynamic(), 4, 7}; + const Shape kernel{2, 3, 4, 5}; + const Strides window_movement_strides{1, 1, 1, 1}; + const Shape pads_begin{0, 0, 0, 0}; + const Shape pads_end{0, 0, 0, 0}; + + const auto param = make_shared(element::f32, arg_shape); + auto ap = make_shared(param, + window_movement_strides, + pads_begin, + pads_end, + kernel, + false, + op::RoundingType::FLOOR); + + ASSERT_EQ(ap->get_output_element_type(0), element::f32); + ASSERT_TRUE(ap->get_output_partial_shape(0).same_scheme( + PartialShape{5, Dimension::dynamic(), 7, Dimension::dynamic(), 1, 3})); +} + +TEST(type_prop, avg_pool_partial_rank_static_dynamic_attrib_rank_mismatch) +{ + const PartialShape arg_shape{5, Dimension::dynamic(), 8, Dimension::dynamic(), 4, 7}; + const Shape kernel{2, 3, 4, 5, 6}; + const Strides window_movement_strides{1, 1, 1, 1}; + const Shape pads_begin{0, 0, 0, 0}; + const Shape pads_end{0, 0, 0, 0}; + + const auto param = make_shared(element::f32, arg_shape); + + EXPECT_THROW(make_shared(param, + window_movement_strides, + pads_begin, + pads_end, + kernel, + true, + op::RoundingType::FLOOR), + NodeValidationFailure); +} + +TEST(type_prop, avg_pool_partial_rank_static_dynamic_window_not_too_big) +{ + const PartialShape arg_shape{5, Dimension::dynamic(), 8, Dimension::dynamic(), 4, 7}; + const Shape kernel{9, 3, 4, 5}; + const Strides window_movement_strides{1, 1, 1, 1}; + const Shape pads_begin{0, 0, 0, 0}; + const Shape pads_end{0, 0, 0, 0}; + + const auto param = make_shared(element::f32, arg_shape); + + EXPECT_THROW(make_shared(param, + window_movement_strides, + pads_begin, + pads_end, + kernel, + true, + op::RoundingType::FLOOR), + NodeValidationFailure); +} + +TEST(type_prop, avg_pool_partial_rank_static_dynamic_padded_window_not_too_big) +{ + const PartialShape arg_shape{5, Dimension::dynamic(), 8, Dimension::dynamic(), 4, 7}; + const Shape kernel{9, 3, 4, 5}; + const Strides window_movement_strides{1, 1, 1, 1}; + const Shape pads_begin{0, 0, 0, 0}; + const Shape pads_end{1, 0, 0, 0}; + + const auto param = make_shared(element::f32, arg_shape); + auto ap = make_shared(param, + window_movement_strides, + pads_begin, + pads_end, + kernel, + true, + op::RoundingType::FLOOR); + + ASSERT_EQ(ap->get_output_element_type(0), element::f32); + ASSERT_TRUE(ap->get_output_partial_shape(0).same_scheme( + PartialShape{5, Dimension::dynamic(), 1, Dimension::dynamic(), 1, 3})); +} + +TEST(type_prop, avg_pool_partial_rank_static_dynamic_window_in_padding) +{ + const PartialShape arg_shape{5, Dimension::dynamic(), 8, Dimension::dynamic(), 4, 7}; + const Shape kernel{9, 3, 4, 3}; + const Strides window_movement_strides{1, 1, 1, 1}; + const Shape pads_begin{0, 0, 0, 4}; + const Shape pads_end{0, 0, 0, 0}; + + const auto param = make_shared(element::f32, arg_shape); + + EXPECT_THROW(make_shared(param, + window_movement_strides, + pads_begin, + pads_end, + kernel, + true, + op::RoundingType::FLOOR), + NodeValidationFailure); +} From 2c7f06e08f83934e2bc5b065605ab1943e96d740 Mon Sep 17 00:00:00 2001 From: Maksim Kutakov Date: Mon, 7 Sep 2020 17:17:14 +0300 Subject: [PATCH 15/66] [IE TESTS] GatherTree single layer test has been created. (#2006) * [IE TESTS] GatherTree op ref function has been created. * [IE TESTS] Added GatherTree single layer test * [IE TESTS] Fixed code styles. * [IE TESTS] GatherTree test FP32 precion was enabled. * [IE TESTS] Refactoring of Builder::makeConstatn procedure The refactoring is aimed at managing the range of random data for the constants initialization procedure. * [IE TESTS] GatherTree test was extended with constants * [IE TESTS] GatherTree ref rewritten to non-templated function. * [IE TESTS] GatherTree test inp shape indx enum removed. * Revert "[IE TESTS] Refactoring of Builder::makeConstatn procedure" This reverts commit 2648172e00ccca266d39e8775b890b8a8395f57c. * [IE TESTS] makeConstant was augmented with random data range parameters. * [IE TESTS] GatherTree test was rewritten using makeConstant function. * [IE TESTS] GaterTree test call templated makeConstant * [IE TESTS] GaterTree test code style fix --- .../single_layer_tests/gather_tree.cpp | 34 ++++ .../single_layer_tests/gather_tree.hpp | 34 ++++ .../src/single_layer_tests/gather_tree.cpp | 87 +++++++++++ .../include/ngraph_functions/builders.hpp | 9 +- .../ngraph_functions/utils/data_utils.hpp | 12 +- .../ngraph/runtime/reference/gather_tree.hpp | 39 +++++ .../src/runtime/reference/gather_tree.cpp | 145 ++++++++++++++++++ .../runtime/interpreter/int_executable.hpp | 16 ++ .../runtime/interpreter/opset_int_tbl.hpp | 1 + 9 files changed, 367 insertions(+), 10 deletions(-) create mode 100644 inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/gather_tree.cpp create mode 100644 inference-engine/tests/functional/plugin/shared/include/single_layer_tests/gather_tree.hpp create mode 100644 inference-engine/tests/functional/plugin/shared/src/single_layer_tests/gather_tree.cpp create mode 100644 ngraph/core/reference/include/ngraph/runtime/reference/gather_tree.hpp create mode 100644 ngraph/core/reference/src/runtime/reference/gather_tree.cpp diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/gather_tree.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/gather_tree.cpp new file mode 100644 index 00000000000000..9c510449601783 --- /dev/null +++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/gather_tree.cpp @@ -0,0 +1,34 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "single_layer_tests/gather_tree.hpp" +#include "common_test_utils/test_constants.hpp" + +using namespace LayerTestsDefinitions; + +namespace { + +const std::vector netPrecisions = { + InferenceEngine::Precision::FP32, + InferenceEngine::Precision::I32 +}; + +const std::vector> inputShapes = { {5, 1, 10}, {1, 1, 10}, {20, 1, 10}, {20, 20, 10} }; + +const std::vector secondaryInputTypes = { + ngraph::helpers::InputLayerType::CONSTANT, + ngraph::helpers::InputLayerType::PARAMETER +}; + +INSTANTIATE_TEST_CASE_P(Basic_smoke, GatherTreeLayerTest, + ::testing::Combine( + ::testing::ValuesIn(inputShapes), + ::testing::ValuesIn(secondaryInputTypes), + ::testing::ValuesIn(netPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + GatherTreeLayerTest::getTestCaseName); + +} // namespace diff --git a/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/gather_tree.hpp b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/gather_tree.hpp new file mode 100644 index 00000000000000..8042b648772f42 --- /dev/null +++ b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/gather_tree.hpp @@ -0,0 +1,34 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include + +#include "functional_test_utils/layer_test_utils.hpp" +#include "ngraph_functions/builders.hpp" +#include "ngraph_functions/utils/ngraph_helpers.hpp" + +namespace LayerTestsDefinitions { + +using GatherTreeParamsTuple = typename std::tuple< + std::vector, // Input tensors shape + ngraph::helpers::InputLayerType, // Secondary input type + InferenceEngine::Precision, // Network precision + std::string>; // Device name + +class GatherTreeLayerTest : public testing::WithParamInterface, + virtual public LayerTestsUtils::LayerTestsCommon { +public: + static std::string getTestCaseName(const testing::TestParamInfo &obj); + InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo &info) const override; + +protected: + void SetUp() override; +}; + +} // namespace LayerTestsDefinitions \ No newline at end of file diff --git a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/gather_tree.cpp b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/gather_tree.cpp new file mode 100644 index 00000000000000..172539bbf43af6 --- /dev/null +++ b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/gather_tree.cpp @@ -0,0 +1,87 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include "common_test_utils/common_utils.hpp" +#include "functional_test_utils/precision_utils.hpp" +#include "functional_test_utils/skip_tests_config.hpp" + +#include "single_layer_tests/gather_tree.hpp" + +namespace LayerTestsDefinitions { +std::string GatherTreeLayerTest::getTestCaseName(const testing::TestParamInfo &obj) { + std::vector inputShape; + InferenceEngine::Precision netPrecision; + ngraph::helpers::InputLayerType secondaryInputType; + std::string targetName; + + std::tie(inputShape, secondaryInputType, netPrecision, targetName) = obj.param; + + std::ostringstream result; + result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_"; + result << "secondaryInputType=" << secondaryInputType << "_"; + result << "netPRC=" << netPrecision.name() << "_"; + result << "targetDevice=" << targetName; + return result.str(); +} + +void GatherTreeLayerTest::SetUp() { + std::vector inputShape; + InferenceEngine::Precision netPrecision; + ngraph::helpers::InputLayerType secondaryInputType; + + std::tie(inputShape, secondaryInputType, netPrecision, targetDevice) = GetParam(); + + auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); + + std::shared_ptr inp2; + std::shared_ptr inp3; + std::shared_ptr inp4; + + auto paramsIn = ngraph::builder::makeParams(ngPrc, { inputShape }); + if (ngraph::helpers::InputLayerType::PARAMETER == secondaryInputType) { + auto paramsSecond = ngraph::builder::makeParams(ngPrc, { inputShape, {inputShape.at(1)}, {}}); + paramsIn.insert(paramsIn.end(), paramsSecond.begin(), paramsSecond.end()); + + inp2 = paramsIn.at(1); + inp3 = paramsIn.at(2); + inp4 = paramsIn.at(3); + } else if (ngraph::helpers::InputLayerType::CONSTANT == secondaryInputType) { + auto maxBeamIndex = inputShape.at(2) - 1; + + inp2 = ngraph::builder::makeConstant(ngPrc, inputShape, {}, true, maxBeamIndex); + inp3 = ngraph::builder::makeConstant(ngPrc, {inputShape.at(1)}, {}, true, maxBeamIndex); + inp4 = ngraph::builder::makeConstant(ngPrc, {}, {}, true, maxBeamIndex); + } else { + throw std::runtime_error("Unsupported inputType"); + } + + auto operationResult = std::make_shared(paramsIn.front(), inp2, inp3, inp4); + + ngraph::ResultVector results{std::make_shared(operationResult)}; + function = std::make_shared(results, paramsIn, "GatherTree"); +} + +InferenceEngine::Blob::Ptr GatherTreeLayerTest::GenerateInput(const InferenceEngine::InputInfo &info) const { + auto& shape = function->get_parameters()[0]->get_output_shape(0); + auto& vecDims = info.getTensorDesc().getDims(); + + auto maxBeamIndx = shape.at(2) - 1; + + if (vecDims.size() == 1 || vecDims.size() == 0) { //max_seq_len vector || end_token + return FuncTestUtils::createAndFillBlob(info.getTensorDesc(), maxBeamIndx, maxBeamIndx / 2); + } + + return FuncTestUtils::createAndFillBlob(info.getTensorDesc(), maxBeamIndx); +} + +TEST_P(GatherTreeLayerTest, CompareWithRefs) { + Run(); +}; + +} // namespace LayerTestsDefinitions \ No newline at end of file diff --git a/inference-engine/tests/ngraph_functions/include/ngraph_functions/builders.hpp b/inference-engine/tests/ngraph_functions/include/ngraph_functions/builders.hpp index 1ce7d4e3123075..afc245c3bd5299 100644 --- a/inference-engine/tests/ngraph_functions/include/ngraph_functions/builders.hpp +++ b/inference-engine/tests/ngraph_functions/include/ngraph_functions/builders.hpp @@ -24,27 +24,28 @@ makeParams(const element::Type &type, const std::vector std::shared_ptr makeConstant(const element::Type &type, const std::vector &shape, - const std::vector &data, bool random = false) { + const std::vector &data, bool random = false, + uint32_t upTo = 10, uint32_t startFrom = 1) { std::shared_ptr weightsNode; #define makeNode(TYPE) \ case TYPE: \ weightsNode = std::make_shared( \ type, shape, \ - random ? NGraphFunctions::Utils::generateVector(ngraph::shape_size(shape)) : \ + random ? NGraphFunctions::Utils::generateVector(ngraph::shape_size(shape), upTo, startFrom) : \ NGraphFunctions::Utils::castVector::value_type >(data)); \ break; switch (type) { case ngraph::element::Type_t::bf16: weightsNode = std::make_shared( type, shape, - random ? NGraphFunctions::Utils::generateBF16Vector(ngraph::shape_size(shape)) : + random ? NGraphFunctions::Utils::generateBF16Vector(ngraph::shape_size(shape), upTo, startFrom) : NGraphFunctions::Utils::castVector(data)); break; case ngraph::element::Type_t::f16: weightsNode = std::make_shared( type, shape, - random ? NGraphFunctions::Utils::generateF16Vector(ngraph::shape_size(shape)) : + random ? NGraphFunctions::Utils::generateF16Vector(ngraph::shape_size(shape), upTo, startFrom) : NGraphFunctions::Utils::castVector(data)); break; makeNode(ngraph::element::Type_t::f32); diff --git a/inference-engine/tests/ngraph_functions/include/ngraph_functions/utils/data_utils.hpp b/inference-engine/tests/ngraph_functions/include/ngraph_functions/utils/data_utils.hpp index 46ddd7f0a59c99..ebcf456d1e3816 100644 --- a/inference-engine/tests/ngraph_functions/include/ngraph_functions/utils/data_utils.hpp +++ b/inference-engine/tests/ngraph_functions/include/ngraph_functions/utils/data_utils.hpp @@ -17,13 +17,13 @@ namespace Utils { template std::vector::value_type> inline -generateVector(size_t vec_len) { +generateVector(size_t vec_len, uint32_t upTo = 10, uint32_t startFrom = 1) { std::vector::value_type> res; std::mt19937 gen( static_cast(std::chrono::high_resolution_clock::now().time_since_epoch().count())); // chose values between this range to avoid type overrun (e.g. in case of I8 precision) - std::uniform_int_distribution dist(1, 10); + std::uniform_int_distribution dist(startFrom, upTo); for (int i = 0; i < vec_len; i++) { res.push_back( @@ -32,13 +32,13 @@ generateVector(size_t vec_len) { return res; } -std::vector inline generateF16Vector(size_t vec_len) { +std::vector inline generateF16Vector(size_t vec_len, uint32_t upTo = 10, uint32_t startFrom = 1) { std::vector res; std::mt19937 gen( static_cast(std::chrono::high_resolution_clock::now().time_since_epoch().count())); // chose values between this range to avoid type overrun (e.g. in case of I8 precision) - std::uniform_int_distribution dist(1, 10); + std::uniform_int_distribution dist(startFrom, upTo); for (int i = 0; i < vec_len; i++) { res.emplace_back(ngraph::float16(static_cast(dist(gen)))); @@ -46,13 +46,13 @@ std::vector inline generateF16Vector(size_t vec_len) { return res; } -std::vector inline generateBF16Vector(size_t vec_len) { +std::vector inline generateBF16Vector(size_t vec_len, uint32_t upTo = 10, uint32_t startFrom = 1) { std::vector res; std::mt19937 gen( static_cast(std::chrono::high_resolution_clock::now().time_since_epoch().count())); // chose values between this range to avoid type overrun (e.g. in case of I8 precision) - std::uniform_int_distribution dist(1, 10); + std::uniform_int_distribution dist(startFrom, upTo); for (int i = 0; i < vec_len; i++) { res.emplace_back(ngraph::bfloat16(static_cast(dist(gen)))); diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/gather_tree.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/gather_tree.hpp new file mode 100644 index 00000000000000..665f471622d957 --- /dev/null +++ b/ngraph/core/reference/include/ngraph/runtime/reference/gather_tree.hpp @@ -0,0 +1,39 @@ +//***************************************************************************** +// Copyright 2017-2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#pragma once + +#include "ngraph/type/element_type.hpp" + +namespace ngraph +{ + namespace runtime + { + namespace reference + { + void gather_tree(const char* step_ids, + const char* parent_ids, + const char* max_seq_len, + const char* end_token, + char* out, + const Shape& step_ids_shape, + const Shape& parent_ids_shape, + const Shape& max_seq_len_shape, + const Shape& end_token_shape, + const element::Type& type); + } + } +} \ No newline at end of file diff --git a/ngraph/core/reference/src/runtime/reference/gather_tree.cpp b/ngraph/core/reference/src/runtime/reference/gather_tree.cpp new file mode 100644 index 00000000000000..0008f77515c738 --- /dev/null +++ b/ngraph/core/reference/src/runtime/reference/gather_tree.cpp @@ -0,0 +1,145 @@ +//***************************************************************************** +// Copyright 2017-2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#include +#include +#include + +#include "ngraph/check.hpp" +#include "ngraph/coordinate_transform.hpp" +#include "ngraph/runtime/reference/gather_tree.hpp" + +using namespace ngraph; + +static size_t _asIndex(const char* source, const element::Type& element_type) +{ + // According to the GatherTree op specification only I32 and FP32 precisions are supported. + switch (element_type) + { + case element::Type_t::f32: + { + float tmpBuff = 0.f; + memcpy(&tmpBuff, source, sizeof(float)); + return tmpBuff; + } + case element::Type_t::i32: + { + int32_t tmpBuff = 0; + memcpy(&tmpBuff, source, sizeof(int32_t)); + return tmpBuff; + } + default: + { + throw ngraph_error(std::string("Unsupported input data type: ") + + element_type.get_type_name()); + } + } +} + +// This is an implementation of the algorithm from the tensorflow 1.5 sources. +void runtime::reference::gather_tree(const char* step_ids, + const char* parent_ids, + const char* max_seq_len, + const char* end_token, + char* out, + const Shape& step_ids_shape, + const Shape& parent_ids_shape, + const Shape& max_seq_len_shape, + const Shape& end_token_shape, + const element::Type& element_type) +{ + if (step_ids_shape != parent_ids_shape) + { + throw ngraph_error("step_ids shape and parent_ids shape must be the same"); + } + if (step_ids_shape.size() != 3) + { + throw ngraph_error("step_ids must be a 3-tensor"); + } + if (!is_vector(max_seq_len_shape)) + { + throw ngraph_error("max_seq_len must be a vector"); + } + if (!is_scalar(end_token_shape)) + { + throw ngraph_error("end_token must be a scalar"); + } + + const size_t max_time = step_ids_shape.at(0); + const size_t batch_size = step_ids_shape.at(1); + const size_t beam_width = step_ids_shape.at(2); + + const size_t elem_size = element_type.size(); + + if (max_seq_len_shape.front() != batch_size) + { + throw ngraph_error("max_seq_len must have size of BATCH_SIZE"); + } + + ngraph::CoordinateTransform cordinate_transform(step_ids_shape); + + for (const auto& coord : cordinate_transform) + { + memcpy(out + cordinate_transform.index(coord) * elem_size, end_token, elem_size); + } + + for (size_t batch = 0; batch < batch_size; ++batch) + { + for (size_t beam = 0; beam < beam_width; ++beam) + { + const size_t max_seq_in_beam = + std::min(max_time, _asIndex(max_seq_len + batch * elem_size, element_type)); + + if (max_seq_in_beam == 0) + { + continue; + } + + auto offset = cordinate_transform.index({max_seq_in_beam - 1, batch, beam}) * elem_size; + + memcpy(out + offset, step_ids + offset, elem_size); + + size_t parent = _asIndex(parent_ids + offset, element_type); + + for (size_t level = max_seq_in_beam - 1; level-- > 0;) + { + memcpy(out + cordinate_transform.index({level, batch, beam}) * elem_size, + step_ids + cordinate_transform.index({level, batch, parent}) * elem_size, + elem_size); + + parent = _asIndex(parent_ids + + cordinate_transform.index({level, batch, parent}) * elem_size, + element_type); + } + + bool finished = false; + for (size_t time = 0; time < max_seq_in_beam; ++time) + { + if (finished) + { + memcpy(out + cordinate_transform.index({time, batch, beam}) * elem_size, + end_token, + elem_size); + } + else if (_asIndex(out + cordinate_transform.index({time, batch, beam}) * elem_size, + element_type) == _asIndex(end_token, element_type)) + { + finished = true; + } + } + } + } +} \ No newline at end of file diff --git a/ngraph/test/runtime/interpreter/int_executable.hpp b/ngraph/test/runtime/interpreter/int_executable.hpp index 0f4c4eac8b0d0c..26e35ef2f7b8ea 100644 --- a/ngraph/test/runtime/interpreter/int_executable.hpp +++ b/ngraph/test/runtime/interpreter/int_executable.hpp @@ -59,6 +59,8 @@ #include "ngraph/runtime/reference/floor.hpp" #include "ngraph/runtime/reference/gather.hpp" #include "ngraph/runtime/reference/gather_nd.hpp" +#include "ngraph/runtime/reference/gather_tree.hpp" +#include "ngraph/runtime/reference/gather_tree.hpp" #include "ngraph/runtime/reference/gru_cell.hpp" #include "ngraph/runtime/reference/log.hpp" #include "ngraph/runtime/reference/lrn.hpp" @@ -1258,6 +1260,20 @@ class INTERPRETER_BACKEND_API ngraph::runtime::interpreter::INTExecutable : publ break; } + case OP_TYPEID::GatherTree_v1: + { + reference::gather_tree(args[0]->get_data_ptr(), + args[1]->get_data_ptr(), + args[2]->get_data_ptr(), + args[3]->get_data_ptr(), + out[0]->get_data_ptr(), + node.get_input_shape(0), + node.get_input_shape(1), + node.get_input_shape(2), + node.get_input_shape(3), + args[1]->get_element_type()); + break; + } // Fused Ops are not supported in interpreter. They need to be decomposed before execution case OP_TYPEID::DepthToSpace: diff --git a/ngraph/test/runtime/interpreter/opset_int_tbl.hpp b/ngraph/test/runtime/interpreter/opset_int_tbl.hpp index 1dadbfa35e2085..f44775d4010fb3 100644 --- a/ngraph/test/runtime/interpreter/opset_int_tbl.hpp +++ b/ngraph/test/runtime/interpreter/opset_int_tbl.hpp @@ -29,6 +29,7 @@ NGRAPH_OP(LogicalAnd, op::v1) NGRAPH_OP(LogicalOr, op::v1) NGRAPH_OP(LogicalXor, op::v1) NGRAPH_OP(LogicalNot, op::v1) +NGRAPH_OP(GatherTree, op::v1) #undef ID_SUFFIX #define ID_SUFFIX(NAME) NAME##_v3 From b225ddf4148bcb0e652b8884f382665a1eb65c23 Mon Sep 17 00:00:00 2001 From: Dmitrii Denisov Date: Mon, 7 Sep 2020 19:54:56 +0300 Subject: [PATCH 16/66] Refactoring: install_openvino_dependencies.sh script (#2112) --- .../install_openvino_dependencies.sh | 100 ++---------------- 1 file changed, 8 insertions(+), 92 deletions(-) diff --git a/scripts/install_dependencies/install_openvino_dependencies.sh b/scripts/install_dependencies/install_openvino_dependencies.sh index fe47c5b89d980f..e62152f1b0b912 100755 --- a/scripts/install_dependencies/install_openvino_dependencies.sh +++ b/scripts/install_dependencies/install_openvino_dependencies.sh @@ -71,6 +71,7 @@ if [ -f /etc/lsb-release ]; then gstreamer1.0-plugins-base gstreamer1.0-plugins-good gstreamer1.0-plugins-bad + gstreamer1.0-vaapi ffmpeg ) system_ver=$(cat /etc/lsb-release | grep -i "DISTRIB_RELEASE" | cut -d "=" -f2) @@ -78,12 +79,7 @@ if [ -f /etc/lsb-release ]; then PKGS+=( libgtk2.0-0 ) else if [ "$system_ver" = "20.04" ]; then - PKGS+=( libglib2.0-0 - libgstreamer1.0-0 - gstreamer1.0-plugins-base - gstreamer1.0-plugins-good - gstreamer1.0-plugins-bad - gstreamer1.0-plugins-ugly + PKGS+=( gstreamer1.0-plugins-ugly gstreamer1.0-libav libgstreamer-plugins-base1.0-dev gstreamer1.0-doc @@ -103,115 +99,35 @@ if [ -f /etc/lsb-release ]; then libpython3.8 ) elif [ "$system_ver" = "18.04" ]; then - PKGS+=( libglib2.0 - libfluidsynth1 + PKGS+=( libfluidsynth1 libnettle6 libopenexr22 - python3.6 - libpython3.6 + gstreamer1.0-plugins-ugly + gstreamer1.0-alsa + gstreamer1.0-gtk3 ) fi - PKGS+=( libgtk-3-0 - flex + PKGS+=( flex bison - libgmp10 libgsl23 gobject-introspection - libcap2 - libcap2-bin - gettext - libgirepository-1.0-1 - libx11-6 - iso-codes - libgl1-mesa-dri - libgles2 libgl-dev - gir1.2-gudev-1.0 - libtheora0 - libcdparanoia0 - libpango-1.0-0 - libgbm1 - libasound2 - libjpeg8 - libvisual-0.4-0 - libxv1 - libopus0 - libgraphene-1.0-0 - libvorbis0a - libbz2-1.0 - libv4l-0 - libaa1 - libflac8 - libgdk-pixbuf2.0-0 - libmp3lame0 - libcaca0 - libdv4 - libmpg123-0 - libraw1394-11 - libavc1394-0 - libiec61883-0 - libpulse0 - libsoup2.4-1 - libspeex1 libtag-extras1 - libtwolame0 - libwavpack1 - libbluetooth3 libusb-1.0-0-dev - libass9 - libbs2b0 - libchromaprint1 - liblcms2-2 - libssh2-1 - libdc1394-22 libdirectfb-1.7-7 - libssh-4 - libdca0 libfaac0 libfdk-aac1 - libflite1 - libgme0 - libgsm1 - libkate1 liblrdf0 - libde265-0 libmjpegtools-dev - libmms0 - libmodplug1 - libmpcdec6 - libneon27 - libopenal1 - libopenjp2-7 - libopenmpt0 libopenni2-0 - libdvdnav4 - librtmp1 - librsvg2-2 - libsbc1 - libsndfile1 - libsoundtouch1 - libspandsp2 - libsrtp2-1 - libzvbi0 - libvo-aacenc0 - libvo-amrwbenc0 - libwebrtc-audio-processing1 - libwebp6 - libwildmidi2 - libzbar0 - libnice10 - libxkbcommon0 libmpeg2-4 libopencore-amrnb0 libopencore-amrwb0 liba52-0.7.4 - libva2 - libxrandr2 - libudev1 - python3-gi ) fi apt update + # shellcheck disable=SC2068 apt install -y ${PKGS[@]} else # CentOS From dc8bbd930f93cabf8f882e745636c656a0f51980 Mon Sep 17 00:00:00 2001 From: Edward Shogulin Date: Mon, 7 Sep 2020 20:31:45 +0300 Subject: [PATCH 17/66] [LPT] Multiinput with one parent and FQ with three Constant (#2066) * [LPT] FakeQuantize with three constants * [LPT] Dequantization ops on thw inputs with one parent --- .../network_helper.hpp | 7 +- .../src/activation.cpp | 12 +- .../src/concat.cpp | 16 +- .../src/layer_transformation.cpp | 14 +- .../src/network_helper.cpp | 151 +++++++++++------- ..._constant_fake_quantize_transformation.cpp | 32 ++++ ...ultiply_with_one_parent_transformation.cpp | 31 ++++ ..._constant_fake_quantize_transformation.cpp | 31 ++++ ...ultiply_with_one_parent_transformation.cpp | 31 ++++ ..._constant_fake_quantize_transformation.hpp | 38 +++++ ...ultiply_with_one_parent_transformation.hpp | 40 +++++ ..._constant_fake_quantize_transformation.cpp | 63 ++++++++ ...multiply_with_one_parent_transformaion.cpp | 84 ++++++++++ ...imized_constant_fake_quantize_function.hpp | 26 +++ .../multiply_with_one_parent_function.hpp | 25 +++ ...imized_constant_fake_quantize_function.cpp | 51 ++++++ .../multiply_with_one_parent_function.cpp | 32 ++++ 17 files changed, 613 insertions(+), 71 deletions(-) create mode 100644 inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp create mode 100644 inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/multiply_with_one_parent_transformation.cpp create mode 100644 inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp create mode 100644 inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/multiply_with_one_parent_transformation.cpp create mode 100644 inference-engine/tests/functional/plugin/shared/include/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.hpp create mode 100644 inference-engine/tests/functional/plugin/shared/include/low_precision_transformations/multiply_with_one_parent_transformation.hpp create mode 100644 inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp create mode 100644 inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/multiply_with_one_parent_transformaion.cpp create mode 100644 inference-engine/tests/ngraph_functions/include/ngraph_functions/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_function.hpp create mode 100644 inference-engine/tests/ngraph_functions/include/ngraph_functions/low_precision_transformations/multiply_with_one_parent_function.hpp create mode 100644 inference-engine/tests/ngraph_functions/src/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_function.cpp create mode 100644 inference-engine/tests/ngraph_functions/src/low_precision_transformations/multiply_with_one_parent_function.cpp diff --git a/inference-engine/src/low_precision_transformations/include/low_precision_transformations/network_helper.hpp b/inference-engine/src/low_precision_transformations/include/low_precision_transformations/network_helper.hpp index 643688af6dad6b..1cc3af0838193c 100644 --- a/inference-engine/src/low_precision_transformations/include/low_precision_transformations/network_helper.hpp +++ b/inference-engine/src/low_precision_transformations/include/low_precision_transformations/network_helper.hpp @@ -140,7 +140,9 @@ class INFERENCE_ENGINE_API_CLASS(CNNNetworkHelper) { static void replaceLayer(TransformationContext& context, const CNNLayerPtr source, const CNNLayerPtr target); - static CNNLayerPtr addScaleShiftBetween( + // Add ScaleShift beween parent and child layers. Affected edges (output and input ports) are not specified. + // As result ScaleShift will be added for all edges between parent and children. + static std::vector addScaleShiftBetween( TransformationContext& context, const CNNLayerPtr parent, const CNNLayerPtr child, @@ -158,7 +160,8 @@ class INFERENCE_ENGINE_API_CLASS(CNNNetworkHelper) { DataPtr parentOutData, CNNLayer::Ptr layer, const std::string& nextLayerName, - ICNNNetwork& net); + ICNNNetwork& net, + const int childInsDataIndex = -1); IE_SUPPRESS_DEPRECATED_START static void fillInScaleShift(ScaleShiftLayer* layer, const size_t channels, const float* scales, const float* shifts); diff --git a/inference-engine/src/low_precision_transformations/src/activation.cpp b/inference-engine/src/low_precision_transformations/src/activation.cpp index 34e88303c03338..450aaaaa258c8a 100644 --- a/inference-engine/src/low_precision_transformations/src/activation.cpp +++ b/inference-engine/src/low_precision_transformations/src/activation.cpp @@ -105,8 +105,14 @@ void ActivationTransformation::transform(TransformationContext& context, CNNLaye const std::vector children = CNNNetworkHelper::getChildren(*activationLayer); for (const CNNLayerPtr& child : children) { - CNNLayerPtr dequantizationLayer = CNNNetworkHelper::addScaleShiftBetween(context, activationLayer, child, - DequantizationDetails(scales, shifts)); - context.dequantizationLayersNames.insert(dequantizationLayer->name); + const std::vector dequantizationLayers = CNNNetworkHelper::addScaleShiftBetween( + context, + activationLayer, + child, + DequantizationDetails(scales, shifts)); + + for (const auto& dequantizationLayer : dequantizationLayers) { + context.dequantizationLayersNames.insert(dequantizationLayer->name); + } } } diff --git a/inference-engine/src/low_precision_transformations/src/concat.cpp b/inference-engine/src/low_precision_transformations/src/concat.cpp index b883af8075bc57..378b00b0ea5fdb 100644 --- a/inference-engine/src/low_precision_transformations/src/concat.cpp +++ b/inference-engine/src/low_precision_transformations/src/concat.cpp @@ -253,12 +253,15 @@ void ConcatTransformation::addDequantizationLayers( getLayerDequantizationCallback(*layer, layer->name, layerDequantizationScales, layerDequantizationShifts); } - CNNLayerPtr dequantizationLayer = CNNNetworkHelper::addScaleShiftBetween( + const std::vector dequantizationLayers = CNNNetworkHelper::addScaleShiftBetween( context, std::make_shared(*layer), child, DequantizationDetails(layerDequantizationScales, layerDequantizationShifts, layerDequantizationScales.size())); - context.dequantizationLayersNames.insert(dequantizationLayer->name); + + for (const CNNLayerPtr& dequantizationLayer : dequantizationLayers) { + context.dequantizationLayersNames.insert(dequantizationLayer->name); + } } } @@ -275,14 +278,17 @@ void ConcatTransformation::addDequantizationLayers( getLayerDequantizationCallback(*layer, originalName, layerDequantizationScales, layerDequantizationShifts); } - CNNLayerPtr dequantizationLayer = CNNNetworkHelper::addScaleShiftBetween( + const std::vector dequantizationLayers = CNNNetworkHelper::addScaleShiftBetween( context, std::make_shared(*layer), nullptr, DequantizationDetails(layerDequantizationScales, layerDequantizationShifts, layerDequantizationScales.size()), originalName); - context.dequantizationLayersNames.insert(dequantizationLayer->name); - subgraph.layers[dequantizationLayer->name] = dequantizationLayer.get(); + + for (const CNNLayerPtr& dequantizationLayer : dequantizationLayers) { + context.dequantizationLayersNames.insert(dequantizationLayer->name); + subgraph.layers[dequantizationLayer->name] = dequantizationLayer.get(); + } } } } diff --git a/inference-engine/src/low_precision_transformations/src/layer_transformation.cpp b/inference-engine/src/low_precision_transformations/src/layer_transformation.cpp index 9db8490d8a3e99..c4fda19a728658 100644 --- a/inference-engine/src/low_precision_transformations/src/layer_transformation.cpp +++ b/inference-engine/src/low_precision_transformations/src/layer_transformation.cpp @@ -254,12 +254,15 @@ void LayerTransformation::addDequantizationLayer( const std::vector children = CNNNetworkHelper::getChildren(layer); for (const CNNLayerPtr& child : children) { - const CNNLayerPtr dequantizationLayer = CNNNetworkHelper::addScaleShiftBetween( + const std::vector dequantizationLayers = CNNNetworkHelper::addScaleShiftBetween( context, std::make_shared(layer), child, DequantizationDetails(dequantizationScales, dequantizationShifts, outputChannelsCount)); - context.dequantizationLayersNames.insert(dequantizationLayer->name); + + for (const auto& dequantizationLayer : dequantizationLayers) { + context.dequantizationLayersNames.insert(dequantizationLayer->name); + } } OutputsDataMap outputs; @@ -269,13 +272,16 @@ void LayerTransformation::addDequantizationLayer( const std::string dequantizationLayerName = layer.name; CNNNetworkHelper::renameLayer(context.network, layer.name, layer.name + LayerTransformation::lastLayerPostfix); - const CNNLayerPtr dequantizationLayer = CNNNetworkHelper::addScaleShiftBetween( + const std::vector dequantizationLayers = CNNNetworkHelper::addScaleShiftBetween( context, std::make_shared(layer), nullptr, DequantizationDetails(dequantizationScales, dequantizationShifts, outputChannelsCount), dequantizationLayerName); - context.dequantizationLayersNames.insert(dequantizationLayer->name); + + for (const auto& dequantizationLayer : dequantizationLayers) { + context.dequantizationLayersNames.insert(dequantizationLayer->name); + } } } diff --git a/inference-engine/src/low_precision_transformations/src/network_helper.cpp b/inference-engine/src/low_precision_transformations/src/network_helper.cpp index b8c0b09c916f82..8556e23402f355 100644 --- a/inference-engine/src/low_precision_transformations/src/network_helper.cpp +++ b/inference-engine/src/low_precision_transformations/src/network_helper.cpp @@ -439,8 +439,7 @@ std::vector CNNNetworkHelper::transformFakeQuantizeToConst(Transfor const CNNLayerPtr fakeQuantize, const Blob::Ptr weights, const std::string& constLayerName) { - std::vector constLayersToRemove; - constLayersToRemove.reserve(fakeQuantize->insData.size()); + std::set constLayersToRemove; for (const DataWeakPtr& insDataWeak : fakeQuantize->insData) { const DataPtr insData = insDataWeak.lock(); @@ -456,7 +455,7 @@ std::vector CNNNetworkHelper::transformFakeQuantizeToConst(Transfor << fakeQuantize->name << "' is nullable"; } - constLayersToRemove.push_back(parent); + constLayersToRemove.insert(parent); } for (const CNNLayerPtr& parent : constLayersToRemove) { @@ -1049,7 +1048,7 @@ void CNNNetworkHelper::replaceLayer(TransformationContext& context, const CNNLay networkImpl->addLayer(target); } -CNNLayerPtr CNNNetworkHelper::addScaleShiftBetween(TransformationContext& context, const CNNLayerPtr parent, +std::vector CNNNetworkHelper::addScaleShiftBetween(TransformationContext& context, const CNNLayerPtr parent, const CNNLayerPtr child, const DequantizationDetails& dequantizationDetails, const std::string& name) { @@ -1078,66 +1077,92 @@ CNNLayerPtr CNNNetworkHelper::addScaleShiftBetween(TransformationContext& contex CNNNetworkHelper::updateBlobs(*child, "biases", updatedShifts); } - return child; + return { child }; } // Searching the connection between the layers - int l1_out_i = 0; + + // specify parent/child edges here and manipulate with them below + std::vector parentOutDataIndexes; + std::vector childInsDataIndexes; if (child != nullptr) { - for (; l1_out_i < parent->outData.size(); l1_out_i++) { - if (getInputTo(parent->outData[l1_out_i]).find(child->name) != - getInputTo(parent->outData[l1_out_i]).end()) { - break; + for (int l1_out_i = 0; l1_out_i < parent->outData.size(); l1_out_i++) { + auto& inputTo = getInputTo(parent->outData[l1_out_i]); + if (inputTo.find(child->name) != inputTo.end()) { + parentOutDataIndexes.push_back(l1_out_i); + } + } + + for (size_t i = 0; i < child->insData.size(); ++i) { + const auto& insData = child->insData[i]; + const CNNLayerPtr& creatorLayer = getCreatorLayer(insData.lock()).lock(); + if (creatorLayer->name == parent->name) { + childInsDataIndexes.push_back(i); } } + } else { + parentOutDataIndexes.push_back(0); + childInsDataIndexes.push_back(0); } - if (l1_out_i == parent->outData.size()) { + + if (childInsDataIndexes.empty()) { if (child != nullptr) THROW_IE_EXCEPTION << "Can't find layer " << child->name << " among layer " << parent->name << " outputs"; else THROW_IE_EXCEPTION << "Layer '" << parent->name << "' has invalid output"; } - DataPtr outData = parent->outData[l1_out_i]; - - std::string layerName = name.empty() ? (child != nullptr ? (parent->name + "_ScaleShift_" + child->name) - : (parent->name + "_ScaleShift")) - : name; + std::vector ssCnnLayers; + ssCnnLayers.reserve(childInsDataIndexes.size()); + for (int l1_out_i : parentOutDataIndexes) { + DataPtr outData = parent->outData[l1_out_i]; + + for (int i = 0; i < childInsDataIndexes.size(); ++i) { + const int childInsDataIndex = childInsDataIndexes[i]; + std::string layerName = name.empty() ? + (child != nullptr ? + (parent->name + "_ScaleShift" + (childInsDataIndexes.size() == 1 ? "" : std::to_string(childInsDataIndex)) + "_" + child->name) : + (parent->name + "_ScaleShift" + (childInsDataIndexes.size() == 1 ? "" : std::to_string(childInsDataIndex)))) + : name; + + Precision ssPrecision = context.getOriginalLayerPrecision(parent->name, outData->getName()); + if (ssPrecision == Precision::UNSPECIFIED) { + if (child != nullptr) + ssPrecision = child->precision; + else + ssPrecision = Precision::FP32; + } - Precision ssPrecision = context.getOriginalLayerPrecision(parent->name, outData->getName()); - if (ssPrecision == Precision::UNSPECIFIED) { - if (child != nullptr) - ssPrecision = child->precision; - else - ssPrecision = Precision::FP32; - } + LayerParams ssCnnLayerParams{ layerName, "ScaleShift", ssPrecision }; + CNNLayerPtr ssCnnLayer(new ScaleShiftLayer(ssCnnLayerParams)); - LayerParams ssCnnLayerParams {layerName, "ScaleShift", ssPrecision}; - CNNLayerPtr ssCnnLayer(new ScaleShiftLayer(ssCnnLayerParams)); + const std::vector dims = outData->getDims(); - const std::vector dims = outData->getDims(); + if ((dims.size() != 2ul) || ((dims.size() == 2ul) && (dims[0] != dequantizationDetails.channelsCount))) { + if ((dims.size() > 1) && (dims[1] != dequantizationDetails.channelsCount)) { + THROW_IE_EXCEPTION << "unexpected parent channels count " << dims[1]; + } + } + addLayerToCNNNetworkAfterData(outData, ssCnnLayer, child != nullptr ? child->name : "", context.network, childInsDataIndex); - if ((dims.size() != 2ul) || ((dims.size() == 2ul) && (dims[0] != dequantizationDetails.channelsCount))) { - if ((dims.size() > 1) && (dims[1] != dequantizationDetails.channelsCount)) { - THROW_IE_EXCEPTION << "unexpected parent channels count " << dims[1]; - } - } - addLayerToCNNNetworkAfterData(outData, ssCnnLayer, child != nullptr ? child->name : "", context.network); + { + ScaleShiftLayer* scshLayer = dynamic_cast(ssCnnLayer.get()); + if (scshLayer == nullptr) { + THROW_IE_EXCEPTION << "Layer " << ssCnnLayer->name << " is not instance of ScaleShiftLayer class"; + } + fillInScaleShift( + scshLayer, + dequantizationDetails.channelsCount, + dequantizationDetails.scales.data(), + dequantizationDetails.shifts.data()); + } - { - ScaleShiftLayer* scshLayer = dynamic_cast(ssCnnLayer.get()); - if (scshLayer == nullptr) { - THROW_IE_EXCEPTION << "Layer " << ssCnnLayer->name << " is not instance of ScaleShiftLayer class"; + CNNNetworkHelper::setOutDataPrecision(*ssCnnLayer, ssPrecision); + ssCnnLayers.push_back(ssCnnLayer); } - fillInScaleShift( - scshLayer, - dequantizationDetails.channelsCount, - dequantizationDetails.scales.data(), - dequantizationDetails.shifts.data()); } - CNNNetworkHelper::setOutDataPrecision(*ssCnnLayer, ssPrecision); - return ssCnnLayer; + return ssCnnLayers; } CNNLayerPtr CNNNetworkHelper::addConstBetween(ICNNNetwork& net, const CNNLayerPtr layer1, const CNNLayerPtr layer2, @@ -1177,7 +1202,8 @@ void CNNNetworkHelper::addLayerToCNNNetworkAfterData( DataPtr parentOutData, CNNLayer::Ptr layer, const std::string& nextLayerName, - ICNNNetwork& net) { + ICNNNetwork& net, + const int childInsDataIndex) { CNNNetworkImpl* netImpl = dynamic_cast(&net); if (netImpl == nullptr) { THROW_IE_EXCEPTION << "unexpected network type"; @@ -1188,7 +1214,7 @@ void CNNNetworkHelper::addLayerToCNNNetworkAfterData( netImpl->getLayerByName(nextLayerName.c_str(), nextLayer, nullptr); } - if (layer && (nextLayerName.empty() || (parentOutData == nullptr) || + if (layer && (nextLayerName.empty() || (parentOutData == nullptr) || (childInsDataIndex != -1) || (getInputTo(parentOutData).find(nextLayerName) != getInputTo(parentOutData).end()))) { auto getTensorDesc = [](CNNLayerPtr& nextLayer) { const DataPtr insData = nextLayer->insData[0].lock(); @@ -1222,12 +1248,18 @@ void CNNNetworkHelper::addLayerToCNNNetworkAfterData( if (!nextLayerName.empty()) { // CNNLayerPtr nextLayer = getInputTo(parentOutData)[nextLayerName]; getInputTo(newEdgeAfterLayer)[nextLayerName] = nextLayer; + if (parentOutData != nullptr) { getInputTo(parentOutData).erase(nextLayerName); - for (size_t i = 0; i < nextLayer->insData.size(); i++) { - if (nextLayer->insData[i].lock() == parentOutData) { - nextLayer->insData[i] = newEdgeAfterLayer; + + if (childInsDataIndex == -1) { + for (size_t i = 0; i < nextLayer->insData.size(); i++) { + if (nextLayer->insData[i].lock() == parentOutData) { + nextLayer->insData[i] = newEdgeAfterLayer; + } } + } else { + nextLayer->insData[childInsDataIndex] = newEdgeAfterLayer; } } else { // TODO: why new? @@ -1348,20 +1380,21 @@ size_t CNNNetworkHelper::disconnectLayers(CNNNetworkImpl* network, const CNNLaye bool wasFound = false; for (auto dataIt = parentLayer->outData.begin(); dataIt != parentLayer->outData.end(); ++dataIt) { auto data = *dataIt; - for (auto inputIt = getInputTo(data).begin(); inputIt != getInputTo(data).end(); ++inputIt) { + + auto inputIt = getInputTo(data).begin(); + while (inputIt != getInputTo(data).end()) { auto currentChildLayer = inputIt->second; if (currentChildLayer == nullptr) { THROW_IE_EXCEPTION << "Output layer for '" << parentLayer->name << "'is absent"; } + if (currentChildLayer->name == childLayer->name) { - getInputTo(data).erase(inputIt); + inputIt = getInputTo(data).erase(inputIt); wasFound = true; - break; + continue; } - } - if (wasFound) { - break; + ++inputIt; } } if (!wasFound) { @@ -1370,7 +1403,8 @@ size_t CNNNetworkHelper::disconnectLayers(CNNNetworkImpl* network, const CNNLaye } wasFound = false; - for (auto it = childLayer->insData.begin(); it != childLayer->insData.end(); ++it) { + auto it = childLayer->insData.begin(); + while (it != childLayer->insData.end()) { auto data = it->lock(); if (data == nullptr) { THROW_IE_EXCEPTION << "Input layer data for '" << childLayer->name << "'is absent"; @@ -1379,11 +1413,14 @@ size_t CNNNetworkHelper::disconnectLayers(CNNNetworkImpl* network, const CNNLaye if (currentParentLayer == nullptr) { THROW_IE_EXCEPTION << "Input layer for '" << childLayer->name << "'is absent"; } + if (currentParentLayer->name == parentLayer->name) { - childLayer->insData.erase(it); + it = childLayer->insData.erase(it); wasFound = true; - break; + continue; } + + ++it; } if (!wasFound) { THROW_IE_EXCEPTION << "Input layer '" << parentLayer->name << "' was not found for '" << childLayer->name diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp new file mode 100644 index 00000000000000..5af9d5558918e1 --- /dev/null +++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp @@ -0,0 +1,32 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.hpp" +#include "common_test_utils/test_constants.hpp" + +using namespace LayerTestsDefinitions; + +namespace { +const std::vector netPrecisions = { + InferenceEngine::Precision::FP32, + InferenceEngine::Precision::FP16 +}; + +const std::vector params = { + { + { 256ul, ngraph::Shape { 1 }, { 0.f }, { 25.5f }, { 0.f }, { 25.5f } }, + { 255ul, ngraph::Shape { 1 }, { -12.8f }, { 12.7f }, { -12.8f }, { 12.7f } } + }, +}; + +INSTANTIATE_TEST_CASE_P(LPT, MatMulWithOptimizedConstantFakeQuantizeTransformation, + ::testing::Combine( + ::testing::ValuesIn(netPrecisions), + ::testing::Values(InferenceEngine::SizeVector({ 1, 16 })), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::ValuesIn(params)), + MatMulWithOptimizedConstantFakeQuantizeTransformation::getTestCaseName); +} // namespace diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/multiply_with_one_parent_transformation.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/multiply_with_one_parent_transformation.cpp new file mode 100644 index 00000000000000..e5f55dcbbf0c54 --- /dev/null +++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/multiply_with_one_parent_transformation.cpp @@ -0,0 +1,31 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "low_precision_transformations/multiply_with_one_parent_transformation.hpp" +#include "common_test_utils/test_constants.hpp" + +using namespace LayerTestsDefinitions; + +namespace { +const std::vector netPrecisions = { + InferenceEngine::Precision::FP32, + InferenceEngine::Precision::FP16 +}; + +const std::vector values = { + { + { 256ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 255.f }, { 0.f }, { 255.f } } + } +}; + +INSTANTIATE_TEST_CASE_P(LPT, MultiplyWithOneParentTransformation, + ::testing::Combine( + ::testing::ValuesIn(netPrecisions), + ::testing::Values(InferenceEngine::SizeVector({ 1, 3, 16, 16 })), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::ValuesIn(values)), + MultiplyWithOneParentTransformation::getTestCaseName); +} // namespace diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp new file mode 100644 index 00000000000000..7af10777ac5d1b --- /dev/null +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp @@ -0,0 +1,31 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.hpp" +#include "common_test_utils/test_constants.hpp" + +using namespace LayerTestsDefinitions; + +namespace { +const std::vector netPrecisions = { + InferenceEngine::Precision::FP32 +}; + +const std::vector params = { + { + { 256ul, ngraph::Shape { 1 }, { 0.f }, { 25.5f }, { 0.f }, { 25.5f } }, + { 255ul, ngraph::Shape { 1 }, { -12.8f }, { 12.7f }, { -12.8f }, { 12.7f } } + } +}; + +INSTANTIATE_TEST_CASE_P(LPT, MatMulWithOptimizedConstantFakeQuantizeTransformation, + ::testing::Combine( + ::testing::ValuesIn(netPrecisions), + ::testing::Values(InferenceEngine::SizeVector({ 1, 16 })), + ::testing::Values(CommonTestUtils::DEVICE_GPU), + ::testing::ValuesIn(params)), + MatMulWithOptimizedConstantFakeQuantizeTransformation::getTestCaseName); +} // namespace diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/multiply_with_one_parent_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/multiply_with_one_parent_transformation.cpp new file mode 100644 index 00000000000000..6c0e1f0a443a2c --- /dev/null +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/multiply_with_one_parent_transformation.cpp @@ -0,0 +1,31 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "low_precision_transformations/multiply_with_one_parent_transformation.hpp" +#include "common_test_utils/test_constants.hpp" + +using namespace LayerTestsDefinitions; + +namespace { +const std::vector netPrecisions = { + InferenceEngine::Precision::FP32, + InferenceEngine::Precision::FP16 +}; + +const std::vector values = { + { + { 256ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 255.f }, { 0.f }, { 255.f } } + } +}; + +INSTANTIATE_TEST_CASE_P(LPT, MultiplyWithOneParentTransformation, + ::testing::Combine( + ::testing::ValuesIn(netPrecisions), + ::testing::Values(InferenceEngine::SizeVector({ 1, 3, 16, 16 })), + ::testing::Values(CommonTestUtils::DEVICE_GPU), + ::testing::ValuesIn(values)), + MultiplyWithOneParentTransformation::getTestCaseName); +} // namespace diff --git a/inference-engine/tests/functional/plugin/shared/include/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.hpp b/inference-engine/tests/functional/plugin/shared/include/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.hpp new file mode 100644 index 00000000000000..e76243a83614f9 --- /dev/null +++ b/inference-engine/tests/functional/plugin/shared/include/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.hpp @@ -0,0 +1,38 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +#include "functional_test_utils/low_precision_transformations/layer_transformation.hpp" +#include "ngraph_functions/low_precision_transformations/common/fake_quantize_on_data.hpp" + +namespace LayerTestsDefinitions { + +class MatMulWithOptimizedConstantFakeQuantizeTransformationTestValues { +public: + ngraph::builder::subgraph::FakeQuantizeOnData fqOnData; + ngraph::builder::subgraph::FakeQuantizeOnData fqOnWeights; +}; + +typedef std::tuple< + InferenceEngine::Precision, + InferenceEngine::SizeVector, + std::string, + MatMulWithOptimizedConstantFakeQuantizeTransformationTestValues +> MatMulWithOptimizedConstantFakeQuantizeTransformationTransformationParams; + +class MatMulWithOptimizedConstantFakeQuantizeTransformation : + public testing::WithParamInterface, + public LayerTestsUtils::LayerTransformation { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + void SetUp() override; +}; + +} // namespace LayerTestsDefinitions diff --git a/inference-engine/tests/functional/plugin/shared/include/low_precision_transformations/multiply_with_one_parent_transformation.hpp b/inference-engine/tests/functional/plugin/shared/include/low_precision_transformations/multiply_with_one_parent_transformation.hpp new file mode 100644 index 00000000000000..3cf0a227fd3732 --- /dev/null +++ b/inference-engine/tests/functional/plugin/shared/include/low_precision_transformations/multiply_with_one_parent_transformation.hpp @@ -0,0 +1,40 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +#include "functional_test_utils/low_precision_transformations/layer_transformation.hpp" +#include "ngraph_functions/low_precision_transformations/common/fake_quantize_on_data.hpp" + +namespace LayerTestsDefinitions { + +class MultiplyWithOneParentTransformationValues { +public: + ngraph::builder::subgraph::FakeQuantizeOnData fakeQuantize; +}; + +typedef std::tuple< + InferenceEngine::Precision, + InferenceEngine::SizeVector, + std::string, + MultiplyWithOneParentTransformationValues +> MultiplyWithOneParentTransformationParams; + +class MultiplyWithOneParentTransformation : + public testing::WithParamInterface, + public LayerTestsUtils::LayerTransformation { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + void SetUp() override; + +private: + void validate(); +}; + +} // namespace LayerTestsDefinitions diff --git a/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp b/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp new file mode 100644 index 00000000000000..6e39ca31f50fa6 --- /dev/null +++ b/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp @@ -0,0 +1,63 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.hpp" + +#include +#include +#include +#include + +#include + +#include "common_test_utils/common_utils.hpp" +#include "functional_test_utils/plugin_cache.hpp" +#include "functional_test_utils/layer_test_utils.hpp" +#include "functional_test_utils/blob_utils.hpp" +#include "ngraph_functions/pass/convert_prc.hpp" +#include "ngraph_functions/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_function.hpp" + +namespace LayerTestsDefinitions { + +std::string MatMulWithOptimizedConstantFakeQuantizeTransformation::getTestCaseName( + testing::TestParamInfo obj) { + InferenceEngine::Precision netPrecision; + InferenceEngine::SizeVector inputShape; + std::string targetDevice; + InferenceEngine::details::LayerTransformation::Params params; + MatMulWithOptimizedConstantFakeQuantizeTransformationTestValues param; + + std::tie(netPrecision, inputShape, targetDevice, param) = obj.param; + + std::ostringstream result; + result << netPrecision.name() << "_" << + CommonTestUtils::vec2str(inputShape) << "_" << + targetDevice << "_" << + param.fqOnData << "_" << + param.fqOnWeights; + return result.str(); +} + +void MatMulWithOptimizedConstantFakeQuantizeTransformation::SetUp() { + threshold = 0.01f; + + InferenceEngine::Precision netPrecision; + InferenceEngine::SizeVector inputShape; + InferenceEngine::details::LayerTransformation::Params params; + MatMulWithOptimizedConstantFakeQuantizeTransformationTestValues param; + std::tie(netPrecision, inputShape, targetDevice, param) = this->GetParam(); + auto precision = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); + + function = ngraph::builder::subgraph::MatMulWithOptimizedConstantFakeQuantizeFunction::getOriginal( + precision, + inputShape, + param.fqOnData, + param.fqOnWeights); +} + +TEST_P(MatMulWithOptimizedConstantFakeQuantizeTransformation, CompareWithRefImpl) { + Run(); +}; + +} // namespace LayerTestsDefinitions diff --git a/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/multiply_with_one_parent_transformaion.cpp b/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/multiply_with_one_parent_transformaion.cpp new file mode 100644 index 00000000000000..d3f348e981d1b0 --- /dev/null +++ b/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/multiply_with_one_parent_transformaion.cpp @@ -0,0 +1,84 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "low_precision_transformations/multiply_with_one_parent_transformation.hpp" + +#include +#include +#include +#include + +#include +#include "common_test_utils/common_utils.hpp" +#include "ngraph_functions/low_precision_transformations/multiply_with_one_parent_function.hpp" + +namespace LayerTestsDefinitions { + +std::string MultiplyWithOneParentTransformation::getTestCaseName(testing::TestParamInfo obj) { + InferenceEngine::Precision netPrecision; + InferenceEngine::SizeVector inputShape; + std::string targetDevice; + MultiplyWithOneParentTransformationValues values; + + std::tie(netPrecision, inputShape, targetDevice, values) = obj.param; + + std::ostringstream result; + result << netPrecision.name() << "_" << CommonTestUtils::vec2str(inputShape); + return result.str(); +} + +void MultiplyWithOneParentTransformation::SetUp() { + threshold = 0.01f; + + InferenceEngine::Precision netPrecision; + InferenceEngine::SizeVector inputShape; + InferenceEngine::details::LayerTransformation::Params params; + MultiplyWithOneParentTransformationValues values; + std::tie(netPrecision, inputShape, targetDevice, values) = this->GetParam(); + auto precision = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); + + function = ngraph::builder::subgraph::MultiplyWithOneParentFunction::getOriginal(precision, inputShape, values.fakeQuantize); + + validate(); +} + +void MultiplyWithOneParentTransformation::validate() { + InferenceEngine::Precision netPrecision; + InferenceEngine::SizeVector inputShape; + std::string targetDevice; + InferenceEngine::details::LayerTransformation::Params params = LayerTestsUtils::LayerTransformationParamsFactory::createParams(); + MultiplyWithOneParentTransformationValues values; + std::tie(netPrecision, inputShape, targetDevice, values) = this->GetParam(); + + const InferenceEngine::CNNNetwork network = transform(params); + + IE_SUPPRESS_DEPRECATED_START + + InferenceEngine::OutputsDataMap outputs = network.getOutputsInfo(); + EXPECT_EQ(1, outputs.size()); + + std::map::iterator it = outputs.begin(); + const InferenceEngine::CNNLayerPtr outputLayer = getCreatorLayer(it->second).lock(); + EXPECT_TRUE(outputLayer != nullptr); + EXPECT_EQ("Eltwise", outputLayer->type); + + // check #1: successful transformation execution + EXPECT_EQ(2ul, outputLayer->insData.size()); + const auto parents = InferenceEngine::details::CNNNetworkHelper::getParents(*outputLayer); + EXPECT_EQ(2ul, parents.size()); + EXPECT_EQ("ScaleShift", parents[0]->type); + + // check #2: successful graph handling + EXPECT_EQ("FakeQuantize", parents[1]->type); + EXPECT_EQ(1ul, InferenceEngine::details::CNNNetworkHelper::getParents(*parents[0]).size()); + EXPECT_EQ("FakeQuantize", InferenceEngine::details::CNNNetworkHelper::getParents(*parents[0])[0]->type); + + IE_SUPPRESS_DEPRECATED_END +} + +TEST_P(MultiplyWithOneParentTransformation, CompareWithRefImpl) { + Run(); +}; + +} // namespace LayerTestsDefinitions diff --git a/inference-engine/tests/ngraph_functions/include/ngraph_functions/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_function.hpp b/inference-engine/tests/ngraph_functions/include/ngraph_functions/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_function.hpp new file mode 100644 index 00000000000000..83983c9be0f20f --- /dev/null +++ b/inference-engine/tests/ngraph_functions/include/ngraph_functions/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_function.hpp @@ -0,0 +1,26 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include "ngraph_functions/low_precision_transformations/common/fake_quantize_on_data.hpp" + +namespace ngraph { +namespace builder { +namespace subgraph { + +class MatMulWithOptimizedConstantFakeQuantizeFunction { +public: + static std::shared_ptr getOriginal( + const ngraph::element::Type precision, + const ngraph::Shape& inputShape, + const FakeQuantizeOnData& fqOnData, + const FakeQuantizeOnData& fqOnWeights); +}; + +} // namespace subgraph +} // namespace builder +} // namespace ngraph diff --git a/inference-engine/tests/ngraph_functions/include/ngraph_functions/low_precision_transformations/multiply_with_one_parent_function.hpp b/inference-engine/tests/ngraph_functions/include/ngraph_functions/low_precision_transformations/multiply_with_one_parent_function.hpp new file mode 100644 index 00000000000000..504ee84ca048cb --- /dev/null +++ b/inference-engine/tests/ngraph_functions/include/ngraph_functions/low_precision_transformations/multiply_with_one_parent_function.hpp @@ -0,0 +1,25 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include "ngraph_functions/low_precision_transformations/common/fake_quantize_on_data.hpp" + +namespace ngraph { +namespace builder { +namespace subgraph { + +class MultiplyWithOneParentFunction { +public: + static std::shared_ptr getOriginal( + const ngraph::element::Type precision, + const ngraph::Shape& inputShape, + const FakeQuantizeOnData& fakeQuantize); +}; + +} // namespace subgraph +} // namespace builder +} // namespace ngraph diff --git a/inference-engine/tests/ngraph_functions/src/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_function.cpp b/inference-engine/tests/ngraph_functions/src/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_function.cpp new file mode 100644 index 00000000000000..ad571d24e09a4e --- /dev/null +++ b/inference-engine/tests/ngraph_functions/src/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_function.cpp @@ -0,0 +1,51 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ngraph_functions/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_function.hpp" + +#include +#include "ngraph_functions/builders.hpp" + +namespace ngraph { +namespace builder { +namespace subgraph { + +std::shared_ptr MatMulWithOptimizedConstantFakeQuantizeFunction::getOriginal( + const ngraph::element::Type precision, + const ngraph::Shape& inputShape, + const FakeQuantizeOnData& fqOnData, + const FakeQuantizeOnData& fqOnWeights) { + const auto input = std::make_shared(precision, ngraph::Shape(inputShape)); + const auto fakeQuantizeOnActivations = fqOnData.empty() ? + nullptr : + ngraph::builder::makeFakeQuantize( + input, precision, fqOnData.quantizationLevel, fqOnData.constantShape, + fqOnData.inputLowValues, fqOnData.inputHighValues, fqOnData.outputLowValues, fqOnData.outputHighValues); + + const ngraph::Shape weightsShape = { inputShape[1], 10 }; + const std::vector weigths(weightsShape[0] * weightsShape[1], 10.f); + const auto weightsConst = std::make_shared(precision, weightsShape, weigths); + const auto lowConstant = std::make_shared(precision, fqOnWeights.constantShape, fqOnWeights.inputLowValues); + const auto highConstant = std::make_shared(precision, fqOnWeights.constantShape, fqOnWeights.inputHighValues); + const auto fakeQuantizeOnWeights = std::make_shared( + weightsConst, + lowConstant, + highConstant, + lowConstant, + highConstant, + fqOnWeights.quantizationLevel); + + const auto matMul = std::make_shared( + fqOnData.empty() ? input : fakeQuantizeOnActivations, + fakeQuantizeOnWeights, + false, + false); + + ngraph::ResultVector results{ std::make_shared(matMul) }; + return std::make_shared(results, ngraph::ParameterVector{ input }, "MatMulWithOptimizedConstantFakeQuantizeFunction"); +} + +} // namespace subgraph +} // namespace builder +} // namespace ngraph diff --git a/inference-engine/tests/ngraph_functions/src/low_precision_transformations/multiply_with_one_parent_function.cpp b/inference-engine/tests/ngraph_functions/src/low_precision_transformations/multiply_with_one_parent_function.cpp new file mode 100644 index 00000000000000..66487d0bf2516c --- /dev/null +++ b/inference-engine/tests/ngraph_functions/src/low_precision_transformations/multiply_with_one_parent_function.cpp @@ -0,0 +1,32 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ngraph_functions/low_precision_transformations/multiply_with_one_parent_function.hpp" + +#include +#include "ngraph_functions/builders.hpp" + +namespace ngraph { +namespace builder { +namespace subgraph { + +std::shared_ptr MultiplyWithOneParentFunction::getOriginal( + const ngraph::element::Type precision, + const ngraph::Shape& inputShape, + const FakeQuantizeOnData& fqOnData) { + const auto input = std::make_shared(precision, ngraph::Shape(inputShape)); + + const auto fakeQuantize = ngraph::builder::makeFakeQuantize( + input, precision, fqOnData.quantizationLevel, fqOnData.constantShape, + fqOnData.inputLowValues, fqOnData.inputHighValues, fqOnData.outputLowValues, fqOnData.outputHighValues); + + const auto multiply = std::make_shared(fakeQuantize->output(0), fakeQuantize->output(0)); + + ngraph::ResultVector results{ std::make_shared(multiply) }; + return std::make_shared(results, ngraph::ParameterVector{ input }, "MultiplyWithOneParentFunction"); +} + +} // namespace subgraph +} // namespace builder +} // namespace ngraph From 928eed9a51f8acdf0b9e66d90a7f2768e0582bee Mon Sep 17 00:00:00 2001 From: Andrew Bakalin Date: Mon, 7 Sep 2020 23:23:47 +0300 Subject: [PATCH 18/66] [IE][VPU][GT][IE Transformations]: Fixes for post processing model (#2041) * Disable StridedSlice to crop convertion in dynamic cases * NMS: increase shave requirements for some cases * Update firmware * Add test cases --- inference-engine/cmake/vpu_dependencies.cmake | 2 +- .../convert_strided_slice_to_crop.cpp | 4 +- .../include/vpu/stages/nms.hpp | 34 ++++++ .../src/frontend/frontend.cpp | 14 ++- .../vpu/graph_transformer/src/stages/nms.cpp | 107 +++++++++--------- .../src/stages/static_shape_nms.cpp | 29 +---- .../single_layer_tests/static_shape_nms.cpp | 3 +- .../subgraph_tests/dsr_strided_slice.cpp | 1 + 8 files changed, 107 insertions(+), 87 deletions(-) create mode 100644 inference-engine/src/vpu/graph_transformer/include/vpu/stages/nms.hpp diff --git a/inference-engine/cmake/vpu_dependencies.cmake b/inference-engine/cmake/vpu_dependencies.cmake index 262bcd21207bef..e17ada43d53a83 100644 --- a/inference-engine/cmake/vpu_dependencies.cmake +++ b/inference-engine/cmake/vpu_dependencies.cmake @@ -19,7 +19,7 @@ set(VPU_SUPPORTED_FIRMWARES usb-ma2450 usb-ma2x8x pcie-ma248x) # Default packages # -set(FIRMWARE_PACKAGE_VERSION 1354) +set(FIRMWARE_PACKAGE_VERSION 1360) set(VPU_CLC_MA2X8X_VERSION "movi-cltools-20.02.0") # diff --git a/inference-engine/src/transformations/src/transformations/convert_opset1_to_legacy/convert_strided_slice_to_crop.cpp b/inference-engine/src/transformations/src/transformations/convert_opset1_to_legacy/convert_strided_slice_to_crop.cpp index 69ee25ed43e324..54f292571f5578 100644 --- a/inference-engine/src/transformations/src/transformations/convert_opset1_to_legacy/convert_strided_slice_to_crop.cpp +++ b/inference-engine/src/transformations/src/transformations/convert_opset1_to_legacy/convert_strided_slice_to_crop.cpp @@ -22,9 +22,9 @@ ngraph::pass::ConvertStridedSliceToCropMatcher::ConvertStridedSliceToCropMatcher std::vector end_mask = {0, 0, 0, 0}; auto m_slice = std::make_shared(data, m_begin, m_end, m_stride, begin_mask, end_mask); - ngraph::matcher_pass_callback callback = [](pattern::Matcher& m) { + ngraph::matcher_pass_callback callback = [this](pattern::Matcher& m) { auto slice = std::dynamic_pointer_cast (m.get_match_root()); - if (!slice) { + if (!slice || m_transformation_callback(slice)) { return false; } diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/stages/nms.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/stages/nms.hpp new file mode 100644 index 00000000000000..e76410a60231f9 --- /dev/null +++ b/inference-engine/src/vpu/graph_transformer/include/vpu/stages/nms.hpp @@ -0,0 +1,34 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace vpu { + +class NonMaxSuppression : public StageNode { +protected: + StagePtr cloneImpl() const override; + + void propagateDataOrderImpl(StageDataInfo& orderInfo) override; + + void getDataStridesRequirementsImpl(StageDataInfo& stridesInfo) override; + + void finalizeDataLayoutImpl() override; + + void getBatchSupportInfoImpl(StageDataInfo& batchInfo) override; + + StageSHAVEsRequirements getSHAVEsRequirementsImpl() const override; + + void initialCheckImpl() const override; + + void finalCheckImpl() const override; + + void serializeParamsImpl(BlobSerializer& serializer) const override; + + void serializeDataImpl(BlobSerializer& serializer) const override; +}; + +} // namespace vpu diff --git a/inference-engine/src/vpu/graph_transformer/src/frontend/frontend.cpp b/inference-engine/src/vpu/graph_transformer/src/frontend/frontend.cpp index b4b48be2bc0f7a..b1b2cdaf36f5f4 100644 --- a/inference-engine/src/vpu/graph_transformer/src/frontend/frontend.cpp +++ b/inference-engine/src/vpu/graph_transformer/src/frontend/frontend.cpp @@ -384,12 +384,16 @@ ModelPtr FrontEnd::runCommonPasses(ie::ICNNNetwork& network, const UnsupportedLa VPU_LOGGER_SECTION(env.log); auto convertNetwork = [&convertedNetwork, &originalOrConvertNetwork]() { - // disable GeLU decomposition + // disable transformations for some cases const auto transformationsPredicate = [](const std::shared_ptr &node) -> bool { - return std::dynamic_pointer_cast(node) || - (std::dynamic_pointer_cast(node) && - std::dynamic_pointer_cast(node->input_value(0).get_node_shared_ptr())) || - std::dynamic_pointer_cast(node); + const bool casesWithDynamicOrStaticUsage = std::dynamic_pointer_cast(node) || + std::dynamic_pointer_cast(node); + + const bool casesWithOnlyDynamicUsage = (std::dynamic_pointer_cast(node) || + std::dynamic_pointer_cast(node)) && + std::dynamic_pointer_cast(node->input_value(0).get_node_shared_ptr()); + + return casesWithDynamicOrStaticUsage || casesWithOnlyDynamicUsage; }; auto nGraphFunc = originalOrConvertNetwork->getFunction(); diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/nms.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/nms.cpp index b56e783f4c5330..698126371fabec 100644 --- a/inference-engine/src/vpu/graph_transformer/src/stages/nms.cpp +++ b/inference-engine/src/vpu/graph_transformer/src/stages/nms.cpp @@ -2,76 +2,81 @@ // SPDX-License-Identifier: Apache-2.0 // +#include + #include #include #include namespace vpu { -namespace { +StagePtr NonMaxSuppression::cloneImpl() const { + return std::make_shared(*this); +} -class NonMaxSuppression final : public StageNode { -private: - StagePtr cloneImpl() const override { - return std::make_shared(*this); - } +void NonMaxSuppression::propagateDataOrderImpl(StageDataInfo& orderInfo) { +} - void propagateDataOrderImpl(StageDataInfo& orderInfo) override { - } +void NonMaxSuppression::getDataStridesRequirementsImpl(StageDataInfo& stridesInfo) { +} - void getDataStridesRequirementsImpl(StageDataInfo& stridesInfo) override { - } +void NonMaxSuppression::finalizeDataLayoutImpl() { +} - void finalizeDataLayoutImpl() override { - } +void NonMaxSuppression::getBatchSupportInfoImpl(StageDataInfo& batchInfo) { +} - void getBatchSupportInfoImpl(StageDataInfo& batchInfo) override { - } +StageSHAVEsRequirements NonMaxSuppression::getSHAVEsRequirementsImpl() const { + // Current NMS implementation doesn't allow calculation of `> boxesThreshold` boxes using one SHAVE + constexpr int boxesThreshold = 3650; - StageSHAVEsRequirements getSHAVEsRequirementsImpl() const override { - return StageSHAVEsRequirements::OnlyOne; - } + const auto& inDesc = input(0)->desc(); + const auto& maxBoxesNum = inDesc.dim(Dim::H); - void initialCheckImpl() const override { - assertInputsOutputsTypes(this, - {{DataType::FP16}, - {DataType::FP16}, - {DataType::S32}, - {DataType::FP16}, - {DataType::FP16}}, - {{DataType::S32}}); + if (maxBoxesNum > boxesThreshold) { + return StageSHAVEsRequirements::NeedMax; + } else { + return StageSHAVEsRequirements::OnlyOne; } +} - void finalCheckImpl() const override { - } +void NonMaxSuppression::initialCheckImpl() const { + assertInputsOutputsTypes(this, + {{DataType::FP16}, + {DataType::FP16}, + {DataType::S32}, + {DataType::FP16}, + {DataType::FP16}}, + {{DataType::S32}}); +} - void serializeParamsImpl(BlobSerializer& serializer) const override { - bool center_point_box = attrs().get("center_point_box"); +void NonMaxSuppression::finalCheckImpl() const { +} - serializer.append(static_cast(center_point_box)); - } +void NonMaxSuppression::serializeParamsImpl(BlobSerializer& serializer) const { + bool center_point_box = attrs().get("center_point_box"); - void serializeDataImpl(BlobSerializer& serializer) const override { - IE_ASSERT(inputEdges().size() >= 2 && inputEdges().size() <= 5); - IE_ASSERT(outputEdges().size() == 1); - - auto input1 = inputEdges()[0]->input(); - auto input2 = inputEdges()[1]->input(); - auto input3 = inputEdges()[2]->input(); - auto input4 = inputEdges()[3]->input(); - auto input5 = inputEdges()[4]->input(); - auto output = outputEdges()[0]->output(); - - input1->serializeBuffer(serializer); - input2->serializeBuffer(serializer); - output->serializeBuffer(serializer); - input3->serializeBuffer(serializer); - input4->serializeBuffer(serializer); - input5->serializeBuffer(serializer); - } -}; + serializer.append(static_cast(center_point_box)); +} -} // namespace +void NonMaxSuppression::serializeDataImpl(BlobSerializer& serializer) const { + IE_ASSERT(inputEdges().size() >= 2 && inputEdges().size() <= 5); + IE_ASSERT(outputEdges().size() == 1); + + auto input1 = inputEdges()[0]->input(); + auto input2 = inputEdges()[1]->input(); + auto input3 = inputEdges()[2]->input(); + auto input4 = inputEdges()[3]->input(); + auto input5 = inputEdges()[4]->input(); + auto output = outputEdges()[0]->output(); + + input1->serializeBuffer(serializer); + input2->serializeBuffer(serializer); + output->serializeBuffer(serializer); + input3->serializeBuffer(serializer); + input4->serializeBuffer(serializer); + input5->serializeBuffer(serializer); +} void FrontEnd::parseNonMaxSuppression(const Model& model, const ie::CNNLayerPtr& _layer, const DataVector& inputs, const DataVector& outputs) const { auto layer = std::dynamic_pointer_cast(_layer); diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/static_shape_nms.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/static_shape_nms.cpp index 68af204852d5a4..dda8008663fad7 100644 --- a/inference-engine/src/vpu/graph_transformer/src/stages/static_shape_nms.cpp +++ b/inference-engine/src/vpu/graph_transformer/src/stages/static_shape_nms.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // +#include #include #include @@ -13,28 +14,12 @@ namespace vpu { namespace { -class StaticShapeNMS final : public StageNode { +class StaticShapeNMS final : public NonMaxSuppression { private: StagePtr cloneImpl() const override { return std::make_shared(*this); } - void propagateDataOrderImpl(StageDataInfo& orderInfo) override { - } - - void getDataStridesRequirementsImpl(StageDataInfo& stridesInfo) override { - } - - void finalizeDataLayoutImpl() override { - } - - void getBatchSupportInfoImpl(StageDataInfo& batchInfo) override { - } - - StageSHAVEsRequirements getSHAVEsRequirementsImpl() const override { - return StageSHAVEsRequirements::OnlyOne; - } - void initialCheckImpl() const override { assertInputsOutputsTypes(this, {{DataType::FP16}, @@ -46,16 +31,6 @@ class StaticShapeNMS final : public StageNode { {DataType::S32}}); } - void finalCheckImpl() const override { - initialCheckImpl(); - } - - void serializeParamsImpl(BlobSerializer& serializer) const override { - bool center_point_box = attrs().get("center_point_box"); - - serializer.append(static_cast(center_point_box)); - } - void serializeDataImpl(BlobSerializer& serializer) const override { auto input1 = inputEdges()[0]->input(); auto input2 = inputEdges()[1]->input(); diff --git a/inference-engine/tests/functional/plugin/myriad/single_layer_tests/static_shape_nms.cpp b/inference-engine/tests/functional/plugin/myriad/single_layer_tests/static_shape_nms.cpp index de53a037754fd8..eafb984e689b53 100644 --- a/inference-engine/tests/functional/plugin/myriad/single_layer_tests/static_shape_nms.cpp +++ b/inference-engine/tests/functional/plugin/myriad/single_layer_tests/static_shape_nms.cpp @@ -98,7 +98,8 @@ std::vector NMSParams = { std::make_tuple(1, 10, 5, 10, 0., 0.), std::make_tuple(2, 100, 5, 10, 0., 0.), std::make_tuple(3, 10, 5, 2, 0.5, 0.), - std::make_tuple(1, 1000, 1, 2000, 0.5, 0.) + std::make_tuple(1, 1000, 1, 2000, 0.5, 0.), + std::make_tuple(1, 8200, 1, 8200, 0.5, 0.), }; std::vector NMSPrecisions = { diff --git a/inference-engine/tests/functional/plugin/myriad/subgraph_tests/dsr_strided_slice.cpp b/inference-engine/tests/functional/plugin/myriad/subgraph_tests/dsr_strided_slice.cpp index 953788eed310b7..450543effe3244 100644 --- a/inference-engine/tests/functional/plugin/myriad/subgraph_tests/dsr_strided_slice.cpp +++ b/inference-engine/tests/functional/plugin/myriad/subgraph_tests/dsr_strided_slice.cpp @@ -57,6 +57,7 @@ std::vector testCases = { { { { 1, 10, 70 }, { 1, 12, 100 } }, { 0, 4, 0 }, { 0, 9, 0 }, { 1, 2, 1 }, { 1, 0, 1 }, { 1, 0, 1 }, {}, {}, {} }, { { { 1, 10, 60 }, { 1, 12, 100 } }, { 0, -8, 0 }, { 0, -6, 0 }, { 1, 2, 1 }, { 1, 0, 1 }, { 1, 0, 1 }, {}, {}, {} }, { { { 1, 2, 2, 2 }, { 1, 3, 3, 3 } }, { 0, 0, 0, 0 }, { 1, -1, -1, -1 }, { 1, 2, 1, 1 }, {0, 0, 0, 0}, {1, 1, 1, 1}, {}, {}, {} }, + { { { 4000, 2 }, { 8232, 2 } }, { 0, 1 }, { -1, 2 }, { 1, 1 }, {0, 0 }, {1, 1 }, {}, {}, {} }, }; std::vector precisions = { From f42921ce473982cc943101cf4313e2afba3330e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20Do=C5=82bniak?= Date: Tue, 8 Sep 2020 08:51:30 +0200 Subject: [PATCH 19/66] ONNX Resize fix (#2103) --- ngraph/frontend/onnx_import/src/op/resize.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ngraph/frontend/onnx_import/src/op/resize.cpp b/ngraph/frontend/onnx_import/src/op/resize.cpp index 601f507fc41e8b..60547402d9882e 100644 --- a/ngraph/frontend/onnx_import/src/op/resize.cpp +++ b/ngraph/frontend/onnx_import/src/op/resize.cpp @@ -217,7 +217,7 @@ namespace ngraph const auto converted_sizes = std::make_shared(sizes, ngraph::element::f32); const auto divide = - std::make_shared(sizes, shape_of_data); + std::make_shared(converted_sizes, shape_of_data); const auto eps_node = std::make_shared( ngraph::element::f32, Shape{}, epsilon); const auto scales = std::make_shared(divide, eps_node); @@ -262,7 +262,7 @@ namespace ngraph return { std::make_shared(data, output_shape, attrs)}; } - } + } // namespace namespace set_11 { @@ -336,4 +336,4 @@ namespace ngraph } // namespace onnx_import -} // namespace ngraph \ No newline at end of file +} // namespace ngraph From 8cc7eb7a171c885c7dd998eaa80fcb5313e38938 Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Tue, 8 Sep 2020 09:56:54 +0300 Subject: [PATCH 20/66] [IE CLDNN] Added is_discrete flag into device info and FULL_DEVICE_NAME flag (#2089) --- inference-engine/src/cldnn_engine/cldnn_engine.cpp | 4 +++- inference-engine/thirdparty/clDNN/api/device.hpp | 8 ++++++++ .../thirdparty/clDNN/src/gpu/device_info.cpp | 13 +++++++++---- .../thirdparty/clDNN/src/gpu/device_info.h | 4 ++-- .../thirdparty/clDNN/src/gpu/ocl_toolkit.cpp | 1 - 5 files changed, 22 insertions(+), 8 deletions(-) diff --git a/inference-engine/src/cldnn_engine/cldnn_engine.cpp b/inference-engine/src/cldnn_engine/cldnn_engine.cpp index 954d49f22c2f2b..14effa2a00f4dc 100644 --- a/inference-engine/src/cldnn_engine/cldnn_engine.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_engine.cpp @@ -578,7 +578,9 @@ Parameter clDNNEngine::GetMetric(const std::string& name, const std::map configKeys; for (auto opt : _impl->m_config.key_config_map) diff --git a/inference-engine/thirdparty/clDNN/api/device.hpp b/inference-engine/thirdparty/clDNN/api/device.hpp index 7e49aa03ac944e..4789324f4f5e47 100644 --- a/inference-engine/thirdparty/clDNN/api/device.hpp +++ b/inference-engine/thirdparty/clDNN/api/device.hpp @@ -29,6 +29,12 @@ namespace cldnn { /// @defgroup cpp_device GPU Device /// @{ +/// @brief Enumeration of supported device types +enum class device_type { + integrated_gpu = 0, + discrete_gpu = 1 +}; + /// @brief Information about the device properties and capabilities. struct device_info { uint32_t cores_count; ///< Number of available HW cores. @@ -55,6 +61,8 @@ struct device_info { std::string dev_name; ///< Device ID string std::string driver_version; ///< Version of OpenCL driver + + device_type dev_type; ///< Defines type of current GPU device (integrated or discrete) }; struct device_impl; diff --git a/inference-engine/thirdparty/clDNN/src/gpu/device_info.cpp b/inference-engine/thirdparty/clDNN/src/gpu/device_info.cpp index 1fc851d931ca68..8383fdf4eeac97 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/device_info.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/device_info.cpp @@ -117,15 +117,20 @@ int driver_dev_id() return result.back(); } -bool get_imad_support(const cl::Device& device) { +static device_type get_device_type(const cl::Device& device) { + auto unified_mem = device.getInfo(); + + return unified_mem ? device_type::integrated_gpu : device_type::discrete_gpu; +} + +static bool get_imad_support(const cl::Device& device) { std::string dev_name = device.getInfo(); if (dev_name.find("Gen12") != std::string::npos || dev_name.find("Xe") != std::string::npos) return true; - auto flag = device.getInfo(); - if (flag != 0) { + if (get_device_type(device) == device_type::integrated_gpu) { const std::vector imad_ids = { 0x9A40, 0x9A49, 0x9A59, 0x9AD9, 0x9A60, 0x9A68, 0x9A70, 0x9A78, @@ -189,6 +194,7 @@ bool is_local_block_io_supported(const cl::Device& device) { device_info_internal::device_info_internal(const cl::Device& device) { dev_name = device.getInfo(); driver_version = device.getInfo(); + dev_type = get_device_type(device); compute_units_count = device.getInfo(); @@ -220,7 +226,6 @@ device_info_internal::device_info_internal(const cl::Device& device) { supports_imad = get_imad_support(device); supports_immad = false; - dev_type = static_cast(device.getInfo()); vendor_id = static_cast(device.getInfo()); supports_usm = extensions.find("cl_intel_unified_shared_memory") != std::string::npos; diff --git a/inference-engine/thirdparty/clDNN/src/gpu/device_info.h b/inference-engine/thirdparty/clDNN/src/gpu/device_info.h index 076bf76b034e87..9fb804b51ee5e0 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/device_info.h +++ b/inference-engine/thirdparty/clDNN/src/gpu/device_info.h @@ -26,7 +26,6 @@ namespace gpu { struct device_info_internal : cldnn::device_info { std::uint32_t compute_units_count; - uint32_t dev_type; uint32_t vendor_id; uint8_t supports_usm; bool supports_optimization_hints; @@ -51,7 +50,8 @@ struct device_info_internal : cldnn::device_info { supports_immad, supports_usm, dev_name, - driver_version + driver_version, + dev_type }; } }; diff --git a/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.cpp b/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.cpp index dc8ea532467b56..0d1f2d37e11c42 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.cpp @@ -117,7 +117,6 @@ gpu_toolkit::gpu_toolkit(const device_impl& device_impl, const configuration& co << " profiling: " << std::boolalpha << _configuration.enable_profiling << "\n" << " meaningful names: " << std::boolalpha << _configuration.meaningful_kernels_names << "\n" << " dump custom program: " << std::boolalpha << _configuration.dump_custom_program << "\n" - << " device type: " << std::to_string(device_info.dev_type) << "\n" << " vendor type: " << std::hex << std::setfill('0') << std::setw(4) << std::right << std::to_string(device_info.vendor_id) << "\n" << std::dec << std::setfill(' ') << std::right From 063c7ef6b9800f356887c3a62c894c90579fd974 Mon Sep 17 00:00:00 2001 From: Ivan Tikhonov Date: Tue, 8 Sep 2020 10:31:44 +0300 Subject: [PATCH 21/66] GRU/RNN/LSTM sequence ops, reference implementations, single layer tests (#1594) * gru/rnn sequences * update gru/rnn sequences ops, add unit tests * enable sequence transformations for cpu plugin * ngraph codestyle * update tensor iterator to rnn/gru/lstm sequence transformations, add unit tests * ngraph codestyle * add visitors for ngraph ie ops, fix a bug with incorrect axis, fix ngraph to ngraph ie conversion * update GRUSequence/GRUSequenceIE according to plugin format * fix ngraph ie implementations according to plugins restricrictions * fix naming issue * adapt unit tests to accordance to new changes * strict checks, additional unit tests * add descriptions for transformations, fix unit tests * enable ti to sequnece and unroll transformations in plugins for testing * disable tensor iterator ngraph reader tests * delete unnecessary cmake file * fix includes * clean up, resolve review comments * move ti to sequence transformation to ti folder * validate_and_infer_types() implementation * input parameter validation for LSTM, GRU and RNN * style-check applied * Add LSTMSequence dynamic shape validation and test props for RNNCell, GRUCell, LSTMCell and LSTMSequence. * recurrent_sequence.hpp moved to ngraph/core/include/ngraph/op/util/ * style check applied * removed unused variable from LSTMSequence::validate_and_infer_types * Add missing newline mark at the end of file. * Add supression macro for FusedOp deprecation. * Add element type initialization * transpose,rnn cell reference implementations * Apply PR review remarks * reference implementations for cells op, single layer tests, align lstm cell/sequence according to the spec * lstm/gru/rnn cell decompostion transformations * ngraph codestyle * clean up * ngraph code style * change inheritance of Cells, fix build * fix build * fix build again * remove Peepholes from LSTMSeq, fix copy_runtime_info in transformations * Rewrite tests to use gtest exception assertions. * resolve tests issues * ngraph codestyle * add missed files * fix typeprop tests * fix lstm sequence checks * fix arm build * fix arm again * delete unnecessary file * add convert weghts format function, enable lstm test, resolve review comments * add ngraph builders * ngraph codestyle * fix unit tests * revert transpose reference implementation * move ti to sequences transformation to another branch, resolve review comments * resolve review comments * revert fix in ie_layer_validators * revert LSTM Cell v0, add LSTMCell v1, update transformation lstm_cell_to_cell_ie * v1 version of LSTMCell op * LSTMSequence v1 operation, exclude LSTMSeq from opset4 * fix python api tests * resolve review comments, tests for decomposition transformations, switch lstm cell to opset4 in mo * references impl for RNN/GRU/LSTM Sequences, single layer tests, bidirectional transformation * fix unit tests * process dynamic ranks of rnn/gru/lstm ops * remove sequences specifications from opset4 * resolve review comments * fix validate_and_infer_types of GRU/RNN sequences Co-authored-by: Szymon Durawa --- docs/ops/opset4.md | 3 - docs/ops/sequence/GRUSequence_4.md | 136 ----- docs/ops/sequence/RNNSequence_4.md | 128 ----- .../src/convert_function_to_cnn_network.cpp | 123 +++- .../include/ngraph_ops/gru_sequence_ie.hpp | 59 ++ .../include/ngraph_ops/lstm_sequence_ie.hpp | 49 ++ .../include/ngraph_ops/rnn_sequence_ie.hpp | 56 ++ .../bidirectional_sequences_decomposition.hpp | 56 ++ .../convert_sequences_to_sequences_ie.hpp | 59 ++ .../src/ngraph_ops/gru_sequence_ie.cpp | 85 +++ .../src/ngraph_ops/lstm_sequence_ie.cpp | 87 +++ .../src/ngraph_ops/rnn_sequence_ie.cpp | 81 +++ .../bidirectional_sequences_decomposition.cpp | 196 +++++++ .../convert_opset1_to_legacy.cpp | 4 + .../convert_sequences_to_sequences_ie.cpp | 195 +++++++ .../convert_ngraph_to_cnn_network_tests.cpp | 6 +- ...convert_sequences_to_sequences_ie_test.cpp | 271 +++++++++ .../single_layer_tests/gru_sequence.cpp | 59 ++ .../single_layer_tests/lstm_sequence.cpp | 57 ++ .../single_layer_tests/rnn_sequence.cpp | 55 ++ .../single_layer_tests/gru_sequence.hpp | 40 ++ .../single_layer_tests/lstm_sequence.hpp | 39 ++ .../single_layer_tests/rnn_sequence.hpp | 39 ++ .../src/single_layer_tests/gru_cell.cpp | 3 +- .../src/single_layer_tests/gru_sequence.cpp | 98 ++++ .../src/single_layer_tests/lstm_cell.cpp | 2 +- .../src/single_layer_tests/lstm_sequence.cpp | 96 ++++ .../src/single_layer_tests/rnn_cell.cpp | 3 +- .../src/single_layer_tests/rnn_sequence.cpp | 96 ++++ .../include/ngraph_functions/builders.hpp | 49 +- .../tests/ngraph_functions/src/gru_cell.cpp | 37 +- .../tests/ngraph_functions/src/lstm_cell.cpp | 34 +- .../tests/ngraph_functions/src/rnn_cell.cpp | 34 +- .../core/include/ngraph/op/gru_sequence.hpp | 67 +++ .../core/include/ngraph/op/lstm_sequence.hpp | 30 +- .../core/include/ngraph/op/rnn_sequence.hpp | 66 +++ ngraph/core/include/ngraph/ops.hpp | 2 + .../core/include/ngraph/opsets/opset4_tbl.hpp | 2 +- .../ngraph/runtime/reference/sequences.hpp | 539 ++++++++++++++++++ ngraph/core/src/op/gru_cell.cpp | 8 + ngraph/core/src/op/gru_sequence.cpp | 199 +++++++ ngraph/core/src/op/lstm_cell.cpp | 24 +- ngraph/core/src/op/lstm_sequence.cpp | 40 +- ngraph/core/src/op/rnn_cell.cpp | 8 + ngraph/core/src/op/rnn_sequence.cpp | 192 +++++++ ngraph/test/CMakeLists.txt | 2 + ngraph/test/attributes.cpp | 6 +- .../runtime/interpreter/int_executable.hpp | 78 ++- .../runtime/interpreter/opset_int_tbl.hpp | 6 + ngraph/test/type_prop/gru_cell.cpp | 30 +- ngraph/test/type_prop/gru_sequence.cpp | 64 +++ ngraph/test/type_prop/lstm_cell.cpp | 36 +- ngraph/test/type_prop/lstm_sequence.cpp | 24 +- ngraph/test/type_prop/rnn_cell.cpp | 29 +- ngraph/test/type_prop/rnn_sequence.cpp | 62 ++ 55 files changed, 3413 insertions(+), 436 deletions(-) delete mode 100644 docs/ops/sequence/GRUSequence_4.md delete mode 100644 docs/ops/sequence/RNNSequence_4.md create mode 100644 inference-engine/src/transformations/include/ngraph_ops/gru_sequence_ie.hpp create mode 100644 inference-engine/src/transformations/include/ngraph_ops/lstm_sequence_ie.hpp create mode 100644 inference-engine/src/transformations/include/ngraph_ops/rnn_sequence_ie.hpp create mode 100644 inference-engine/src/transformations/include/transformations/bidirectional_sequences_decomposition.hpp create mode 100644 inference-engine/src/transformations/include/transformations/convert_opset1_to_legacy/convert_sequences_to_sequences_ie.hpp create mode 100644 inference-engine/src/transformations/src/ngraph_ops/gru_sequence_ie.cpp create mode 100644 inference-engine/src/transformations/src/ngraph_ops/lstm_sequence_ie.cpp create mode 100644 inference-engine/src/transformations/src/ngraph_ops/rnn_sequence_ie.cpp create mode 100644 inference-engine/src/transformations/src/transformations/bidirectional_sequences_decomposition.cpp create mode 100644 inference-engine/src/transformations/src/transformations/convert_opset1_to_legacy/convert_sequences_to_sequences_ie.cpp create mode 100644 inference-engine/tests/functional/inference_engine/transformations/convert_sequences_to_sequences_ie_test.cpp create mode 100644 inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/gru_sequence.cpp create mode 100644 inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/lstm_sequence.cpp create mode 100644 inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/rnn_sequence.cpp create mode 100644 inference-engine/tests/functional/plugin/shared/include/single_layer_tests/gru_sequence.hpp create mode 100644 inference-engine/tests/functional/plugin/shared/include/single_layer_tests/lstm_sequence.hpp create mode 100644 inference-engine/tests/functional/plugin/shared/include/single_layer_tests/rnn_sequence.hpp create mode 100644 inference-engine/tests/functional/plugin/shared/src/single_layer_tests/gru_sequence.cpp create mode 100644 inference-engine/tests/functional/plugin/shared/src/single_layer_tests/lstm_sequence.cpp create mode 100644 inference-engine/tests/functional/plugin/shared/src/single_layer_tests/rnn_sequence.cpp create mode 100644 ngraph/core/include/ngraph/op/gru_sequence.hpp create mode 100644 ngraph/core/include/ngraph/op/rnn_sequence.hpp create mode 100644 ngraph/core/reference/include/ngraph/runtime/reference/sequences.hpp create mode 100644 ngraph/core/src/op/gru_sequence.cpp create mode 100644 ngraph/core/src/op/rnn_sequence.cpp create mode 100644 ngraph/test/type_prop/gru_sequence.cpp create mode 100644 ngraph/test/type_prop/rnn_sequence.cpp diff --git a/docs/ops/opset4.md b/docs/ops/opset4.md index 16a857d43fc30f..709319f0640d16 100644 --- a/docs/ops/opset4.md +++ b/docs/ops/opset4.md @@ -62,7 +62,6 @@ declared in `namespace opset4`. * [GroupConvolution](convolution/GroupConvolution_1.md) * [GroupConvolutionBackpropData](convolution/GroupConvolutionBackpropData_1.md) * [GRUCell](sequence/GRUCell_3.md) -* [GRUSequence](sequence/GRUSequence_4.md) * [HardSigmoid](activation/HardSigmoid_1.md) * [HSwish](activation/HSwish_4.md) * [Interpolate](image/Interpolate_4.md) @@ -75,7 +74,6 @@ declared in `namespace opset4`. * [LogicalXor](logical/LogicalXor_1.md) * [LRN](normalization/LRN_1.md) * [LSTMCell](sequence/LSTMCell_1.md) -* [LSTMSequence](sequence/LSTMSequence_1.md) * [MatMul](matrix/MatMul_1.md) * [MaxPool](pooling/MaxPool_1.md) * [Maximum](arithmetic/Maximum_1.md) @@ -117,7 +115,6 @@ declared in `namespace opset4`. * [Reverse](movement/Reverse_1.md) * [ReverseSequence](movement/ReverseSequence_1.md) * [RNNCell](sequence/RNNCell_3.md) -* [RNNSequence](sequence/RNNSequence_4.md) * [ROIAlign](detection/ROIAlign_3.md) * [ROIPooling](detection/ROIPooling_1.md) * [ScatterElementsUpdate](movement/ScatterElementsUpdate_3.md) diff --git a/docs/ops/sequence/GRUSequence_4.md b/docs/ops/sequence/GRUSequence_4.md deleted file mode 100644 index 64ab6a990a9103..00000000000000 --- a/docs/ops/sequence/GRUSequence_4.md +++ /dev/null @@ -1,136 +0,0 @@ -## GRUSequence {#openvino_docs_ops_sequence_GRUSequence_4} - -**Versioned name**: *GRUSequence-4* - -**Category**: *Sequence processing* - -**Short description**: *GRUSequence* operation represents a series of GRU cells. Each cell is implemented as GRUCell operation. - -**Detailed description** - -A single cell in the sequence is implemented in the same way as in GRUCell operation. *GRUSequence* represents a sequence of GRU cells. The sequence can be connected differently depending on `direction` attribute that specifies the direction of traversing of input data along sequence dimension or specifies whether it should be a bidirectional sequence. The most of the attributes are in sync with the specification of ONNX GRU operator defined GRUCell. - - -**Attributes** - -* *hidden_size* - - * **Description**: *hidden_size* specifies hidden state size. - * **Range of values**: a positive integer - * **Type**: `int` - * **Default value**: None - * **Required**: *yes* - -* *activations* - - * **Description**: *activations* specifies activation functions for gates, there are two gates, so two activation functions should be specified as a value for this attributes - * **Range of values**: any combination of *relu*, *sigmoid*, *tanh* - * **Type**: a list of strings - * **Default value**: *sigmoid,tanh* - * **Required**: *no* - -* *activations_alpha, activations_beta* - - * **Description**: *activations_alpha, activations_beta* attributes of functions; applicability and meaning of these attributes depends on choosen activation functions - * **Range of values**: a list of floating-point numbers - * **Type**: `float[]` - * **Default value**: None - * **Required**: *no* - -* *clip* - - * **Description**: *clip* specifies bound values *[-C, C]* for tensor clipping. Clipping is performed before activations. - * **Range of values**: a positive floating-point number - * **Type**: `float` - * **Default value**: *infinity* that means that the clipping is not applied - * **Required**: *no* - -* *direction* - - * **Description**: Specify if the RNN is forward, reverse, or bidirectional. If it is one of *forward* or *reverse* then `num_directions = 1`, if it is *bidirectional*, then `num_directions = 2`. This `num_directions` value specifies input/output shape requirements. - * **Range of values**: *forward*, *reverse*, *bidirectional* - * **Type**: `string` - * **Default value**: None - * **Required**: *Yes* - -* *linear_before_reset* - - * **Description**: *linear_before_reset* flag denotes if the layer behaves according to the modification of *GRUCell* described in the formula in the [ONNX documentation](https://github.com/onnx/onnx/blob/master/docs/Operators.md#GRU). - * **Range of values**: True or False - * **Type**: `boolean` - * **Default value**: False - * **Required**: *no* - -**Inputs** - -* **1**: `X` - 3D tensor of type *T1* `[batch_size, seq_length, input_size]`, input data. It differs from GRUCell 1st input only by additional axis with size `seq_length`. **Required.** - -* **2**: `initial_hidden_state` - 3D tensor of type *T1* `[batch_size, num_directions, hidden_size]`, input hidden state data. **Required.** - -* **3**: `sequence_lengths` - 1D tensor of type *T2* `[batch_size]`, specifies real sequence lengths for each batch element. **Required.** - -* **4**: `W` - 3D tensor of type *T1* `[num_directions, 3 * hidden_size, input_size]`, the weights for matrix multiplication, gate order: zrh. **Required.** - -* **5**: `R` - 3D tensor of type *T1* `[num_directions, 3 * hidden_size, hidden_size]`, the recurrence weights for matrix multiplication, gate order: zrh. **Required.** - -* **6**: `B` - 2D tensor of type *T*. If *linear_before_reset* is set to 1, then the shape is `[num_directions, 4 * hidden_size]` - the sum of biases for z and r gates (weights and recurrence weights), the biases for h gate are placed separately. Otherwise the shape is `[num_directions, 3 * hidden_size]`, the sum of biases (weights and recurrence weights). **Required.** - -**Outputs** - -* **1**: `Y` – 3D tensor of type *T1* `[batch_size, num_directions, seq_len, hidden_size]`, concatenation of all the intermediate output values of the hidden. - -* **2**: `Ho` - 3D tensor of type *T1* `[batch_size, num_directions, hidden_size]`, the last output value of hidden state. - -**Types** - -* *T1*: any supported floating point type. -* *T2*: any supported integer type. - -**Example** -```xml - - - - - 1 - 4 - 16 - - - 1 - 1 - 128 - - - 1 - - - 1 - 384 - 16 - - - 1 - 384 - 128 - - - 1 - 384 - - - - - 1 - 1 - 4 - 128 - - - 1 - 1 - 128 - - - -``` \ No newline at end of file diff --git a/docs/ops/sequence/RNNSequence_4.md b/docs/ops/sequence/RNNSequence_4.md deleted file mode 100644 index 4b135b3ff2f33f..00000000000000 --- a/docs/ops/sequence/RNNSequence_4.md +++ /dev/null @@ -1,128 +0,0 @@ -## RNNSequence {#openvino_docs_ops_sequence_RNNSequence_4} - -**Versioned name**: *RNNSequence-4* - -**Category**: *Sequence processing* - -**Short description**: *RNNSequence* operation represents a series of RNN cells. Each cell is implemented as RNNCell operation. - -**Detailed description** - -A single cell in the sequence is implemented in the same way as in RNNCell operation. *RNNSequence* represents a sequence of RNN cells. The sequence can be connected differently depending on `direction` attribute that specifies the direction of traversing of input data along sequence dimension or specifies whether it should be a bidirectional sequence. The most of the attributes are in sync with the specification of ONNX RNN operator defined RNNCell. - - -**Attributes** - -* *hidden_size* - - * **Description**: *hidden_size* specifies hidden state size. - * **Range of values**: a positive integer - * **Type**: `int` - * **Default value**: None - * **Required**: *yes* - -* *activations* - - * **Description**: activation functions for gates - * **Range of values**: any combination of *relu*, *sigmoid*, *tanh* - * **Type**: a list of strings - * **Default value**: *tanh* - * **Required**: *no* - -* *activations_alpha, activations_beta* - - * **Description**: *activations_alpha, activations_beta* attributes of functions; applicability and meaning of these attributes depends on choosen activation functions - * **Range of values**: a list of floating-point numbers - * **Type**: `float[]` - * **Default value**: None - * **Required**: *no* - -* *clip* - - * **Description**: *clip* specifies bound values *[-C, C]* for tensor clipping. Clipping is performed before activations. - * **Range of values**: a positive floating-point number - * **Type**: `float` - * **Default value**: *infinity* that means that the clipping is not applied - * **Required**: *no* - -* *direction* - - * **Description**: Specify if the RNN is forward, reverse, or bidirectional. If it is one of *forward* or *reverse* then `num_directions = 1`, if it is *bidirectional*, then `num_directions = 2`. This `num_directions` value specifies input/output shape requirements. - * **Range of values**: *forward*, *reverse*, *bidirectional* - * **Type**: `string` - * **Default value**: None - * **Required**: *Yes* - -**Inputs** - -* **1**: `X` - 3D tensor of type *T1* `[batch_size, seq_length, input_size]`, input data. It differs from RNNCell 1st input only by additional axis with size `seq_length`. **Required.** - -* **2**: `initial_hidden_state` - 3D tensor of type *T1* `[batch_size, num_directions, hidden_size]`, input hidden state data. **Required.** - -* **3**: `sequence_lengths` - 1D tensor of type *T2* `[batch_size]`, specifies real sequence lengths for each batch element. **Required.** - -* **4**: `W` - 3D tensor of type *T1* `[num_directions, hidden_size, input_size]`, the weights for matrix multiplication. **Required.** - -* **5**: `R` - 3D tensor of type *T1* `[num_directions, hidden_size, hidden_size]`, the recurrence weights for matrix multiplication. **Required.** - -* **6**: `B` - 2D tensor of type *T1* `[num_directions, hidden_size]`, the sum of biases (weights and recurrence weights). **Required.** - -**Outputs** - -* **1**: `Y` – 3D tensor of type *T1* `[batch_size, num_directions, seq_len, hidden_size]`, concatenation of all the intermediate output values of the hidden. - -* **2**: `Ho` - 3D tensor of type *T1* `[batch_size, num_directions, hidden_size]`, the last output value of hidden state. - -**Types** - -* *T1*: any supported floating point type. -* *T2*: any supported integer type. - -**Example** -```xml - - - - - 1 - 4 - 16 - - - 1 - 1 - 128 - - - 1 - - - 1 - 128 - 16 - - - 1 - 128 - 128 - - - 1 - 128 - - - - - 1 - 1 - 4 - 128 - - - 1 - 1 - 128 - - - -``` diff --git a/inference-engine/src/legacy_api/src/convert_function_to_cnn_network.cpp b/inference-engine/src/legacy_api/src/convert_function_to_cnn_network.cpp index fe86357340a3d4..f0401fe8992089 100644 --- a/inference-engine/src/legacy_api/src/convert_function_to_cnn_network.cpp +++ b/inference-engine/src/legacy_api/src/convert_function_to_cnn_network.cpp @@ -34,6 +34,9 @@ #include "ngraph_ops/selu_ie.hpp" #include "ngraph_ops/rnn_cell_ie.hpp" #include "ngraph_ops/topk_ie.hpp" +#include "ngraph_ops/rnn_sequence_ie.hpp" +#include "ngraph_ops/lstm_sequence_ie.hpp" +#include "ngraph_ops/gru_sequence_ie.hpp" #include "generic_ie.hpp" #include "exec_graph_info.hpp" @@ -539,6 +542,111 @@ InferenceEngine::details::CNNLayerCreator::CNNLayerCreator(const std::shared_ptr return res; }); + addSpecificCreator({"GRUSequenceIE"}, [](const std::shared_ptr<::ngraph::Node>& node, + const std::map& params) -> CNNLayerPtr { + + LayerParams attrs = {node->get_friendly_name(), "GRUSequence", + details::convertPrecision(node->get_output_element_type(0))}; + auto res = std::make_shared(attrs); + res->params = params; + + if (res->params["direction"] == "reverse") + res->params["direction"] = "Backward"; + else if (res->params["direction"] == "forward") + res->params["direction"] = "Forward"; + else + res->params["direction"] = "Bidirectional"; + + res->cellType = RNNSequenceLayer::CellType::GRU; + if (res->params["linear_before_reset"] == "true") { + res->cellType = RNNSequenceLayer::CellType::GRU_LBR; + } + + Builder::NodeConverter converter; + const auto weightsNode = node->input_value(3).get_node_shared_ptr(); + if (converter.canCreate(weightsNode)) { + const auto& weights = converter.createLayer(weightsNode); + res->blobs["weights"] = weights->blobs["custom"]; + res->_weights = weights->blobs["custom"]; + } + + const auto biasNode = node->input_value(4).get_node_shared_ptr(); + if (converter.canCreate(biasNode)) { + const auto& bias = converter.createLayer(biasNode); + res->blobs["biases"] = bias->blobs["custom"]; + res->_biases = bias->blobs["custom"]; + } + return res; + }); + + addSpecificCreator({"RNNSequenceIE"}, [](const std::shared_ptr<::ngraph::Node>& node, + const std::map& params) -> CNNLayerPtr { + + LayerParams attrs = {node->get_friendly_name(), "RNNSequence", + details::convertPrecision(node->get_output_element_type(0))}; + auto res = std::make_shared(attrs); + res->params = params; + + res->cellType = RNNSequenceLayer::CellType::RNN; + + if (res->params["direction"] == "reverse") + res->params["direction"] = "Backward"; + else if (res->params["direction"] == "forward") + res->params["direction"] = "Forward"; + else + res->params["direction"] = "Bidirectional"; + + Builder::NodeConverter converter; + const auto weightsNode = node->input_value(3).get_node_shared_ptr(); + if (converter.canCreate(weightsNode)) { + const auto& weights = converter.createLayer(weightsNode); + res->blobs["weights"] = weights->blobs["custom"]; + res->_weights = weights->blobs["custom"]; + } + + const auto biasNode = node->input_value(4).get_node_shared_ptr(); + if (converter.canCreate(biasNode)) { + const auto& bias = converter.createLayer(biasNode); + res->blobs["biases"] = bias->blobs["custom"]; + res->_biases = bias->blobs["custom"]; + } + return res; + }); + + addSpecificCreator({"LSTMSequenceIE"}, [](const std::shared_ptr<::ngraph::Node>& node, + const std::map& params) -> CNNLayerPtr { + + LayerParams attrs = {node->get_friendly_name(), "LSTMSequence", + details::convertPrecision(node->get_output_element_type(0))}; + auto res = std::make_shared(attrs); + res->params = params; + + res->cellType = RNNSequenceLayer::CellType::LSTM; + + if (res->params["direction"] == "reverse") + res->params["direction"] = "Backward"; + else if (res->params["direction"] == "forward") + res->params["direction"] = "Forward"; + else + res->params["direction"] = "Bidirectional"; + + Builder::NodeConverter converter; + const auto weightsNode = node->input_value(4).get_node_shared_ptr(); + if (converter.canCreate(weightsNode)) { + const auto &weights = converter.createLayer(weightsNode); + res->blobs["weights"] = weights->blobs["custom"]; + res->_weights = weights->blobs["custom"]; + } + + const auto biasNode = node->input_value(5).get_node_shared_ptr(); + if (converter.canCreate(biasNode)) { + const auto &bias = converter.createLayer(biasNode); + res->blobs["biases"] = bias->blobs["custom"]; + res->_biases = bias->blobs["custom"]; + } + return res; + }); + REQUIRED_IE_CONVERSION_CREATOR("Broadcast", "Tile"); REQUIRED_IE_CONVERSION_CREATOR("Interpolate", "Interp"); REQUIRED_IE_CONVERSION_CREATOR("NormalizeL2", "NormalizeIE"); @@ -736,13 +844,24 @@ void convertFunctionToICNNNetwork(const std::shared_ptr(consumerLayer) || ::ngraph::as_type_ptr<::ngraph::op::ScaleShiftIE>(consumerLayer) || ::ngraph::as_type_ptr<::ngraph::op::Transpose>(consumerLayer) || + ::ngraph::as_type_ptr<::ngraph::op::LSTMSequenceIE>(consumerLayer) || + ::ngraph::as_type_ptr<::ngraph::op::RNNSequenceIE>(consumerLayer) || + ::ngraph::as_type_ptr<::ngraph::op::GRUSequenceIE>(consumerLayer) || ::ngraph::as_type_ptr<::ngraph::op::RNNCellIE>(consumerLayer) || ::ngraph::as_type_ptr<::ngraph::op::GRUCellIE>(consumerLayer)) { // Check that all input nodes except zero input are Constants for all ops except DeformableConvolutions // for which the input with index 1 is also dynamic - size_t inputID = ::ngraph::as_type_ptr<::ngraph::op::v1::DeformableConvolution>(consumerLayer) || + size_t inputID = 1; + if (::ngraph::as_type_ptr<::ngraph::op::v1::DeformableConvolution>(consumerLayer) || ::ngraph::as_type_ptr<::ngraph::op::GRUCellIE>(consumerLayer) || - ::ngraph::as_type_ptr<::ngraph::op::RNNCellIE>(consumerLayer)? 2 : 1; + ::ngraph::as_type_ptr<::ngraph::op::RNNCellIE>(consumerLayer) || + ::ngraph::as_type_ptr<::ngraph::op::GRUSequenceIE>(consumerLayer) || + ::ngraph::as_type_ptr<::ngraph::op::RNNSequenceIE>(consumerLayer)) { + inputID = 2; + } else if (::ngraph::as_type_ptr<::ngraph::op::LSTMSequenceIE>(consumerLayer)) { + inputID = 3; + } + for (; inputID < consumerLayer->inputs().size(); ++inputID) { auto inputLayer = consumerLayer->input(inputID).get_source_output().get_node_shared_ptr(); if (inputLayer == constLayer) { diff --git a/inference-engine/src/transformations/include/ngraph_ops/gru_sequence_ie.hpp b/inference-engine/src/transformations/include/ngraph_ops/gru_sequence_ie.hpp new file mode 100644 index 00000000000000..8bd29914c70f33 --- /dev/null +++ b/inference-engine/src/transformations/include/ngraph_ops/gru_sequence_ie.hpp @@ -0,0 +1,59 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +#include + +#include "ngraph/opsets/opset4.hpp" +#include "ngraph/op/op.hpp" + +namespace ngraph { +namespace op { +class TRANSFORMATIONS_API GRUSequenceIE : public ngraph::op::util::RNNCellBase { +public: + NGRAPH_RTTI_DECLARATION; + + GRUSequenceIE(const Output &X, + const Output &H_t, + const Output &seg_lengths, + const Output &WR, + const Output &B, + size_t hidden_size, + op::RecurrentSequenceDirection direction, + const std::vector &activations, + const std::vector &activations_alpha, + const std::vector &activations_beta, + float clip, + bool linear_before_reset); + + GRUSequenceIE() = delete; + + std::shared_ptr clone_with_new_inputs(const OutputVector &new_args) const override; + + void validate_and_infer_types() override; + + std::size_t get_hidden_size() { return m_hidden_size; } + + const std::vector &get_activations() { return m_activations; } + + const std::vector &get_activations_alpha() { return m_activations_alpha; } + + const std::vector &get_activations_beta() { return m_activations_beta; } + + float get_clip() { return m_clip; } + + bool visit_attributes(AttributeVisitor& visitor) override; + +protected: + op::RecurrentSequenceDirection m_direction; + bool m_linear_before_reset; +}; + + } // namespace op +} // namespace ngraph diff --git a/inference-engine/src/transformations/include/ngraph_ops/lstm_sequence_ie.hpp b/inference-engine/src/transformations/include/ngraph_ops/lstm_sequence_ie.hpp new file mode 100644 index 00000000000000..12bd31b66cffdb --- /dev/null +++ b/inference-engine/src/transformations/include/ngraph_ops/lstm_sequence_ie.hpp @@ -0,0 +1,49 @@ +// Copyright (C) 2018-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +#include + +#include "ngraph/opsets/opset4.hpp" +#include "ngraph/op/op.hpp" + +namespace ngraph { +namespace op { +class TRANSFORMATIONS_API LSTMSequenceIE : public ngraph::op::util::RNNCellBase { +public: + NGRAPH_RTTI_DECLARATION; + + LSTMSequenceIE() = delete; + + LSTMSequenceIE(const Output &X, + const Output &H_t, + const Output &C_t, + const Output &seq_lenghts, + const Output &WR, + const Output &B, + size_t hidden_size, + ngraph::op::RecurrentSequenceDirection lstm_direction, + const std::vector &activations, + const std::vector &activations_alpha, + const std::vector &activations_beta, + float clip); + + std::shared_ptr clone_with_new_inputs(const OutputVector &new_args) const override; + + void validate_and_infer_types() override; + + ngraph::op::RecurrentSequenceDirection get_direction() { return m_direction; } + + bool visit_attributes(AttributeVisitor& visitor) override; + +protected: + ngraph::op::RecurrentSequenceDirection m_direction; +}; +} // namespace op +} // namespace ngraph diff --git a/inference-engine/src/transformations/include/ngraph_ops/rnn_sequence_ie.hpp b/inference-engine/src/transformations/include/ngraph_ops/rnn_sequence_ie.hpp new file mode 100644 index 00000000000000..0315318edfce82 --- /dev/null +++ b/inference-engine/src/transformations/include/ngraph_ops/rnn_sequence_ie.hpp @@ -0,0 +1,56 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +#include + +#include "ngraph/opsets/opset4.hpp" +#include "ngraph/op/op.hpp" + +namespace ngraph { +namespace op { +class TRANSFORMATIONS_API RNNSequenceIE : public ngraph::op::util::RNNCellBase { +public: + NGRAPH_RTTI_DECLARATION; + + RNNSequenceIE(const Output &X, + const Output &H_t, + const Output &seq_lengths, + const Output &WR, + const Output &B, + size_t hidden_size, + op::RecurrentSequenceDirection direction, + const std::vector &activations, + const std::vector &activations_alpha, + const std::vector &activations_beta, + float clip); + + RNNSequenceIE() = delete; + + std::shared_ptr clone_with_new_inputs(const OutputVector &new_args) const override; + + void validate_and_infer_types() override; + + std::size_t get_hidden_size() { return m_hidden_size; } + + const std::vector &get_activations() { return m_activations; } + + const std::vector &get_activations_alpha() { return m_activations_alpha; } + + const std::vector &get_activations_beta() { return m_activations_beta; } + + float get_clip() { return m_clip; } + + bool visit_attributes(AttributeVisitor& visitor) override; + +protected: + op::RecurrentSequenceDirection m_direction; +}; +} // namespace op +} // namespace ngraph diff --git a/inference-engine/src/transformations/include/transformations/bidirectional_sequences_decomposition.hpp b/inference-engine/src/transformations/include/transformations/bidirectional_sequences_decomposition.hpp new file mode 100644 index 00000000000000..f3379991882837 --- /dev/null +++ b/inference-engine/src/transformations/include/transformations/bidirectional_sequences_decomposition.hpp @@ -0,0 +1,56 @@ +// Copyright (C) 2018-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +#include + +#include + +namespace ngraph { +namespace pass { + +class TRANSFORMATIONS_API BidirectionalLSTMSequenceDecomposition; +class TRANSFORMATIONS_API BidirectionalGRUSequenceDecomposition; +class TRANSFORMATIONS_API BidirectionalRNNSequenceDecomposition; + +} // namespace pass +} // namespace ngraph + +/** + * @ingroup ie_transformation_common_api + * @brief Decompose LSTMSequence to forward and reverse LSTMSequence. + * + */ + +class ngraph::pass::BidirectionalLSTMSequenceDecomposition : public ngraph::pass::MatcherPass { +public: + BidirectionalLSTMSequenceDecomposition(); +}; + +/** + * @ingroup ie_transformation_common_api + * @brief Decompose GRUSequence to forward and reverse GRUSequence. + * + */ + +class ngraph::pass::BidirectionalGRUSequenceDecomposition : public ngraph::pass::MatcherPass { +public: + BidirectionalGRUSequenceDecomposition(); +}; + +/** + * @ingroup ie_transformation_common_api + * @brief Decompose RNNSequence to forward and reverse RNNSequence. + * + */ + +class ngraph::pass::BidirectionalRNNSequenceDecomposition : public ngraph::pass::MatcherPass { +public: + BidirectionalRNNSequenceDecomposition(); +}; diff --git a/inference-engine/src/transformations/include/transformations/convert_opset1_to_legacy/convert_sequences_to_sequences_ie.hpp b/inference-engine/src/transformations/include/transformations/convert_opset1_to_legacy/convert_sequences_to_sequences_ie.hpp new file mode 100644 index 00000000000000..a0ca240dec8a03 --- /dev/null +++ b/inference-engine/src/transformations/include/transformations/convert_opset1_to_legacy/convert_sequences_to_sequences_ie.hpp @@ -0,0 +1,59 @@ +// Copyright (C) 2018-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +#include + +#include + +namespace ngraph { +namespace pass { + +class TRANSFORMATIONS_API ConvertLSTMSequenceMatcher; +class TRANSFORMATIONS_API ConvertGRUSequenceMatcher; +class TRANSFORMATIONS_API ConvertRNNSequenceMatcher; + +} // namespace pass +} // namespace ngraph + +/** + * @ingroup ie_transformation_common_api + * @brief Converts LSTMSequence to legacy LSTMSequenceIE. + * SequenceIE op doesn't use seq_length input and num_direction (direction) attribute. + * We squeeze num_direction dimension for all corresponding inputs and unsqueeze them after the SequenceIE op. + */ + +class ngraph::pass::ConvertLSTMSequenceMatcher : public ngraph::pass::MatcherPass { +public: + ConvertLSTMSequenceMatcher(); +}; + +/** + * @ingroup ie_transformation_common_api + * @brief Converts GRUSequence to legacy GRUSequenceIE. + * SequenceIE op doesn't use seq_length input and num_direction (direction) attribute. + * We squeeze num_direction dimension for all corresponding inputs and unsqueeze them after the SequenceIE op. + */ + +class ngraph::pass::ConvertGRUSequenceMatcher : public ngraph::pass::MatcherPass { +public: + ConvertGRUSequenceMatcher(); +}; + +/** + * @ingroup ie_transformation_common_api + * @brief Converts RNNSequence to legacy RNNSequenceIE. + * SequenceIE op doesn't use seq_length input and num_direction (direction) attribute. + * We squeeze num_direction dimension for all corresponding inputs and unsqueeze them after the SequenceIE op. + */ + +class ngraph::pass::ConvertRNNSequenceMatcher : public ngraph::pass::MatcherPass { +public: + ConvertRNNSequenceMatcher(); +}; diff --git a/inference-engine/src/transformations/src/ngraph_ops/gru_sequence_ie.cpp b/inference-engine/src/transformations/src/ngraph_ops/gru_sequence_ie.cpp new file mode 100644 index 00000000000000..1759c506d56bf6 --- /dev/null +++ b/inference-engine/src/transformations/src/ngraph_ops/gru_sequence_ie.cpp @@ -0,0 +1,85 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ngraph_ops/gru_sequence_ie.hpp" +#include "ngraph/op/util/recurrent_sequence.hpp" + +#include +#include +#include + +using namespace std; +using namespace ngraph; + +NGRAPH_RTTI_DEFINITION(op::GRUSequenceIE, "GRUSequenceIE", 4); + +op::GRUSequenceIE::GRUSequenceIE(const Output& X, + const Output& H_t, + const Output& seq_lenghts, + const Output& WR, + const Output& B, + std::size_t hidden_size, + op::RecurrentSequenceDirection direction, + const std::vector& activations, + const std::vector& activations_alpha, + const std::vector& activations_beta, + float clip, + bool linear_before_reset) + : RNNCellBase({X, H_t, seq_lenghts, WR, B}, hidden_size, clip, activations, activations_alpha, activations_beta), + m_direction(direction), + m_linear_before_reset(linear_before_reset) { + constructor_validate_and_infer_types(); +} + +void op::GRUSequenceIE::validate_and_infer_types() { + for (const auto& input : inputs()) { + if (input.get_partial_shape().rank().is_dynamic()) { + set_output_type(0, get_input_element_type(0), PartialShape::dynamic()); + set_output_type(1, get_input_element_type(0), PartialShape::dynamic()); + return; + } + } + // rank validation + auto x_pshape = get_input_partial_shape(0); + auto h_state_pshape = get_input_partial_shape(1); + auto seq_lengths_pshape = get_input_partial_shape(2); + auto wr_pshape = get_input_partial_shape(3); + auto b_pshape = get_input_partial_shape(4); + std::vector pshapes = {x_pshape, h_state_pshape, seq_lengths_pshape, wr_pshape, b_pshape}; + + std::vector in_names = {"X", "H", "seq_lenghts", "WR", "B"}; + // num_direction dimension should be squeezed, we don't support bidirectional case + std::vector ranks = {3, 2, 1, 2, 1}; + for (size_t i = 0; i < pshapes.size(); ++i) { + NGRAPH_CHECK((pshapes[i].rank().get_length() == ranks[i]), + "GRUSequenceIE ", + in_names[i], + " input rank is not correct."); + } + + element::Type arg_type = get_input_element_type(0); + PartialShape output_shape_0{PartialShape::dynamic(3)}; + PartialShape output_shape_1{PartialShape::dynamic(2)}; + if (get_input_partial_shape(0).is_static()) { + size_t batch_size = get_input_partial_shape(0).get_shape()[0]; + size_t seq_length = get_input_partial_shape(0).get_shape()[1]; + output_shape_0 = Shape{batch_size, seq_length, m_hidden_size}; + output_shape_1 = Shape{batch_size, m_hidden_size}; + } + set_output_type(0, arg_type, output_shape_0); + set_output_type(1, arg_type, output_shape_1); +} + +bool op::GRUSequenceIE::visit_attributes(AttributeVisitor& visitor) { + visitor.on_attribute("direction", m_direction); + visitor.on_attribute("linear_before_reset", m_linear_before_reset); + return op::util::RNNCellBase::visit_attributes(visitor); +} + +shared_ptr op::GRUSequenceIE::clone_with_new_inputs(const OutputVector& new_args) const { + check_new_args_count(this, new_args); + return std::make_shared(new_args.at(0), new_args.at(1), new_args.at(2), new_args.at(3), + new_args.at(4), m_hidden_size, m_direction, m_activations, m_activations_alpha, m_activations_beta, m_clip, + m_linear_before_reset); +} diff --git a/inference-engine/src/transformations/src/ngraph_ops/lstm_sequence_ie.cpp b/inference-engine/src/transformations/src/ngraph_ops/lstm_sequence_ie.cpp new file mode 100644 index 00000000000000..ba3d0ba27b099a --- /dev/null +++ b/inference-engine/src/transformations/src/ngraph_ops/lstm_sequence_ie.cpp @@ -0,0 +1,87 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ngraph_ops/lstm_sequence_ie.hpp" +#include "ngraph/op/util/recurrent_sequence.hpp" + +#include +#include +#include + +using namespace std; +using namespace ngraph; + +NGRAPH_RTTI_DEFINITION(op::LSTMSequenceIE, "LSTMSequenceIE", 5); + +op::LSTMSequenceIE::LSTMSequenceIE(const Output &X, + const Output &H_t, + const Output &C_t, + const Output &seq_lenghts, + const Output &WR, + const Output &B, + std::size_t hidden_size, + ngraph::op::RecurrentSequenceDirection direction, + const std::vector &activations, + const std::vector &activations_alpha, + const std::vector &activations_beta, + float clip) + : RNNCellBase({X, H_t, C_t, seq_lenghts, WR, B}, hidden_size, clip, activations, activations_alpha, activations_beta), + m_direction(direction) { + constructor_validate_and_infer_types(); +} + +void op::LSTMSequenceIE::validate_and_infer_types() { + for (const auto& input : inputs()) { + if (input.get_partial_shape().rank().is_dynamic()) { + set_output_type(0, get_input_element_type(0), PartialShape::dynamic()); + set_output_type(1, get_input_element_type(0), PartialShape::dynamic()); + set_output_type(2, get_input_element_type(0), PartialShape::dynamic()); + return; + } + } + // rank validation + auto x_pshape = get_input_partial_shape(0); + auto h_state_pshape = get_input_partial_shape(1); + auto c_state_pshape = get_input_partial_shape(2); + auto seq_lengths_pshape = get_input_partial_shape(3); + auto wr_pshape = get_input_partial_shape(4); + auto b_pshape = get_input_partial_shape(5); + + std::vector pshapes = {x_pshape, h_state_pshape, c_state_pshape, + seq_lengths_pshape, wr_pshape, b_pshape}; + std::vector in_names = {"X", "H", "C", "seq_lenghts", "WR", "B"}; + // num_direction dimension should be squeezed, we don't support bidirectional case + std::vector ranks = {3, 2, 2, 1, 2, 1}; + for (size_t i = 0; i < pshapes.size(); ++i) { + NGRAPH_CHECK((pshapes[i].rank().get_length() == ranks[i]), + "LSTMSequenceIE ", + in_names[i], + " input rank is not correct."); + } + + element::Type arg_type = get_input_element_type(0); + PartialShape output_shape_0{PartialShape::dynamic(3)}; + PartialShape output_shape_1{PartialShape::dynamic(2)}; + if (get_input_partial_shape(0).is_static()) { + size_t batch_size = get_input_partial_shape(0).get_shape()[0]; + size_t seq_length = get_input_partial_shape(0).get_shape()[1]; + output_shape_0 = Shape{batch_size, seq_length, m_hidden_size}; + output_shape_1 = Shape{batch_size, m_hidden_size}; + } + set_output_type(0, arg_type, output_shape_0); + set_output_type(1, arg_type, output_shape_1); + set_output_type(2, arg_type, output_shape_1); +} + +bool ngraph::op::LSTMSequenceIE::visit_attributes(AttributeVisitor& visitor) { + visitor.on_attribute("direction", m_direction); + return op::util::RNNCellBase::visit_attributes(visitor); +} + +shared_ptr op::LSTMSequenceIE::clone_with_new_inputs(const OutputVector &new_args) const { + check_new_args_count(this, new_args); + return make_shared(new_args.at(0), new_args.at(1), new_args.at(2), new_args.at(3), + new_args.at(4), new_args.at(5), m_hidden_size, m_direction, m_activations, m_activations_alpha, m_activations_beta, + m_clip); +} diff --git a/inference-engine/src/transformations/src/ngraph_ops/rnn_sequence_ie.cpp b/inference-engine/src/transformations/src/ngraph_ops/rnn_sequence_ie.cpp new file mode 100644 index 00000000000000..e247bbafd3914c --- /dev/null +++ b/inference-engine/src/transformations/src/ngraph_ops/rnn_sequence_ie.cpp @@ -0,0 +1,81 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ngraph_ops/rnn_sequence_ie.hpp" +#include "ngraph/op/util/recurrent_sequence.hpp" + +#include +#include +#include + +using namespace std; +using namespace ngraph; + +NGRAPH_RTTI_DEFINITION(op::RNNSequenceIE, "RNNSequenceIE", 4); + +op::RNNSequenceIE::RNNSequenceIE(const Output& X, + const Output& H_t, + const Output& seq_lengths, // actually not supported + const Output& WR, + const Output& B, + std::size_t hidden_size, + op::RecurrentSequenceDirection direction, + const std::vector& activations, + const std::vector& activations_alpha, + const std::vector& activations_beta, + float clip) + : RNNCellBase({X, H_t, seq_lengths, WR, B}, hidden_size, clip, activations, activations_alpha, activations_beta), + m_direction(direction) { + constructor_validate_and_infer_types(); +} + +void op::RNNSequenceIE::validate_and_infer_types() { + for (const auto& input : inputs()) { + if (input.get_partial_shape().rank().is_dynamic()) { + set_output_type(0, get_input_element_type(0), PartialShape::dynamic()); + set_output_type(1, get_input_element_type(0), PartialShape::dynamic()); + return; + } + } + // rank validation + auto x_pshape = get_input_partial_shape(0); + auto h_state_pshape = get_input_partial_shape(1); + auto seq_lengths_pshape = get_input_partial_shape(2); + auto wr_pshape = get_input_partial_shape(3); + auto b_pshape = get_input_partial_shape(4); + + std::vector pshapes = {x_pshape, h_state_pshape, seq_lengths_pshape, wr_pshape, b_pshape}; + std::vector in_names = {"X", "H", "seq_lenghts", "WR", "B"}; + // num_direction dimension should be squeezed, we don't support bidirectional case + std::vector ranks = {3, 2, 1, 2, 1}; + for (size_t i = 0; i < pshapes.size(); ++i) { + NGRAPH_CHECK((pshapes[i].rank().get_length() == ranks[i]), + "RNNSequenceIE ", + in_names[i], + " input rank is not correct."); + } + + element::Type arg_type = get_input_element_type(0); + PartialShape output_shape_0{PartialShape::dynamic(3)}; + PartialShape output_shape_1{PartialShape::dynamic(2)}; + if (get_input_partial_shape(0).is_static()) { + size_t batch_size = get_input_partial_shape(0).get_shape()[0]; + size_t seq_length = get_input_partial_shape(0).get_shape()[1]; + output_shape_0 = Shape{batch_size, seq_length, m_hidden_size}; + output_shape_1 = Shape{batch_size, m_hidden_size}; + } + set_output_type(0, arg_type, output_shape_0); + set_output_type(1, arg_type, output_shape_1); +} + +bool op::RNNSequenceIE::visit_attributes(AttributeVisitor& visitor) { + visitor.on_attribute("direction", m_direction); + return op::util::RNNCellBase::visit_attributes(visitor); +} + +shared_ptr op::RNNSequenceIE::clone_with_new_inputs(const ngraph::OutputVector &new_args) const { + check_new_args_count(this, new_args); + return make_shared(new_args.at(0), new_args.at(1), new_args.at(2), new_args.at(3), + new_args.at(4), m_hidden_size, m_direction, m_activations, m_activations_alpha, m_activations_beta, m_clip); +} diff --git a/inference-engine/src/transformations/src/transformations/bidirectional_sequences_decomposition.cpp b/inference-engine/src/transformations/src/transformations/bidirectional_sequences_decomposition.cpp new file mode 100644 index 00000000000000..281d470fa14c06 --- /dev/null +++ b/inference-engine/src/transformations/src/transformations/bidirectional_sequences_decomposition.cpp @@ -0,0 +1,196 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "transformations/bidirectional_sequences_decomposition.hpp" + +#include + +#include +#include +#include + +ngraph::pass::BidirectionalLSTMSequenceDecomposition::BidirectionalLSTMSequenceDecomposition() { + auto lstm_sequence_ngraph = ngraph::pattern::wrap_type(); + + ngraph::matcher_pass_callback callback = [](pattern::Matcher &m) { + auto lstm_sequence = std::dynamic_pointer_cast(m.get_match_root()); + if (!lstm_sequence) { + return false; + } + + auto axis_0 = ngraph::opset4::Constant::create(element::i64, Shape{}, {0}); + auto axis_1 = ngraph::opset4::Constant::create(element::i64, Shape{}, {1}); + auto H = std::make_shared(lstm_sequence->input_value(1), axis_1, 2); + auto C = std::make_shared(lstm_sequence->input_value(2), axis_1, 2); + auto W = std::make_shared(lstm_sequence->input_value(4), axis_0, 2); + auto R = std::make_shared(lstm_sequence->input_value(5), axis_0, 2); + auto B = std::make_shared(lstm_sequence->input_value(6), axis_0, 2); + auto lstm_sequence_forward = std::make_shared( + lstm_sequence->input_value(0), + H->output(0), + C->output(0), + lstm_sequence->input_value(3), + W->output(0), + R->output(0), + B->output(0), + lstm_sequence->get_hidden_size(), + ngraph::op::RecurrentSequenceDirection::FORWARD, + lstm_sequence->get_activations_alpha(), + lstm_sequence->get_activations_beta(), + lstm_sequence->get_activations(), + lstm_sequence->get_clip()); + + auto lstm_sequence_reverse = std::make_shared( + lstm_sequence->input_value(0), + H->output(1), + C->output(1), + lstm_sequence->input_value(3), + W->output(1), + R->output(1), + B->output(1), + lstm_sequence->get_hidden_size(), + ngraph::op::RecurrentSequenceDirection::REVERSE, + lstm_sequence->get_activations_alpha(), + lstm_sequence->get_activations_beta(), + lstm_sequence->get_activations(), + lstm_sequence->get_clip()); + + auto concat_0 = std::make_shared(OutputVector{lstm_sequence_forward->output(0), + lstm_sequence_reverse->output(0)}, 1); + auto concat_1 = std::make_shared(OutputVector{lstm_sequence_forward->output(1), + lstm_sequence_reverse->output(1)}, 1); + auto concat_2 = std::make_shared(OutputVector{lstm_sequence_forward->output(2), + lstm_sequence_reverse->output(2)}, 1); + ngraph::copy_runtime_info(lstm_sequence, {H, C, W, R, B, lstm_sequence_forward, lstm_sequence_reverse, + concat_0, concat_1, concat_2}); + concat_0->set_friendly_name(lstm_sequence->get_friendly_name()+".0"); + concat_1->set_friendly_name(lstm_sequence->get_friendly_name()+".1"); + concat_2->set_friendly_name(lstm_sequence->get_friendly_name()+".2"); + ngraph::replace_node(lstm_sequence, {concat_0->output(0), concat_1->output(0), concat_2->output(0)}); + return true; + }; + + auto m = std::make_shared(lstm_sequence_ngraph, "BidirectionalLSTMSequenceDecomposition"); + this->register_matcher(m, callback); +} + +ngraph::pass::BidirectionalGRUSequenceDecomposition::BidirectionalGRUSequenceDecomposition() { + auto gru_sequence_ngraph = ngraph::pattern::wrap_type(); + + ngraph::matcher_pass_callback callback = [](pattern::Matcher &m) { + auto gru_sequence = std::dynamic_pointer_cast(m.get_match_root()); + if (!gru_sequence) { + return false; + } + + auto axis_0 = ngraph::opset4::Constant::create(element::i64, Shape{}, {0}); + auto axis_1 = ngraph::opset4::Constant::create(element::i64, Shape{}, {1}); + auto H = std::make_shared(gru_sequence->input_value(1), axis_1, 2); + auto W = std::make_shared(gru_sequence->input_value(3), axis_0, 2); + auto R = std::make_shared(gru_sequence->input_value(4), axis_0, 2); + auto B = std::make_shared(gru_sequence->input_value(5), axis_0, 2); + auto gru_sequence_forward = std::make_shared( + gru_sequence->input_value(0), + H->output(0), + gru_sequence->input_value(2), + W->output(0), + R->output(0), + B->output(0), + gru_sequence->get_hidden_size(), + ngraph::op::RecurrentSequenceDirection::FORWARD, + gru_sequence->get_activations(), + gru_sequence->get_activations_alpha(), + gru_sequence->get_activations_beta(), + gru_sequence->get_clip(), + gru_sequence->get_linear_before_reset()); + + auto gru_sequence_reverse = std::make_shared( + gru_sequence->input_value(0), + H->output(1), + gru_sequence->input_value(2), + W->output(1), + R->output(1), + B->output(1), + gru_sequence->get_hidden_size(), + ngraph::op::RecurrentSequenceDirection::REVERSE, + gru_sequence->get_activations(), + gru_sequence->get_activations_alpha(), + gru_sequence->get_activations_beta(), + gru_sequence->get_clip(), + gru_sequence->get_linear_before_reset()); + + auto concat_0 = std::make_shared(OutputVector{gru_sequence_forward->output(0), + gru_sequence_reverse->output(0)}, 1); + auto concat_1 = std::make_shared(OutputVector{gru_sequence_forward->output(1), + gru_sequence_reverse->output(1)}, 1); + ngraph::copy_runtime_info(gru_sequence, {H, W, R, B, gru_sequence_forward, gru_sequence_reverse, + concat_0, concat_1}); + concat_0->set_friendly_name(gru_sequence->get_friendly_name()+".0"); + concat_1->set_friendly_name(gru_sequence->get_friendly_name()+".1"); + ngraph::replace_node(gru_sequence, {concat_0->output(0), concat_1->output(0)}); + return true; + }; + + auto m = std::make_shared(gru_sequence_ngraph, "BidirectionalGRUSequenceDecomposition"); + this->register_matcher(m, callback); +} + +ngraph::pass::BidirectionalRNNSequenceDecomposition::BidirectionalRNNSequenceDecomposition() { + auto rnn_sequence_ngraph = ngraph::pattern::wrap_type(); + + ngraph::matcher_pass_callback callback = [](pattern::Matcher &m) { + auto rnn_sequence = std::dynamic_pointer_cast(m.get_match_root()); + if (!rnn_sequence) { + return false; + } + + auto axis_0 = ngraph::opset4::Constant::create(element::i64, Shape{}, {0}); + auto axis_1 = ngraph::opset4::Constant::create(element::i64, Shape{}, {1}); + auto H = std::make_shared(rnn_sequence->input_value(1), axis_1, 2); + auto W = std::make_shared(rnn_sequence->input_value(3), axis_0, 2); + auto R = std::make_shared(rnn_sequence->input_value(4), axis_0, 2); + auto B = std::make_shared(rnn_sequence->input_value(5), axis_0, 2); + auto rnn_sequence_forward = std::make_shared( + rnn_sequence->input_value(0), + H->output(0), + rnn_sequence->input_value(2), + W->output(0), + R->output(0), + B->output(0), + rnn_sequence->get_hidden_size(), + ngraph::op::RecurrentSequenceDirection::FORWARD, + rnn_sequence->get_activations(), + rnn_sequence->get_activations_alpha(), + rnn_sequence->get_activations_beta(), + rnn_sequence->get_clip()); + + auto rnn_sequence_reverse = std::make_shared( + rnn_sequence->input_value(0), + H->output(1), + rnn_sequence->input_value(2), + W->output(1), + R->output(1), + B->output(1), + rnn_sequence->get_hidden_size(), + ngraph::op::RecurrentSequenceDirection::REVERSE, + rnn_sequence->get_activations(), + rnn_sequence->get_activations_alpha(), + rnn_sequence->get_activations_beta(), + rnn_sequence->get_clip()); + + auto concat_0 = std::make_shared(OutputVector{rnn_sequence_forward->output(0), + rnn_sequence_reverse->output(0)}, 1); + auto concat_1 = std::make_shared(OutputVector{rnn_sequence_forward->output(1), + rnn_sequence_reverse->output(1)}, 1); + ngraph::copy_runtime_info(rnn_sequence, {H, W, R, B, rnn_sequence_forward, rnn_sequence_reverse, + concat_0, concat_1}); + concat_0->set_friendly_name(rnn_sequence->get_friendly_name() + ".0"); + concat_1->set_friendly_name(rnn_sequence->get_friendly_name() + ".1"); + ngraph::replace_node(rnn_sequence, {concat_0->output(0), concat_1->output(0)}); + return true; + }; + + auto m = std::make_shared(rnn_sequence_ngraph, "BidirectionalRNNSequenceDecomposition"); + this->register_matcher(m, callback); +} diff --git a/inference-engine/src/transformations/src/transformations/convert_opset1_to_legacy/convert_opset1_to_legacy.cpp b/inference-engine/src/transformations/src/transformations/convert_opset1_to_legacy/convert_opset1_to_legacy.cpp index 1d6c5dd59b9217..25847676511212 100644 --- a/inference-engine/src/transformations/src/transformations/convert_opset1_to_legacy/convert_opset1_to_legacy.cpp +++ b/inference-engine/src/transformations/src/transformations/convert_opset1_to_legacy/convert_opset1_to_legacy.cpp @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -157,6 +158,9 @@ bool ngraph::pass::ConvertOpSet1ToLegacy::run_on_function(std::shared_ptradd_matcher(); anchor->add_matcher(); anchor->add_matcher(); + anchor->add_matcher(); + anchor->add_matcher(); + anchor->add_matcher(); anchor->set_name("ngraph::pass::ConvertOpSet1ToLegacy"); // List of final conversion transformations that must to be executed diff --git a/inference-engine/src/transformations/src/transformations/convert_opset1_to_legacy/convert_sequences_to_sequences_ie.cpp b/inference-engine/src/transformations/src/transformations/convert_opset1_to_legacy/convert_sequences_to_sequences_ie.cpp new file mode 100644 index 00000000000000..b1648fdec72647 --- /dev/null +++ b/inference-engine/src/transformations/src/transformations/convert_opset1_to_legacy/convert_sequences_to_sequences_ie.cpp @@ -0,0 +1,195 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "transformations/convert_opset1_to_legacy/convert_sequences_to_sequences_ie.hpp" + +#include + +#include +#include +#include + +#include +#include +#include + +ngraph::pass::ConvertLSTMSequenceMatcher::ConvertLSTMSequenceMatcher() { + auto lstm_sequence_ngraph = ngraph::pattern::wrap_type(); + + ngraph::matcher_pass_callback callback = [](pattern::Matcher &m) { + auto lstm_sequence = std::dynamic_pointer_cast(m.get_match_root()); + if (!lstm_sequence) { + return false; + } + + const auto& W = std::dynamic_pointer_cast( + lstm_sequence->input_value(4).get_node_shared_ptr()); + if (!W) { + return false; + } + + const auto& R = std::dynamic_pointer_cast( + lstm_sequence->input_value(5).get_node_shared_ptr()); + if (!R) { + return false; + } + + // for forward/reverse cases we can squeeze num_direction dimension + auto axis_1 = ngraph::opset4::Constant::create(ngraph::element::i64, ngraph::Shape{1}, {1}); + auto in_1 = std::make_shared(lstm_sequence->input(1).get_source_output(), axis_1); + auto in_2 = std::make_shared(lstm_sequence->input(2).get_source_output(), axis_1); + auto concat = std::make_shared(ngraph::NodeVector({W, R}), 2); + auto axis_2 = ngraph::opset4::Constant::create(ngraph::element::i64, ngraph::Shape{1}, {0}); + auto in_3 = std::make_shared(concat->output(0), axis_2); + auto in_4 = std::make_shared(lstm_sequence->input(6).get_source_output(), axis_2); + auto lstm_sequence_ie = std::make_shared( + lstm_sequence->input(0).get_source_output(), // X + in_1, // initial_hidden_state + in_2, // initial_cell_state + lstm_sequence->input(3).get_source_output(), + in_3, // WR + in_4, // B + lstm_sequence->get_hidden_size(), + lstm_sequence->get_direction(), + lstm_sequence->get_activations(), + lstm_sequence->get_activations_alpha(), + lstm_sequence->get_activations_beta(), + lstm_sequence->get_clip()); + + auto unsqueeze_axis = ngraph::opset4::Constant::create(ngraph::element::i64, ngraph::Shape{1}, {1}); + auto unsqueeze_1 = std::make_shared(lstm_sequence_ie->output(0), unsqueeze_axis); + auto unsqueeze_2 = std::make_shared(lstm_sequence_ie->output(1), unsqueeze_axis); + auto unsqueeze_3 = std::make_shared(lstm_sequence_ie->output(2), unsqueeze_axis); + + ngraph::copy_runtime_info(lstm_sequence, {concat, lstm_sequence_ie, in_1, in_2, in_3, in_4, unsqueeze_1, + unsqueeze_2, unsqueeze_3}); + unsqueeze_1->set_friendly_name(lstm_sequence->get_friendly_name()+".0"); + unsqueeze_2->set_friendly_name(lstm_sequence->get_friendly_name()+".1"); + unsqueeze_3->set_friendly_name(lstm_sequence->get_friendly_name()+".2"); + ngraph::replace_node(lstm_sequence, {unsqueeze_1->output(0), unsqueeze_2->output(0), unsqueeze_3->output(0)}); + return true; + }; + + auto m = std::make_shared(lstm_sequence_ngraph, "ConvertLSTMSequenceToLSTMSequenceIE"); + this->register_matcher(m, callback); +} + +ngraph::pass::ConvertGRUSequenceMatcher::ConvertGRUSequenceMatcher() { + auto gru_sequence_ngraph = ngraph::pattern::wrap_type(); + + ngraph::matcher_pass_callback callback = [](pattern::Matcher &m) { + auto gru_sequence = std::dynamic_pointer_cast(m.get_match_root()); + if (!gru_sequence) { + return false; + } + + auto W = std::dynamic_pointer_cast( + gru_sequence->input_value(3).get_node_shared_ptr()); + if (!W) { + return false; + } + + auto R = std::dynamic_pointer_cast( + gru_sequence->input_value(4).get_node_shared_ptr()); + if (!R) { + return false; + } + + // todo: add exception? + if (gru_sequence->get_direction() == ngraph::op::RecurrentSequenceDirection::BIDIRECTIONAL) + return false; + + // for forward/reverse cases we can squeeze num_direction dimension + auto axis_1 = ngraph::opset4::Constant::create(ngraph::element::i64, ngraph::Shape{1}, {1}); + auto in_1 = std::make_shared(gru_sequence->input(1).get_source_output(), axis_1); + auto concat = std::make_shared(ngraph::NodeVector({W, R}), 2); + auto axis_2 = ngraph::opset4::Constant::create(ngraph::element::i64, ngraph::Shape{1}, {0}); + auto in_3 = std::make_shared(concat->output(0), axis_2); + auto in_4 = std::make_shared(gru_sequence->input(5).get_source_output(), axis_2); + + auto gru_sequence_ie = std::make_shared( + gru_sequence->input(0).get_source_output(), // X + in_1, // initial_hidden_state + gru_sequence->input(2).get_source_output(), + in_3, // WR + in_4, // B + gru_sequence->get_hidden_size(), + gru_sequence->get_direction(), + gru_sequence->get_activations(), + gru_sequence->get_activations_alpha(), + gru_sequence->get_activations_beta(), + gru_sequence->get_clip(), + gru_sequence->get_linear_before_reset()); + + auto unsqueeze_axis = ngraph::opset4::Constant::create(ngraph::element::i64, ngraph::Shape{1}, {1}); + auto unsqueeze_1 = std::make_shared(gru_sequence_ie->output(0), unsqueeze_axis); + auto unsqueeze_2 = std::make_shared(gru_sequence_ie->output(1), unsqueeze_axis); + + ngraph::copy_runtime_info(gru_sequence, {concat, gru_sequence_ie, unsqueeze_1, unsqueeze_2, in_1, in_3, in_4}); + unsqueeze_1->set_friendly_name(gru_sequence->get_friendly_name()+".0"); + unsqueeze_2->set_friendly_name(gru_sequence->get_friendly_name()+".1"); + ngraph::replace_node(gru_sequence, {unsqueeze_1, unsqueeze_2}); + return true; + }; + + auto m = std::make_shared(gru_sequence_ngraph, "ConvertGRUSequenceToGRUSequenceIE"); + this->register_matcher(m, callback); +} + +ngraph::pass::ConvertRNNSequenceMatcher::ConvertRNNSequenceMatcher() { + auto rnn_sequence_ngraph = ngraph::pattern::wrap_type(); + + ngraph::matcher_pass_callback callback = [](pattern::Matcher &m) { + auto rnn_sequence = std::dynamic_pointer_cast(m.get_match_root()); + if (!rnn_sequence) { + return false; + } + + auto W = std::dynamic_pointer_cast( + rnn_sequence->input_value(3).get_node_shared_ptr()); + if (!W) { + return false; + } + + auto R = std::dynamic_pointer_cast( + rnn_sequence->input_value(4).get_node_shared_ptr()); + if (!R) { + return false; + } + + // for forward/reverse cases we can squeeze num_direction dimension + auto axis_1 = ngraph::opset4::Constant::create(ngraph::element::i64, ngraph::Shape{1}, {1}); + auto in_1 = std::make_shared(rnn_sequence->input(1).get_source_output(), axis_1); + auto concat = std::make_shared(ngraph::NodeVector({W, R}), 2); + auto axis_2 = ngraph::opset4::Constant::create(ngraph::element::i64, ngraph::Shape{1}, {0}); + auto in_3 = std::make_shared(concat->output(0), axis_2); + auto in_4 = std::make_shared(rnn_sequence->input(5).get_source_output(), axis_2); + auto rnn_sequence_ie = std::make_shared( + rnn_sequence->input(0).get_source_output(), // X + in_1, // initial_hidden_state + rnn_sequence->input_value(2), + in_3, // WR + in_4, // B + rnn_sequence->get_hidden_size(), + rnn_sequence->get_direction(), + rnn_sequence->get_activations(), + rnn_sequence->get_activations_alpha(), + rnn_sequence->get_activations_beta(), + rnn_sequence->get_clip()); + + auto unsqueeze_axis = ngraph::opset4::Constant::create(ngraph::element::i64, ngraph::Shape{1}, {1}); + auto unsqueeze_1 = std::make_shared(rnn_sequence_ie->output(0), unsqueeze_axis); + auto unsqueeze_2 = std::make_shared(rnn_sequence_ie->output(1), unsqueeze_axis); + + ngraph::copy_runtime_info(rnn_sequence, {concat, rnn_sequence_ie, in_1, in_3, in_4, unsqueeze_1, + unsqueeze_2}); + unsqueeze_1->set_friendly_name(rnn_sequence->get_friendly_name()+".0"); + unsqueeze_2->set_friendly_name(rnn_sequence->get_friendly_name()+".1"); + ngraph::replace_node(rnn_sequence, {unsqueeze_1->output(0), unsqueeze_2->output(0)}); + return true; + }; + + auto m = std::make_shared(rnn_sequence_ngraph, "ConvertRNNSequenceToRNNSequenceIE"); + this->register_matcher(m, callback); +} \ No newline at end of file diff --git a/inference-engine/tests/functional/inference_engine/cnn_network/convert_ngraph_to_cnn_network_tests.cpp b/inference-engine/tests/functional/inference_engine/cnn_network/convert_ngraph_to_cnn_network_tests.cpp index 3e7b55ed73e4ea..6daf3297e17c7e 100644 --- a/inference-engine/tests/functional/inference_engine/cnn_network/convert_ngraph_to_cnn_network_tests.cpp +++ b/inference-engine/tests/functional/inference_engine/cnn_network/convert_ngraph_to_cnn_network_tests.cpp @@ -85,15 +85,15 @@ TEST(ConvertFunctionToCNNNetworkTests, OpsShouldBeConvertedToIERepresentation) { std::make_shared(), std::make_shared(), std::make_shared(), - // std::make_shared(), todo: enable after GRUSequence support + // std::make_shared(), todo: enable after GRUSequence support std::make_shared(), std::make_shared(), std::make_shared(), - // std::make_shared(), todo: enable after LSTMSequence support + // std::make_shared(), todo: enable after LSTMSequence support std::make_shared(), std::make_shared(), std::make_shared(), - // std::make_shared(), todo: enable after RNNSequence support + // std::make_shared(), todo: enable after RNNSequence support std::make_shared(), std::make_shared(), std::make_shared(), diff --git a/inference-engine/tests/functional/inference_engine/transformations/convert_sequences_to_sequences_ie_test.cpp b/inference-engine/tests/functional/inference_engine/transformations/convert_sequences_to_sequences_ie_test.cpp new file mode 100644 index 00000000000000..a8555d5ecff87f --- /dev/null +++ b/inference-engine/tests/functional/inference_engine/transformations/convert_sequences_to_sequences_ie_test.cpp @@ -0,0 +1,271 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "common_test_utils/test_common.hpp" +#include "common_test_utils/ngraph_test_utils.hpp" + +using namespace testing; + +TEST(TransformationTests, GRUSequenceConversionTest) { + std::shared_ptr f(nullptr), f_ref(nullptr); + std::shared_ptr sequence; + + const size_t batch_size = 2; + const size_t input_size = 3; + const size_t hidden_size = 3; + const size_t gates_count = 3; + const size_t num_directions = 1; + { + const auto X = std::make_shared(ngraph::element::f32, + ngraph::Shape{batch_size, 1, input_size}); + const auto W = + std::make_shared(ngraph::element::f32, + ngraph::Shape{num_directions, gates_count * hidden_size, input_size}); + const auto R = + std::make_shared(ngraph::element::f32, + ngraph::Shape{num_directions, gates_count * hidden_size, hidden_size}); + const auto H_t = std::make_shared(ngraph::element::f32, + ngraph::Shape{batch_size, num_directions, hidden_size}); + const auto B = std::make_shared(ngraph::element::f32, + ngraph::Shape{num_directions, gates_count * hidden_size}); + + const auto seq_len = std::make_shared(ngraph::element::i32, ngraph::Shape{batch_size}); + sequence = std::make_shared(X, H_t, seq_len, W, R, B, hidden_size, + ngraph::op::RecurrentSequenceDirection::FORWARD); + sequence->set_friendly_name("test_sequence"); + + f = std::make_shared(ngraph::NodeVector{sequence}, ngraph::ParameterVector{X, H_t}); + ngraph::pass::Manager manager; + manager.register_pass(); + manager.register_pass(); + manager.run_passes(f); + ASSERT_NO_THROW(check_rt_info(f)); + } + + { + const auto X = std::make_shared(ngraph::element::f32, + ngraph::Shape{batch_size, 1, input_size}); + const auto W = + std::make_shared(ngraph::element::f32, + ngraph::Shape{num_directions, gates_count * hidden_size, input_size}); + const auto R = + std::make_shared(ngraph::element::f32, + ngraph::Shape{num_directions, gates_count * hidden_size, hidden_size}); + const auto H_t = std::make_shared(ngraph::element::f32, + ngraph::Shape{batch_size, num_directions, hidden_size}); + const auto B = std::make_shared(ngraph::element::f32, + ngraph::Shape{num_directions, gates_count * hidden_size}); + + const auto seq_len = std::make_shared(ngraph::element::i32, ngraph::Shape{batch_size}, 1); + auto axis_1 = ngraph::opset4::Constant::create(ngraph::element::i64, ngraph::Shape{1}, {1}); + auto in_1 = std::make_shared(H_t, axis_1); + auto concat = std::make_shared(ngraph::NodeVector({W, R}), 2); + auto axis_2 = ngraph::opset4::Constant::create(ngraph::element::i64, ngraph::Shape{1}, {0}); + auto in_3 = std::make_shared(concat->output(0), axis_2); + auto in_4 = std::make_shared(B, axis_2); + auto sequence_ie = std::make_shared(X, + in_1, + seq_len, // this input is not supported + in_3, + in_4, + sequence->get_hidden_size(), + sequence->get_direction(), + sequence->get_activations(), + sequence->get_activations_alpha(), + sequence->get_activations_beta(), + sequence->get_clip(), + sequence->get_linear_before_reset()); + sequence_ie->set_friendly_name("test_sequence"); + + auto unsqueeze_axis = ngraph::opset4::Constant::create(ngraph::element::i64, ngraph::Shape{1}, {1}); + auto unsqueeze_1 = std::make_shared(sequence_ie->output(0), unsqueeze_axis); + auto unsqueeze_2 = std::make_shared(sequence_ie->output(1), unsqueeze_axis); + f_ref = std::make_shared(ngraph::NodeVector{unsqueeze_1}, ngraph::ParameterVector{X, H_t}); + } + auto res = compare_functions(f, f_ref); + ASSERT_TRUE(res.first) << res.second; + + auto result_node_of_converted_f = f->get_output_op(0); + auto sequence_node = result_node_of_converted_f->input_value(0).get_node_shared_ptr() + ->input_value(0).get_node_shared_ptr(); +} + +TEST(TransformationTests, RNNSequenceConversionTest) { + const size_t hidden_size = 3; + const size_t num_directions = 1; + const size_t batch_size = 2; + std::shared_ptr f(nullptr), f_ref(nullptr); + std::shared_ptr sequence; + + { + auto X = std::make_shared(ngraph::element::f32, ngraph::Shape{batch_size, 1, 3}); + auto H = std::make_shared(ngraph::element::f32, ngraph::Shape{batch_size, num_directions, 3}); + auto W = std::make_shared(ngraph::element::f32, ngraph::Shape{num_directions, 3, 3}); + auto R = std::make_shared(ngraph::element::f32, ngraph::Shape{num_directions, 3, 3}); + auto B = std::make_shared(ngraph::element::f32, ngraph::Shape{num_directions, 3}); + auto seq_len = std::make_shared(ngraph::element::f32, ngraph::Shape{2}); + sequence = std::make_shared(X, H, seq_len, W, R, B, hidden_size, + ngraph::op::RecurrentSequenceDirection::FORWARD); + sequence->set_friendly_name("test_sequence"); + + f = std::make_shared(ngraph::NodeVector{sequence}, ngraph::ParameterVector{X, H}); + ngraph::pass::Manager manager; + manager.register_pass(); + manager.register_pass(); + manager.run_passes(f); + ASSERT_NO_THROW(check_rt_info(f)); + } + + { + auto X = std::make_shared(ngraph::element::f32, ngraph::Shape{batch_size, 1, 3}); + auto H = std::make_shared(ngraph::element::f32, ngraph::Shape{batch_size, num_directions, 3}); + auto W = std::make_shared(ngraph::element::f32, ngraph::Shape{num_directions, 3, 3}); + auto R = std::make_shared(ngraph::element::f32, ngraph::Shape{num_directions, 3, 3}); + auto B = std::make_shared(ngraph::element::f32, ngraph::Shape{num_directions, 3}); + auto seq_len = std::make_shared(ngraph::element::f32, ngraph::Shape{batch_size}, 1); + auto axis_1 = ngraph::opset4::Constant::create(ngraph::element::i64, ngraph::Shape{1}, {1}); + auto in_1 = std::make_shared(H, axis_1); + auto concat = std::make_shared(ngraph::NodeVector({W, R}), 2); + auto axis_2 = ngraph::opset4::Constant::create(ngraph::element::i64, ngraph::Shape{1}, {0}); + auto in_3 = std::make_shared(concat->output(0), axis_2); + auto in_4 = std::make_shared(B, axis_2); + auto sequence_ie = std::make_shared(X, + in_1, + seq_len, + in_3, + in_4, + sequence->get_hidden_size(), + sequence->get_direction(), + sequence->get_activations(), + sequence->get_activations_alpha(), + sequence->get_activations_beta(), + sequence->get_clip()); + + auto unsqueeze_axis = ngraph::opset4::Constant::create(ngraph::element::i64, ngraph::Shape{1}, {1}); + auto unsqueeze_1 = std::make_shared(sequence_ie->output(0), unsqueeze_axis); + auto unsqueeze_2 = std::make_shared(sequence_ie->output(1), unsqueeze_axis); + sequence_ie->set_friendly_name("test_sequence"); + f_ref = std::make_shared(ngraph::NodeVector{unsqueeze_1}, ngraph::ParameterVector{X, H}); + } + + auto res = compare_functions(f, f_ref); + ASSERT_TRUE(res.first) << res.second; + + auto result_node_of_converted_f = f->get_output_op(0); + auto sequence_node = result_node_of_converted_f->input_value(0).get_node_shared_ptr() + ->input_value(0).get_node_shared_ptr(); +} + +TEST(TransformationTests, LSTMSequenceConversionTest) { + const size_t batch_size = 2; + const size_t input_size = 3; + const size_t hidden_size = 3; + const size_t gates_count = 4; + const size_t num_directions = 1; + std::shared_ptr f(nullptr), f_ref(nullptr); + std::shared_ptr sequence; + { + const auto X = std::make_shared(ngraph::element::f32, + ngraph::Shape{batch_size, 10, input_size}); + const auto W = + std::make_shared(ngraph::element::f32, + ngraph::Shape{num_directions, + gates_count * hidden_size, input_size}); + const auto R = + std::make_shared(ngraph::element::f32, + ngraph::Shape{num_directions, + gates_count * hidden_size, hidden_size}); + const auto H_t = std::make_shared(ngraph::element::f32, + ngraph::Shape{batch_size, num_directions, + hidden_size}); + const auto C_t = std::make_shared(ngraph::element::f32, + ngraph::Shape{batch_size, num_directions, hidden_size}); + const auto B = std::make_shared(ngraph::element::f32, + ngraph::Shape{num_directions, + gates_count * hidden_size}); + const auto seq_len = std::make_shared(ngraph::element::i32, ngraph::Shape{batch_size}); + sequence = std::make_shared(X, H_t, C_t, seq_len, W, R, B, hidden_size, + ngraph::op::RecurrentSequenceDirection::FORWARD); + sequence->set_friendly_name("test_sequence"); + + f = std::make_shared(ngraph::OutputVector{sequence->output(0)}, ngraph::ParameterVector{X, H_t, C_t}); + ngraph::pass::Manager manager; + manager.register_pass(); + manager.register_pass(); + manager.run_passes(f); + ASSERT_NO_THROW(check_rt_info(f)); + } + + { + const auto X = std::make_shared(ngraph::element::f32, + ngraph::Shape{batch_size, 10, input_size}); + const auto W = + std::make_shared(ngraph::element::f32, + ngraph::Shape{num_directions, + gates_count * hidden_size, input_size}); + const auto R = + std::make_shared(ngraph::element::f32, + ngraph::Shape{num_directions, + gates_count * hidden_size, hidden_size}); + const auto H_t = std::make_shared(ngraph::element::f32, + ngraph::Shape{batch_size, num_directions, hidden_size}); + const auto C_t = std::make_shared(ngraph::element::f32, + ngraph::Shape{batch_size, num_directions, hidden_size}); + const auto seq_lenghts = std::make_shared(ngraph::element::f32, + ngraph::Shape{batch_size}); + const auto B = std::make_shared(ngraph::element::f32, + ngraph::Shape{num_directions, + gates_count * hidden_size}); + // const auto seq_len = std::make_shared(ngraph::element::i32, ngraph::Shape{1}, 1); + auto axis_1 = ngraph::opset4::Constant::create(ngraph::element::i64, ngraph::Shape{1}, {1}); + auto in_1 = std::make_shared(H_t, axis_1); + auto in_2 = std::make_shared(C_t, axis_1); + auto concat = std::make_shared(ngraph::NodeVector({W, R}), 2); + auto axis_2 = ngraph::opset4::Constant::create(ngraph::element::i64, ngraph::Shape{1}, {0}); + auto in_3 = std::make_shared(concat->output(0), axis_2); + auto in_4 = std::make_shared(B, axis_2); + auto sequence_ie = std::make_shared(X, + in_1, + in_2, + seq_lenghts, + in_3, + in_4, + sequence->get_hidden_size(), + sequence->get_direction(), + sequence->get_activations(), + sequence->get_activations_alpha(), + sequence->get_activations_beta(), + sequence->get_clip()); + sequence_ie->set_friendly_name("test_sequence"); + auto unsqueeze_axis = ngraph::opset4::Constant::create(ngraph::element::i64, ngraph::Shape{1}, {1}); + auto unsqueeze_1 = std::make_shared(sequence_ie->output(0), unsqueeze_axis); + auto unsqueeze_2 = std::make_shared(sequence_ie->output(1), unsqueeze_axis); + auto unsqueeze_3 = std::make_shared(sequence_ie->output(2), unsqueeze_axis); + f_ref = std::make_shared(ngraph::NodeVector{unsqueeze_1}, + ngraph::ParameterVector{X, H_t, C_t}); + } + + auto res = compare_functions(f, f_ref); + ASSERT_TRUE(res.first) << res.second; + + auto result_node_of_converted_f = f->get_output_op(0); + auto sequence_node = result_node_of_converted_f->input_value(0).get_node_shared_ptr() + ->input_value(0).get_node_shared_ptr(); +} \ No newline at end of file diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/gru_sequence.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/gru_sequence.cpp new file mode 100644 index 00000000000000..060e36e88d2a77 --- /dev/null +++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/gru_sequence.cpp @@ -0,0 +1,59 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include "single_layer_tests/gru_sequence.hpp" +#include "common_test_utils/test_constants.hpp" + +using namespace LayerTestsDefinitions; + +namespace { + // without clip values increase rapidly, so use only seq_lenghts = 2 + std::vector seq_lengths_zero_clip{2}; + std::vector seq_lengths_clip_non_zero{20}; + std::vector batch{1, 10}; + std::vector hidden_size{1, 10}; + std::vector input_size{10}; + std::vector> activations = {{"relu", "tanh"}, {"tanh", "sigmoid"}, {"sigmoid", "tanh"}, + {"tanh", "relu"}}; + std::vector linear_before_reset = {true, false}; + std::vector clip{0.f}; + std::vector clip_non_zeros{0.7f}; + std::vector direction = {ngraph::op::RecurrentSequenceDirection::FORWARD, + ngraph::op::RecurrentSequenceDirection::REVERSE, + ngraph::op::RecurrentSequenceDirection::BIDIRECTIONAL + }; + std::vector netPrecisions = {InferenceEngine::Precision::FP32, + InferenceEngine::Precision::FP16}; + + INSTANTIATE_TEST_CASE_P(GRUSequenceCommonZeroClip, GRUSequenceTest, + ::testing::Combine( + ::testing::ValuesIn(seq_lengths_zero_clip), + ::testing::ValuesIn(batch), + ::testing::ValuesIn(hidden_size), + ::testing::ValuesIn(input_size), + ::testing::ValuesIn(activations), + ::testing::ValuesIn(clip), + ::testing::ValuesIn(linear_before_reset), + ::testing::ValuesIn(direction), + ::testing::ValuesIn(netPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + GRUSequenceTest::getTestCaseName); + + INSTANTIATE_TEST_CASE_P(GRUSequenceCommonClip, GRUSequenceTest, + ::testing::Combine( + ::testing::ValuesIn(seq_lengths_clip_non_zero), + ::testing::ValuesIn(batch), + ::testing::ValuesIn(hidden_size), + ::testing::ValuesIn(input_size), + ::testing::ValuesIn(activations), + ::testing::ValuesIn(clip_non_zeros), + ::testing::ValuesIn(linear_before_reset), + ::testing::ValuesIn(direction), + ::testing::ValuesIn(netPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + GRUSequenceTest::getTestCaseName); + +} // namespace diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/lstm_sequence.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/lstm_sequence.cpp new file mode 100644 index 00000000000000..8611f0388cfc33 --- /dev/null +++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/lstm_sequence.cpp @@ -0,0 +1,57 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include "single_layer_tests/lstm_sequence.hpp" +#include "common_test_utils/test_constants.hpp" + +using namespace LayerTestsDefinitions; + +namespace { + // without clip values increase rapidly, so use only seq_lenghts = 2 + std::vector seq_lengths_zero_clip{2}; + std::vector seq_lengths_clip_non_zero{20}; + std::vector batch{1, 10}; + std::vector hidden_size{1, 10}; + std::vector input_size{10}; + std::vector> activations = {{"relu", "sigmoid", "tanh"}, {"sigmoid", "tanh", "tanh"}, + {"tanh", "relu", "sigmoid"}, {"sigmoid", "sigmoid", "sigmoid"}, + {"tanh", "tanh", "tanh"}, {"relu", "relu", "relu"}}; + std::vector clip{0.f}; + std::vector clip_non_zeros{0.7f}; + std::vector direction = {ngraph::op::RecurrentSequenceDirection::FORWARD, + ngraph::op::RecurrentSequenceDirection::REVERSE, + ngraph::op::RecurrentSequenceDirection::BIDIRECTIONAL + }; + std::vector netPrecisions = {InferenceEngine::Precision::FP32, + InferenceEngine::Precision::FP16}; + + INSTANTIATE_TEST_CASE_P(LSTMSequenceCommonZeroClip, LSTMSequenceTest, + ::testing::Combine( + ::testing::ValuesIn(seq_lengths_zero_clip), + ::testing::ValuesIn(batch), + ::testing::ValuesIn(hidden_size), + ::testing::ValuesIn(input_size), + ::testing::ValuesIn(activations), + ::testing::ValuesIn(clip), + ::testing::ValuesIn(direction), + ::testing::ValuesIn(netPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + LSTMSequenceTest::getTestCaseName); + + INSTANTIATE_TEST_CASE_P(LSTMSequenceCommonClip, LSTMSequenceTest, + ::testing::Combine( + ::testing::ValuesIn(seq_lengths_clip_non_zero), + ::testing::ValuesIn(batch), + ::testing::ValuesIn(hidden_size), + ::testing::ValuesIn(input_size), + ::testing::ValuesIn(activations), + ::testing::ValuesIn(clip_non_zeros), + ::testing::ValuesIn(direction), + ::testing::ValuesIn(netPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + LSTMSequenceTest::getTestCaseName); + +} // namespace diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/rnn_sequence.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/rnn_sequence.cpp new file mode 100644 index 00000000000000..8415f53471b55b --- /dev/null +++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/rnn_sequence.cpp @@ -0,0 +1,55 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include "single_layer_tests/rnn_sequence.hpp" +#include "common_test_utils/test_constants.hpp" + +using namespace LayerTestsDefinitions; + +namespace { + // without clip values increase rapidly, so use only seq_lenghts = 2 + std::vector seq_lengths_zero_clip{2}; + std::vector seq_lengths_clip_non_zero{20}; + std::vector batch{1, 10}; + std::vector hidden_size{1, 10}; + std::vector input_size{10}; + std::vector> activations = {{"relu"}, {"sigmoid"}, {"tanh"}}; + std::vector clip{0.f}; + std::vector clip_non_zeros{0.7f}; + std::vector direction = {ngraph::op::RecurrentSequenceDirection::FORWARD, + ngraph::op::RecurrentSequenceDirection::REVERSE, + ngraph::op::RecurrentSequenceDirection::BIDIRECTIONAL + }; + std::vector netPrecisions = {InferenceEngine::Precision::FP32, + InferenceEngine::Precision::FP16}; + + INSTANTIATE_TEST_CASE_P(RNNSequenceCommonZeroClip, RNNSequenceTest, + ::testing::Combine( + ::testing::ValuesIn(seq_lengths_zero_clip), + ::testing::ValuesIn(batch), + ::testing::ValuesIn(hidden_size), + ::testing::ValuesIn(input_size), + ::testing::ValuesIn(activations), + ::testing::ValuesIn(clip), + ::testing::ValuesIn(direction), + ::testing::ValuesIn(netPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + RNNSequenceTest::getTestCaseName); + + INSTANTIATE_TEST_CASE_P(RNNSequenceCommonClip, RNNSequenceTest, + ::testing::Combine( + ::testing::ValuesIn(seq_lengths_clip_non_zero), + ::testing::ValuesIn(batch), + ::testing::ValuesIn(hidden_size), + ::testing::ValuesIn(input_size), + ::testing::ValuesIn(activations), + ::testing::ValuesIn(clip_non_zeros), + ::testing::ValuesIn(direction), + ::testing::ValuesIn(netPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + RNNSequenceTest::getTestCaseName); + +} // namespace diff --git a/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/gru_sequence.hpp b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/gru_sequence.hpp new file mode 100644 index 00000000000000..290e4e3092ea6c --- /dev/null +++ b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/gru_sequence.hpp @@ -0,0 +1,40 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include "functional_test_utils/layer_test_utils.hpp" +#include "ngraph_functions/builders.hpp" +#include "ngraph_functions/utils/ngraph_helpers.hpp" + +namespace LayerTestsDefinitions { + +using GRUSequenceParams = typename std::tuple< + // bool, // using decompose to sub-ops transformation + size_t, // seq_lengths + size_t, // batch + size_t, // hidden size + size_t, // input size + std::vector, // activations + float, // clip + bool, // linear_before_reset + ngraph::op::RecurrentSequenceDirection, // direction + InferenceEngine::Precision, // Network precision + std::string>; // Device name + +class GRUSequenceTest : public testing::WithParamInterface, + virtual public LayerTestsUtils::LayerTestsCommon { +public: + static std::string getTestCaseName(const testing::TestParamInfo &obj); + +protected: + void SetUp() override; +}; + +} // namespace LayerTestsDefinitions diff --git a/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/lstm_sequence.hpp b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/lstm_sequence.hpp new file mode 100644 index 00000000000000..3aee251b3d5d9b --- /dev/null +++ b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/lstm_sequence.hpp @@ -0,0 +1,39 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include "functional_test_utils/layer_test_utils.hpp" +#include "ngraph_functions/builders.hpp" +#include "ngraph_functions/utils/ngraph_helpers.hpp" + +namespace LayerTestsDefinitions { + +using LSTMSequenceParams = typename std::tuple< + // bool, // using decompose to sub-ops transformation + size_t, // seq_lengths + size_t, // batch + size_t, // hidden size + size_t, // input size + std::vector, // activations + float, // clip + ngraph::op::RecurrentSequenceDirection, // direction + InferenceEngine::Precision, // Network precision + std::string>; // Device name + +class LSTMSequenceTest : public testing::WithParamInterface, + virtual public LayerTestsUtils::LayerTestsCommon { +public: + static std::string getTestCaseName(const testing::TestParamInfo &obj); + +protected: + void SetUp() override; +}; + +} // namespace LayerTestsDefinitions diff --git a/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/rnn_sequence.hpp b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/rnn_sequence.hpp new file mode 100644 index 00000000000000..46c3d5703af365 --- /dev/null +++ b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/rnn_sequence.hpp @@ -0,0 +1,39 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include "functional_test_utils/layer_test_utils.hpp" +#include "ngraph_functions/builders.hpp" +#include "ngraph_functions/utils/ngraph_helpers.hpp" + +namespace LayerTestsDefinitions { + +using RNNSequenceParams = typename std::tuple< + // bool, // using decompose to sub-ops transformation + size_t, // seq_lengths + size_t, // batch + size_t, // hidden size + size_t, // input size + std::vector, // activations + float, // clip + ngraph::op::RecurrentSequenceDirection, // direction + InferenceEngine::Precision, // Network precision + std::string>; // Device name + +class RNNSequenceTest : public testing::WithParamInterface, + virtual public LayerTestsUtils::LayerTestsCommon { +public: + static std::string getTestCaseName(const testing::TestParamInfo &obj); + +protected: + void SetUp() override; +}; + +} // namespace LayerTestsDefinitions diff --git a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/gru_cell.cpp b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/gru_cell.cpp index 0750819b1fa676..fa004c1b3cc550 100644 --- a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/gru_cell.cpp +++ b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/gru_cell.cpp @@ -72,7 +72,8 @@ void GRUCellTest::SetUp() { auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); auto params = ngraph::builder::makeParams(ngPrc, {inputShapes[0], inputShapes[1]}); std::vector WRB = {inputShapes[2], inputShapes[3], inputShapes[4]}; - auto gru_cell = ngraph::builder::makeGRUCell(ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(params)), + auto gru_cell = ngraph::builder::makeGRU( + ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(params)), WRB, hidden_size, activations, {}, {}, clip, linear_before_reset); ngraph::ResultVector results{std::make_shared(gru_cell->output(0))}; function = std::make_shared(results, params, "gru_cell"); diff --git a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/gru_sequence.cpp b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/gru_sequence.cpp new file mode 100644 index 00000000000000..1327831a457260 --- /dev/null +++ b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/gru_sequence.cpp @@ -0,0 +1,98 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include +#include + +#include "ie_core.hpp" + +#include "common_test_utils/common_utils.hpp" +#include "functional_test_utils/blob_utils.hpp" +#include "functional_test_utils/precision_utils.hpp" +#include "functional_test_utils/plugin_cache.hpp" +#include "functional_test_utils/skip_tests_config.hpp" + +#include "single_layer_tests/gru_sequence.hpp" +#include + +namespace LayerTestsDefinitions { + + std::string GRUSequenceTest::getTestCaseName(const testing::TestParamInfo &obj) { + //bool should_decompose; + size_t seq_lenghts; + size_t batch; + size_t hidden_size; + size_t input_size; + std::vector activations; + std::vector activations_alpha; + std::vector activations_beta; + float clip; + bool linear_before_reset; + ngraph::op::RecurrentSequenceDirection direction; + InferenceEngine::Precision netPrecision; + std::string targetDevice; + std::tie(seq_lenghts, batch, hidden_size, input_size, activations, clip, linear_before_reset, direction, netPrecision, + targetDevice) = obj.param; + std::vector> inputShapes = { + {{batch, input_size}, {batch, hidden_size}, {batch, hidden_size}, {3 * hidden_size, input_size}, + {3 * hidden_size, hidden_size}, {(linear_before_reset ? 4 : 3) * hidden_size}}, + }; + std::ostringstream result; + result << "seq_lenghts" << seq_lenghts << "_"; + result << "batch=" << batch << "_"; + result << "hidden_size=" << hidden_size << "_"; + result << "input_size=" << input_size << "_"; + result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_"; + result << "activations=" << CommonTestUtils::vec2str(activations) << "_"; + result << "direction=" << direction << "_"; + result << "clip=" << clip << "_"; + result << "netPRC=" << netPrecision.name() << "_"; + result << "targetDevice=" << targetDevice << "_"; + return result.str(); + } + + void GRUSequenceTest::SetUp() { + size_t seq_lenghts; + // bool should_decompose; + size_t batch; + size_t hidden_size; + size_t input_size; + std::vector activations; + std::vector activations_alpha; + std::vector activations_beta; + float clip; + bool linear_before_reset; + ngraph::op::RecurrentSequenceDirection direction; + InferenceEngine::Precision netPrecision; + std::tie(seq_lenghts, batch, hidden_size, input_size, activations, clip, linear_before_reset, direction, netPrecision, + targetDevice) = this->GetParam(); + size_t num_directions = direction == ngraph::op::RecurrentSequenceDirection::BIDIRECTIONAL ? 2 : 1; + std::vector> inputShapes = { + {{batch, seq_lenghts, input_size}, {batch, num_directions, hidden_size}, {batch}, + {num_directions, 3 * hidden_size, input_size}, {num_directions, 3 * hidden_size, hidden_size}, + {num_directions, (linear_before_reset ? 4 : 3) * hidden_size}}, + }; + auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); + auto params = ngraph::builder::makeParams(ngPrc, {inputShapes[0], inputShapes[1]}); + std::vector WRB = {inputShapes[3], inputShapes[4], inputShapes[5], inputShapes[2]}; + auto gru_sequence = ngraph::builder::makeGRU(ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(params)), + WRB, hidden_size, activations, {}, {}, clip, linear_before_reset, true, direction); + ngraph::ResultVector results{std::make_shared(gru_sequence->output(0)), + std::make_shared(gru_sequence->output(1))}; + function = std::make_shared(results, params, "gru_sequence"); + if (direction == ngraph::op::RecurrentSequenceDirection::BIDIRECTIONAL) { + ngraph::pass::Manager m; + m.register_pass(); + m.run_passes(function); + } + } + + + TEST_P(GRUSequenceTest, CompareWithRefs) { + Run(); + }; +} // namespace LayerTestsDefinitions diff --git a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/lstm_cell.cpp b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/lstm_cell.cpp index 2c8c9c71b4a439..733c740784377d 100644 --- a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/lstm_cell.cpp +++ b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/lstm_cell.cpp @@ -70,7 +70,7 @@ void LSTMCellTest::SetUp() { auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); auto params = ngraph::builder::makeParams(ngPrc, {inputShapes[0], inputShapes[1], inputShapes[2]}); std::vector WRB = {inputShapes[3], inputShapes[4], inputShapes[5]}; - auto lstm_cell = ngraph::builder::makeLSTMCell(ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(params)), + auto lstm_cell = ngraph::builder::makeLSTM(ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(params)), WRB, hidden_size, activations, {}, {}, clip); ngraph::ResultVector results{std::make_shared(lstm_cell->output(0)), std::make_shared(lstm_cell->output(1))}; diff --git a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/lstm_sequence.cpp b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/lstm_sequence.cpp new file mode 100644 index 00000000000000..b1edaa9b0a4b99 --- /dev/null +++ b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/lstm_sequence.cpp @@ -0,0 +1,96 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include +#include + +#include "ie_core.hpp" + +#include "common_test_utils/common_utils.hpp" +#include "functional_test_utils/blob_utils.hpp" +#include "functional_test_utils/precision_utils.hpp" +#include "functional_test_utils/plugin_cache.hpp" +#include "functional_test_utils/skip_tests_config.hpp" + +#include "single_layer_tests/lstm_sequence.hpp" +#include + +namespace LayerTestsDefinitions { + + std::string LSTMSequenceTest::getTestCaseName(const testing::TestParamInfo &obj) { + //bool should_decompose; + size_t seq_lenghts; + size_t batch; + size_t hidden_size; + size_t input_size; + std::vector activations; + std::vector activations_alpha; + std::vector activations_beta; + float clip; + ngraph::op::RecurrentSequenceDirection direction; + InferenceEngine::Precision netPrecision; + std::string targetDevice; + std::tie(seq_lenghts, batch, hidden_size, input_size, activations, clip, direction, netPrecision, + targetDevice) = obj.param; + std::vector> inputShapes = { + {{batch, input_size}, {batch, hidden_size}, {batch, hidden_size}, {4 * hidden_size, input_size}, + {4 * hidden_size, hidden_size}, {4 * hidden_size}}, + }; + std::ostringstream result; + result << "seq_lenghts" << seq_lenghts << "_"; + result << "batch=" << batch << "_"; + result << "hidden_size=" << hidden_size << "_"; + result << "input_size=" << input_size << "_"; + result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_"; + result << "activations=" << CommonTestUtils::vec2str(activations) << "_"; + result << "direction=" << direction << "_"; + result << "clip=" << clip << "_"; + result << "netPRC=" << netPrecision.name() << "_"; + result << "targetDevice=" << targetDevice << "_"; + return result.str(); + } + + void LSTMSequenceTest::SetUp() { + size_t seq_lenghts; + // bool should_decompose; + size_t batch; + size_t hidden_size; + size_t input_size; + std::vector activations; + std::vector activations_alpha; + std::vector activations_beta; + float clip; + ngraph::op::RecurrentSequenceDirection direction; + InferenceEngine::Precision netPrecision; + std::tie(seq_lenghts, batch, hidden_size, input_size, activations, clip, direction, netPrecision, + targetDevice) = this->GetParam(); + size_t num_directions = direction == ngraph::op::RecurrentSequenceDirection::BIDIRECTIONAL ? 2 : 1; + std::vector> inputShapes = { + {{batch, seq_lenghts, input_size}, {batch, num_directions, hidden_size}, {batch, num_directions, hidden_size}, + {batch}, {num_directions, 4 * hidden_size, input_size}, {num_directions, 4 * hidden_size, hidden_size}, {num_directions, 4 * hidden_size}}, + }; + auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); + auto params = ngraph::builder::makeParams(ngPrc, {inputShapes[0], inputShapes[1], inputShapes[2]}); + std::vector WRB = {inputShapes[4], inputShapes[5], inputShapes[6], inputShapes[3]}; + auto lstm_sequence = ngraph::builder::makeLSTM(ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(params)), + WRB, hidden_size, activations, {}, {}, clip, true, direction); + ngraph::ResultVector results{std::make_shared(lstm_sequence->output(0)), + std::make_shared(lstm_sequence->output(1)), + std::make_shared(lstm_sequence->output(2))}; + function = std::make_shared(results, params, "lstm_sequence"); + if (direction == ngraph::op::RecurrentSequenceDirection::BIDIRECTIONAL) { + ngraph::pass::Manager m; + m.register_pass(); + m.run_passes(function); + } + } + + + TEST_P(LSTMSequenceTest, CompareWithRefs) { + Run(); + }; +} // namespace LayerTestsDefinitions diff --git a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/rnn_cell.cpp b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/rnn_cell.cpp index 97c1c08b63842e..ec8616019b2005 100644 --- a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/rnn_cell.cpp +++ b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/rnn_cell.cpp @@ -64,7 +64,8 @@ void RNNCellTest::SetUp() { auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); auto params = ngraph::builder::makeParams(ngPrc, {inputShapes[0], inputShapes[1]}); std::vector WRB = {inputShapes[2], inputShapes[3], inputShapes[4]}; - auto rnn_cell = ngraph::builder::makeRNNCell(ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(params)), + auto rnn_cell = ngraph::builder::makeRNN( + ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(params)), WRB, hidden_size, activations, {}, {}, clip); ngraph::ResultVector results{std::make_shared(rnn_cell)}; function = std::make_shared(results, params, "rnn_cell"); diff --git a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/rnn_sequence.cpp b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/rnn_sequence.cpp new file mode 100644 index 00000000000000..63f9e85002686c --- /dev/null +++ b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/rnn_sequence.cpp @@ -0,0 +1,96 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include +#include + +#include "ie_core.hpp" + +#include "common_test_utils/common_utils.hpp" +#include "functional_test_utils/blob_utils.hpp" +#include "functional_test_utils/precision_utils.hpp" +#include "functional_test_utils/plugin_cache.hpp" +#include "functional_test_utils/skip_tests_config.hpp" + +#include "single_layer_tests/rnn_sequence.hpp" +#include + +namespace LayerTestsDefinitions { + + std::string RNNSequenceTest::getTestCaseName(const testing::TestParamInfo &obj) { + //bool should_decompose; + size_t seq_lenghts; + size_t batch; + size_t hidden_size; + size_t input_size; + std::vector activations; + std::vector activations_alpha; + std::vector activations_beta; + float clip; + ngraph::op::RecurrentSequenceDirection direction; + InferenceEngine::Precision netPrecision; + std::string targetDevice; + std::tie(seq_lenghts, batch, hidden_size, input_size, activations, clip, direction, netPrecision, + targetDevice) = obj.param; + std::vector> inputShapes = { + {{batch, input_size}, {batch, hidden_size}, {batch, hidden_size}, {hidden_size, input_size}, + {hidden_size, hidden_size}, {hidden_size}}, + }; + std::ostringstream result; + result << "seq_lenghts" << seq_lenghts << "_"; + result << "batch=" << batch << "_"; + result << "hidden_size=" << hidden_size << "_"; + result << "input_size=" << input_size << "_"; + result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_"; + result << "activations=" << CommonTestUtils::vec2str(activations) << "_"; + result << "direction=" << direction << "_"; + result << "clip=" << clip << "_"; + result << "netPRC=" << netPrecision.name() << "_"; + result << "targetDevice=" << targetDevice << "_"; + return result.str(); + } + + void RNNSequenceTest::SetUp() { + size_t seq_lenghts; + // bool should_decompose; + size_t batch; + size_t hidden_size; + size_t input_size; + std::vector activations; + std::vector activations_alpha; + std::vector activations_beta; + float clip; + ngraph::op::RecurrentSequenceDirection direction; + InferenceEngine::Precision netPrecision; + std::tie(seq_lenghts, batch, hidden_size, input_size, activations, clip, direction, netPrecision, + targetDevice) = this->GetParam(); + size_t num_directions = direction == ngraph::op::RecurrentSequenceDirection::BIDIRECTIONAL ? 2 : 1; + std::vector> inputShapes = { + {{batch, seq_lenghts, input_size}, {batch, num_directions, hidden_size}, {batch}, + {num_directions, hidden_size, input_size}, {num_directions, hidden_size, hidden_size}, + {num_directions, hidden_size}}, + }; + auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); + auto params = ngraph::builder::makeParams(ngPrc, {inputShapes[0], inputShapes[1]}); + std::vector WRB = {inputShapes[3], inputShapes[4], inputShapes[5], inputShapes[2]}; + auto rnn_sequence = ngraph::builder::makeRNN(ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(params)), + WRB, hidden_size, activations, {}, {}, clip, true, direction); + ngraph::ResultVector results{std::make_shared(rnn_sequence->output(0)), + std::make_shared(rnn_sequence->output(1))}; + function = std::make_shared(results, params, "rnn_sequence"); + if (direction == ngraph::op::RecurrentSequenceDirection::BIDIRECTIONAL) { + ngraph::pass::Manager m; + m.register_pass(); + m.run_passes(function); + } + } + + + TEST_P(RNNSequenceTest, CompareWithRefs) { + Run(); + }; +} // namespace LayerTestsDefinitions diff --git a/inference-engine/tests/ngraph_functions/include/ngraph_functions/builders.hpp b/inference-engine/tests/ngraph_functions/include/ngraph_functions/builders.hpp index afc245c3bd5299..bcddb14d90512b 100644 --- a/inference-engine/tests/ngraph_functions/include/ngraph_functions/builders.hpp +++ b/inference-engine/tests/ngraph_functions/include/ngraph_functions/builders.hpp @@ -390,35 +390,40 @@ std::shared_ptr makePad(const ngraph::Output& data, std::shared_ptr makeBatchNormInference(const ngraph::Output& data, double epsilon); -std::shared_ptr makeLSTMCell(const OutputVector& in, - const std::vector& WRB, +std::shared_ptr makeLSTM(const OutputVector& in, + const std::vector& constants, std::size_t hidden_size, const std::vector& activations = std::vector{"sigmoid", "tanh", "tanh"}, const std::vector& activations_alpha = {}, const std::vector& activations_beta = {}, - float clip = 0.f); - -std::shared_ptr makeGRUCell(const OutputVector& in, - const std::vector& WRB, - std::size_t hidden_size, - const std::vector& activations = - std::vector{"sigmoid", "tanh"}, - const std::vector& activations_alpha = {}, - const std::vector& activations_beta = {}, - float clip = 0.f, - bool linear_before_reset = false); - -std::shared_ptr makeRNNCell(const OutputVector& in, - const std::vector& WRB, - std::size_t hidden_size, - const std::vector& activations = std::vector{"tanh"}, - const std::vector& activations_alpha = {}, - const std::vector& activations_beta = {}, - float clip = 0.f); + float clip = 0.f, + bool make_sequence = false, + ngraph::op::RecurrentSequenceDirection direction = ngraph::op::RecurrentSequenceDirection::FORWARD); + +std::shared_ptr makeGRU(const OutputVector& in, + const std::vector& constants, + std::size_t hidden_size, + const std::vector& activations = + std::vector{"sigmoid", "tanh"}, + const std::vector& activations_alpha = {}, + const std::vector& activations_beta = {}, + float clip = 0.f, + bool linear_before_reset = false, + bool make_sequence = false, + ngraph::op::RecurrentSequenceDirection direction = ngraph::op::RecurrentSequenceDirection::FORWARD); + +std::shared_ptr makeRNN(const OutputVector& in, + const std::vector& constants, + std::size_t hidden_size, + const std::vector& activations = std::vector{"tanh"}, + const std::vector& activations_alpha = {}, + const std::vector& activations_beta = {}, + float clip = 0.f, + bool make_sequence = false, + ngraph::op::RecurrentSequenceDirection direction = ngraph::op::RecurrentSequenceDirection::FORWARD); std::shared_ptr makeTile(const ngraph::Output& in, const std::vector& repeats); - } // namespace builder } // namespace ngraph diff --git a/inference-engine/tests/ngraph_functions/src/gru_cell.cpp b/inference-engine/tests/ngraph_functions/src/gru_cell.cpp index 487959fa5a58f3..784daf288796ce 100644 --- a/inference-engine/tests/ngraph_functions/src/gru_cell.cpp +++ b/inference-engine/tests/ngraph_functions/src/gru_cell.cpp @@ -10,21 +10,30 @@ namespace ngraph { namespace builder { -std::shared_ptr makeGRUCell(const OutputVector& in, - const std::vector& WRB, - std::size_t hidden_size, - const std::vector& activations, - const std::vector& activations_alpha, - const std::vector& activations_beta, - float clip, - bool linear_before_reset) { +std::shared_ptr makeGRU(const OutputVector& in, + const std::vector& constants, + std::size_t hidden_size, + const std::vector& activations, + const std::vector& activations_alpha, + const std::vector& activations_beta, + float clip, + bool linear_before_reset, + bool make_sequence, + ngraph::op::RecurrentSequenceDirection direction) { std::vector empty; - auto W = ngraph::builder::makeConstant(in[0].get_element_type(), WRB[0], empty, true); - auto R = ngraph::builder::makeConstant(in[0].get_element_type(), WRB[1], empty, true); - auto B = ngraph::builder::makeConstant(in[0].get_element_type(), WRB[2], empty, true); - return std::make_shared(in[0], in[1], W, R, B, hidden_size, activations, - activations_alpha, activations_beta, clip, linear_before_reset); + auto W = ngraph::builder::makeConstant(in[0].get_element_type(), constants[0], empty, true); + auto R = ngraph::builder::makeConstant(in[0].get_element_type(), constants[1], empty, true); + auto B = ngraph::builder::makeConstant(in[0].get_element_type(), constants[2], empty, true); + if (!make_sequence) { + return std::make_shared(in[0], in[1], W, R, B, hidden_size, activations, + activations_alpha, activations_beta, clip, + linear_before_reset); + } else { + std::vector lenghts(in[0].get_shape()[0], in[0].get_shape()[1]); + auto seq_lenghts = ngraph::builder::makeConstant(in[0].get_element_type(), constants[3], lenghts, false); + return std::make_shared(in[0], in[1], seq_lenghts, W, R, B, hidden_size, direction, + activations, activations_alpha, activations_beta, clip, linear_before_reset); + } } - } // namespace builder } // namespace ngraph \ No newline at end of file diff --git a/inference-engine/tests/ngraph_functions/src/lstm_cell.cpp b/inference-engine/tests/ngraph_functions/src/lstm_cell.cpp index 38f39f718900cb..944c8b6592a8fa 100644 --- a/inference-engine/tests/ngraph_functions/src/lstm_cell.cpp +++ b/inference-engine/tests/ngraph_functions/src/lstm_cell.cpp @@ -10,20 +10,28 @@ namespace ngraph { namespace builder { -std::shared_ptr makeLSTMCell(const std::vector>& in, - const std::vector& WRB, - std::size_t hidden_size, - const std::vector& activations, - const std::vector& activations_alpha, - const std::vector& activations_beta, - float clip) { +std::shared_ptr makeLSTM(const std::vector>& in, + const std::vector& constants, + std::size_t hidden_size, + const std::vector& activations, + const std::vector& activations_alpha, + const std::vector& activations_beta, + float clip, + bool make_sequence, + ngraph::op::RecurrentSequenceDirection direction) { std::vector empty; - auto W = ngraph::builder::makeConstant(in[0].get_element_type(), WRB[0], empty, true); - auto R = ngraph::builder::makeConstant(in[0].get_element_type(), WRB[1], empty, true); - auto B = ngraph::builder::makeConstant(in[0].get_element_type(), WRB[2], empty, true); - return std::make_shared(in[0], in[1], in[2], W, R, B, hidden_size, activations, - activations_alpha, activations_beta, clip); + auto W = ngraph::builder::makeConstant(in[0].get_element_type(), constants[0], empty, true); + auto R = ngraph::builder::makeConstant(in[0].get_element_type(), constants[1], empty, true); + auto B = ngraph::builder::makeConstant(in[0].get_element_type(), constants[2], empty, true); + if (!make_sequence) { + return std::make_shared(in[0], in[1], in[2], W, R, B, hidden_size, activations, + activations_alpha, activations_beta, clip); + } else { + std::vector lenghts(in[0].get_shape()[0], in[0].get_shape()[1]); + auto seq_lenghts = ngraph::builder::makeConstant(in[0].get_element_type(), constants[3], lenghts, false); + return std::make_shared(in[0], in[1], in[2], seq_lenghts, W, R, B, hidden_size, direction, + activations_alpha, activations_beta, activations, clip); + } } - } // namespace builder } // namespace ngraph \ No newline at end of file diff --git a/inference-engine/tests/ngraph_functions/src/rnn_cell.cpp b/inference-engine/tests/ngraph_functions/src/rnn_cell.cpp index 824c4a8bc1ace7..0d87ec19d1e216 100644 --- a/inference-engine/tests/ngraph_functions/src/rnn_cell.cpp +++ b/inference-engine/tests/ngraph_functions/src/rnn_cell.cpp @@ -10,20 +10,28 @@ namespace ngraph { namespace builder { -std::shared_ptr makeRNNCell(const OutputVector& in, - const std::vector& WRB, - std::size_t hidden_size, - const std::vector& activations, - const std::vector& activations_alpha, - const std::vector& activations_beta, - float clip) { +std::shared_ptr makeRNN(const OutputVector& in, + const std::vector& constants, + std::size_t hidden_size, + const std::vector& activations, + const std::vector& activations_alpha, + const std::vector& activations_beta, + float clip, + bool make_sequence, + ngraph::op::RecurrentSequenceDirection direction) { std::vector empty; - auto W = ngraph::builder::makeConstant(in[0].get_element_type(), WRB[0], empty, true); - auto R = ngraph::builder::makeConstant(in[0].get_element_type(), WRB[1], empty, true); - auto B = ngraph::builder::makeConstant(in[0].get_element_type(), WRB[2], empty, true); - return std::make_shared(in[0], in[1], W, R, B, hidden_size, activations, - activations_alpha, activations_beta, clip); + auto W = ngraph::builder::makeConstant(in[0].get_element_type(), constants[0], empty, true); + auto R = ngraph::builder::makeConstant(in[0].get_element_type(), constants[1], empty, true); + auto B = ngraph::builder::makeConstant(in[0].get_element_type(), constants[2], empty, true); + if (!make_sequence) { + return std::make_shared(in[0], in[1], W, R, B, hidden_size, activations, + activations_alpha, activations_beta, clip); + } else { + std::vector lenghts(in[0].get_shape()[0], in[0].get_shape()[1]); + auto seq_lenghts = ngraph::builder::makeConstant(in[0].get_element_type(), constants[3], lenghts, false); + return std::make_shared(in[0], in[1], seq_lenghts, W, R, B, hidden_size, direction, + activations, activations_alpha, activations_beta, clip); + } } - } // namespace builder } // namespace ngraph \ No newline at end of file diff --git a/ngraph/core/include/ngraph/op/gru_sequence.hpp b/ngraph/core/include/ngraph/op/gru_sequence.hpp new file mode 100644 index 00000000000000..39ee0dd91ff6c6 --- /dev/null +++ b/ngraph/core/include/ngraph/op/gru_sequence.hpp @@ -0,0 +1,67 @@ +//***************************************************************************** +// Copyright 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#pragma once + +#include +#include +#include + +#include "ngraph/op/op.hpp" +#include "ngraph/op/util/rnn_cell_base.hpp" + +namespace ngraph +{ + namespace op + { + namespace v5 + { + class NGRAPH_API GRUSequence : public util::RNNCellBase + { + public: + NGRAPH_RTTI_DECLARATION; + GRUSequence(); + + GRUSequence(const Output& X, + const Output& H_t, + const Output& sequence_lengths, + const Output& W, + const Output& R, + const Output& B, + size_t hidden_size, + op::RecurrentSequenceDirection direction, + const std::vector& activations = + std::vector{"sigmoid", "tanh"}, + const std::vector& activations_alpha = {}, + const std::vector& activations_beta = {}, + float clip = 0.f, + bool linear_before_reset = false); + + std::shared_ptr + clone_with_new_inputs(const OutputVector& new_args) const override; + + void validate_and_infer_types() override; + + bool visit_attributes(AttributeVisitor& visitor) override; + bool get_linear_before_reset() const { return m_linear_before_reset; } + op::RecurrentSequenceDirection get_direction() const { return m_direction; } + protected: + op::RecurrentSequenceDirection m_direction; + bool m_linear_before_reset; + }; + } + } // namespace op +} // namespace ngraph diff --git a/ngraph/core/include/ngraph/op/lstm_sequence.hpp b/ngraph/core/include/ngraph/op/lstm_sequence.hpp index 7fbe1f32ac97ff..81cf782ac40768 100644 --- a/ngraph/core/include/ngraph/op/lstm_sequence.hpp +++ b/ngraph/core/include/ngraph/op/lstm_sequence.hpp @@ -47,8 +47,7 @@ namespace ngraph class NGRAPH_API LSTMSequence : public util::FusedOp { public: - static constexpr NodeTypeInfo type_info{"LSTMSequence", 0}; - const NodeTypeInfo& get_type_info() const override { return type_info; } + NGRAPH_RTTI_DECLARATION; LSTMSequence() = default; using direction = RecurrentSequenceDirection; @@ -102,11 +101,11 @@ namespace ngraph const std::int64_t hidden_size, const direction lstm_direction, LSTMWeightsFormat weights_format = LSTMWeightsFormat::IFCO, - const std::vector activations_alpha = {}, - const std::vector activations_beta = {}, - const std::vector activations = {"sigmoid", - "tanh", - "tanh"}, + const std::vector& activations_alpha = {}, + const std::vector& activations_beta = {}, + const std::vector& activations = {"sigmoid", + "tanh", + "tanh"}, const float clip_threshold = 0, const bool input_forget = false) : LSTMSequence( @@ -186,7 +185,7 @@ namespace ngraph }; } - namespace v1 + namespace v5 { /// /// \brief Class for lstm sequence node. @@ -200,8 +199,7 @@ namespace ngraph class NGRAPH_API LSTMSequence : public util::RNNCellBase { public: - static constexpr NodeTypeInfo type_info{"LSTMSequence", 1}; - const NodeTypeInfo& get_type_info() const override { return type_info; } + NGRAPH_RTTI_DECLARATION; LSTMSequence() = default; using direction = RecurrentSequenceDirection; @@ -216,11 +214,11 @@ namespace ngraph const Output& B, const std::int64_t hidden_size, const direction lstm_direction, - const std::vector activations_alpha = {}, - const std::vector activations_beta = {}, - const std::vector activations = {"sigmoid", - "tanh", - "tanh"}, + const std::vector& activations_alpha = {}, + const std::vector& activations_beta = {}, + const std::vector& activations = {"sigmoid", + "tanh", + "tanh"}, const float clip = 0.f) : RNNCellBase( {X, initial_hidden_state, initial_cell_state, sequence_lengths, W, R, B}, @@ -237,7 +235,7 @@ namespace ngraph void validate_and_infer_types() override; bool visit_attributes(AttributeVisitor& visitor) override; - virtual std::shared_ptr + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; direction get_direction() const { return m_direction; } diff --git a/ngraph/core/include/ngraph/op/rnn_sequence.hpp b/ngraph/core/include/ngraph/op/rnn_sequence.hpp new file mode 100644 index 00000000000000..37b30376d0fc96 --- /dev/null +++ b/ngraph/core/include/ngraph/op/rnn_sequence.hpp @@ -0,0 +1,66 @@ +//***************************************************************************** +// Copyright 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#pragma once + +#include +#include +#include + +#include "ngraph/op/op.hpp" +#include "ngraph/op/util/rnn_cell_base.hpp" + +namespace ngraph +{ + namespace op + { + namespace v5 + { + class NGRAPH_API RNNSequence : public util::RNNCellBase + { + public: + NGRAPH_RTTI_DECLARATION; + + RNNSequence(); + + RNNSequence( + const Output& X, + const Output& H_t, + const Output& sequence_lengths, + const Output& W, + const Output& R, + const Output& B, + size_t hidden_size, + op::RecurrentSequenceDirection direction, + const std::vector& activations = std::vector{"tanh"}, + const std::vector& activations_alpha = {}, + const std::vector& activations_beta = {}, + float clip = 0.f); + + std::shared_ptr + clone_with_new_inputs(const OutputVector& new_args) const override; + + void validate_and_infer_types() override; + + bool visit_attributes(AttributeVisitor& visitor) override; + + op::RecurrentSequenceDirection get_direction() const { return m_direction; } + protected: + op::RecurrentSequenceDirection m_direction; + }; + } + } // namespace op +} // namespace ngraph diff --git a/ngraph/core/include/ngraph/ops.hpp b/ngraph/core/include/ngraph/ops.hpp index f070bac0b486a3..e3f2ac94289d4e 100644 --- a/ngraph/core/include/ngraph/ops.hpp +++ b/ngraph/core/include/ngraph/ops.hpp @@ -74,6 +74,7 @@ #include "ngraph/op/grn.hpp" #include "ngraph/op/group_conv.hpp" #include "ngraph/op/gru_cell.hpp" +#include "ngraph/op/gru_sequence.hpp" #include "ngraph/op/hard_sigmoid.hpp" #include "ngraph/op/hswish.hpp" #include "ngraph/op/interpolate.hpp" @@ -131,6 +132,7 @@ #include "ngraph/op/reverse.hpp" #include "ngraph/op/reverse_sequence.hpp" #include "ngraph/op/rnn_cell.hpp" +#include "ngraph/op/rnn_sequence.hpp" #include "ngraph/op/roi_align.hpp" #include "ngraph/op/roi_pooling.hpp" #include "ngraph/op/round.hpp" diff --git a/ngraph/core/include/ngraph/opsets/opset4_tbl.hpp b/ngraph/core/include/ngraph/opsets/opset4_tbl.hpp index 001af3f3fc8846..43c3cf1d04304d 100644 --- a/ngraph/core/include/ngraph/opsets/opset4_tbl.hpp +++ b/ngraph/core/include/ngraph/opsets/opset4_tbl.hpp @@ -156,8 +156,8 @@ NGRAPH_OP(Atanh, ngraph::op::v3) NGRAPH_OP(CTCLoss, ngraph::op::v4) NGRAPH_OP(HSwish, ngraph::op::v4) NGRAPH_OP(Interpolate, ngraph::op::v4) -NGRAPH_OP(NonMaxSuppression, ngraph::op::v4) NGRAPH_OP(Mish, ngraph::op::v4) +NGRAPH_OP(NonMaxSuppression, ngraph::op::v4) NGRAPH_OP(ReduceL1, ngraph::op::v4) NGRAPH_OP(ReduceL2, ngraph::op::v4) NGRAPH_OP(SoftPlus, ngraph::op::v4) diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/sequences.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/sequences.hpp new file mode 100644 index 00000000000000..e236bbdb57460f --- /dev/null +++ b/ngraph/core/reference/include/ngraph/runtime/reference/sequences.hpp @@ -0,0 +1,539 @@ +//***************************************************************************** +// Copyright 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace ngraph +{ + namespace runtime + { + namespace reference + { + enum class CellType + { + RNN, + GRU, + LSTM, + }; + + struct CellArgs + { + std::string activation_f; // RNN + std::string activation_g; // RNN/GRU + std::string activation_h; // RNN/GRU/LSTM + float clip; // RNN/GRU/LSTM + bool linear_before_reset = false; // GRU + }; + + template + void cell_pass(CellType type, + const std::vector& inputs, + const std::vector& shapes, + const std::vector& outputs, + const CellArgs& args, + bool is_reverse) + { + auto squeeze_axis = [](const Shape& shape, size_t axis) -> Shape { + Shape new_shape(shape.size() - 1); + for (size_t i = 0, j = 0; i < shape.size(); ++i) + { + if (i != axis) + { + new_shape[j] = shape[i]; + j++; + } + } + return new_shape; + }; + + size_t x_shape_size = ngraph::shape_size(shapes[0]); + + // split X + size_t num_splits = shapes[0].at(1); + std::vector> in_seqs( + num_splits, std::vector(x_shape_size / num_splits * sizeof(T))); + std::vector pointers(num_splits); + for (size_t i = 0; i < num_splits; ++i) + pointers[is_reverse ? num_splits - i - 1 : i] = in_seqs[i].data(); + reference::split(inputs[0], shapes[0], sizeof(T), 1, num_splits, pointers.data()); + + Shape part_shape{shapes[0][0], 1, shapes[2][2]}; + size_t part_shape_size = ngraph::shape_size(part_shape); + std::vector> h_list( + num_splits, std::vector(ngraph::shape_size(part_shape) * sizeof(T))); + + // use outputs as a buffer for temporarily values + char* H_i = outputs[1]; + std::memcpy(H_i, inputs[2], ngraph::shape_size(shapes[2]) * sizeof(T)); + + char* C_i = nullptr; // LSTMCell only + if (type == CellType::LSTM) + { + C_i = outputs[2]; + std::memcpy(C_i, inputs[3], ngraph::shape_size(shapes[3]) * sizeof(T)); + } + + for (size_t time_step = 0; time_step < num_splits; ++time_step) + { + if (type == CellType::LSTM) + { + runtime::reference::lstm_cell( + reinterpret_cast(in_seqs[time_step].data()), + squeeze_axis(shapes[0], 1), + reinterpret_cast(H_i), + squeeze_axis(shapes[2], 1), + reinterpret_cast(C_i), + squeeze_axis(shapes[3], 1), + reinterpret_cast(inputs[4]), + squeeze_axis(shapes[4], 0), + reinterpret_cast(inputs[5]), + squeeze_axis(shapes[5], 0), + reinterpret_cast(inputs[6]), + squeeze_axis(shapes[6], 0), + reinterpret_cast(outputs[1]), + reinterpret_cast(outputs[2]), + args.activation_f, + args.activation_g, + args.activation_h, + args.clip); + } + else if (type == CellType::RNN) + { + runtime::reference::rnn_cell( + reinterpret_cast(in_seqs[time_step].data()), + squeeze_axis(shapes[0], 1), + reinterpret_cast(H_i), + squeeze_axis(shapes[2], 1), + reinterpret_cast(inputs[3]), + squeeze_axis(shapes[3], 0), + reinterpret_cast(inputs[4]), + squeeze_axis(shapes[4], 0), + reinterpret_cast(inputs[5]), + squeeze_axis(shapes[5], 0), + reinterpret_cast(outputs[1]), + args.activation_f, + args.clip); + } + else if (type == CellType::GRU) + { + runtime::reference::gru_cell( + reinterpret_cast(in_seqs[time_step].data()), + squeeze_axis(shapes[0], 1), + reinterpret_cast(H_i), + squeeze_axis(shapes[2], 1), + reinterpret_cast(inputs[3]), + squeeze_axis(shapes[3], 0), + reinterpret_cast(inputs[4]), + squeeze_axis(shapes[4], 0), + reinterpret_cast(inputs[5]), + squeeze_axis(shapes[5], 0), + reinterpret_cast(outputs[1]), + args.activation_f, + args.activation_g, + args.clip, + args.linear_before_reset); + } + std::memcpy(h_list[time_step].data(), outputs[1], part_shape_size * sizeof(T)); + } + // The tensor that concats all the intermediate output values of the hidden. + // It has shape [batch_size, seq_length, hidden_size] + std::vector in_shapes(num_splits, part_shape); + std::vector to_concat_pointers(num_splits); + for (size_t i = 0; i < num_splits; ++i) + to_concat_pointers[is_reverse ? num_splits - i - 1 : i] = h_list[i].data(); + runtime::reference::concat(to_concat_pointers, + outputs[0], + in_shapes, + {shapes[0][0], shapes[0][1], shapes[2][2]}, + 1, + sizeof(T)); + } + + template + void lstm_sequence(const char* X, + const Shape& X_shape, + const char* H, + const Shape& H_shape, + const char* C, + const Shape& C_shape, + const char* seq_lengths, + const Shape& seq_lengths_shape, + const char* W, + const Shape& W_shape, + const char* R, + const Shape& R_shape, + const char* B, + const Shape& B_shape, + char* Y, + char* Ho, + char* Co, + const std::string& activation_f, + const std::string& activation_g, + const std::string& activation_h, + float clip, + op::RecurrentSequenceDirection direction) + { + OutputVector results; + if (direction == op::RecurrentSequenceDirection::FORWARD || + direction == op::RecurrentSequenceDirection::REVERSE) + { + CellArgs args; + args.activation_f = activation_f; + args.activation_g = activation_g; + args.activation_h = activation_h; + args.clip = clip; + std::vector inputs = {X, seq_lengths, H, C, W, R, B}; + std::vector outputs = {Y, Ho, Co}; + std::vector shapes = { + X_shape, seq_lengths_shape, H_shape, C_shape, W_shape, R_shape, B_shape}; + cell_pass(CellType::LSTM, + inputs, + shapes, + outputs, + args, + direction == op::RecurrentSequenceDirection::REVERSE); + } + else if (direction == op::RecurrentSequenceDirection::BIDIRECTIONAL) + { + // Split bidirectional case to forward + reverse passes. + // split inputs + std::vector> H_split( + 2, std::vector(ngraph::shape_size(H_shape) / 2)); + std::vector> C_split( + 2, std::vector(ngraph::shape_size(C_shape) / 2)); + std::vector> W_split( + 2, std::vector(ngraph::shape_size(W_shape) / 2)); + std::vector> R_split( + 2, std::vector(ngraph::shape_size(R_shape) / 2)); + std::vector> B_split( + 2, std::vector(ngraph::shape_size(B_shape) / 2)); + char* h_pointers[2] = {H_split[0].data(), H_split[1].data()}; + char* c_pointers[2] = {C_split[0].data(), C_split[1].data()}; + char* w_pointers[2] = {W_split[0].data(), W_split[1].data()}; + char* r_pointers[2] = {R_split[0].data(), R_split[1].data()}; + char* b_pointers[2] = {B_split[0].data(), B_split[1].data()}; + reference::split(H, H_shape, sizeof(T), 1, 2, h_pointers); + reference::split(C, C_shape, sizeof(T), 1, 2, c_pointers); + reference::split(W, W_shape, sizeof(T), 1, 2, w_pointers); + reference::split(R, R_shape, sizeof(T), 1, 2, r_pointers); + reference::split(B, B_shape, sizeof(T), 1, 2, b_pointers); + std::vector> forward_res( + 3, std::vector(H_shape[0] * H_shape[2])); + std::vector> reverse_res( + 3, std::vector(H_shape[0] * H_shape[2])); + + CellArgs args; + args.activation_f = activation_f; + args.activation_g = activation_g; + args.activation_h = activation_h; + args.clip = clip; + std::vector shapes = { + X_shape, seq_lengths_shape, H_shape, C_shape, W_shape, R_shape, B_shape}; + // forward pass + cell_pass( + CellType::LSTM, + {X, + seq_lengths, + h_pointers[0], + c_pointers[0], + w_pointers[0], + r_pointers[0], + b_pointers[0]}, + shapes, + {forward_res[0].data(), forward_res[1].data(), forward_res[2].data()}, + args, + false); + // reverse pass + cell_pass( + CellType::LSTM, + {X, + seq_lengths, + h_pointers[1], + c_pointers[1], + w_pointers[1], + r_pointers[1], + b_pointers[1]}, + shapes, + {reverse_res[0].data(), reverse_res[1].data(), reverse_res[2].data()}, + args, + true); + + // Stack together respective outputs from both forward and reverse passes. + std::vector in_shapes = {{H_shape[0], 1, H_shape[2]}, + {H_shape[0], 1, H_shape[2]}, + {H_shape[0], 1, H_shape[2]}}; + Shape output_shape = {{H_shape[0], 2, H_shape[2]}}; + + runtime::reference::concat({forward_res[0].data(), reverse_res[0].data()}, + Y, + in_shapes, + output_shape, + 1, + sizeof(T)); + runtime::reference::concat({forward_res[1].data(), reverse_res[1].data()}, + Ho, + in_shapes, + output_shape, + 1, + sizeof(T)); + runtime::reference::concat({forward_res[2].data(), reverse_res[2].data()}, + Co, + in_shapes, + output_shape, + 1, + sizeof(T)); + } + } + + template + void gru_sequence(const char* X, + const Shape& X_shape, + const char* H, + const Shape& H_shape, + const char* seq_lengths, + const Shape& seq_lengths_shape, + const char* W, + const Shape& W_shape, + const char* R, + const Shape& R_shape, + const char* B, + const Shape& B_shape, + char* Y, + char* Ho, + const std::string& activation_f, + const std::string& activation_g, + const float clip, + const op::RecurrentSequenceDirection direction, + const bool linear_before_reset) + { + OutputVector results; + if (direction == op::RecurrentSequenceDirection::FORWARD || + direction == op::RecurrentSequenceDirection::REVERSE) + { + CellArgs args; + args.activation_f = activation_f; + args.activation_g = activation_g; + args.linear_before_reset = linear_before_reset; + args.clip = clip; + std::vector inputs = {X, seq_lengths, H, W, R, B}; + std::vector outputs = {Y, Ho}; + std::vector shapes = { + X_shape, seq_lengths_shape, H_shape, W_shape, R_shape, B_shape}; + cell_pass(CellType::GRU, + inputs, + shapes, + outputs, + args, + direction == op::RecurrentSequenceDirection::REVERSE); + } + else if (direction == op::RecurrentSequenceDirection::BIDIRECTIONAL) + { + // Split bidirectional case to forward + reverse passes. + // split inputs + std::vector> H_split( + 2, std::vector(ngraph::shape_size(H_shape) / 2)); + std::vector> W_split( + 2, std::vector(ngraph::shape_size(W_shape) / 2)); + std::vector> R_split( + 2, std::vector(ngraph::shape_size(R_shape) / 2)); + std::vector> B_split( + 2, std::vector(ngraph::shape_size(B_shape) / 2)); + char* h_pointers[2] = {H_split[0].data(), H_split[1].data()}; + char* w_pointers[2] = {W_split[0].data(), W_split[1].data()}; + char* r_pointers[2] = {R_split[0].data(), R_split[1].data()}; + char* b_pointers[2] = {B_split[0].data(), B_split[1].data()}; + reference::split(H, H_shape, sizeof(T), 1, 2, h_pointers); + reference::split(W, W_shape, sizeof(T), 1, 2, w_pointers); + reference::split(R, R_shape, sizeof(T), 1, 2, r_pointers); + reference::split(B, B_shape, sizeof(T), 1, 2, b_pointers); + std::vector> forward_res( + 2, std::vector(H_shape[0] * H_shape[2])); + std::vector> reverse_res( + 2, std::vector(H_shape[0] * H_shape[2])); + + CellArgs args; + args.activation_f = activation_f; + args.activation_g = activation_g; + args.linear_before_reset = linear_before_reset; + args.clip = clip; + std::vector shapes = { + X_shape, seq_lengths_shape, H_shape, W_shape, R_shape, B_shape}; + // forward pass + cell_pass(CellType::GRU, + {X, + seq_lengths, + h_pointers[0], + w_pointers[0], + r_pointers[0], + b_pointers[0]}, + shapes, + {forward_res[0].data(), forward_res[1].data()}, + args, + false); + // reverse pass + cell_pass(CellType::GRU, + {X, + seq_lengths, + h_pointers[1], + w_pointers[1], + r_pointers[1], + b_pointers[1]}, + shapes, + {reverse_res[0].data(), reverse_res[1].data()}, + args, + true); + + // Stack together respective outputs from both forward and reverse passes. + std::vector in_shapes = {{H_shape[0], 1, H_shape[2]}, + {H_shape[0], 1, H_shape[2]}}; + Shape output_shape = {{H_shape[0], 2, H_shape[2]}}; + + runtime::reference::concat({forward_res[0].data(), reverse_res[0].data()}, + Y, + in_shapes, + output_shape, + 1, + sizeof(T)); + runtime::reference::concat({forward_res[1].data(), reverse_res[1].data()}, + Ho, + in_shapes, + output_shape, + 1, + sizeof(T)); + } + } + + template + void rnn_sequence(const char* X, + const Shape& X_shape, + const char* H, + const Shape& H_shape, + const char* seq_lengths, + const Shape& seq_lengths_shape, + const char* W, + const Shape& W_shape, + const char* R, + const Shape& R_shape, + const char* B, + const Shape& B_shape, + char* Y, + char* Ho, + const std::string& activation_f, + float clip, + const op::RecurrentSequenceDirection direction) + { + OutputVector results; + if (direction == op::RecurrentSequenceDirection::FORWARD || + direction == op::RecurrentSequenceDirection::REVERSE) + { + CellArgs args; + args.activation_f = activation_f; + args.clip = clip; + std::vector inputs = {X, seq_lengths, H, W, R, B}; + std::vector outputs = {Y, Ho}; + std::vector shapes = { + X_shape, seq_lengths_shape, H_shape, W_shape, R_shape, B_shape}; + cell_pass(CellType::RNN, + inputs, + shapes, + outputs, + args, + direction == op::RecurrentSequenceDirection::REVERSE); + } + else if (direction == op::RecurrentSequenceDirection::BIDIRECTIONAL) + { + // Split bidirectional case to forward + reverse passes. + // split inputs + std::vector> H_split( + 2, std::vector(ngraph::shape_size(H_shape) / 2)); + std::vector> W_split( + 2, std::vector(ngraph::shape_size(W_shape) / 2)); + std::vector> R_split( + 2, std::vector(ngraph::shape_size(R_shape) / 2)); + std::vector> B_split( + 2, std::vector(ngraph::shape_size(B_shape) / 2)); + char* h_pointers[2] = {H_split[0].data(), H_split[1].data()}; + char* w_pointers[2] = {W_split[0].data(), W_split[1].data()}; + char* r_pointers[2] = {R_split[0].data(), R_split[1].data()}; + char* b_pointers[2] = {B_split[0].data(), B_split[1].data()}; + reference::split(H, H_shape, sizeof(T), 1, 2, h_pointers); + reference::split(W, W_shape, sizeof(T), 1, 2, w_pointers); + reference::split(R, R_shape, sizeof(T), 1, 2, r_pointers); + reference::split(B, B_shape, sizeof(T), 1, 2, b_pointers); + std::vector> forward_res( + 2, std::vector(H_shape[0] * H_shape[2])); + std::vector> reverse_res( + 2, std::vector(H_shape[0] * H_shape[2])); + + CellArgs args; + args.activation_f = activation_f; + args.clip = clip; + std::vector shapes = { + X_shape, seq_lengths_shape, H_shape, W_shape, R_shape, B_shape}; + // forward pass + cell_pass(CellType::RNN, + {X, + seq_lengths, + h_pointers[0], + w_pointers[0], + r_pointers[0], + b_pointers[0]}, + shapes, + {forward_res[0].data(), forward_res[1].data()}, + args, + false); + // reverse pass + cell_pass(CellType::RNN, + {X, + seq_lengths, + h_pointers[1], + w_pointers[1], + r_pointers[1], + b_pointers[1]}, + shapes, + {reverse_res[0].data(), reverse_res[1].data()}, + args, + true); + + // Stack together respective outputs from both forward and reverse passes. + std::vector in_shapes = {{H_shape[0], 1, H_shape[2]}, + {H_shape[0], 1, H_shape[2]}}; + Shape output_shape = {{H_shape[0], 2, H_shape[2]}}; + + runtime::reference::concat({forward_res[0].data(), reverse_res[0].data()}, + Y, + in_shapes, + output_shape, + 1, + sizeof(T)); + runtime::reference::concat({forward_res[1].data(), reverse_res[1].data()}, + Ho, + in_shapes, + output_shape, + 1, + sizeof(T)); + } + } + } + } +} diff --git a/ngraph/core/src/op/gru_cell.cpp b/ngraph/core/src/op/gru_cell.cpp index fff0fddcb696df..f84c4dee2ae34b 100644 --- a/ngraph/core/src/op/gru_cell.cpp +++ b/ngraph/core/src/op/gru_cell.cpp @@ -109,6 +109,14 @@ bool op::v3::GRUCell::visit_attributes(AttributeVisitor& visitor) void op::v3::GRUCell::validate_and_infer_types() { + for (const auto& input : inputs()) + { + if (input.get_partial_shape().rank().is_dynamic()) + { + set_output_type(0, get_input_element_type(0), PartialShape::dynamic()); + return; + } + } auto merged_batch_size = Dimension::dynamic(); auto merged_hidden_size = Dimension::dynamic(); auto result_et = element::dynamic; diff --git a/ngraph/core/src/op/gru_sequence.cpp b/ngraph/core/src/op/gru_sequence.cpp new file mode 100644 index 00000000000000..fc7cb620d3dd73 --- /dev/null +++ b/ngraph/core/src/op/gru_sequence.cpp @@ -0,0 +1,199 @@ +//***************************************************************************** +// Copyright 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#include +#include +#include + +#include "ngraph/op/gru_sequence.hpp" +#include "ngraph/op/util/recurrent_sequence.hpp" +#include "ngraph/opsets/opset4.hpp" + +using namespace std; +using namespace ngraph; + +NGRAPH_RTTI_DEFINITION(op::v5::GRUSequence, "GRUSequence", 5); + +op::v5::GRUSequence::GRUSequence() + : m_direction(op::RecurrentSequenceDirection::FORWARD) + , m_linear_before_reset(false) +{ +} + +op::v5::GRUSequence::GRUSequence(const Output& X, + const Output& H_t, + const Output& sequence_lengths, + const Output& W, + const Output& R, + const Output& B, + std::size_t hidden_size, + op::RecurrentSequenceDirection direction, + const std::vector& activations, + const std::vector& activations_alpha, + const std::vector& activations_beta, + float clip, + bool linear_before_reset) + : RNNCellBase({X, H_t, sequence_lengths, W, R, B}, + hidden_size, + clip, + activations, + activations_alpha, + activations_beta) + , m_direction(direction) + , m_linear_before_reset(linear_before_reset) +{ + constructor_validate_and_infer_types(); +} + +void op::v5::GRUSequence::validate_and_infer_types() +{ + for (const auto& input : inputs()) + { + if (input.get_partial_shape().rank().is_dynamic()) + { + set_output_type(0, get_input_element_type(0), PartialShape::dynamic()); + set_output_type(1, get_input_element_type(0), PartialShape::dynamic()); + return; + } + } + + auto gru_seq_gates_count = 3; + auto merged_batch_size = Dimension::dynamic(); + auto merged_hidden_size = Dimension::dynamic(); + auto merged_num_directions = Dimension::dynamic(); + auto result_et = element::dynamic; + + auto x_pshape = get_input_partial_shape(0); + auto ht_pshape = get_input_partial_shape(1); + auto sl_pshape = get_input_partial_shape(2); + auto w_pshape = get_input_partial_shape(3); + auto r_pshape = get_input_partial_shape(4); + auto b_pshape = get_input_partial_shape(5); + + ngraph::op::util::validate_seq_input_rank_dimension( + {x_pshape, ht_pshape, sl_pshape, w_pshape, r_pshape, b_pshape}); + + // Validate input types and save result for output type + NODE_VALIDATION_CHECK( + this, + element::Type::merge(result_et, result_et, get_input_element_type(0)) && + element::Type::merge(result_et, result_et, get_input_element_type(1)) && + element::Type::merge(result_et, result_et, get_input_element_type(3)) && + element::Type::merge(result_et, result_et, get_input_element_type(4)) && + element::Type::merge(result_et, result_et, get_input_element_type(5)), + "Element types for X, initial_hidden_state, W, R and B inputs do not " + "match."); + + // Merge batch_size dimension across all inputs to evaluate output[0] dimension + NODE_VALIDATION_CHECK(this, + Dimension::merge(merged_batch_size, merged_batch_size, ht_pshape[0]) && + Dimension::merge(merged_batch_size, merged_batch_size, x_pshape[0]) && + Dimension::merge(merged_batch_size, merged_batch_size, sl_pshape[0]), + "Parameter batch_size not matched in RNNSequence."); + + // Merge hidden_size dimension across all inputs to evaluate output dimension + NODE_VALIDATION_CHECK(this, + Dimension::merge(merged_hidden_size, merged_hidden_size, ht_pshape[2]) && + Dimension::merge(merged_hidden_size, merged_hidden_size, r_pshape[2]), + "Parameter hidden_size not matched RNNSequence."); + + // Merge num_directions dimension across all inputs to evaluate output dimension + NODE_VALIDATION_CHECK( + this, + Dimension::merge(merged_num_directions, merged_num_directions, ht_pshape[1]) && + Dimension::merge(merged_num_directions, merged_num_directions, w_pshape[0]) && + Dimension::merge(merged_num_directions, merged_num_directions, r_pshape[0]) && + Dimension::merge(merged_num_directions, merged_num_directions, b_pshape[0]), + "Parameter num_directions not matched in RNNSequence."); + + // Validate hidden_size value for W, R, B inputs + if (merged_hidden_size.is_static()) + { + if (w_pshape[1].is_static()) + { + NODE_VALIDATION_CHECK( + this, + w_pshape[1].compatible(merged_hidden_size * gru_seq_gates_count), + "Parameter hidden_size mistmatched in W input. Current value is: ", + w_pshape[1].get_length(), + ", expected: ", + merged_hidden_size.get_length() * gru_seq_gates_count, + "."); + } + + if (r_pshape[1].is_static()) + { + NODE_VALIDATION_CHECK( + this, + r_pshape[1].compatible(merged_hidden_size * gru_seq_gates_count), + "Parameter hidden_size mistmatched in R input. Current value is: ", + r_pshape[1].get_length(), + ", expected: ", + merged_hidden_size.get_length() * gru_seq_gates_count, + "."); + } + + if (b_pshape[1].is_static()) + { + NODE_VALIDATION_CHECK( + this, + b_pshape[1].compatible(merged_hidden_size * (m_linear_before_reset + ? (gru_seq_gates_count + 1) + : gru_seq_gates_count)), + "Parameter hidden_size mistmatched in B input. Current value is: ", + b_pshape[1].get_length(), + ", expected: ", + merged_hidden_size.get_length() * + (m_linear_before_reset ? (gru_seq_gates_count + 1) : gru_seq_gates_count), + "."); + } + } + + // Mark inputs which are relevant to output parameters + for (size_t i = 0; i <= 5; ++i) + set_input_is_relevant_to_shape(i); + + // Set output size, type and shape + set_output_size(2); + set_output_type( + 0, result_et, {merged_batch_size, merged_num_directions, x_pshape[1], merged_hidden_size}); + set_output_type(1, result_et, {merged_batch_size, merged_num_directions, merged_hidden_size}); +} + +bool op::v5::GRUSequence::visit_attributes(AttributeVisitor& visitor) +{ + visitor.on_attribute("direction", m_direction); + visitor.on_attribute("linear_before_reset", m_linear_before_reset); + return op::util::RNNCellBase::visit_attributes(visitor); +} + +shared_ptr op::v5::GRUSequence::clone_with_new_inputs(const OutputVector& new_args) const +{ + check_new_args_count(this, new_args); + return make_shared(new_args.at(0), + new_args.at(1), + new_args.at(2), + new_args.at(3), + new_args.at(4), + new_args.at(5), + m_hidden_size, + m_direction, + m_activations, + m_activations_alpha, + m_activations_beta, + m_clip, + m_linear_before_reset); +} diff --git a/ngraph/core/src/op/lstm_cell.cpp b/ngraph/core/src/op/lstm_cell.cpp index 6f72cc0681eb94..0d2b24d53eae9a 100644 --- a/ngraph/core/src/op/lstm_cell.cpp +++ b/ngraph/core/src/op/lstm_cell.cpp @@ -142,6 +142,16 @@ bool ngraph::op::v0::LSTMCell::visit_attributes(AttributeVisitor& visitor) void op::v0::LSTMCell::validate_and_infer_types() { + for (const auto& input : inputs()) + { + if (input.get_partial_shape().rank().is_dynamic()) + { + set_output_type(0, get_input_element_type(0), PartialShape::dynamic()); + set_output_type(1, get_input_element_type(0), PartialShape::dynamic()); + return; + } + } + std::vector input_param{}; auto merged_batch_size = Dimension::dynamic(); @@ -436,6 +446,15 @@ bool ngraph::op::v4::LSTMCell::visit_attributes(AttributeVisitor& visitor) void op::v4::LSTMCell::validate_and_infer_types() { + for (const auto& input : inputs()) + { + if (input.get_partial_shape().rank().is_dynamic()) + { + set_output_type(0, get_input_element_type(0), PartialShape::dynamic()); + set_output_type(1, get_input_element_type(0), PartialShape::dynamic()); + return; + } + } auto merged_batch_size = Dimension::dynamic(); auto merged_hidden_size = Dimension::dynamic(); auto result_et = element::dynamic; @@ -448,11 +467,6 @@ void op::v4::LSTMCell::validate_and_infer_types() const auto& r_pshape = get_input_partial_shape(4); const auto& b_pshape = get_input_partial_shape(5); - // Validate rank and dimension for initial_cell_state input - NODE_VALIDATION_CHECK(this, - (ct_pshape.rank().is_static()), - "LSTMCell input tensor initial_cell_state shall have static rank."); - NODE_VALIDATION_CHECK(this, (ct_pshape.rank().get_length() == 2), "LSTMCell input tensor initial_cell_state shall have dimension 2D."); diff --git a/ngraph/core/src/op/lstm_sequence.cpp b/ngraph/core/src/op/lstm_sequence.cpp index 10a5b75efce18a..ab3607c425eacf 100644 --- a/ngraph/core/src/op/lstm_sequence.cpp +++ b/ngraph/core/src/op/lstm_sequence.cpp @@ -29,8 +29,8 @@ using namespace ngraph; using namespace std; -constexpr NodeTypeInfo op::v1::LSTMSequence::type_info; -constexpr NodeTypeInfo op::v0::LSTMSequence::type_info; +NGRAPH_RTTI_DEFINITION(op::v0::LSTMSequence, "LSTMSequence", 0); +NGRAPH_RTTI_DEFINITION(op::v5::LSTMSequence, "LSTMSequence", 5); bool ngraph::op::v0::LSTMSequence::visit_attributes(AttributeVisitor& visitor) { @@ -353,7 +353,7 @@ void op::v0::LSTMSequence::validate_and_infer_types() // Validate hidden_size value for W, R, B and P inputs if (merged_hidden_size.is_static()) { - if (w_pshape[0].is_static()) + if (w_pshape[1].is_static()) { NODE_VALIDATION_CHECK( this, @@ -365,7 +365,7 @@ void op::v0::LSTMSequence::validate_and_infer_types() "."); } - if (r_pshape[0].is_static()) + if (r_pshape[1].is_static()) { NODE_VALIDATION_CHECK( this, @@ -377,7 +377,7 @@ void op::v0::LSTMSequence::validate_and_infer_types() "."); } - if (b_pshape[0].is_static()) + if (b_pshape[1].is_static()) { NODE_VALIDATION_CHECK( this, @@ -389,7 +389,7 @@ void op::v0::LSTMSequence::validate_and_infer_types() "."); } - if (p_pshape[0].is_static()) + if (p_pshape[1].is_static()) { NODE_VALIDATION_CHECK( this, @@ -419,18 +419,18 @@ void op::v0::LSTMSequence::validate_and_infer_types() set_output_type(2, result_et, {merged_batch_size, merged_num_directions, merged_hidden_size}); } -bool ngraph::op::v1::LSTMSequence::visit_attributes(AttributeVisitor& visitor) +bool ngraph::op::v5::LSTMSequence::visit_attributes(AttributeVisitor& visitor) { visitor.on_attribute("direction", m_direction); return op::util::RNNCellBase::visit_attributes(visitor); } -shared_ptr op::v1::LSTMSequence::clone_with_new_inputs(const OutputVector& new_args) const +shared_ptr op::v5::LSTMSequence::clone_with_new_inputs(const OutputVector& new_args) const { check_new_args_count(this, new_args); if (new_args.size() == 7) { - return make_shared(new_args.at(0), // X + return make_shared(new_args.at(0), // X new_args.at(1), // initial_hidden_state new_args.at(2), // initial_cell_state new_args.at(3), // sequence_lengths @@ -450,8 +450,18 @@ shared_ptr op::v1::LSTMSequence::clone_with_new_inputs(const OutputVector& } } -void op::v1::LSTMSequence::validate_and_infer_types() +void op::v5::LSTMSequence::validate_and_infer_types() { + for (const auto& input : inputs()) + { + if (input.get_partial_shape().rank().is_dynamic()) + { + set_output_type(0, get_input_element_type(0), PartialShape::dynamic()); + set_output_type(1, get_input_element_type(0), PartialShape::dynamic()); + set_output_type(2, get_input_element_type(0), PartialShape::dynamic()); + return; + } + } std::vector input_param{}; auto lstm_seq_gates_count = 4; @@ -482,10 +492,6 @@ void op::v1::LSTMSequence::validate_and_infer_types() ngraph::op::util::validate_seq_input_rank_dimension(input_param); // Validate rank and dimension for initial_cell_state input - NODE_VALIDATION_CHECK(this, - (ct_pshape.rank().is_static()), - "LSTMSequence input tensor initial_cell_state shall have static rank."); - NODE_VALIDATION_CHECK(this, (ct_pshape.rank().get_length() == 3), "LSTMSequence input tensor initial_cell_state shall have dimension 3D."); @@ -532,7 +538,7 @@ void op::v1::LSTMSequence::validate_and_infer_types() // Validate hidden_size value for W, R, B inputs if (merged_hidden_size.is_static()) { - if (w_pshape[0].is_static()) + if (w_pshape[1].is_static()) { NODE_VALIDATION_CHECK( this, @@ -544,7 +550,7 @@ void op::v1::LSTMSequence::validate_and_infer_types() "."); } - if (r_pshape[0].is_static()) + if (r_pshape[1].is_static()) { NODE_VALIDATION_CHECK( this, @@ -556,7 +562,7 @@ void op::v1::LSTMSequence::validate_and_infer_types() "."); } - if (b_pshape[0].is_static()) + if (b_pshape[1].is_static()) { NODE_VALIDATION_CHECK( this, diff --git a/ngraph/core/src/op/rnn_cell.cpp b/ngraph/core/src/op/rnn_cell.cpp index 6310b237f66b00..8c2357b2987827 100644 --- a/ngraph/core/src/op/rnn_cell.cpp +++ b/ngraph/core/src/op/rnn_cell.cpp @@ -83,6 +83,14 @@ bool op::v0::RNNCell::visit_attributes(AttributeVisitor& visitor) void op::v0::RNNCell::validate_and_infer_types() { + for (const auto& input : inputs()) + { + if (input.get_partial_shape().rank().is_dynamic()) + { + set_output_type(0, get_input_element_type(0), PartialShape::dynamic()); + return; + } + } auto merged_batch_size = Dimension::dynamic(); auto merged_hidden_size = Dimension::dynamic(); auto result_et = element::dynamic; diff --git a/ngraph/core/src/op/rnn_sequence.cpp b/ngraph/core/src/op/rnn_sequence.cpp new file mode 100644 index 00000000000000..5087b631d1e1d1 --- /dev/null +++ b/ngraph/core/src/op/rnn_sequence.cpp @@ -0,0 +1,192 @@ +//***************************************************************************** +// Copyright 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#include "ngraph/op/rnn_sequence.hpp" +#include "ngraph/op/util/recurrent_sequence.hpp" +#include "ngraph/opsets/opset4.hpp" + +#include +#include +#include + +using namespace std; +using namespace ngraph; + +NGRAPH_RTTI_DEFINITION(op::v5::RNNSequence, "RNNSequence", 4); + +op::v5::RNNSequence::RNNSequence() + : m_direction(op::RecurrentSequenceDirection::FORWARD) +{ +} + +op::v5::RNNSequence::RNNSequence(const Output& X, + const Output& H_t, + const Output& sequence_lengths, + const Output& W, + const Output& R, + const Output& B, + std::size_t hidden_size, + op::RecurrentSequenceDirection direction, + const std::vector& activations, + const std::vector& activations_alpha, + const std::vector& activations_beta, + float clip) + : RNNCellBase({X, H_t, sequence_lengths, W, R, B}, + hidden_size, + clip, + activations, + activations_alpha, + activations_beta) + , m_direction(direction) +{ + constructor_validate_and_infer_types(); +} + +void op::v5::RNNSequence::validate_and_infer_types() +{ + for (const auto& input : inputs()) + { + if (input.get_partial_shape().rank().is_dynamic()) + { + set_output_type(0, get_input_element_type(0), PartialShape::dynamic()); + set_output_type(1, get_input_element_type(0), PartialShape::dynamic()); + return; + } + } + + auto rnn_seq_gates_count = 1; + auto merged_batch_size = Dimension::dynamic(); + auto merged_hidden_size = Dimension::dynamic(); + auto merged_num_directions = Dimension::dynamic(); + auto result_et = element::dynamic; + + auto x_pshape = get_input_partial_shape(0); + auto ht_pshape = get_input_partial_shape(1); + auto sl_pshape = get_input_partial_shape(2); + auto w_pshape = get_input_partial_shape(3); + auto r_pshape = get_input_partial_shape(4); + auto b_pshape = get_input_partial_shape(5); + + ngraph::op::util::validate_seq_input_rank_dimension( + {x_pshape, ht_pshape, sl_pshape, w_pshape, r_pshape, b_pshape}); + + // Validate input types and save result for output type + NODE_VALIDATION_CHECK( + this, + element::Type::merge(result_et, result_et, get_input_element_type(0)) && + element::Type::merge(result_et, result_et, get_input_element_type(1)) && + element::Type::merge(result_et, result_et, get_input_element_type(3)) && + element::Type::merge(result_et, result_et, get_input_element_type(4)) && + element::Type::merge(result_et, result_et, get_input_element_type(5)), + "Element types for X, initial_hidden_state, W, R and B inputs do not " + "match."); + + // Merge batch_size dimension across all inputs to evaluate output[0] dimension + NODE_VALIDATION_CHECK(this, + Dimension::merge(merged_batch_size, merged_batch_size, ht_pshape[0]) && + Dimension::merge(merged_batch_size, merged_batch_size, x_pshape[0]) && + Dimension::merge(merged_batch_size, merged_batch_size, sl_pshape[0]), + "Parameter batch_size not matched in RNNSequence."); + + // Merge hidden_size dimension across all inputs to evaluate output dimension + NODE_VALIDATION_CHECK(this, + Dimension::merge(merged_hidden_size, merged_hidden_size, ht_pshape[2]) && + Dimension::merge(merged_hidden_size, merged_hidden_size, r_pshape[2]), + "Parameter hidden_size not matched RNNSequence."); + + // Merge num_directions dimension across all inputs to evaluate output dimension + NODE_VALIDATION_CHECK( + this, + Dimension::merge(merged_num_directions, merged_num_directions, ht_pshape[1]) && + Dimension::merge(merged_num_directions, merged_num_directions, w_pshape[0]) && + Dimension::merge(merged_num_directions, merged_num_directions, r_pshape[0]) && + Dimension::merge(merged_num_directions, merged_num_directions, b_pshape[0]), + "Parameter num_directions not matched in RNNSequence."); + + // Validate hidden_size value for W, R, B inputs + if (merged_hidden_size.is_static()) + { + if (w_pshape[1].is_static()) + { + NODE_VALIDATION_CHECK( + this, + w_pshape[1].compatible(merged_hidden_size * rnn_seq_gates_count), + "Parameter hidden_size mistmatched in W input. Current value is: ", + w_pshape[1].get_length(), + ", expected: ", + merged_hidden_size.get_length() * rnn_seq_gates_count, + "."); + } + + if (r_pshape[1].is_static()) + { + NODE_VALIDATION_CHECK( + this, + r_pshape[1].compatible(merged_hidden_size * rnn_seq_gates_count), + "Parameter hidden_size mistmatched in R input. Current value is: ", + r_pshape[1].get_length(), + ", expected: ", + merged_hidden_size.get_length() * rnn_seq_gates_count, + "."); + } + + if (b_pshape[1].is_static()) + { + NODE_VALIDATION_CHECK( + this, + b_pshape[1].compatible(merged_hidden_size * rnn_seq_gates_count), + "Parameter hidden_size mistmatched in B input. Current value is: ", + b_pshape[1].get_length(), + ", expected: ", + merged_hidden_size.get_length() * rnn_seq_gates_count, + "."); + } + } + + // Mark inputs which are relevant to output parameters + for (size_t i = 0; i <= 5; ++i) + set_input_is_relevant_to_shape(i); + + // Set output size, type and shape + set_output_size(2); + set_output_type( + 0, result_et, {merged_batch_size, merged_num_directions, x_pshape[1], merged_hidden_size}); + set_output_type(1, result_et, {merged_batch_size, merged_num_directions, merged_hidden_size}); +} + +bool op::v5::RNNSequence::visit_attributes(AttributeVisitor& visitor) +{ + visitor.on_attribute("direction", m_direction); + return op::util::RNNCellBase::visit_attributes(visitor); +} + +shared_ptr + op::v5::RNNSequence::clone_with_new_inputs(const ngraph::OutputVector& new_args) const +{ + check_new_args_count(this, new_args); + return make_shared(new_args.at(0), + new_args.at(1), + new_args.at(2), + new_args.at(3), + new_args.at(4), + new_args.at(5), + m_hidden_size, + m_direction, + m_activations, + m_activations_alpha, + m_activations_beta, + m_clip); +} diff --git a/ngraph/test/CMakeLists.txt b/ngraph/test/CMakeLists.txt index dd9c3514f47955..8a1caedaaf0413 100644 --- a/ngraph/test/CMakeLists.txt +++ b/ngraph/test/CMakeLists.txt @@ -129,6 +129,7 @@ set(SRC type_prop/group_convolution.cpp type_prop/group_convolution_backprop_data.cpp type_prop/gru_cell.cpp + type_prop/gru_sequence.cpp type_prop/hard_sigmoid.cpp type_prop/hswish.cpp type_prop/interpolate.cpp @@ -160,6 +161,7 @@ set(SRC type_prop/reverse_sequence.cpp type_prop/roi_align.cpp type_prop/rnn_cell.cpp + type_prop/rnn_sequence.cpp type_prop/scatter_elements_update.cpp type_prop/scatter_nd_update.cpp type_prop/scatter_update.cpp diff --git a/ngraph/test/attributes.cpp b/ngraph/test/attributes.cpp index 093cfbf26b0b54..f7f600d51d54a1 100644 --- a/ngraph/test/attributes.cpp +++ b/ngraph/test/attributes.cpp @@ -1099,7 +1099,7 @@ TEST(attributes, lstm_cell_op) TEST(attributes, lstm_sequence_op) { - FactoryRegistry::get().register_factory(); + FactoryRegistry::get().register_factory(); const size_t batch_size = 4; const size_t num_directions = 2; @@ -1126,7 +1126,7 @@ TEST(attributes, lstm_sequence_op) const std::vector activations = {"tanh", "sigmoid", "tanh"}; const float clip_threshold = 0.5f; - const auto lstm_sequence = make_shared(X, + const auto lstm_sequence = make_shared(X, initial_hidden_state, initial_cell_state, sequence_lengths, @@ -1140,7 +1140,7 @@ TEST(attributes, lstm_sequence_op) activations, clip_threshold); NodeBuilder builder(lstm_sequence); - auto g_lstm_sequence = as_type_ptr(builder.create()); + auto g_lstm_sequence = as_type_ptr(builder.create()); EXPECT_EQ(g_lstm_sequence->get_hidden_size(), lstm_sequence->get_hidden_size()); EXPECT_EQ(g_lstm_sequence->get_activations(), lstm_sequence->get_activations()); diff --git a/ngraph/test/runtime/interpreter/int_executable.hpp b/ngraph/test/runtime/interpreter/int_executable.hpp index 26e35ef2f7b8ea..74c1c0138cd8df 100644 --- a/ngraph/test/runtime/interpreter/int_executable.hpp +++ b/ngraph/test/runtime/interpreter/int_executable.hpp @@ -85,6 +85,7 @@ #include "ngraph/runtime/reference/round.hpp" #include "ngraph/runtime/reference/scatter_nd_update.hpp" #include "ngraph/runtime/reference/select.hpp" +#include "ngraph/runtime/reference/sequences.hpp" #include "ngraph/runtime/reference/sigmoid.hpp" #include "ngraph/runtime/reference/sign.hpp" #include "ngraph/runtime/reference/sin.hpp" @@ -758,6 +759,82 @@ class INTERPRETER_BACKEND_API ngraph::runtime::interpreter::INTExecutable : publ rnn_cell->get_clip()); break; } + case OP_TYPEID::LSTMSequence: + case OP_TYPEID::LSTMSequence_v5: + { + auto lstm_seq = static_cast(&node); + runtime::reference::lstm_sequence(args[0]->get_data_ptr(), + args[0]->get_shape(), + args[1]->get_data_ptr(), + args[1]->get_shape(), + args[2]->get_data_ptr(), + args[2]->get_shape(), + args[3]->get_data_ptr(), + args[3]->get_shape(), + args[4]->get_data_ptr(), + args[4]->get_shape(), + args[5]->get_data_ptr(), + args[5]->get_shape(), + args[6]->get_data_ptr(), + args[6]->get_shape(), + out[0]->get_data_ptr(), + out[1]->get_data_ptr(), + out[2]->get_data_ptr(), + lstm_seq->get_activations()[0], + lstm_seq->get_activations()[1], + lstm_seq->get_activations()[2], + lstm_seq->get_clip(), + lstm_seq->get_direction()); + break; + } + case OP_TYPEID::GRUSequence_v5: + { + auto gru_seq = static_cast(&node); + runtime::reference::gru_sequence(args[0]->get_data_ptr(), + args[0]->get_shape(), + args[1]->get_data_ptr(), + args[1]->get_shape(), + args[2]->get_data_ptr(), + args[2]->get_shape(), + args[3]->get_data_ptr(), + args[3]->get_shape(), + args[4]->get_data_ptr(), + args[4]->get_shape(), + args[5]->get_data_ptr(), + args[5]->get_shape(), + out[0]->get_data_ptr(), + out[1]->get_data_ptr(), + gru_seq->get_activations()[0], + gru_seq->get_activations()[1], + gru_seq->get_clip(), + gru_seq->get_direction(), + gru_seq->get_linear_before_reset() + + ); + break; + } + case OP_TYPEID::RNNSequence_v5: + { + auto rnn_seq = static_cast(&node); + runtime::reference::rnn_sequence(args[0]->get_data_ptr(), + args[0]->get_shape(), + args[1]->get_data_ptr(), + args[1]->get_shape(), + args[2]->get_data_ptr(), + args[2]->get_shape(), + args[3]->get_data_ptr(), + args[3]->get_shape(), + args[4]->get_data_ptr(), + args[4]->get_shape(), + args[5]->get_data_ptr(), + args[5]->get_shape(), + out[0]->get_data_ptr(), + out[1]->get_data_ptr(), + rnn_seq->get_activations()[0], + rnn_seq->get_clip(), + rnn_seq->get_direction()); + break; + } case OP_TYPEID::Log: { size_t element_count = shape_size(node.get_output_shape(0)); @@ -1285,7 +1362,6 @@ class INTERPRETER_BACKEND_API ngraph::runtime::interpreter::INTExecutable : publ case OP_TYPEID::GroupConvolutionBackpropData: case OP_TYPEID::HardSigmoid: case OP_TYPEID::Interpolate: - case OP_TYPEID::LSTMSequence: case OP_TYPEID::MVN: case OP_TYPEID::NormalizeL2: case OP_TYPEID::PRelu: diff --git a/ngraph/test/runtime/interpreter/opset_int_tbl.hpp b/ngraph/test/runtime/interpreter/opset_int_tbl.hpp index f44775d4010fb3..9ac3292d90ea39 100644 --- a/ngraph/test/runtime/interpreter/opset_int_tbl.hpp +++ b/ngraph/test/runtime/interpreter/opset_int_tbl.hpp @@ -48,3 +48,9 @@ NGRAPH_OP(ScatterUpdate, op::v3) NGRAPH_OP(CTCLoss, op::v4) NGRAPH_OP(LSTMCell, op::v4) #undef ID_SUFFIX + +#define ID_SUFFIX(NAME) NAME##_v5 +NGRAPH_OP(LSTMSequence, op::v5) +NGRAPH_OP(GRUSequence, op::v5) +NGRAPH_OP(RNNSequence, op::v5) +#undef ID_SUFFIX diff --git a/ngraph/test/type_prop/gru_cell.cpp b/ngraph/test/type_prop/gru_cell.cpp index 9ed57293fcb940..a7b3558b908791 100644 --- a/ngraph/test/type_prop/gru_cell.cpp +++ b/ngraph/test/type_prop/gru_cell.cpp @@ -231,38 +231,38 @@ TEST(type_prop, gru_cell_invalid_input_dynamic_rank) PartialShape{gates_count * hidden_size, hidden_size}); auto H_t = make_shared(element::f32, PartialShape{batch_size, hidden_size}); + auto check_dynamic_gru = [](const shared_ptr& gru) -> bool { + return gru->output(0).get_partial_shape() == PartialShape::dynamic() && + gru->output(0).get_element_type() == gru->input(0).get_element_type(); + }; + // Invalid dynamic rank for W tensor. auto W = make_shared(element::f32, PartialShape::dynamic(Rank::dynamic())); - ASSERT_THROW(make_shared(X, H_t, W, R, hidden_size), - ngraph::NodeValidationFailure) - << "GRUCell node was created with invalid data."; + auto gru_w = make_shared(X, H_t, W, R, hidden_size); + EXPECT_EQ(check_dynamic_gru(gru_w), true); // Invalid dynamic rank for X tensor. W = make_shared(element::f32, PartialShape{hidden_size, input_size}); X = make_shared(element::f32, PartialShape::dynamic(Rank::dynamic())); - ASSERT_THROW(make_shared(X, H_t, W, R, hidden_size), - ngraph::NodeValidationFailure) - << "GRUCell node was created with invalid data."; + auto gru_x = make_shared(X, H_t, W, R, hidden_size); + EXPECT_EQ(check_dynamic_gru(gru_x), true); // Invalid dynamic rank for H_t tensor. X = make_shared(element::f32, PartialShape{batch_size, input_size}); H_t = make_shared(element::f32, PartialShape::dynamic(Rank::dynamic())); - ASSERT_THROW(make_shared(X, H_t, W, R, hidden_size), - ngraph::NodeValidationFailure) - << "GRUCell node was created with invalid data."; + auto gru_h = make_shared(X, H_t, W, R, hidden_size); + EXPECT_EQ(check_dynamic_gru(gru_h), true); // Invalid dynamic rank for R tensor. H_t = make_shared(element::f32, PartialShape{batch_size, hidden_size}); R = make_shared(element::f32, PartialShape::dynamic(Rank::dynamic())); - ASSERT_THROW(make_shared(X, H_t, W, R, hidden_size), - ngraph::NodeValidationFailure) - << "GRUCell node was created with invalid data."; + auto gru_r = make_shared(X, H_t, W, R, hidden_size); + EXPECT_EQ(check_dynamic_gru(gru_r), true); // Invalid dynamic rank for B tensor. R = make_shared(element::f32, PartialShape{gates_count * hidden_size, hidden_size}); auto B = make_shared(element::f32, PartialShape::dynamic(Rank::dynamic())); - ASSERT_THROW(make_shared(X, H_t, W, R, B, hidden_size), - ngraph::NodeValidationFailure) - << "GRUCell node was created with invalid data."; + auto gru_b = make_shared(X, H_t, W, R, B, hidden_size); + EXPECT_EQ(check_dynamic_gru(gru_b), true); } diff --git a/ngraph/test/type_prop/gru_sequence.cpp b/ngraph/test/type_prop/gru_sequence.cpp new file mode 100644 index 00000000000000..105d8e3bb5a43b --- /dev/null +++ b/ngraph/test/type_prop/gru_sequence.cpp @@ -0,0 +1,64 @@ +//***************************************************************************** +// Copyright 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#include "gtest/gtest.h" +#include "ngraph/ngraph.hpp" +#include "ngraph/opsets/opset4.hpp" +#include "util/type_prop.hpp" + +using namespace std; +using namespace ngraph; + +TEST(type_prop, gru_sequence_forward) +{ + const size_t batch_size = 8; + const size_t num_directions = 1; + const size_t seq_length = 6; + const size_t input_size = 4; + const size_t hidden_size = 128; + + const auto X = + make_shared(element::f32, Shape{batch_size, seq_length, input_size}); + const auto initial_hidden_state = make_shared( + element::f32, Shape{batch_size, num_directions, hidden_size}); + const auto sequence_lengths = make_shared(element::i32, Shape{batch_size}); + const auto W = make_shared( + element::f32, Shape{num_directions, 3 * hidden_size, input_size}); + const auto R = make_shared( + element::f32, Shape{num_directions, 3 * hidden_size, hidden_size}); + const auto B = + make_shared(element::f32, Shape{num_directions, 3 * hidden_size}); + + const auto direction = op::RecurrentSequenceDirection::FORWARD; + + const auto sequence = make_shared( + X, initial_hidden_state, sequence_lengths, W, R, B, hidden_size, direction); + + EXPECT_EQ(sequence->get_hidden_size(), hidden_size); + EXPECT_EQ(sequence->get_direction(), op::RecurrentSequenceDirection::FORWARD); + EXPECT_TRUE(sequence->get_activations_alpha().empty()); + EXPECT_TRUE(sequence->get_activations_beta().empty()); + EXPECT_EQ(sequence->get_activations()[0], "sigmoid"); + EXPECT_EQ(sequence->get_activations()[1], "tanh"); + EXPECT_EQ(sequence->get_clip(), 0.f); + EXPECT_EQ(sequence->get_linear_before_reset(), false); + EXPECT_EQ(sequence->get_output_element_type(0), element::f32); + EXPECT_EQ(sequence->outputs().size(), 2); + EXPECT_EQ(sequence->get_output_shape(0), + (Shape{batch_size, num_directions, seq_length, hidden_size})); + EXPECT_EQ(sequence->get_output_element_type(1), element::f32); + EXPECT_EQ(sequence->get_output_shape(1), (Shape{batch_size, num_directions, hidden_size})); +} diff --git a/ngraph/test/type_prop/lstm_cell.cpp b/ngraph/test/type_prop/lstm_cell.cpp index 48b89cd65d5603..e8275d8973f87a 100644 --- a/ngraph/test/type_prop/lstm_cell.cpp +++ b/ngraph/test/type_prop/lstm_cell.cpp @@ -290,46 +290,46 @@ TEST(type_prop, lstm_cell_invalid_input_dynamic_rank) auto H_t = make_shared(element::f32, PartialShape{batch_size, hidden_size}); auto C_t = make_shared(element::f32, PartialShape{batch_size, hidden_size}); + auto check_dynamic_lstm = [](const shared_ptr& lstm) -> bool { + return lstm->output(0).get_partial_shape() == PartialShape::dynamic() && + lstm->output(1).get_partial_shape() == PartialShape::dynamic() && + lstm->output(0).get_element_type() == lstm->input(0).get_element_type(); + }; + // Invalid dynamic rank for W tensor. W = make_shared(element::f32, PartialShape::dynamic(Rank::dynamic())); - ASSERT_THROW(make_shared(X, H_t, C_t, W, R, hidden_size), - ngraph::NodeValidationFailure) - << "LSTMCell node was created with invalid data."; + auto lstm = make_shared(X, H_t, C_t, W, R, hidden_size); + EXPECT_EQ(check_dynamic_lstm(lstm), true); // Invalid dynamic rank for X tensor. W = make_shared(element::f32, PartialShape{gates_count * hidden_size, input_size}); X = make_shared(element::f32, PartialShape::dynamic(Rank::dynamic())); - ASSERT_THROW(make_shared(X, H_t, C_t, W, R, hidden_size), - ngraph::NodeValidationFailure) - << "LSTMCell node was created with invalid data."; + lstm = make_shared(X, H_t, C_t, W, R, hidden_size); + EXPECT_EQ(check_dynamic_lstm(lstm), true); // Invalid dynamic rank for H_t tensor. X = make_shared(element::f32, PartialShape{batch_size, input_size}); H_t = make_shared(element::f32, PartialShape::dynamic(Rank::dynamic())); - ASSERT_THROW(make_shared(X, H_t, C_t, W, R, hidden_size), - ngraph::NodeValidationFailure) - << "LSTMCell node was created with invalid data."; + lstm = make_shared(X, H_t, C_t, W, R, hidden_size); + EXPECT_EQ(check_dynamic_lstm(lstm), true); // Invalid dynamic rank for C_t tensor. H_t = make_shared(element::f32, PartialShape{batch_size, hidden_size}); C_t = make_shared(element::f32, PartialShape::dynamic(Rank::dynamic())); - ASSERT_THROW(make_shared(X, H_t, C_t, W, R, hidden_size), - ngraph::NodeValidationFailure) - << "LSTMCell node was created with invalid data."; + lstm = make_shared(X, H_t, C_t, W, R, hidden_size); + EXPECT_EQ(check_dynamic_lstm(lstm), true); // Invalid dynamic rank for R tensor. C_t = make_shared(element::f32, PartialShape{batch_size, hidden_size}); R = make_shared(element::f32, PartialShape::dynamic(Rank::dynamic())); - ASSERT_THROW(make_shared(X, H_t, C_t, W, R, hidden_size), - ngraph::NodeValidationFailure) - << "LSTMCell node was created with invalid data."; + lstm = make_shared(X, H_t, C_t, W, R, hidden_size); + EXPECT_EQ(check_dynamic_lstm(lstm), true); // Invalid dynamic rank for B tensor. R = make_shared(element::f32, PartialShape{gates_count * hidden_size, hidden_size}); auto B = make_shared(element::f32, PartialShape::dynamic(Rank::dynamic())); - ASSERT_THROW(make_shared(X, H_t, C_t, W, R, B, hidden_size), - ngraph::NodeValidationFailure) - << "LSTMCell node was created with invalid data."; + lstm = make_shared(X, H_t, C_t, W, R, B, hidden_size); + EXPECT_EQ(check_dynamic_lstm(lstm), true); } diff --git a/ngraph/test/type_prop/lstm_sequence.cpp b/ngraph/test/type_prop/lstm_sequence.cpp index 491712a9f4860f..16d628cddf0fa1 100644 --- a/ngraph/test/type_prop/lstm_sequence.cpp +++ b/ngraph/test/type_prop/lstm_sequence.cpp @@ -41,7 +41,7 @@ struct recurrent_sequence_parameters // // Create and initialize default input test tensors. // -shared_ptr +shared_ptr lstm_seq_tensor_initialization(const recurrent_sequence_parameters& param) { auto batch_size = param.batch_size; @@ -65,7 +65,7 @@ shared_ptr const auto B = make_shared(et, PartialShape{num_directions, hidden_size * 4}); - const auto lstm_sequence = make_shared(); + const auto lstm_sequence = make_shared(); lstm_sequence->set_argument(0, X); lstm_sequence->set_argument(1, initial_hidden_state); @@ -102,7 +102,7 @@ TEST(type_prop, lstm_sequence_forward) const auto lstm_direction = op::RecurrentSequenceDirection::FORWARD; - const auto lstm_sequence = make_shared(X, + const auto lstm_sequence = make_shared(X, initial_hidden_state, initial_cell_state, sequence_lengths, @@ -121,6 +121,7 @@ TEST(type_prop, lstm_sequence_forward) EXPECT_EQ(lstm_sequence->get_activations()[2], "tanh"); EXPECT_EQ(lstm_sequence->get_clip(), 0.f); EXPECT_EQ(lstm_sequence->get_output_element_type(0), element::f32); + EXPECT_EQ(lstm_sequence->outputs().size(), 3); EXPECT_EQ(lstm_sequence->get_output_shape(0), (Shape{batch_size, num_directions, seq_length, hidden_size})); EXPECT_EQ(lstm_sequence->get_output_element_type(1), element::f32); @@ -151,12 +152,12 @@ TEST(type_prop, lstm_sequence_bidirectional) const auto B = make_shared(element::f32, Shape{num_directions, 4 * hidden_size}); - const auto lstm_direction = op::v1::LSTMSequence::direction::BIDIRECTIONAL; + const auto lstm_direction = op::v5::LSTMSequence::direction::BIDIRECTIONAL; const std::vector activations_alpha = {2.7, 7.0, 32.367}; const std::vector activations_beta = {0.0, 5.49, 6.0}; const std::vector activations = {"tanh", "sigmoid", "sigmoid"}; - const auto lstm_sequence = make_shared(X, + const auto lstm_sequence = make_shared(X, initial_hidden_state, initial_cell_state, sequence_lengths, @@ -169,7 +170,7 @@ TEST(type_prop, lstm_sequence_bidirectional) activations_beta, activations); EXPECT_EQ(lstm_sequence->get_hidden_size(), hidden_size); - EXPECT_EQ(lstm_sequence->get_direction(), op::v1::LSTMSequence::direction::BIDIRECTIONAL); + EXPECT_EQ(lstm_sequence->get_direction(), op::v5::LSTMSequence::direction::BIDIRECTIONAL); EXPECT_EQ(lstm_sequence->get_activations_alpha(), activations_alpha); EXPECT_EQ(lstm_sequence->get_activations_beta(), activations_beta); EXPECT_EQ(lstm_sequence->get_activations()[0], "tanh"); @@ -351,6 +352,13 @@ TEST(type_prop, lstm_sequence_invalid_input_dynamic_rank) param.hidden_size = 256; param.et = element::f32; + auto check_dynamic_lstm = [](const shared_ptr& lstm) -> bool { + return lstm->output(0).get_partial_shape() == PartialShape::dynamic() && + lstm->output(1).get_partial_shape() == PartialShape::dynamic() && + lstm->output(2).get_partial_shape() == PartialShape::dynamic() && + lstm->output(0).get_element_type() == lstm->input(0).get_element_type(); + }; + auto lstm_sequence = lstm_seq_tensor_initialization(param); auto invalid_dynamic_tensor = make_shared(param.et, PartialShape::dynamic(Rank::dynamic())); @@ -361,7 +369,7 @@ TEST(type_prop, lstm_sequence_invalid_input_dynamic_rank) { lstm_sequence = lstm_seq_tensor_initialization(param); lstm_sequence->set_argument(i, invalid_dynamic_tensor); - ASSERT_THROW(lstm_sequence->validate_and_infer_types(), ngraph::CheckFailure) - << "LSTMSequence node was created with invalid data."; + lstm_sequence->validate_and_infer_types(); + EXPECT_EQ(check_dynamic_lstm(lstm_sequence), true); } } diff --git a/ngraph/test/type_prop/rnn_cell.cpp b/ngraph/test/type_prop/rnn_cell.cpp index 5db9e15971022b..627457edbb9c93 100644 --- a/ngraph/test/type_prop/rnn_cell.cpp +++ b/ngraph/test/type_prop/rnn_cell.cpp @@ -223,37 +223,36 @@ TEST(type_prop, rnn_cell_invalid_input_dynamic_rank) auto R = make_shared(element::f32, Shape{hidden_size, hidden_size}); auto H_t = make_shared(element::f32, Shape{batch_size, hidden_size}); + auto check_dynamic_rnn = [](const shared_ptr& rnn) -> bool { + return rnn->output(0).get_partial_shape() == PartialShape::dynamic() && + rnn->output(0).get_element_type() == rnn->input(0).get_element_type(); + }; // Invalid dynamic rank for W tensor. auto W = make_shared(element::f32, PartialShape::dynamic(Rank::dynamic())); - ASSERT_THROW(make_shared(X, H_t, W, R, hidden_size), - ngraph::NodeValidationFailure) - << "RNNCell node was created with invalid data."; + auto rnn_w = make_shared(X, H_t, W, R, hidden_size); + EXPECT_EQ(check_dynamic_rnn(rnn_w), true); // Invalid dynamic rank for X tensor. W = make_shared(element::f32, PartialShape{hidden_size, input_size}); X = make_shared(element::f32, PartialShape::dynamic(Rank::dynamic())); - ASSERT_THROW(make_shared(X, H_t, W, R, hidden_size), - ngraph::NodeValidationFailure) - << "RNNCell node was created with invalid data."; + auto rnn_x = make_shared(X, H_t, W, R, hidden_size); + EXPECT_EQ(check_dynamic_rnn(rnn_x), true); // Invalid dynamic rank for H_t tensor. X = make_shared(element::f32, Shape{batch_size, input_size}); H_t = make_shared(element::f32, PartialShape::dynamic(Rank::dynamic())); - ASSERT_THROW(make_shared(X, H_t, W, R, hidden_size), - ngraph::NodeValidationFailure) - << "RNNCell node was created with invalid data."; + auto rnn_h = make_shared(X, H_t, W, R, hidden_size); + EXPECT_EQ(check_dynamic_rnn(rnn_h), true); // Invalid dynamic rank for R tensor. H_t = make_shared(element::f32, Shape{batch_size, hidden_size}); R = make_shared(element::f32, PartialShape::dynamic(Rank::dynamic())); - ASSERT_THROW(make_shared(X, H_t, W, R, hidden_size), - ngraph::NodeValidationFailure) - << "RNNCell node was created with invalid data."; + auto rnn_r = make_shared(X, H_t, W, R, hidden_size); + EXPECT_EQ(check_dynamic_rnn(rnn_r), true); // Invalid dynamic rank for B tensor. R = make_shared(element::f32, PartialShape{hidden_size, hidden_size}); auto B = make_shared(element::f32, PartialShape::dynamic(Rank::dynamic())); - ASSERT_THROW(make_shared(X, H_t, W, R, B, hidden_size), - ngraph::NodeValidationFailure) - << "RNNCell node was created with invalid data."; + auto rnn_b = make_shared(X, H_t, W, R, B, hidden_size); + EXPECT_EQ(check_dynamic_rnn(rnn_b), true); } diff --git a/ngraph/test/type_prop/rnn_sequence.cpp b/ngraph/test/type_prop/rnn_sequence.cpp new file mode 100644 index 00000000000000..a3dfb6c5b630cb --- /dev/null +++ b/ngraph/test/type_prop/rnn_sequence.cpp @@ -0,0 +1,62 @@ +//***************************************************************************** +// Copyright 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#include "gtest/gtest.h" +#include "ngraph/ngraph.hpp" +#include "ngraph/opsets/opset4.hpp" +#include "util/type_prop.hpp" + +using namespace std; +using namespace ngraph; + +TEST(type_prop, rnn_sequence_forward) +{ + const size_t batch_size = 8; + const size_t num_directions = 1; + const size_t seq_length = 6; + const size_t input_size = 4; + const size_t hidden_size = 128; + + const auto X = + make_shared(element::f32, Shape{batch_size, seq_length, input_size}); + const auto initial_hidden_state = make_shared( + element::f32, Shape{batch_size, num_directions, hidden_size}); + const auto sequence_lengths = make_shared(element::i32, Shape{batch_size}); + + const auto W = make_shared(element::f32, + Shape{num_directions, hidden_size, input_size}); + const auto R = make_shared(element::f32, + Shape{num_directions, hidden_size, hidden_size}); + const auto B = make_shared(element::f32, Shape{num_directions, hidden_size}); + + const auto direction = op::RecurrentSequenceDirection::FORWARD; + + const auto sequence = make_shared( + X, initial_hidden_state, sequence_lengths, W, R, B, hidden_size, direction); + + EXPECT_EQ(sequence->get_hidden_size(), hidden_size); + EXPECT_EQ(sequence->get_direction(), op::RecurrentSequenceDirection::FORWARD); + EXPECT_TRUE(sequence->get_activations_alpha().empty()); + EXPECT_TRUE(sequence->get_activations_beta().empty()); + EXPECT_EQ(sequence->get_activations()[0], "tanh"); + EXPECT_EQ(sequence->get_clip(), 0.f); + EXPECT_EQ(sequence->get_output_element_type(0), element::f32); + EXPECT_EQ(sequence->outputs().size(), 2); + EXPECT_EQ(sequence->get_output_shape(0), + (Shape{batch_size, num_directions, seq_length, hidden_size})); + EXPECT_EQ(sequence->get_output_element_type(1), element::f32); + EXPECT_EQ(sequence->get_output_shape(1), (Shape{batch_size, num_directions, hidden_size})); +} From 8e6d9470bbe44abea1ce24bc2296e0f6a3a5efcd Mon Sep 17 00:00:00 2001 From: Andrey Dmitriev Date: Tue, 8 Sep 2020 10:46:10 +0300 Subject: [PATCH 22/66] [GNA] Handling input orientation (#1851) Added test Add fix --- .../src/gna_plugin/gna_graph_compiler.cpp | 5 ++ .../src/gna_plugin/gna_plugin.cpp | 55 +++++---------- .../gna_plugin/optimizer/gna_pass_manager.cpp | 6 +- .../handling_orientation_conv.cpp | 30 +++++++++ .../handling_orientation_conv.hpp | 31 +++++++++ .../handling_orientation_conv.cpp | 67 +++++++++++++++++++ 6 files changed, 155 insertions(+), 39 deletions(-) create mode 100644 inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/handling_orientation_conv.cpp create mode 100644 inference-engine/tests/functional/plugin/shared/include/subgraph_tests/handling_orientation_conv.hpp create mode 100644 inference-engine/tests/functional/plugin/shared/src/subgraph_tests/handling_orientation_conv.cpp diff --git a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp index 481d111a0f83a4..31372b5d4aab38 100644 --- a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp +++ b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp @@ -398,6 +398,11 @@ void GNAGraphCompiler::ConvolutionPrimitive(InferenceEngine::CNNLayerPtr layer) ptr_weights, ptr_biases); + if (inputs->getLayout() == Layout::NHWC) { + currentComponent.orientation_in = kDnnInterleavedOrientation; + currentComponent.orientation_out = kDnnInterleavedOrientation; + } + size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->getDims()), end(outputs->getDims())) * outputs->getPrecision().size(); diff --git a/inference-engine/src/gna_plugin/gna_plugin.cpp b/inference-engine/src/gna_plugin/gna_plugin.cpp index ff5d1b5f747177..38c0108d9b2850 100644 --- a/inference-engine/src/gna_plugin/gna_plugin.cpp +++ b/inference-engine/src/gna_plugin/gna_plugin.cpp @@ -699,54 +699,35 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) { } } if (withConv) { - for (auto &layer : sortedNet) { - for (int i = 0; CNNNetHasPrevLayer(layer.get(), i); i++) { - auto prevLayer = CNNNetPrevLayer(layer.get(), i); - if (!skippedLayers.count(prevLayer->name)) { - if (CNNNetHasPrevLayer(prevLayer.get())) { - continue; - } + for (auto &inputLayer : sortedNet) { + if (!LayerInfo(inputLayer).isInput()) { + continue; + } + auto doesntHaveGnaMapping = [this] (CNNLayerPtr l) { + auto dnnLayer = graphCompiler.dnnComponents.findComponent(l); + return dnnLayer == nullptr; + }; - // we are in the one of input layers - if (LayerInfo(prevLayer).isMemory()) { - continue; - } - } - - auto dnnLayer = graphCompiler.dnnComponents.findComponent(layer); - string inputName = prevLayer->name; - std::vector inputs; - if (skippedLayers.count(prevLayer->name)) { - inputs = skippedLayers[prevLayer->name]; - } else { - inputs.push_back(inputName); - } + auto nextLayers = CNNNetGetAllNextLayersSkipCertain(inputLayer, -1, doesntHaveGnaMapping); + for (auto &nextLayer : nextLayers) { + auto dnnLayer = graphCompiler.dnnComponents.findComponent(nextLayer); // non functional layer - skipped by gna if (nullptr == dnnLayer) { - // storing input name for skipped layer - if (skippedLayers[inputName].size() == 0) { - skippedLayers[layer->name].push_back(inputName); - } else { - skippedLayers[layer->name] = skippedLayers[inputName]; - } - continue; + THROW_GNA_LAYER_EXCEPTION(inputLayer) << " gna mapped layer search connection failed"; } - // input orientation might be already initialized, thus verify that it matches - for (auto input : inputs) { - if (!inputsDesc->orientation_in.count(input)) { - inputsDesc->orientation_in[input] = dnnLayer->orientation_in; - } else { - if (inputsDesc->orientation_in[input] != dnnLayer->orientation_in) { - THROW_GNA_EXCEPTION << "orientation for input layer: " << input << "cannot be calculated"; - } + if (!inputsDesc->orientation_in.count(inputLayer->name)) { + inputsDesc->orientation_in[inputLayer->name] = dnnLayer->orientation_in; + } else { + if (inputsDesc->orientation_in[inputLayer->name] != dnnLayer->orientation_in) { + THROW_GNA_EXCEPTION << "orientation for input layer: " << inputLayer->name << "cannot be calculated"; } } } } } else { - for (auto& inputLayer : inputLayers) { + for (auto &inputLayer : inputLayers) { inputsDesc->orientation_in[inputLayer->name] = kDnnInterleavedOrientation; } } diff --git a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp index ca6e6a1d5272f0..760838b3ab81a2 100644 --- a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp +++ b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp @@ -1290,9 +1290,11 @@ void FuseMultipleIdentitiesPass::run() { if (LayerInfo(l).isNonFunctional() || LayerInfo(l).has32BInput()) continue; gnalog() << "CNNNetPrevLayer skip non functional from :: " << l->name; - auto prevLayersReached = CNNNetGetPrevLayersSkip(l, [](CNNLayerPtr ptr) { + auto isFunctional = [](CNNLayerPtr ptr) { return !LayerInfo(ptr).isNonFunctional(); - }); + }; + + auto prevLayersReached = CNNNetGetPrevLayersSkip(l, isFunctional); prevLayersReached.erase(std::remove_if(prevLayersReached.begin(), prevLayersReached.end(), [] (const std::pair & candidate) { diff --git a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/handling_orientation_conv.cpp b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/handling_orientation_conv.cpp new file mode 100644 index 00000000000000..959a9aa2325504 --- /dev/null +++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/handling_orientation_conv.cpp @@ -0,0 +1,30 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "subgraph_tests/handling_orientation_conv.hpp" +#include "common_test_utils/test_constants.hpp" + +using namespace LayerTestsDefinitions; +const std::vector netPrecisions = { + InferenceEngine::Precision::FP32, + InferenceEngine::Precision::FP16, +}; + +const std::vector> configs = { + { + {"GNA_SCALE_FACTOR_0", "1"}, + {"GNA_SCALE_FACTOR_1", "1"}, + {"GNA_COMPACT_MODE", "NO"}, + } +}; + +INSTANTIATE_TEST_CASE_P(handling_orientation, HandlingOrientationClass, + ::testing::Combine( + ::testing::ValuesIn(netPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_GNA), + ::testing::ValuesIn(configs)), + HandlingOrientationClass::getTestCaseName); + diff --git a/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/handling_orientation_conv.hpp b/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/handling_orientation_conv.hpp new file mode 100644 index 00000000000000..eeff3bebae8e9a --- /dev/null +++ b/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/handling_orientation_conv.hpp @@ -0,0 +1,31 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include + +#include "functional_test_utils/layer_test_utils.hpp" +#include "ngraph_functions/utils/ngraph_helpers.hpp" +#include "ngraph_functions/builders.hpp" + +namespace LayerTestsDefinitions { +typedef std::tuple< + InferenceEngine::Precision, //Network precision + std::string, //Device name + std::map //Configuration +> HandlingOrientationParams; + +class HandlingOrientationClass : public testing::WithParamInterface, + virtual public LayerTestsUtils::LayerTestsCommon { +public: + static std::string getTestCaseName(const testing::TestParamInfo &obj); + +protected: + void SetUp() override; +}; +} // namespace LayerTestsDefinitions diff --git a/inference-engine/tests/functional/plugin/shared/src/subgraph_tests/handling_orientation_conv.cpp b/inference-engine/tests/functional/plugin/shared/src/subgraph_tests/handling_orientation_conv.cpp new file mode 100644 index 00000000000000..be38326f132bcd --- /dev/null +++ b/inference-engine/tests/functional/plugin/shared/src/subgraph_tests/handling_orientation_conv.cpp @@ -0,0 +1,67 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include +#include +#include +#include +#include +#include "functional_test_utils/precision_utils.hpp" +#include "functional_test_utils/skip_tests_config.hpp" +#include "subgraph_tests/handling_orientation_conv.hpp" + +namespace LayerTestsDefinitions { + std::string HandlingOrientationClass::getTestCaseName(const testing::TestParamInfo &obj) { + InferenceEngine::Precision netPrecision; + std::string targetName; + std::map configuration; + std::tie(netPrecision, targetName, configuration) = obj.param; + std::ostringstream results; + + results << "netPRC=" << netPrecision.name() << "_"; + results << "targetDevice=" << targetName << "_"; + return results.str(); + } + + void HandlingOrientationClass::SetUp() { + InferenceEngine::Precision netPrecision; + std::tie(netPrecision, targetDevice, configuration) = this->GetParam(); + auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); + + auto params = ngraph::builder::makeParams(ngPrc, { {1, 336} , {1, 336}}); + + std::vector outFormShapes1 = { 1, 1, 168, 2 }; + std::vector outFormShapes2 = { 1, 336, 1, 1 }; + auto pattern1 = std::make_shared(ngraph::element::Type_t::i64, ngraph::Shape{ 4 }, outFormShapes1); + auto reshape1 = std::make_shared(params[0], pattern1, false); + + auto pattern2 = std::make_shared(ngraph::element::Type_t::i64, ngraph::Shape{ 4 }, outFormShapes2); + auto reshape2 = std::make_shared(params[1], pattern2, false); + + auto permute1 = std::make_shared(reshape1, + ngraph::opset1::Constant::create(ngraph::element::i64, ngraph::Shape{ 4 }, { 0, 3, 1, 2 })); + + auto conv1 = ngraph::builder::makeConvolution(permute1, ngPrc, { 1, 8 }, { 1, 1 }, { 0, 0 }, { 0, 0 }, { 1, 1 }, + ngraph::op::PadType::VALID, 12); + + auto permute2 = std::make_shared(conv1, + ngraph::opset1::Constant::create(ngraph::element::i64, ngraph::Shape{ 4 }, { 0, 2, 3, 1 })); + + auto conv2 = ngraph::builder::makeConvolution(reshape2, ngPrc, { 1, 1 }, { 1, 1 }, { 0, 0 }, { 0, 0 }, { 1, 1 }, + ngraph::op::PadType::VALID, 336); + + std::vector outFormShapes3 = { 1, 1932 }; + std::vector outFormShapes4 = { 1, 336 }; + auto pattern3 = std::make_shared(ngraph::element::Type_t::i64, ngraph::Shape{ 2 }, outFormShapes3); + auto pattern4 = std::make_shared(ngraph::element::Type_t::i64, ngraph::Shape{ 2 }, outFormShapes4); + auto reshape3 = std::make_shared(permute2, pattern3, false); + auto reshape4 = std::make_shared(conv2, pattern4, false); + ngraph::ResultVector results{ std::make_shared(reshape3), + std::make_shared(reshape4)}; + function = std::make_shared(results, params, "RemovePermutationPass"); + } + + TEST_P(HandlingOrientationClass, CompareWithRefs){ + Run(); + }; +} // namespace LayerTestsDefinitions From 6357ce83c5a0648410350ad95346d933d3628d84 Mon Sep 17 00:00:00 2001 From: Anna Alberska Date: Tue, 8 Sep 2020 09:57:44 +0200 Subject: [PATCH 23/66] [GNA] add support for NCHW & NHWC layouts for exporting output (#2031) * [GNA] add support for NCHW & NHWC ExportScores * fix cpplint --- .../src/gna_plugin/gna_plugin.cpp | 84 +++++++++++-------- .../remove_permutations_NHWC_to_NCHW_pass.cpp | 50 +++++++++++ 2 files changed, 98 insertions(+), 36 deletions(-) diff --git a/inference-engine/src/gna_plugin/gna_plugin.cpp b/inference-engine/src/gna_plugin/gna_plugin.cpp index 38c0108d9b2850..820fc70d9016c3 100644 --- a/inference-engine/src/gna_plugin/gna_plugin.cpp +++ b/inference-engine/src/gna_plugin/gna_plugin.cpp @@ -994,7 +994,8 @@ bool GNAPlugin::WaitFor(uint32_t request_idx, int64_t millisTimeout) { for (auto && outputBlobIt : request) { auto & outputBlob = outputBlobIt.second; auto & outputDesc = outputsDesc[output_idx]; - if (outputBlob->getTensorDesc().getLayout() == Layout::NC) { + if (outputBlob->getTensorDesc().getLayout() == Layout::NC || outputBlob->getTensorDesc().getLayout() == Layout::CN + || outputBlob->getTensorDesc().getLayout() == Layout::NCHW || outputBlob->getTensorDesc().getLayout() == Layout::NHWC) { // TODO: rotate can be incorporated with exporting - used only in unit tests so far // TODO: restore: // if (orientation_out != kDnnInterleavedOrientation) { @@ -1010,56 +1011,67 @@ bool GNAPlugin::WaitFor(uint32_t request_idx, int64_t millisTimeout) { // dims[0], // dims[dims.size() - 1]); // } + auto is2D = outputBlob->getTensorDesc().getLayout() == Layout::NC || outputBlob->getTensorDesc().getLayout() == Layout::CN; auto& exportOutputDims = outputBlob->getTensorDesc().getDims(); + auto batchSize = exportOutputDims[0]; + auto elementsPerBatch = is2D ? exportOutputDims[exportOutputDims.size() - 1] + : exportOutputDims[exportOutputDims.size() - 1] + * exportOutputDims[exportOutputDims.size() - 2] + * exportOutputDims[exportOutputDims.size() - 3]; + ExportScores(outputBlob->buffer(), outputDesc.ptrs[request_idx], outputDesc.orientation, - exportOutputDims[0], - exportOutputDims[exportOutputDims.size() - 2], - exportOutputDims[exportOutputDims.size() - 1], - exportOutputDims[exportOutputDims.size() - 1], - exportOutputDims[exportOutputDims.size() - 1], + batchSize, + batchSize, + elementsPerBatch, + elementsPerBatch, + elementsPerBatch, outputDesc.num_bytes_per_element, sizeof(float)); - } - if (gnadevice) { + if (gnadevice) { #ifdef PLOT - FILE *f = nullptr; - static int num_infers = 0; - { - f = fopen("ex_scores.txt", "w"); - } - num_infers++; - if (f) { - auto dims = outputBlob->getTensorDesc().getDims(); - for (int i = 0; i < dims[dims.size() - 2]; i++) { - for (int j = 0; j < dims[dims.size() - 1]; j++) { - fprintf(f, "%d ", outputBlob->cbuffer().as()[dims[dims.size() - 1] * i + j]); + FILE* f = nullptr; + static int num_infers = 0; + { + f = fopen("ex_scores.txt", "w"); } - fprintf(f, "\n"); + num_infers++; + if (f) { + auto dims = outputBlob->getTensorDesc().getDims(); + for (int i = 0; i < dims[dims.size() - 2]; i++) { + for (int j = 0; j < dims[dims.size() - 1]; j++) { + fprintf(f, "%d ", outputBlob->cbuffer().as()[dims[dims.size() - 1] * i + j]); + } + fprintf(f, "\n"); + } + fprintf(f, "\n\n"); } - fprintf(f, "\n\n"); - } #endif - ConvertToFloat(outputBlob->buffer(), - outputBlob->buffer(), - outputBlob->getTensorDesc().getDims()[outputBlob->getTensorDesc().getDims().size() - 1], - outputBlob->getTensorDesc().getDims()[outputBlob->getTensorDesc().getDims().size() - 2], - outputDesc.scale_factor); + ConvertToFloat(outputBlob->buffer(), + outputBlob->buffer(), + elementsPerBatch, + batchSize, + outputDesc.scale_factor); #ifdef PLOT - if (f) { - auto dims = outputBlob->getTensorDesc().getDims(); - for (int i = 0; i < dims[dims.size() - 2]; i++) { - for (int j = 0; j < dims[dims.size() - 1]; j++) { - fprintf(f, "%.2f ", outputBlob->cbuffer().as()[dims[dims.size() - 1] * i + j]); - } - fprintf(f, "\n"); + if (f) { + auto dims = outputBlob->getTensorDesc().getDims(); + for (int i = 0; i < dims[dims.size() - 2]; i++) { + for (int j = 0; j < dims[dims.size() - 1]; j++) { + fprintf(f, "%.2f ", outputBlob->cbuffer().as()[dims[dims.size() - 1] * i + j]); + } + fprintf(f, "\n"); + } + fclose(f); } - fclose(f); - } #endif + } + } else { + THROW_GNA_EXCEPTION << "Expected output blob to have Layout::NC, Layout::CN, Layout::NCHW or Layout::NHWC. But was " + << outputBlob->getTensorDesc().getLayout(); } + output_idx++; } return true; diff --git a/inference-engine/tests/functional/plugin/gna/pass_tests/remove_permutations_NHWC_to_NCHW_pass.cpp b/inference-engine/tests/functional/plugin/gna/pass_tests/remove_permutations_NHWC_to_NCHW_pass.cpp index 42b6c40c667a3b..65d0dfb7cc19ea 100644 --- a/inference-engine/tests/functional/plugin/gna/pass_tests/remove_permutations_NHWC_to_NCHW_pass.cpp +++ b/inference-engine/tests/functional/plugin/gna/pass_tests/remove_permutations_NHWC_to_NCHW_pass.cpp @@ -86,10 +86,53 @@ class RemovePermutationsNHWCToNCHWPassTest : public testing::WithParamInterface< } }; +class RemovePermutationsNHWCToNCHWPass4DOutputTest : public testing::WithParamInterface, + public LayerTestsUtils::LayerTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj) { + InferenceEngine::Precision netPrecision; + std::string targetDevice; + std::map configuration; + std::tie(netPrecision, targetDevice, configuration) = obj.param; + + std::ostringstream result; + result << "netPRC=" << netPrecision.name() << "_"; + result << "targetDevice=" << targetDevice << "_"; + for (auto const& configItem : configuration) { + result << "_configItem=" << configItem.first << "_" << configItem.second; + } + return result.str(); + } + +protected: + void SetUp() override { + InferenceEngine::Precision netPrecision; + std::tie(netPrecision, targetDevice, configuration) = this->GetParam(); + auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); + + auto params = ngraph::builder::makeParams(ngPrc, { {1, 1, 168, 2} }); + auto permute1 = std::make_shared(params[0], + ngraph::opset1::Constant::create(ngraph::element::i64, ngraph::Shape{ 4 }, { 0, 3, 1, 2 })); + + auto conv1 = ngraph::builder::makeConvolution(permute1, ngPrc, { 1, 8 }, { 1, 1 }, { 0, 0 }, { 0, 0 }, { 1, 1 }, ngraph::op::PadType::VALID, 12); + + auto permute2 = std::make_shared(conv1, + ngraph::opset1::Constant::create(ngraph::element::i64, ngraph::Shape{ 4 }, { 0, 2, 3, 1 })); + + ngraph::ResultVector results{ std::make_shared(permute2) }; + + function = std::make_shared(results, params, "RemovePermutationPass4DOutput"); + } +}; + TEST_P(RemovePermutationsNHWCToNCHWPassTest, CompareWithRefImpl) { Run(); }; + TEST_P(RemovePermutationsNHWCToNCHWPass4DOutputTest, CompareWithRefImpl) { + Run(); + }; + const std::vector netPrecisions = { InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP16 @@ -109,5 +152,12 @@ class RemovePermutationsNHWCToNCHWPassTest : public testing::WithParamInterface< ::testing::ValuesIn(configs)), RemovePermutationsNHWCToNCHWPassTest::getTestCaseName); + INSTANTIATE_TEST_CASE_P(PermutationPass, RemovePermutationsNHWCToNCHWPass4DOutputTest, + ::testing::Combine( + ::testing::ValuesIn(netPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_GNA), + ::testing::ValuesIn(configs)), + RemovePermutationsNHWCToNCHWPass4DOutputTest::getTestCaseName); + } // namespace LayerTestsDefinitions From ef3b9e1d1f376ed7bde97aa70f8ab344df41213b Mon Sep 17 00:00:00 2001 From: Daria Mityagina Date: Tue, 8 Sep 2020 11:21:18 +0300 Subject: [PATCH 24/66] [IE][VPU]: myriad_compile doesn't work without device - fix (#2054) * Revert setting deprecated PLATFORM config option in compile tool --- inference-engine/tools/compile_tool/main.cpp | 7 ++++--- inference-engine/tools/vpu/vpu_compile/main.cpp | 4 ++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/inference-engine/tools/compile_tool/main.cpp b/inference-engine/tools/compile_tool/main.cpp index 2bc40c48bb5466..788ec7c23958b8 100644 --- a/inference-engine/tools/compile_tool/main.cpp +++ b/inference-engine/tools/compile_tool/main.cpp @@ -113,9 +113,6 @@ static bool parseCommandLine(int *argc, char ***argv, InferenceEngine::Core& ie) if (std::string::npos != FLAGS_d.find("MYRIAD")) { std::vector myriadDeviceIds = ie.GetMetric("MYRIAD", METRIC_KEY(AVAILABLE_DEVICES)); - if (myriadDeviceIds.empty()) { - throw std::runtime_error{"No available MYRIAD devices"}; - } } if (1 < *argc) { @@ -151,6 +148,10 @@ static std::map parseConfig(const std::string& configN static std::map configure(const std::string &configFile, const std::string &xmlFileName) { auto config = parseConfig(configFile); + IE_SUPPRESS_DEPRECATED_START + config[VPU_MYRIAD_CONFIG_KEY(PLATFORM)] = "VPU_MYRIAD_2480"; + IE_SUPPRESS_DEPRECATED_END + if (!FLAGS_VPU_NUMBER_OF_SHAVES.empty()) { config[InferenceEngine::MYRIAD_NUMBER_OF_SHAVES] = FLAGS_VPU_NUMBER_OF_SHAVES; } diff --git a/inference-engine/tools/vpu/vpu_compile/main.cpp b/inference-engine/tools/vpu/vpu_compile/main.cpp index 5ad660db63ac2b..1a3fc4ff6943d5 100644 --- a/inference-engine/tools/vpu/vpu_compile/main.cpp +++ b/inference-engine/tools/vpu/vpu_compile/main.cpp @@ -105,6 +105,10 @@ static bool parseCommandLine(int *argc, char ***argv) { static std::map configure(const std::string &configFile, const std::string &xmlFileName) { auto config = parseConfig(configFile); + IE_SUPPRESS_DEPRECATED_START + config[VPU_MYRIAD_CONFIG_KEY(PLATFORM)] = "VPU_MYRIAD_2480"; + IE_SUPPRESS_DEPRECATED_END + if (!FLAGS_VPU_NUMBER_OF_SHAVES.empty()) { config[InferenceEngine::MYRIAD_NUMBER_OF_SHAVES] = FLAGS_VPU_NUMBER_OF_SHAVES; } From 671ddeea93a46d25c799cd2b88cf3f1f940a9938 Mon Sep 17 00:00:00 2001 From: Mikhail Letavin Date: Tue, 8 Sep 2020 11:48:11 +0300 Subject: [PATCH 25/66] [IE CLDNN] Switch back to old TI unroller to restore correct matching of TensorIterator output names (#2096) --- inference-engine/src/cldnn_engine/cldnn_engine.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/inference-engine/src/cldnn_engine/cldnn_engine.cpp b/inference-engine/src/cldnn_engine/cldnn_engine.cpp index 14effa2a00f4dc..3d9b167076181e 100644 --- a/inference-engine/src/cldnn_engine/cldnn_engine.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_engine.cpp @@ -139,7 +139,8 @@ InferenceEngine::ICNNNetwork::Ptr clDNNEngine::CloneAndTransformNetwork(const In // Apply all transformations to TensorIterator body ti_manager.register_pass(manager); // Unroll will be called after all conversions - ti_manager.register_pass(); + // temporarily switch back to plugin unroller from NGraph unroller until TI output names are corrected + // ti_manager.register_pass(); ti_manager.run_passes(nGraphFunc); clonedNetwork = InferenceEngine::details::convertFunctionToICNNNetwork(nGraphFunc, *clonedNetwork); From 8d74accf52252e390a233770284c36c7390c0755 Mon Sep 17 00:00:00 2001 From: "Roman Vyunov (Intel)" Date: Tue, 8 Sep 2020 11:52:05 +0300 Subject: [PATCH 26/66] [IE][VPU]: Decompose Swish to Sigmoid + Prod (#2107) * Workaround to decompose Swish to Sigmoid + Multiply --- .../include/vpu/middleend/pass_manager.hpp | 1 + .../include/vpu/stage_builder.hpp | 15 +++++ .../src/middleend/pass_manager.cpp | 6 ++ .../src/middleend/passes/decompose_swish.cpp | 61 +++++++++++++++++++ .../graph_transformer/src/stages/eltwise.cpp | 16 +++++ .../graph_transformer/src/stages/sigmoid.cpp | 11 +++- 6 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 inference-engine/src/vpu/graph_transformer/src/middleend/passes/decompose_swish.cpp diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/middleend/pass_manager.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/middleend/pass_manager.hpp index 9eff84e4b74ebd..658ea626e63f6d 100644 --- a/inference-engine/src/vpu/graph_transformer/include/vpu/middleend/pass_manager.hpp +++ b/inference-engine/src/vpu/graph_transformer/include/vpu/middleend/pass_manager.hpp @@ -93,6 +93,7 @@ class PassManager final { // Model common adaptation // + Pass::Ptr decomposeSwish(); Pass::Ptr eliminateConstConcat(); Pass::Ptr splitGroupedConv(); Pass::Ptr splitConv3DInto2D(); diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/stage_builder.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/stage_builder.hpp index abd68f05bb08dc..cb3e57fd45e35f 100644 --- a/inference-engine/src/vpu/graph_transformer/include/vpu/stage_builder.hpp +++ b/inference-engine/src/vpu/graph_transformer/include/vpu/stage_builder.hpp @@ -297,6 +297,21 @@ class StageBuilder final { const std::string& name, const DataVector& inputs, const DataVector& outputs); + + Stage addSigmoidStage( + const Model& model, + const std::string& name, + const ie::CNNLayerPtr& layer, + const DataVector& inputs, + const DataVector& outputs); + + Stage addProdStage( + const Model& model, + const std::string& name, + const ie::CNNLayerPtr& layer, + const Data& input0, + const Data& input1, + const Data& output); }; } // namespace vpu diff --git a/inference-engine/src/vpu/graph_transformer/src/middleend/pass_manager.cpp b/inference-engine/src/vpu/graph_transformer/src/middleend/pass_manager.cpp index 9130b6682593fb..ce0a6451d0506a 100644 --- a/inference-engine/src/vpu/graph_transformer/src/middleend/pass_manager.cpp +++ b/inference-engine/src/vpu/graph_transformer/src/middleend/pass_manager.cpp @@ -87,6 +87,12 @@ PassSet::Ptr PassManager::buildMiddleEnd() { // initial dump pass must be the first dump ADD_DUMP_PASS("initial"); + // + // Decompose swish layer to Sigmoid + Multiply + // + ADD_PASS(decomposeSwish); + ADD_DUMP_PASS("decomposeSwish"); + // // Convert shape notation // diff --git a/inference-engine/src/vpu/graph_transformer/src/middleend/passes/decompose_swish.cpp b/inference-engine/src/vpu/graph_transformer/src/middleend/passes/decompose_swish.cpp new file mode 100644 index 00000000000000..f5c84d06277c41 --- /dev/null +++ b/inference-engine/src/vpu/graph_transformer/src/middleend/passes/decompose_swish.cpp @@ -0,0 +1,61 @@ +// Copyright (C) 2018-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +namespace vpu { + +namespace { + +class PassImpl final : public Pass { +public: + explicit PassImpl(const StageBuilder::Ptr& stageBuilder) : + _stageBuilder(stageBuilder) { + } + + void run(const Model& model) override; + +private: + StageBuilder::Ptr _stageBuilder; +}; + +void PassImpl::run(const Model& model) { + VPU_PROFILE(decomposeSwish); + + for (const auto& swish : model->getStages()) { + if (swish->type() != StageType::Swish) { + continue; + } + const auto inputData = swish->input(0); + const auto outputData = swish->output(0); + const auto name = swish->name(); + const auto& layer = swish->origLayer(); + + model->removeStage(swish); + + const auto sigmoidOutput = model->addNewData(inputData->name() + "@sigmoid", inputData->desc()); + + _stageBuilder->addSigmoidStage( + model, + name + "@sigmoid", + layer, + {inputData}, + {sigmoidOutput}); + _stageBuilder->addProdStage( + model, + name + "@prod", + layer, + inputData, + sigmoidOutput, + outputData); + } +} + +} // namespace + +Pass::Ptr PassManager::decomposeSwish() { + return std::make_shared(_stageBuilder); +} + +} // namespace vpu diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/eltwise.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/eltwise.cpp index 065706302997e1..b59e16de275ef5 100644 --- a/inference-engine/src/vpu/graph_transformer/src/stages/eltwise.cpp +++ b/inference-engine/src/vpu/graph_transformer/src/stages/eltwise.cpp @@ -350,6 +350,22 @@ Stage StageBuilder::addSumStage( {output}); } +Stage StageBuilder::addProdStage( + const Model& model, + const std::string& name, + const ie::CNNLayerPtr& layer, + const Data& input0, + const Data& input1, + const Data& output) { + const Data& fakeInput2 = model->addFakeData(); + return model->addNewStage( + name, + StageType::Prod, + layer, + {input0, input1, fakeInput2}, + {output}); +} + Stage StageBuilder::addMaxStage( const Model& model, const std::string& name, diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/sigmoid.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/sigmoid.cpp index bf3df84ca06a1d..c044230c1baed8 100644 --- a/inference-engine/src/vpu/graph_transformer/src/stages/sigmoid.cpp +++ b/inference-engine/src/vpu/graph_transformer/src/stages/sigmoid.cpp @@ -33,7 +33,16 @@ void FrontEnd::parseSigmoid(const Model& model, const ie::CNNLayerPtr& layer, co IE_ASSERT(inputs.size() == 1); IE_ASSERT(outputs.size() == 1); - model->addNewStage(layer->name, StageType::Sigmoid, layer, inputs, outputs); + _stageBuilder->addSigmoidStage(model, layer->name, layer, inputs, outputs); +} + +Stage StageBuilder::addSigmoidStage( + const Model& model, + const std::string& name, + const ie::CNNLayerPtr& layer, + const DataVector& inputs, + const DataVector& outputs) { + return model->addNewStage(name, StageType::Sigmoid, layer, inputs, outputs); } } // namespace vpu From f4cb425396022501e53a85fb55f6423f61b43a13 Mon Sep 17 00:00:00 2001 From: Jan Iwaszkiewicz Date: Tue, 8 Sep 2020 13:11:05 +0200 Subject: [PATCH 27/66] [nGraph] Enable u1 data type in Constant PyAPI (#2084) --- ngraph/core/src/type/element_type.cpp | 1 + ngraph/python/src/pyngraph/ops/constant.cpp | 4 ++-- ngraph/python/src/pyngraph/types/element_type.cpp | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ngraph/core/src/type/element_type.cpp b/ngraph/core/src/type/element_type.cpp index a807bb56a7c197..828c3b7c787760 100644 --- a/ngraph/core/src/type/element_type.cpp +++ b/ngraph/core/src/type/element_type.cpp @@ -357,6 +357,7 @@ namespace ngraph {"i16", element::Type_t::i16}, {"i32", element::Type_t::i32}, {"i64", element::Type_t::i64}, + {"u1", element::Type_t::u1}, {"u8", element::Type_t::u8}, {"u16", element::Type_t::u16}, {"u32", element::Type_t::u32}, diff --git a/ngraph/python/src/pyngraph/ops/constant.cpp b/ngraph/python/src/pyngraph/ops/constant.cpp index 4b3265f8ce5eaf..1f6dd6b08504af 100644 --- a/ngraph/python/src/pyngraph/ops/constant.cpp +++ b/ngraph/python/src/pyngraph/ops/constant.cpp @@ -149,7 +149,7 @@ void regclass_pyngraph_op_Constant(py::module m) { return _cast_vector(self); } - else if (element_type == ngraph::element::u8) + else if (element_type == ngraph::element::u8 || element_type == ngraph::element::u1) { return _cast_vector(self); } @@ -206,7 +206,7 @@ void regclass_pyngraph_op_Constant(py::module m) { return _get_buffer_info(self); } - else if (element_type == ngraph::element::u8) + else if (element_type == ngraph::element::u8 || element_type == ngraph::element::u1) { return _get_buffer_info(self); } diff --git a/ngraph/python/src/pyngraph/types/element_type.cpp b/ngraph/python/src/pyngraph/types/element_type.cpp index 67ed479e5e7f1a..ce72aacc7158c7 100644 --- a/ngraph/python/src/pyngraph/types/element_type.cpp +++ b/ngraph/python/src/pyngraph/types/element_type.cpp @@ -35,6 +35,7 @@ void regclass_pyngraph_Type(py::module m) type.attr("i16") = ngraph::element::i16; type.attr("i32") = ngraph::element::i32; type.attr("i64") = ngraph::element::i64; + type.attr("u1") = ngraph::element::u1; type.attr("u8") = ngraph::element::u8; type.attr("u16") = ngraph::element::u16; type.attr("u32") = ngraph::element::u32; From 6fb9bfac2a605609fcb7fd815cdf076df60cc46d Mon Sep 17 00:00:00 2001 From: Mateusz Tabaka Date: Tue, 8 Sep 2020 16:04:08 +0200 Subject: [PATCH 28/66] Update tolerance for candy model (#1967) --- ngraph/python/tests/test_onnx/test_additional_models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ngraph/python/tests/test_onnx/test_additional_models.py b/ngraph/python/tests/test_onnx/test_additional_models.py index 13d04f944de83e..f8665e80696829 100644 --- a/ngraph/python/tests/test_onnx/test_additional_models.py +++ b/ngraph/python/tests/test_onnx/test_additional_models.py @@ -45,6 +45,7 @@ def _get_default_additional_models_dir(): "pointilism": {"atol": 0.001, "rtol": 0.001}, "rain_princess": {"atol": 0.001, "rtol": 0.001}, "udnie": {"atol": 0.001, "rtol": 0.001}, + "candy": {"atol": 0.003, "rtol": 0.003}, } zoo_models = [] From 2c6cceeeb627ff96b584a6e4e107193cd2ef9c74 Mon Sep 17 00:00:00 2001 From: Artyom Anokhov Date: Tue, 8 Sep 2020 17:23:27 +0300 Subject: [PATCH 29/66] Added code owners for scripts folder (#2130) --- CODEOWNERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CODEOWNERS b/CODEOWNERS index 47aec85726340a..f6cef156f27196 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -13,6 +13,9 @@ azure-pipelines.yml @openvinotoolkit/openvino-admins # QA Tests: /tests/ @openvinotoolkit/openvino-tests-maintainers +# OpenVINO Scripts +/scripts/ @openvinotoolkit/openvino-admins @openvinotoolkit/openvino-scripts-maintainers + # IE Core: /inference-engine/ @openvinotoolkit/openvino-ie-maintainers /inference-engine/ie_bridges/python @openvinotoolkit/openvino-ie-python-api-maintainers From 867340e8f1801f31fd8d1d4f3916c3f90f2e4a81 Mon Sep 17 00:00:00 2001 From: Vitaliy Urusovskij Date: Wed, 9 Sep 2020 00:13:07 +0300 Subject: [PATCH 30/66] Add `runPipeline` wrapper to mandatory track full run (#2100) --- tests/time_tests/common/main.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/time_tests/common/main.cpp b/tests/time_tests/common/main.cpp index 0a05145361b9dc..e03866529a9268 100644 --- a/tests/time_tests/common/main.cpp +++ b/tests/time_tests/common/main.cpp @@ -27,6 +27,15 @@ bool parseAndCheckCommandLine(int argc, char **argv) { } +/** +* @brief Function calls `runPipeline` with mandatory time tracking of full run +*/ +int _runPipeline() { + SCOPED_TIMER(full_run); + return runPipeline(FLAGS_m, FLAGS_d); +} + + /** * @brief Main entry point */ @@ -34,5 +43,5 @@ int main(int argc, char **argv) { if (!parseAndCheckCommandLine(argc, argv)) return -1; - return runPipeline(FLAGS_m, FLAGS_d); + return _runPipeline(); } \ No newline at end of file From 5ad481179354e8d6697ff6bd28c16ece8bc56c11 Mon Sep 17 00:00:00 2001 From: Evgeny Latkin Date: Wed, 9 Sep 2020 03:50:40 +0300 Subject: [PATCH 31/66] [VPU][OpenCL] Update custom kernels (#2131) * [Custom CL] Updated OpenCL kernels and tests * [Custom CL] Update OpenCL compiler * Update firmware to 1365 * Disable ExpGenerateProposals tests * VPU: new firmware no. 1370 * Myriad: re-enable ExpGenerateProposals tests Co-authored-by: Maxim Kurin --- inference-engine/cmake/vpu_dependencies.cmake | 4 +- .../src/vpu/common/src/utils/simple_math.cpp | 8 +- .../src/vpu/custom_kernels/binarization.cl | 67 +++ .../vpu/custom_kernels/binary_convolution.cl | 95 +++ .../custom_kernels/binary_convolution1x1.cl | 215 +++---- .../custom_kernels/binary_convolution3x3.cl | 424 ++++++-------- .../src/vpu/custom_kernels/binary_layers.cl | 339 ----------- .../src/vpu/custom_kernels/convolution1x1.cl | 281 --------- .../vpu/custom_kernels/convolution1x1_chw.cl | 114 ++++ .../vpu/custom_kernels/convolution1x1_hwc.cl | 126 ++++ .../src/vpu/custom_kernels/convolution3x3.cl | 198 +++---- .../src/vpu/custom_kernels/correlate.cl | 552 +++++++++--------- .../src/vpu/custom_kernels/ctc.cl | 177 ++---- .../custom_kernels/customLayerBindings.xml | 216 +++---- .../src/vpu/custom_kernels/cvtu8f16.cl | 108 ++-- .../detectron_prior_grid_gen.cl | 117 ++-- .../src/vpu/custom_kernels/fakequantize.cl | 111 ++++ .../src/vpu/custom_kernels/grn.cl | 138 ++--- .../src/vpu/custom_kernels/mvn.cl | 390 ------------- .../src/vpu/custom_kernels/mvn_reduction.cl | 115 ++++ .../src/vpu/custom_kernels/mvn_scale.cl | 68 +++ .../src/vpu/custom_kernels/quantize.cl | 176 ------ .../src/vpu/custom_kernels/region.cl | 474 --------------- .../src/vpu/custom_kernels/region_chw.cl | 135 +++-- .../custom_kernels/region_chw_m7_branch0.cl | 58 -- .../custom_kernels/region_chw_m7_branch1.cl | 43 -- .../src/vpu/custom_kernels/region_hwc.cl | 114 ++++ .../src/vpu/custom_kernels/reorg_chw.cl | 144 ++--- .../src/vpu/custom_kernels/reorg_chw_local.cl | 40 -- .../src/vpu/custom_kernels/reorg_chw_stack.cl | 45 -- .../src/vpu/custom_kernels/reorg_hwc.cl | 144 ++--- .../src/vpu/custom_kernels/reorg_hwc_naive.cl | 53 ++ .../src/vpu/custom_kernels/resample_AA.cl | 122 ++++ .../src/vpu/custom_kernels/resample_nn.cl | 173 ------ .../src/vpu/custom_kernels/resample_noAA.cl | 112 ++++ .../custom_kernels/resample_with_antialias.cl | 245 -------- .../vpu/custom_kernels/shuffle_channels.cl | 26 +- inference-engine/src/vpu/custom_kernels/st.cl | 295 +++++----- .../include/vpu/frontend/ShaveElfMetadata.h | 188 ++++++ .../vpu/frontend/ShaveElfMetadataParser.h | 225 +++++++ .../src/frontend/ShaveElfMetadataParser.cpp | 93 +++ .../src/frontend/custom_kernel.cpp | 187 +++--- .../graph_transformer/src/stages/custom.cpp | 2 +- .../layers/myriad_layers_custom_test.cpp | 2 +- .../layers/myriad_layers_custom_test.hpp | 8 +- .../layers/myriad_layers_region_test.cpp | 19 +- .../layers/myriad_layers_reorg_test.cpp | 14 +- .../layers/myriad_layers_reorg_test.hpp | 6 + .../layers/myriad_layers_resample_test.cpp | 19 +- 49 files changed, 2950 insertions(+), 4075 deletions(-) create mode 100644 inference-engine/src/vpu/custom_kernels/binarization.cl create mode 100644 inference-engine/src/vpu/custom_kernels/binary_convolution.cl delete mode 100644 inference-engine/src/vpu/custom_kernels/binary_layers.cl delete mode 100644 inference-engine/src/vpu/custom_kernels/convolution1x1.cl create mode 100644 inference-engine/src/vpu/custom_kernels/convolution1x1_chw.cl create mode 100644 inference-engine/src/vpu/custom_kernels/convolution1x1_hwc.cl create mode 100644 inference-engine/src/vpu/custom_kernels/fakequantize.cl delete mode 100644 inference-engine/src/vpu/custom_kernels/mvn.cl create mode 100644 inference-engine/src/vpu/custom_kernels/mvn_reduction.cl create mode 100644 inference-engine/src/vpu/custom_kernels/mvn_scale.cl delete mode 100644 inference-engine/src/vpu/custom_kernels/quantize.cl delete mode 100644 inference-engine/src/vpu/custom_kernels/region.cl delete mode 100644 inference-engine/src/vpu/custom_kernels/region_chw_m7_branch0.cl delete mode 100644 inference-engine/src/vpu/custom_kernels/region_chw_m7_branch1.cl create mode 100644 inference-engine/src/vpu/custom_kernels/region_hwc.cl delete mode 100644 inference-engine/src/vpu/custom_kernels/reorg_chw_local.cl delete mode 100644 inference-engine/src/vpu/custom_kernels/reorg_chw_stack.cl create mode 100644 inference-engine/src/vpu/custom_kernels/reorg_hwc_naive.cl create mode 100644 inference-engine/src/vpu/custom_kernels/resample_AA.cl delete mode 100644 inference-engine/src/vpu/custom_kernels/resample_nn.cl create mode 100644 inference-engine/src/vpu/custom_kernels/resample_noAA.cl delete mode 100644 inference-engine/src/vpu/custom_kernels/resample_with_antialias.cl create mode 100644 inference-engine/src/vpu/graph_transformer/include/vpu/frontend/ShaveElfMetadata.h create mode 100644 inference-engine/src/vpu/graph_transformer/include/vpu/frontend/ShaveElfMetadataParser.h create mode 100644 inference-engine/src/vpu/graph_transformer/src/frontend/ShaveElfMetadataParser.cpp diff --git a/inference-engine/cmake/vpu_dependencies.cmake b/inference-engine/cmake/vpu_dependencies.cmake index e17ada43d53a83..6433c9aad0889a 100644 --- a/inference-engine/cmake/vpu_dependencies.cmake +++ b/inference-engine/cmake/vpu_dependencies.cmake @@ -19,8 +19,8 @@ set(VPU_SUPPORTED_FIRMWARES usb-ma2450 usb-ma2x8x pcie-ma248x) # Default packages # -set(FIRMWARE_PACKAGE_VERSION 1360) -set(VPU_CLC_MA2X8X_VERSION "movi-cltools-20.02.0") +set(FIRMWARE_PACKAGE_VERSION 1370) +set(VPU_CLC_MA2X8X_VERSION "movi-cltools-20.09.0") # # CMake variables to override default firmware files diff --git a/inference-engine/src/vpu/common/src/utils/simple_math.cpp b/inference-engine/src/vpu/common/src/utils/simple_math.cpp index 79a8179cefbadb..d8669f6c16b356 100644 --- a/inference-engine/src/vpu/common/src/utils/simple_math.cpp +++ b/inference-engine/src/vpu/common/src/utils/simple_math.cpp @@ -65,9 +65,14 @@ void MathExpression::parse(const std::string& expression) { // parse number if (std::isdigit(*it)) { size_t len = 0; + // parse number and use its length const auto value = std::stof(&*it, &len); + (void) value; + // copy sub string that represents a number + auto substring = std::string{it, it + len}; - _parsedTokens.emplace_back(TokenType::Value, ValueType{value}, ""); + auto token = Token{TokenType::Value, ValueType{substring}, ""}; + _parsedTokens.push_back(std::move(token)); std::advance(it, len - 1); continue; @@ -84,6 +89,7 @@ void MathExpression::parse(const std::string& expression) { tokenStack.push(token); continue; } + if (_vars.find(token) != _vars.end()) { _parsedTokens.emplace_back(TokenType::Value, ValueType{_vars.at(token)}, ""); continue; diff --git a/inference-engine/src/vpu/custom_kernels/binarization.cl b/inference-engine/src/vpu/custom_kernels/binarization.cl new file mode 100644 index 00000000000000..4572d43dfb326d --- /dev/null +++ b/inference-engine/src/vpu/custom_kernels/binarization.cl @@ -0,0 +1,67 @@ +// Copyright (C) 2018-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable + +__kernel void binarization( + const __global half *__restrict src_data, + const __global half *__restrict input_low_high, + const __global half *__restrict dst_data, + int switch_out, + int input_low_high_size, + int W, + int H) +{ + __local half local_src[15 * 1024]; + __local half local_dst[15 * 1024]; + + event_t e1 = async_work_group_copy(local_src, src_data + get_group_id(2) * W * H, W * H, 0); + wait_group_events(1, &e1); + + int c = get_global_id(2); + int C = get_global_size(2); + + half dst_low = switch_out ? 1.h : -1.h; + half dst_high = switch_out ? -1.h : 1.h; + + half s_ilow_ihigh = input_low_high_size == 1 ? input_low_high[0] : input_low_high[c]; + + for (int h = 0; h < H; h++) { + + __local const half *__restrict addr_src = local_src + h * W; + __local half *__restrict addr_dst = local_dst + h * W; + +#if 1 + for (int w = 0; w < W / 8; w++) { + + half8 h_src_val8 = (*((__local half8 *)addr_src + w)); + + short8 cond1; + cond1.s0 = (h_src_val8.s0 <= s_ilow_ihigh); + cond1.s1 = (h_src_val8.s1 <= s_ilow_ihigh); + cond1.s2 = (h_src_val8.s2 <= s_ilow_ihigh); + cond1.s3 = (h_src_val8.s3 <= s_ilow_ihigh); + cond1.s4 = (h_src_val8.s4 <= s_ilow_ihigh); + cond1.s5 = (h_src_val8.s5 <= s_ilow_ihigh); + cond1.s6 = (h_src_val8.s6 <= s_ilow_ihigh); + cond1.s7 = (h_src_val8.s7 <= s_ilow_ihigh); + + cond1 = ~(cond1 - (short8)1); + + short8 res = cond1 & as_short8((half8)dst_low) | ~cond1 & as_short8((half8)dst_high); + + *((__local half8 *)addr_dst + w) = as_half8(res); + } +#endif + for (int w = W & (~0x7); w < W; w++) { + addr_dst[w] = (addr_src[w] <= s_ilow_ihigh) ? dst_low : dst_high; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + event_t e2 = async_work_group_copy(dst_data + get_group_id(2) * W * H, local_dst, W * H, 0); + wait_group_events(1, &e2); +} diff --git a/inference-engine/src/vpu/custom_kernels/binary_convolution.cl b/inference-engine/src/vpu/custom_kernels/binary_convolution.cl new file mode 100644 index 00000000000000..b5ada6bff2a941 --- /dev/null +++ b/inference-engine/src/vpu/custom_kernels/binary_convolution.cl @@ -0,0 +1,95 @@ +// Copyright (C) 2018-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +int extract_weights(uchar val, int bit) { return ((val >> bit) & 1); } + +__kernel void binary_convolution( + const __global half *restrict src_data, + const __global uchar *restrict weights_data, + __global half *restrict dst_data, + float pad_value, + + int IW, + int IH, + int IC, + + int DW, + int DH, + + int GC, + + int KW, + int KH, + + int PW, + int PH, + + int SW, + int SH) +{ + int ipad_value = ((pad_value > 0.f) ? 1 : 0); + int c = get_global_id(2); + int y = get_global_id(1); + int x = get_global_id(0); + + int OC = get_global_size(2); + int OH = get_global_size(1); + int OW = get_global_size(0); + + int KD = 1; + int SD = 0; + int DD = 0; + int PD = 0; + int ID = 1; + int OD = 1; + + int nbits = 8; + + int g = c % GC; + int oc = c / GC; + int oh = y; + int ow = x; + + for (int od = 0; od < OD; od++) { + int oidx = g * OC / GC * OD * OH * OW + oc * OD * OH * OW + od * OH * OW + oh * OW + ow; + + int res = 0; + + for (int ic = 0; ic < IC / GC; ic++) { + for (int kd = 0; kd < KD; kd++) { + for (int kh = 0; kh < KH; kh++) { + for (int kw = 0; kw < KW; kw++) { + int widx = g * OC / GC * IC / GC * KD * KH * KW + + oc * IC / GC * KD * KH * KW + ic * KD * KH * KW + kd * KH * KW + + kh * KW + kw; + + int w = extract_weights(weights_data[widx / nbits], (widx % nbits)); + + int s; + + int iw = ow * SW - PW + kw * DW; + int ih = oh * SH - PH + kh * DH; + int id = od * SD - PD + kd * DD; + + if (iw < 0 || iw >= (int)IW || ih < 0 || ih >= (int)IH || id < 0 + || id >= (int)ID) { + s = ipad_value; + } else { + int iidx = g * IC / GC * ID * IH * IW + ic * ID * IH * IW + id * IH * IW + + ih * IW + iw; + + s = ((src_data[iidx] > 0.f) ? 1 : 0); + } + + res += s ^ w; + } + } + } + } + + dst_data[oidx] = (half)(IC / GC * KD * KH * KW - 2 * res); + } +} diff --git a/inference-engine/src/vpu/custom_kernels/binary_convolution1x1.cl b/inference-engine/src/vpu/custom_kernels/binary_convolution1x1.cl index 05bd7e75785833..500574dd6280e8 100644 --- a/inference-engine/src/vpu/custom_kernels/binary_convolution1x1.cl +++ b/inference-engine/src/vpu/custom_kernels/binary_convolution1x1.cl @@ -3,186 +3,115 @@ // #pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable -ushort extract_weights(uchar val, int bit) -{ - return ((val >> bit) & 1); -} +ushort extract_weights(uchar val, int bit) { return ((val >> bit) & 1); } __kernel void binary_convolution( - const __global half* restrict src_data, - const __global uchar* restrict weights_data, - const __global half* restrict dst_data, - float pad_value, + const __global half *restrict src_data, + const __global uchar *restrict weights_data, + __global half *restrict dst_data, + float pad_value, - int IW, - int IH, - int IC, + int IW, + int IH, + int IC, - int DW, - int DH, + int DW, + int DH, - int GC, + int GC, - int KW, - int KH, + int KW, + int KH, - int PW, - int PH, + int PW, + int PH, - int SW, - int SH, + int SW, + int SH, - int OW, - const __local half* restrict src_local, - __local half* restrict dst_local) + int OW) { - int oh = get_global_id(0); - int oc = get_global_id(1); - int OH = get_global_size(0); - int OC = get_global_size(1); + __local half src_local[32 * 1024]; + __local half dst_local[2 * 1024]; + + const int oh = get_group_id(0); + const int oc = get_group_id(1); + const int OH = get_global_size(0); + const int OC = get_global_size(1); + + const int gc = oc / (OC / GC); + + if (oh * SH >= 0 && oh * SH <= IH - 1) { + const __global half *src = src_data + (gc * IC / GC) * IW * IH + (SH * oh) * IW; + + event_t e1 = async_work_group_copy_2D2D( + src_local, // dst + src, // src + IW, // num_elements_per_line, + IC / GC, // num_lines, + IH * IW - IW, // src_line_stride, + 0, // dst_line_stride, + 0); + wait_group_events(1, &e1); + } half pad_value_half = convert_half(pad_value); //padding row - if (oh * SH > IH - 1) - { - __local half* dst = src_local; - for(int c = 0; c < IC/GC; c++) - { + if (oh * SH > IH - 1) { + __local half *dst = src_local; + for (int c = 0; c < IC / GC; c++) { #pragma unroll 8 - for(int j = 0; j < IW; j++) - { + for (int j = 0; j < IW; j++) { dst[j] = pad_value_half; } dst += IW; } - } - + } + int OWS = SW * OW; ushort8 in; - for (int ows8 = 0; ows8 < (OWS+7)/8; ows8++) - { + for (int ows8 = 0; ows8 < (OWS + 7) / 8; ows8++) { ushort8 val = {0, 0, 0, 0, 0, 0, 0, 0}; - for (int ic = 0; ic < IC/GC; ++ic) - { - __local half* src = (__local half*)((__local half8*)(src_local + ic * IW) + ows8); - int weight_pos = oc * IC/GC + ic; - ushort w = extract_weights(weights_data[((weight_pos + 0)) / 8], ((weight_pos + 0) % 8)); - - if ((ows8 * 8) <= IW - 1) - { - in = *((__local ushort8*)(src)); + for (int ic = 0; ic < IC / GC; ++ic) { + __local half *src = (__local half *)((__local half8 *)(src_local + ic * IW) + ows8); + int weight_pos = oc * IC / GC + ic; + ushort w = + extract_weights(weights_data[((weight_pos + 0)) / 8], ((weight_pos + 0) % 8)); + + if ((ows8 * 8) <= IW - 1) { + in = *((__local ushort8 *)(src)); } //padding column - if (ows8 * 8 + 7 > IW - 1) - { + if (ows8 * 8 + 7 > IW - 1) { int boundary = (IW - 1) - ows8 * 8 + 1; - boundary = boundary < 0 ? 0 : boundary; - for (int offset = boundary; offset < 8; offset++) - { - *((half*)(&in) + offset) = pad_value_half; + boundary = boundary < 0 ? 0 : boundary; + for (int offset = boundary; offset < 8; offset++) { + *((half *)(&in) + offset) = pad_value_half; } } ushort8 w8 = (ushort8)(w); - ushort8 cond = (((in) < (ushort8)0x8000) && (in > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0); - + ushort8 cond = + (((in) < (ushort8)0x8000) && (in > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0); + val += (cond ^ w8); - } - + } + ushort8 val_shift = val << 1; - int boundary = (ows8 * 8 + 7) / SW < OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1; - for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++) - { - *(dst_local + ow) = (half)(IC/GC - *((ushort*)(&val_shift) + ow * SW - ows8 * 8)); + int boundary = (ows8 * 8 + 7) / SW < OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1; + for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++) { + *(dst_local + ow) = (half)(IC / GC - *((ushort *)(&val_shift) + ow * SW - ows8 * 8)); } } -} - -__kernel void __dma_preload_binary_convolution( - const __global half* restrict src_data, - const __global uchar* restrict weights_data, - const __global half* restrict dst_data, - float pad_value, - - int IW, - int IH, - int IC, - - int DW, - int DH, - - int GC, - int KW, - int KH, + barrier(CLK_LOCAL_MEM_FENCE); - int PW, - int PH, - - int SW, - int SH, - - int OW, - __local half* restrict src_local, - const __local half* restrict dst_local) -{ - const int oh = get_group_id(0); - const int oc = get_group_id(1); - const int OC = get_global_size(1); - - const int gc = oc / (OC/GC); - - if (oh * SH >= 0 && oh * SH <= IH - 1) - { - const __global half* src = src_data + (gc * IC/GC) * IW * IH + (SH * oh) * IW; - WorkGroupDmaCreateStrideTransaction( - src, // src - src_local, // dst - IW * sizeof(half), // src width - IW * sizeof(half), // dst width - IH * IW * sizeof(half), // src stride - IW * sizeof(half), // dst stride - IW * IC/GC * sizeof(half), //total size - 0 - ); - } + event_t e2 = async_work_group_copy(dst_data + oc * OW * OH + oh * OW, dst_local, OW, 0); + wait_group_events(1, &e2); } -__kernel void __dma_postwrite_binary_convolution( - const __global half* restrict src_data, - const __global uchar* restrict weights_data, - __global half* restrict dst_data, - float pad_value, - - int IW, - int IH, - int IC, - - int DW, - int DH, - - int GC, - - int KW, - int KH, - - int PW, - int PH, - - int SW, - int SH, - - int OW, - const __local half* restrict src_local, - const __local half* restrict dst_local) -{ - const int oh = get_group_id(0); - const int oc = get_group_id(1); - const int OH = get_global_size(0); - - async_work_group_copy(dst_data + oc*OW*OH + oh*OW, dst_local, OW, 0); -} \ No newline at end of file diff --git a/inference-engine/src/vpu/custom_kernels/binary_convolution3x3.cl b/inference-engine/src/vpu/custom_kernels/binary_convolution3x3.cl index db23c37f4dda7e..7c4958663dcfea 100644 --- a/inference-engine/src/vpu/custom_kernels/binary_convolution3x3.cl +++ b/inference-engine/src/vpu/custom_kernels/binary_convolution3x3.cl @@ -3,82 +3,131 @@ // #pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable -ushort extract_weights(uchar val, int bit) -{ - return ((val >> bit) & 1); -} +ushort extract_weights(uchar val, int bit) { return ((val >> bit) & 1); } __kernel void binary_convolution( - const __global half* restrict src_data, - const __global uchar* restrict weights_data, - const __global half* restrict dst_data, - float pad_value, + const __global half *restrict src_data, + const __global uchar *restrict weights_data, + const __global half *restrict dst_data, + float pad_value, - int IW, - int IH, - int IC, + int IW, + int IH, + int IC, - int DW, - int DH, + int DW, + int DH, - int GC, + int GC, - int KW, - int KH, + int KW, + int KH, - int PW, - int PH, + int PW, + int PH, - int SW, - int SH, + int SW, + int SH, - int OW, - const __local half* restrict src_local, - __local half* restrict dst_local) + int OW) { - int oh = get_global_id(0); - int oc = get_global_id(1); - int OH = get_global_size(0); - int OC = get_global_size(1); + __local half src_local[32 * 1024]; + __local half dst_local[2 * 1024]; - half pad_value_half = convert_half(pad_value); + const int oh = get_group_id(0); + const int oc = get_group_id(1); + const int OH = get_global_size(0); + const int OC = get_global_size(1); - //padding row - if (oh * SH - 1 < 0 || oh * SH - 1 > IH - 1) + const int gc = oc / (OC / GC); + + if (oh * SH - 1 >= 0 && oh * SH + DH + DH - 1 <= IH - 1) //dma for 3 rows { - __local half* dst = src_local; - for(int c = 0; c < IC/GC; c++) + event_t e = async_work_group_copy_3D3D( + src_local, // dst + src_data + (gc * IC / GC) * IW * IH + (SH * oh - 1) * IW, // src + IW, // num_elements_per_line + 3, // num_lines + DH * IW - IW, // src_line_stride + 0, // dst_line_stride + IC / GC, // num planes + IH * IW - 3 * IW, // src plane stride + 0, // dst plane stride + 0); + wait_group_events(1, &e); + } else { + int ih = oh * SH - 1; + if (ih >= 0 && ih <= IH - 1) //dma for first row + { + event_t e = async_work_group_copy_2D2D( + src_local, // dst + src_data + (gc * IC / GC) * IW * IH + ih * IW, // src + IW, // num_elements_per_line, + IC / GC, // num_lines, + IH * IW - IW, // src_line_stride, + 2 * IW, // dst_line_stride, + 0); + + wait_group_events(1, &e); + } + ih = oh * SH - 1 + DH; + if (ih >= 0 && ih <= IH - 1) //dma for second row + { + event_t e = async_work_group_copy_2D2D( + src_local + IW, // dst + src_data + (gc * IC / GC) * IW * IH + ih * IW, // src + IW, // num_elements_per_line, + IC / GC, // num_lines, + IH * IW - IW, // src_line_stride, + 2 * IW, // dst_line_stride, + 0); + wait_group_events(1, &e); + } + ih = oh * SH - 1 + 2 * DH; + if (ih >= 0 && ih <= IH - 1) //dma for third row { + event_t e = async_work_group_copy_2D2D( + src_local + 2 * IW, // dst + src_data + (gc * IC / GC) * IW * IH + ih * IW, // src + IW, // num_elements_per_line, + IC / GC, // num_lines, + IH * IW - IW, // src_line_stride, + 2 * IW, // dst_line_stride, + 0); + wait_group_events(1, &e); + } + } + + half pad_value_half = convert_half(pad_value); + + //padding row + if (oh * SH - 1 < 0 || oh * SH - 1 > IH - 1) { + __local half *dst = src_local; + for (int c = 0; c < IC / GC; c++) { #pragma unroll 8 - for(int j = 0; j < IW; j++) - { + for (int j = 0; j < IW; j++) { dst[j] = pad_value_half; } dst += 3 * IW; } } - if (oh * SH + DH - 1 > IH - 1) - { - __local half* dst = src_local + IW; - for(int c = 0; c < IC/GC; c++) - { + if (oh * SH + DH - 1 > IH - 1) { + __local half *dst = src_local + IW; + for (int c = 0; c < IC / GC; c++) { #pragma unroll 8 - for(int j = 0; j < IW; j++) - { + for (int j = 0; j < IW; j++) { dst[j] = pad_value_half; } dst += 3 * IW; } } - if (oh * SH + DH + DH - 1 > IH - 1) - { - __local half* dst = src_local + 2 * IW; - for(int c = 0; c < IC/GC; c++) - { + if (oh * SH + DH + DH - 1 > IH - 1) { + __local half *dst = src_local + 2 * IW; + for (int c = 0; c < IC / GC; c++) { #pragma unroll 8 - for(int j = 0; j < IW; j++) - { + for (int j = 0; j < IW; j++) { dst[j] = pad_value_half; } dst += 3 * IW; @@ -97,13 +146,12 @@ __kernel void binary_convolution( ushort8 in21; ushort8 in22; - for (int ows8 = 0; ows8 < (OWS+7)/8; ows8++) - { + for (int ows8 = 0; ows8 < (OWS + 7) / 8; ows8++) { ushort8 val = {0, 0, 0, 0, 0, 0, 0, 0}; - for (int ic = 0; ic < IC/GC; ++ic) - { - __local half* src = (__local half*)((__local half8*)(src_local + ic * IW * 3 + IW + DW - 1) + ows8); - int weight_pos = oc*IC/GC*3*3 + ic*3*3; + for (int ic = 0; ic < IC / GC; ++ic) { + __local half *src = + (__local half *)((__local half8 *)(src_local + ic * IW * 3 + IW + DW - 1) + ows8); + int weight_pos = oc * IC / GC * 3 * 3 + ic * 3 * 3; ushort w0 = extract_weights(weights_data[((weight_pos + 0)) / 8], ((weight_pos + 0) % 8)); ushort w1 = extract_weights(weights_data[((weight_pos + 1)) / 8], ((weight_pos + 1) % 8)); ushort w2 = extract_weights(weights_data[((weight_pos + 2)) / 8], ((weight_pos + 2) % 8)); @@ -114,64 +162,55 @@ __kernel void binary_convolution( ushort w7 = extract_weights(weights_data[((weight_pos + 7)) / 8], ((weight_pos + 7) % 8)); ushort w8 = extract_weights(weights_data[((weight_pos + 8)) / 8], ((weight_pos + 8) % 8)); - if ((ows8 * 8) - 1 <= IW - 1) - { - in00 = *((__local ushort8*)(src - IW - DW)); - in01 = *((__local ushort8*)(src - IW)); - in02 = *((__local ushort8*)(src - IW + DW)); + if ((ows8 * 8) - 1 <= IW - 1) { + in00 = *((__local ushort8 *)(src - IW - DW)); + in01 = *((__local ushort8 *)(src - IW)); + in02 = *((__local ushort8 *)(src - IW + DW)); - in10 = *((__local ushort8*)(src - DW)); - in11 = *((__local ushort8*)(src)); - in12 = *((__local ushort8*)(src + DW)); + in10 = *((__local ushort8 *)(src - DW)); + in11 = *((__local ushort8 *)(src)); + in12 = *((__local ushort8 *)(src + DW)); - in20 = *((__local ushort8*)(src + IW - DW)); - in21 = *((__local ushort8*)(src + IW)); - in22 = *((__local ushort8*)(src + IW + DW)); + in20 = *((__local ushort8 *)(src + IW - DW)); + in21 = *((__local ushort8 *)(src + IW)); + in22 = *((__local ushort8 *)(src + IW + DW)); } //padding column - if (ows8 * 8 - 1 < 0) - { + if (ows8 * 8 - 1 < 0) { int boundary = 1 - ows8 * 8; - boundary = boundary > 8 ? 8 : boundary; - for (int offset = 0; offset < boundary; offset++) - { - *((half*)(&in00) + offset) = pad_value_half; - *((half*)(&in10) + offset) = pad_value_half; - *((half*)(&in20) + offset) = pad_value_half; + boundary = boundary > 8 ? 8 : boundary; + for (int offset = 0; offset < boundary; offset++) { + *((half *)(&in00) + offset) = pad_value_half; + *((half *)(&in10) + offset) = pad_value_half; + *((half *)(&in20) + offset) = pad_value_half; } - } - if ((ows8 * 8 + 7) + DW + DW - 1 > IW - 1) - { + } + if ((ows8 * 8 + 7) + DW + DW - 1 > IW - 1) { int boundary = (IW - DW - 1 - DW + 1) - ows8 * 8 + 1; - boundary = boundary < 0 ? 0 : boundary; - for (int offset = boundary; offset < 8; offset++) - { - *((half*)(&in02) + offset) = pad_value_half; - *((half*)(&in12) + offset) = pad_value_half; - *((half*)(&in22) + offset) = pad_value_half; + boundary = boundary < 0 ? 0 : boundary; + for (int offset = boundary; offset < 8; offset++) { + *((half *)(&in02) + offset) = pad_value_half; + *((half *)(&in12) + offset) = pad_value_half; + *((half *)(&in22) + offset) = pad_value_half; } - } - if ((ows8 * 8 + 7) + DW - 1 > IW - 1) - { + } + if ((ows8 * 8 + 7) + DW - 1 > IW - 1) { int boundary = (IW - 1 - DW + 1) - ows8 * 8 + 1; - boundary = boundary < 0 ? 0 : boundary; - for (int offset = boundary; offset < 8; offset++) - { - *((half*)(&in01) + offset) = pad_value_half; - *((half*)(&in11) + offset) = pad_value_half; - *((half*)(&in21) + offset) = pad_value_half; + boundary = boundary < 0 ? 0 : boundary; + for (int offset = boundary; offset < 8; offset++) { + *((half *)(&in01) + offset) = pad_value_half; + *((half *)(&in11) + offset) = pad_value_half; + *((half *)(&in21) + offset) = pad_value_half; } } - if ((ows8 * 8 + 7) - 1 > IW - 1) - { + if ((ows8 * 8 + 7) - 1 > IW - 1) { int boundary = (IW - 1 + 1) - ows8 * 8 + 1; - boundary = boundary < 0 ? 0 : boundary; - for (int offset = boundary; offset < 8; offset++) - { - *((half*)(&in00) + offset) = pad_value_half; - *((half*)(&in10) + offset) = pad_value_half; - *((half*)(&in20) + offset) = pad_value_half; + boundary = boundary < 0 ? 0 : boundary; + for (int offset = boundary; offset < 8; offset++) { + *((half *)(&in00) + offset) = pad_value_half; + *((half *)(&in10) + offset) = pad_value_half; + *((half *)(&in20) + offset) = pad_value_half; } } @@ -185,16 +224,34 @@ __kernel void binary_convolution( ushort8 w21 = (ushort8)(w7); ushort8 w22 = (ushort8)(w8); - ushort8 cond0 = (((in00) < (ushort8)0x8000) && (in00 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0); - ushort8 cond1 = (((in01) < (ushort8)0x8000) && (in01 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0); - ushort8 cond2 = (((in02) < (ushort8)0x8000) && (in02 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0); - ushort8 cond3 = (((in10) < (ushort8)0x8000) && (in10 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0); - ushort8 cond4 = (((in11) < (ushort8)0x8000) && (in11 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0); - ushort8 cond5 = (((in12) < (ushort8)0x8000) && (in12 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0); - ushort8 cond6 = (((in20) < (ushort8)0x8000) && (in20 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0); - ushort8 cond7 = (((in21) < (ushort8)0x8000) && (in21 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0); - ushort8 cond8 = (((in22) < (ushort8)0x8000) && (in22 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0); - + ushort8 cond0 = (((in00) < (ushort8)0x8000) && (in00 > (ushort8)0x0000)) ? + (ushort8)(1) : + (ushort8)(0); + ushort8 cond1 = (((in01) < (ushort8)0x8000) && (in01 > (ushort8)0x0000)) ? + (ushort8)(1) : + (ushort8)(0); + ushort8 cond2 = (((in02) < (ushort8)0x8000) && (in02 > (ushort8)0x0000)) ? + (ushort8)(1) : + (ushort8)(0); + ushort8 cond3 = (((in10) < (ushort8)0x8000) && (in10 > (ushort8)0x0000)) ? + (ushort8)(1) : + (ushort8)(0); + ushort8 cond4 = (((in11) < (ushort8)0x8000) && (in11 > (ushort8)0x0000)) ? + (ushort8)(1) : + (ushort8)(0); + ushort8 cond5 = (((in12) < (ushort8)0x8000) && (in12 > (ushort8)0x0000)) ? + (ushort8)(1) : + (ushort8)(0); + ushort8 cond6 = (((in20) < (ushort8)0x8000) && (in20 > (ushort8)0x0000)) ? + (ushort8)(1) : + (ushort8)(0); + ushort8 cond7 = (((in21) < (ushort8)0x8000) && (in21 > (ushort8)0x0000)) ? + (ushort8)(1) : + (ushort8)(0); + ushort8 cond8 = (((in22) < (ushort8)0x8000) && (in22 > (ushort8)0x0000)) ? + (ushort8)(1) : + (ushort8)(0); + val += (cond0 ^ w00); val += (cond1 ^ w01); val += (cond2 ^ w02); @@ -207,150 +264,15 @@ __kernel void binary_convolution( } ushort8 val_shift = val << 1; - int boundary = (ows8 * 8 + 7) / SW <= OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1; - for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++) - { - *(dst_local + ow) = (half)(IC/GC*KH*KW - *((ushort*)(&val_shift) + ow * SW - ows8 * 8)); + int boundary = (ows8 * 8 + 7) / SW <= OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1; + for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++) { + *(dst_local + ow) = + (half)(IC / GC * KH * KW - *((ushort *)(&val_shift) + ow * SW - ows8 * 8)); } } -} - -__kernel void __dma_preload_binary_convolution( - const __global half* restrict src_data, - const __global uchar* restrict weights_data, - const __global half* restrict dst_data, - float pad_value, - - int IW, - int IH, - int IC, - - int DW, - int DH, - int GC, + barrier(CLK_LOCAL_MEM_FENCE); - int KW, - int KH, - - int PW, - int PH, - - int SW, - int SH, - - int OW, - __local half* restrict src_local, - const __local half* restrict dst_local) -{ - const int oh = get_group_id(0); - const int oc = get_group_id(1); - const int OH = get_global_size(0); - const int OC = get_global_size(1); - - const int gc = oc / (OC/GC); - - if (oh * SH - 1 >= 0 && oh * SH + DH + DH - 1 <= IH - 1) //dma for 3 rows - { - const __global half* src = src_data + (gc * IC/GC) * IW * IH + (SH * oh - 1) * IW; - WorkGroupDmaCreate3DTransaction( - src, //src, - src_local, //dst, - IW * sizeof(half), //src width, - IW * sizeof(half), //dst width, - DH * IW * sizeof(half), //src stride, - IW * sizeof(half), //dst stride, - IC/GC, //num planes //hang when > 256 - IH * IW * sizeof(half), //src plane stride, - 3 * IW * sizeof(half), //dst plane stride, - 3 * IW * sizeof(half), //plane size, - 0 - ); - - } - else - { - int ih = oh * SH - 1; - if (ih >= 0 && ih <= IH - 1) //dma for first row - { - const __global half* src = src_data + (gc * IC/GC) * IW * IH + ih * IW; - __local half* dst = src_local; - WorkGroupDmaCreateStrideTransaction( - src, // src - dst, // dst - IW * sizeof(half), // src width - IW * sizeof(half), // dst width - IH * IW * sizeof(half), // src stride - 3 * IW * sizeof(half), // dst stride - IW * IC/GC * sizeof(half), //total size - 0 - ); - } - ih = oh * SH - 1 + DH; - if (ih >= 0 && ih <= IH - 1) //dma for second row - { - const __global half* src = src_data + (gc * IC/GC) * IW * IH + ih * IW; - __local half* dst = src_local + IW; - WorkGroupDmaCreateStrideTransaction( - src, // src - dst, // dst - IW * sizeof(half), // src width - IW * sizeof(half), // dst width - IH * IW * sizeof(half), // src stride - 3 * IW * sizeof(half), // dst stride - IW * IC/GC * sizeof(half), //total size - 0 - ); - } - ih = oh * SH - 1 + 2 * DH; - if (ih >= 0 && ih <= IH - 1) //dma for third row - { - const __global half* src = src_data + (gc * IC/GC) * IW * IH + ih * IW; - __local half* dst = src_local + 2 * IW; - WorkGroupDmaCreateStrideTransaction( - src, // src - dst, // dst - IW * sizeof(half), // src width - IW * sizeof(half), // dst width - IH * IW * sizeof(half), // src stride - 3 * IW * sizeof(half), // dst stride - IW * IC/GC * sizeof(half), //total size - 0 - ); - } - } + event_t e2 = async_work_group_copy(dst_data + oc * OW * OH + oh * OW, dst_local, OW, 0); + wait_group_events(1, &e2); } -__kernel void __dma_postwrite_binary_convolution( - const __global half* restrict src_data, - const __global uchar* restrict weights_data, - __global half* restrict dst_data, - float pad_value, - - int IW, - int IH, - int IC, - - int DW, - int DH, - - int GC, - - int KW, - int KH, - - int PW, - int PH, - - int SW, - int SH, - - int OW, - const __local half* restrict src_local, - const __local half* restrict dst_local) -{ - const int oh = get_group_id(0); - const int oc = get_group_id(1); - const int OH = get_global_size(0); - - async_work_group_copy(dst_data + oc*OW*OH + oh*OW, dst_local, OW, 0); -} \ No newline at end of file diff --git a/inference-engine/src/vpu/custom_kernels/binary_layers.cl b/inference-engine/src/vpu/custom_kernels/binary_layers.cl deleted file mode 100644 index 1924f335b228d7..00000000000000 --- a/inference-engine/src/vpu/custom_kernels/binary_layers.cl +++ /dev/null @@ -1,339 +0,0 @@ -// Copyright (C) 2018-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -int extract_weights(uchar val, int bit) { - return ((val >> bit) & 1); -} - -__kernel void binary_convolution(const __global half* restrict src_data, - const __global uchar* restrict weights_data, - __global half* restrict dst_data, - float pad_value, - - int IW, - int IH, - int IC, - - int DW, - int DH, - - int GC, - - int KW, - int KH, - - int PW, - int PH, - - int SW, - int SH) -{ - int ipad_value = ((pad_value > 0.f) ? 1 : 0); - int c = get_global_id(2); - int y = get_global_id(1); - int x = get_global_id(0); - - int OC = get_global_size(2); - int OH = get_global_size(1); - int OW = get_global_size(0); - - int KD = 1; - int SD = 0; - int DD = 0; - int PD = 0; - int ID = 1; - int OD = 1; - - int nbits = 8; - - int g = c % GC; - int oc = c / GC; - int oh = y; - int ow = x; - - for (int od = 0; od < OD; od++) { - int oidx = g * OC / GC * OD * OH * OW - + oc * OD * OH * OW - + od * OH * OW - + oh * OW - + ow; - - int res = 0; - - for (int ic = 0; ic < IC / GC; ic++) { - for (int kd = 0; kd < KD; kd++) { - for (int kh = 0; kh < KH; kh++) { - for (int kw = 0; kw < KW; kw++) { - int widx = g * OC / GC * IC / GC * KD * KH * KW - + oc * IC / GC * KD * KH * KW - + ic * KD * KH * KW - + kd * KH * KW - + kh * KW - + kw; - - int w = extract_weights(weights_data[widx/nbits], (widx % nbits)); - - int s; - - int iw = ow * SW - PW + kw * DW; - int ih = oh * SH - PH + kh * DH; - int id = od * SD - PD + kd * DD; - - if (iw < 0 || iw >= (int) IW || - ih < 0 || ih >= (int) IH || - id < 0 || id >= (int) ID) { - s = ipad_value; - } else { - int iidx = g * IC / GC * ID * IH * IW - + ic * ID * IH * IW - + id * IH * IW - + ih * IW - + iw; - - s = ((src_data[iidx] > 0.f) ? 1 : 0); - } - - res += s ^ w; - } - } - } - } - - dst_data[oidx] = (half)(IC/GC*KD*KH*KW - 2*res); - } -} - -__kernel void quantize(const __global half* __restrict src, - const __global half* __restrict input_low, - const __global half* __restrict input_high, - const __global half* __restrict output_low, - const __global half* __restrict output_high, - const __global half* __restrict dst, - int levels, - int input_low_size, - int input_high_size, - int output_low_size, - int output_high_size, - int W, - int H, - const __local half* __restrict src_local, - __local half* __restrict dst_local) -{ - - int c = get_global_id(2); - int C = get_global_size(2); - - half h_ilow = (input_low_size == 1 ? input_low[0] : input_low[c]); - half h_ihigh = (input_high_size == 1 ? input_high[0] : input_high[c]); - half h_olow = (output_low_size == 1 ? output_low[0] : output_low[c]); - half h_ohigh = (output_high_size == 1 ? output_high[0] : output_high[c]); - - half const1 = (half)(0.01 > (h_ihigh - h_ilow) ? 0.0f : convert_float(levels - 1) / (convert_float(h_ihigh) - convert_float(h_ilow))); - half const2 = (half)(!(levels - 1) ? 0.0f : (convert_float(h_ohigh) - convert_float(h_olow)) / convert_float(levels - 1)); - - for (int h = 0; h < H; h++) - { - __local const half* __restrict addr_src = src_local + h*W; - __local half* __restrict addr_dst = dst_local + h*W; - - for (int w = 0; w < W / 8; w++) - { - half8 val = *((__local half8*)addr_src + w); -#if 1 - // round is too slow =( 902 b of code - //half8 aux = round((val - (half8)h_ilow) * (half8)const1); - - half8 aux = (val - (half8)h_ilow) * (half8)const1 + (half8)0.5h; - - aux = (half8){ - (half)(short)(aux.s0), - (half)(short)(aux.s1), - (half)(short)(aux.s2), - (half)(short)(aux.s3), - (half)(short)(aux.s4), - (half)(short)(aux.s5), - (half)(short)(aux.s6), - (half)(short)(aux.s7) - }; - - aux = aux * (half8)const2 + (half8)h_olow; - - // vector comparison add 756 b of assembly, so do in manually - // short8 a = val <= (half8)h_olow; - // short8 b = val > (half8)h_ohigh; - - short8 a; - short8 b; - a.s0 = (val.s0 <= h_ilow); - a.s1 = (val.s1 <= h_ilow); - a.s2 = (val.s2 <= h_ilow); - a.s3 = (val.s3 <= h_ilow); - a.s4 = (val.s4 <= h_ilow); - a.s5 = (val.s5 <= h_ilow); - a.s6 = (val.s6 <= h_ilow); - a.s7 = (val.s7 <= h_ilow); - - b.s0 = (val.s0 > h_ihigh); - b.s1 = (val.s1 > h_ihigh); - b.s2 = (val.s2 > h_ihigh); - b.s3 = (val.s3 > h_ihigh); - b.s4 = (val.s4 > h_ihigh); - b.s5 = (val.s5 > h_ihigh); - b.s6 = (val.s6 > h_ihigh); - b.s7 = (val.s7 > h_ihigh); - - a = ~(a-(short8)1); - b = ~(b-(short8)1); - - short8 c1 = (~a & b); - short8 c2 = (~a & ~b); - - short8 res = a & as_short8((half8)h_olow) - | c1 & as_short8((half8)h_ohigh) - | c2 & as_short8(aux); - - *((__local half8*)addr_dst + w) = as_half8(res); -#else - *((__local half8*)addr_dst + w) = val; -#endif - } - for (int w = W & (~0x7); w < W; w++) - { - half val = addr_src[w]; -#if 1 - short a = val <= h_ilow; a = ~(a-1); - short b = val > h_ihigh; b = ~(b-1); - - short c1 = (~a & b); - short c2 = (~a & ~b); - - short res = a & as_short(h_olow) - | c1 & as_short(h_ohigh) - | c2 & as_short(((half)(round( (val - h_ilow) * const1) * const2) + h_olow)); - - addr_dst[w] = as_half(res); -#else - addr_dst[w] = val; -#endif - } - } -} -__kernel void __dma_preload_quantize(const __global half* __restrict src, - const __global half* __restrict input_low, - const __global half* __restrict input_high, - const __global half* __restrict output_low, - const __global half* __restrict output_high, - const __global half* __restrict dst, - int levels, - int input_low_size, - int input_high_size, - int output_low_size, - int output_high_size, - int W, - int H, - __local half* __restrict src_local, - const __local half* __restrict dst_local) -{ - const int sizePlane = W*H; - async_work_group_copy(src_local ,src + get_group_id(2)*sizePlane, sizePlane, 0); -} -__kernel void __dma_postwrite_quantize(const __global half* __restrict src, - const __global half* __restrict input_low, - const __global half* __restrict input_high, - const __global half* __restrict output_low, - const __global half* __restrict output_high, - __global half* __restrict dst, - int levels, - int input_low_size, - int input_high_size, - int output_low_size, - int output_high_size, - int W, - int H, - const __local half* __restrict src_local, - const __local half* __restrict dst_local) -{ - const int sizePlane = W*H; - async_work_group_copy(dst + get_group_id(2)*sizePlane ,dst_local, sizePlane, 0); -} - -__kernel void binarization(const __global half* __restrict src, - const __global half* __restrict input_low_high, - const __global half* __restrict dst, - int switch_out, - int input_low_high_size, - int W, - int H, - const __local half* __restrict src_local, - __local half* __restrict dst_local) -{ - int c = get_global_id(2); - int C = get_global_size(2); - - half dst_low = switch_out ? 1.h : -1.h; - half dst_high = switch_out ? -1.h : 1.h; - - half s_ilow_ihigh = input_low_high_size == 1 ? input_low_high[0] : input_low_high[c]; - - for (int h = 0; h < H; h++) { - - __local const half* __restrict addr_src = src_local + h*W; - __local half* __restrict addr_dst = dst_local + h*W; - -#if 1 - for (int w = 0; w < W / 8; w++) { - - half8 h_src_val8 = (*((__local half8*)addr_src + w)); - - short8 cond1; - cond1.s0 = (h_src_val8.s0 <= s_ilow_ihigh); - cond1.s1 = (h_src_val8.s1 <= s_ilow_ihigh); - cond1.s2 = (h_src_val8.s2 <= s_ilow_ihigh); - cond1.s3 = (h_src_val8.s3 <= s_ilow_ihigh); - cond1.s4 = (h_src_val8.s4 <= s_ilow_ihigh); - cond1.s5 = (h_src_val8.s5 <= s_ilow_ihigh); - cond1.s6 = (h_src_val8.s6 <= s_ilow_ihigh); - cond1.s7 = (h_src_val8.s7 <= s_ilow_ihigh); - - cond1 = ~(cond1-(short8)1); - - short8 res = cond1 & as_short8((half8)dst_low) | ~cond1 & as_short8((half8)dst_high); - - *((__local half8*)addr_dst + w) = as_half8(res); - } -#endif - for (int w = W & (~0x7); w < W; w++) - { - addr_dst[w] = (addr_src[w] <= s_ilow_ihigh) ? dst_low : dst_high; - } - } -} -__kernel void __dma_preload_binarization(const __global half* __restrict src, - const __global half* __restrict input_low_high, - const __global half* __restrict dst, - int switch_out, - int input_low_high_size, - int W, - int H, - __local half* __restrict src_local, - const __local half* __restrict dst_local) -{ - const int sizePlane = W*H; - async_work_group_copy(src_local ,src + get_group_id(2)*sizePlane, sizePlane, 0); -} -__kernel void __dma_postwrite_binarization(const __global half* __restrict src, - const __global half* __restrict input_low_high, - __global half* __restrict dst, - int switch_out, - int input_low_high_size, - int W, - int H, - const __local half* __restrict src_local, - const __local half* __restrict dst_local) -{ - const int sizePlane = W*H; - async_work_group_copy(dst + get_group_id(2)*sizePlane ,dst_local, sizePlane, 0); -} \ No newline at end of file diff --git a/inference-engine/src/vpu/custom_kernels/convolution1x1.cl b/inference-engine/src/vpu/custom_kernels/convolution1x1.cl deleted file mode 100644 index 6ae0e2cfab45e3..00000000000000 --- a/inference-engine/src/vpu/custom_kernels/convolution1x1.cl +++ /dev/null @@ -1,281 +0,0 @@ -// Copyright (C) 2018-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -__kernel void Convolution1x1_NCHW( - const __global half* in, - const __global half* out, - const __global half* w, - int IW, - int IH, - int IC, - int OW, - int OH, - int OC, - const __local half* in_local, - __local half* out_local) -{ - int oh = get_global_id(0); - int oc = get_global_id(1); - - int stride; - int write_output = 0; - __global half* src; - - __global half8* w8 = (__global half8*)(&w[oc*IC]); - __global half* w1 = (__global half*)(&w[oc*IC]); - - - for (uint ow = 0; ow < (OW & (~0x7)); ow += 8) - { - uint iw = ow; - uint ih = oh; - - half8 val8_0 = 0.0f; - - __local half8* in8_0 = (__local half8*)(&in_local[iw + 0 * IW]); - __local half8* in8_1 = (__local half8*)(&in_local[iw + 1 * IW]); - __local half8* in8_2 = (__local half8*)(&in_local[iw + 2 * IW]); - __local half8* in8_3 = (__local half8*)(&in_local[iw + 3 * IW]); - __local half8* in8_4 = (__local half8*)(&in_local[iw + 4 * IW]); - __local half8* in8_5 = (__local half8*)(&in_local[iw + 5 * IW]); - __local half8* in8_6 = (__local half8*)(&in_local[iw + 6 * IW]); - __local half8* in8_7 = (__local half8*)(&in_local[iw + 7 * IW]); - - for (uint ic = 0; ic < IC / 8; ic ++) - { - val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0); - val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1); - val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2); - val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3); - val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4); - val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5); - val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6); - val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7); - } - - for (uint ic = (IC & (~0x7)); ic < IC; ++ic) - { - val8_0 += *((__local half8*)(&in_local[iw + ic * IW])) * ((half8)w1[ic]); - } - *((__local half8*)&out_local[ow + 0]) = (val8_0); - } - - uint iw = (OW & (~0x7)); - uint ih = oh; - - half8 val8_0 = 0.0f; - - __local half8* in8_0 = (__local half8*)(&in_local[iw + 0 * IW]); - __local half8* in8_1 = (__local half8*)(&in_local[iw + 1 * IW]); - __local half8* in8_2 = (__local half8*)(&in_local[iw + 2 * IW]); - __local half8* in8_3 = (__local half8*)(&in_local[iw + 3 * IW]); - __local half8* in8_4 = (__local half8*)(&in_local[iw + 4 * IW]); - __local half8* in8_5 = (__local half8*)(&in_local[iw + 5 * IW]); - __local half8* in8_6 = (__local half8*)(&in_local[iw + 6 * IW]); - __local half8* in8_7 = (__local half8*)(&in_local[iw + 7 * IW]); - - for (uint ic = 0; ic < IC / 8; ic ++) - { - val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0); - val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1); - val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2); - val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3); - val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4); - val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5); - val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6); - val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7); - } - - for (uint ic = (IC & (~0x7)); ic < IC; ++ic) - { - val8_0 += *((__local half8*)(&in_local[iw + ic * IW])) * ((half8)w1[ic]); - } - for (uint ow = (OW & (~0x7)); ow < OW; ow ++) - { - out_local[ow + 0] = (val8_0[ow % 8]); - } -} -__kernel void __dma_preload_Convolution1x1_NCHW( - const __global half* in, - const __global half* out, - const __global half* w, - int IW, - int IH, - int IC, - int OW, - int OH, - int OC, - __local half* in_local, - const __local half* out_local) -{ - const int sizePlane = IW*IH; - WorkGroupDmaCreateStrideTransaction( - in + get_group_id(0)*IW, // src - in_local, // dst - IW * sizeof(half), // src width - IW * sizeof(half), // dst width - sizePlane * sizeof(half), // src stride - IW * sizeof(half), // dst stride - IW * IC * sizeof(half), //total size - 0 - ); -} -__kernel void __dma_postwrite_Convolution1x1_NCHW( - const __global half* in, - __global half* out, - const __global half* w, - int IW, - int IH, - int IC, - int OW, - int OH, - int OC, - const __local half* in_local, - const __local half* out_local) -{ - async_work_group_copy(out + get_group_id(1)*OW*OH + get_group_id(0)*OW, out_local, OW, 0); -} - -__kernel void Convolution1x1_NHWC( - const __global half* in, - const __global half* out, - const __global half* w, - int IW, - int IH, - int IC, - int OW, - int OH, - int OC, - const __local half* in_local, - __local half* out_local) -{ - int oh = get_global_id(0); - int oc = get_global_id(1); - - int stride; - int write_output = 0; - __global half* src; - - __global half8* w8 = (__global half8*)(&w[oc*IC]); - __global half* w1 = (__global half*)(&w[oc*IC]); - - for (uint ow = 0; ow < (OW & (~0x7)); ow += 8) - { - uint iw = ow; - uint ih = oh; - - half8 val8_0 = 0.0f; - half8 val8_1 = 0.0f; - half8 val8_2 = 0.0f; - half8 val8_3 = 0.0f; - half8 val8_4 = 0.0f; - half8 val8_5 = 0.0f; - half8 val8_6 = 0.0f; - half8 val8_7 = 0.0f; - - __local half8* in8_0 = (__local half8*)(&in_local[(iw + 0) * IC]); - __local half8* in8_1 = (__local half8*)(&in_local[(iw + 1) * IC]); - __local half8* in8_2 = (__local half8*)(&in_local[(iw + 2) * IC]); - __local half8* in8_3 = (__local half8*)(&in_local[(iw + 3) * IC]); - __local half8* in8_4 = (__local half8*)(&in_local[(iw + 4) * IC]); - __local half8* in8_5 = (__local half8*)(&in_local[(iw + 5) * IC]); - __local half8* in8_6 = (__local half8*)(&in_local[(iw + 6) * IC]); - __local half8* in8_7 = (__local half8*)(&in_local[(iw + 7) * IC]); - - for (uint ic = 0; ic < IC / 8; ++ic) - { - val8_0 += (in8_0[ic]) * (w8[ic]); - val8_1 += (in8_1[ic]) * (w8[ic]); - val8_2 += (in8_2[ic]) * (w8[ic]); - val8_3 += (in8_3[ic]) * (w8[ic]); - val8_4 += (in8_4[ic]) * (w8[ic]); - val8_5 += (in8_5[ic]) * (w8[ic]); - val8_6 += (in8_6[ic]) * (w8[ic]); - val8_7 += (in8_7[ic]) * (w8[ic]); - } - - half val_0 = 0.0f; - half val_1 = 0.0f; - half val_2 = 0.0f; - half val_3 = 0.0f; - half val_4 = 0.0f; - half val_5 = 0.0f; - half val_6 = 0.0f; - half val_7 = 0.0f; - for (uint ic = IC & (~0x7); ic < IC; ++ic) - { - val_0 += *((__local half*)in8_0 + ic) * (*((__global half*)w8 + ic)); - val_1 += *((__local half*)in8_1 + ic) * (*((__global half*)w8 + ic)); - val_2 += *((__local half*)in8_2 + ic) * (*((__global half*)w8 + ic)); - val_3 += *((__local half*)in8_3 + ic) * (*((__global half*)w8 + ic)); - val_4 += *((__local half*)in8_4 + ic) * (*((__global half*)w8 + ic)); - val_5 += *((__local half*)in8_5 + ic) * (*((__global half*)w8 + ic)); - val_6 += *((__local half*)in8_6 + ic) * (*((__global half*)w8 + ic)); - val_7 += *((__local half*)in8_7 + ic) * (*((__global half*)w8 + ic)); - } - out_local[ow + 0] = __builtin_shave_sau_sumx_f16_r(val8_0) + val_0; - out_local[ow + 1] = __builtin_shave_sau_sumx_f16_r(val8_1) + val_1; - out_local[ow + 2] = __builtin_shave_sau_sumx_f16_r(val8_2) + val_2; - out_local[ow + 3] = __builtin_shave_sau_sumx_f16_r(val8_3) + val_3; - out_local[ow + 4] = __builtin_shave_sau_sumx_f16_r(val8_4) + val_4; - out_local[ow + 5] = __builtin_shave_sau_sumx_f16_r(val8_5) + val_5; - out_local[ow + 6] = __builtin_shave_sau_sumx_f16_r(val8_6) + val_6; - out_local[ow + 7] = __builtin_shave_sau_sumx_f16_r(val8_7) + val_7; - } - for (uint ow = (OW & (~0x7)); ow < OW; ow ++) - { - - uint iw = ow; - uint ih = oh; - - half8 val8 = 0.0f; - - __local half8* in8 = (__local half8*)(&in_local[iw * IC]); - - for (uint ic = 0; ic < IC / 8; ++ic) - { - val8 += (in8[ic]) * (w8[ic]); - } - - half val = 0.0f; - for (uint ic = (IC & (~0x7)); ic < IC; ++ic) - { - val += (*((__local half*)in8 + ic)) * (*((__global half*)w8 + ic)); - } - out_local[ow] = __builtin_shave_sau_sumx_f16_r(val8) + val; - } -} -__kernel void __dma_preload_Convolution1x1_NHWC( - const __global half* in, - const __global half* out, - const __global half* w, - int IW, - int IH, - int IC, - int OW, - int OH, - int OC, - __local half* in_local, - const __local half* out_local) -{ - const int sizeAct = IW*IC; - async_work_group_copy(in_local, in + get_group_id(0)*sizeAct, sizeAct, 0); -} -__kernel void __dma_postwrite_Convolution1x1_NHWC( - const __global half* in, - __global half* out, - const __global half* w, - int IW, - int IH, - int IC, - int OW, - int OH, - int OC, - const __local half* in_local, - const __local half* out_local) -{ - async_work_group_copy(out + get_group_id(1)*OW*OH + get_group_id(0)*OW, out_local, OW, 0); -} diff --git a/inference-engine/src/vpu/custom_kernels/convolution1x1_chw.cl b/inference-engine/src/vpu/custom_kernels/convolution1x1_chw.cl new file mode 100644 index 00000000000000..9e897714bd9d13 --- /dev/null +++ b/inference-engine/src/vpu/custom_kernels/convolution1x1_chw.cl @@ -0,0 +1,114 @@ +// Copyright (C) 2018-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable + +__kernel void Convolution1x1_NCHW( + const __global half *in, + const __global half *out, + const __global half *w, + int IW, + int IH, + int IC, + int OW, + int OH, + int OC) +{ + __local half in_local[8 * 1024]; + __local half out_local[8 * 1024]; + + event_t e1 = async_work_group_copy_2D2D( + in_local, // dst + in + get_group_id(0) * IW, // src + IW, // num_elements_per_line, + IC, // num_lines, + IW * IH - IW, // src_line_stride, + 0, // dst_line_stride, + 0); + wait_group_events(1, &e1); + + int oh = get_global_id(0); + int oc = get_global_id(1); + + int stride; + int write_output = 0; + __global half *src; + + __global half8 *w8 = (__global half8 *)(&w[oc * IC]); + __global half *w1 = (__global half *)(&w[oc * IC]); + + for (uint ow = 0; ow < (OW & (~0x7)); ow += 8) { + uint iw = ow; + uint ih = oh; + + half8 val8_0 = 0.0f; + + __local half8 *in8_0 = (__local half8 *)(&in_local[iw + 0 * IW]); + __local half8 *in8_1 = (__local half8 *)(&in_local[iw + 1 * IW]); + __local half8 *in8_2 = (__local half8 *)(&in_local[iw + 2 * IW]); + __local half8 *in8_3 = (__local half8 *)(&in_local[iw + 3 * IW]); + __local half8 *in8_4 = (__local half8 *)(&in_local[iw + 4 * IW]); + __local half8 *in8_5 = (__local half8 *)(&in_local[iw + 5 * IW]); + __local half8 *in8_6 = (__local half8 *)(&in_local[iw + 6 * IW]); + __local half8 *in8_7 = (__local half8 *)(&in_local[iw + 7 * IW]); + + for (uint ic = 0; ic < IC / 8; ic++) { + val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0); + val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1); + val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2); + val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3); + val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4); + val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5); + val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6); + val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7); + } + + for (uint ic = (IC & (~0x7)); ic < IC; ++ic) { + val8_0 += *((__local half8 *)(&in_local[iw + ic * IW])) * ((half8)w1[ic]); + } + *((__local half8 *)&out_local[ow + 0]) = (val8_0); + } + + uint iw = (OW & (~0x7)); + uint ih = oh; + + half8 val8_0 = 0.0f; + + __local half8 *in8_0 = (__local half8 *)(&in_local[iw + 0 * IW]); + __local half8 *in8_1 = (__local half8 *)(&in_local[iw + 1 * IW]); + __local half8 *in8_2 = (__local half8 *)(&in_local[iw + 2 * IW]); + __local half8 *in8_3 = (__local half8 *)(&in_local[iw + 3 * IW]); + __local half8 *in8_4 = (__local half8 *)(&in_local[iw + 4 * IW]); + __local half8 *in8_5 = (__local half8 *)(&in_local[iw + 5 * IW]); + __local half8 *in8_6 = (__local half8 *)(&in_local[iw + 6 * IW]); + __local half8 *in8_7 = (__local half8 *)(&in_local[iw + 7 * IW]); + + for (uint ic = 0; ic < IC / 8; ic++) { + val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0); + val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1); + val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2); + val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3); + val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4); + val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5); + val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6); + val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7); + } + + for (uint ic = (IC & (~0x7)); ic < IC; ++ic) { + val8_0 += *((__local half8 *)(&in_local[iw + ic * IW])) * ((half8)w1[ic]); + } + for (uint ow = (OW & (~0x7)); ow < OW; ow++) { + out_local[ow + 0] = (val8_0[ow % 8]); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + event_t e2 = async_work_group_copy( + out + get_group_id(1) * OW * OH + get_group_id(0) * OW, + out_local, + OW, + 0); + wait_group_events(1, &e2); +} diff --git a/inference-engine/src/vpu/custom_kernels/convolution1x1_hwc.cl b/inference-engine/src/vpu/custom_kernels/convolution1x1_hwc.cl new file mode 100644 index 00000000000000..94cbb39d51656c --- /dev/null +++ b/inference-engine/src/vpu/custom_kernels/convolution1x1_hwc.cl @@ -0,0 +1,126 @@ +// Copyright (C) 2018-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable + +__kernel void Convolution1x1_NHWC( + const __global half *in, + const __global half *out, + const __global half *w, + int IW, + int IH, + int IC, + int OW, + int OH, + int OC) +{ + + __local half in_local[8 * 1024]; + __local half out_local[8 * 1024]; + + const int sizeAct = IW * IC; + + event_t e1 = async_work_group_copy(in_local, in + get_group_id(0) * sizeAct, sizeAct, 0); + wait_group_events(1, &e1); + + int oh = get_global_id(0); + int oc = get_global_id(1); + + int stride; + int write_output = 0; + __global half *src; + + __global half8 *w8 = (__global half8 *)(&w[oc * IC]); + __global half *w1 = (__global half *)(&w[oc * IC]); + + for (uint ow = 0; ow < (OW & (~0x7)); ow += 8) { + uint iw = ow; + uint ih = oh; + + half8 val8_0 = 0.0f; + half8 val8_1 = 0.0f; + half8 val8_2 = 0.0f; + half8 val8_3 = 0.0f; + half8 val8_4 = 0.0f; + half8 val8_5 = 0.0f; + half8 val8_6 = 0.0f; + half8 val8_7 = 0.0f; + + __local half8 *in8_0 = (__local half8 *)(&in_local[(iw + 0) * IC]); + __local half8 *in8_1 = (__local half8 *)(&in_local[(iw + 1) * IC]); + __local half8 *in8_2 = (__local half8 *)(&in_local[(iw + 2) * IC]); + __local half8 *in8_3 = (__local half8 *)(&in_local[(iw + 3) * IC]); + __local half8 *in8_4 = (__local half8 *)(&in_local[(iw + 4) * IC]); + __local half8 *in8_5 = (__local half8 *)(&in_local[(iw + 5) * IC]); + __local half8 *in8_6 = (__local half8 *)(&in_local[(iw + 6) * IC]); + __local half8 *in8_7 = (__local half8 *)(&in_local[(iw + 7) * IC]); + + for (uint ic = 0; ic < IC / 8; ++ic) { + val8_0 += (in8_0[ic]) * (w8[ic]); + val8_1 += (in8_1[ic]) * (w8[ic]); + val8_2 += (in8_2[ic]) * (w8[ic]); + val8_3 += (in8_3[ic]) * (w8[ic]); + val8_4 += (in8_4[ic]) * (w8[ic]); + val8_5 += (in8_5[ic]) * (w8[ic]); + val8_6 += (in8_6[ic]) * (w8[ic]); + val8_7 += (in8_7[ic]) * (w8[ic]); + } + + half val_0 = 0.0f; + half val_1 = 0.0f; + half val_2 = 0.0f; + half val_3 = 0.0f; + half val_4 = 0.0f; + half val_5 = 0.0f; + half val_6 = 0.0f; + half val_7 = 0.0f; + for (uint ic = IC & (~0x7); ic < IC; ++ic) { + val_0 += *((__local half *)in8_0 + ic) * (*((__global half *)w8 + ic)); + val_1 += *((__local half *)in8_1 + ic) * (*((__global half *)w8 + ic)); + val_2 += *((__local half *)in8_2 + ic) * (*((__global half *)w8 + ic)); + val_3 += *((__local half *)in8_3 + ic) * (*((__global half *)w8 + ic)); + val_4 += *((__local half *)in8_4 + ic) * (*((__global half *)w8 + ic)); + val_5 += *((__local half *)in8_5 + ic) * (*((__global half *)w8 + ic)); + val_6 += *((__local half *)in8_6 + ic) * (*((__global half *)w8 + ic)); + val_7 += *((__local half *)in8_7 + ic) * (*((__global half *)w8 + ic)); + } + out_local[ow + 0] = __builtin_shave_sau_sumx_f16_r(val8_0) + val_0; + out_local[ow + 1] = __builtin_shave_sau_sumx_f16_r(val8_1) + val_1; + out_local[ow + 2] = __builtin_shave_sau_sumx_f16_r(val8_2) + val_2; + out_local[ow + 3] = __builtin_shave_sau_sumx_f16_r(val8_3) + val_3; + out_local[ow + 4] = __builtin_shave_sau_sumx_f16_r(val8_4) + val_4; + out_local[ow + 5] = __builtin_shave_sau_sumx_f16_r(val8_5) + val_5; + out_local[ow + 6] = __builtin_shave_sau_sumx_f16_r(val8_6) + val_6; + out_local[ow + 7] = __builtin_shave_sau_sumx_f16_r(val8_7) + val_7; + } + for (uint ow = (OW & (~0x7)); ow < OW; ow++) { + + uint iw = ow; + uint ih = oh; + + half8 val8 = 0.0f; + + __local half8 *in8 = (__local half8 *)(&in_local[iw * IC]); + + for (uint ic = 0; ic < IC / 8; ++ic) { + val8 += (in8[ic]) * (w8[ic]); + } + + half val = 0.0f; + for (uint ic = (IC & (~0x7)); ic < IC; ++ic) { + val += (*((__local half *)in8 + ic)) * (*((__global half *)w8 + ic)); + } + out_local[ow] = __builtin_shave_sau_sumx_f16_r(val8) + val; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + event_t e2 = async_work_group_copy( + out + get_group_id(1) * OW * OH + get_group_id(0) * OW, + out_local, + OW, + 0); + wait_group_events(1, &e2); +} diff --git a/inference-engine/src/vpu/custom_kernels/convolution3x3.cl b/inference-engine/src/vpu/custom_kernels/convolution3x3.cl index 5c054ed1c810e8..8f0b5efc4bb742 100644 --- a/inference-engine/src/vpu/custom_kernels/convolution3x3.cl +++ b/inference-engine/src/vpu/custom_kernels/convolution3x3.cl @@ -3,64 +3,89 @@ // #pragma OPENCL EXTENSION cl_khr_fp16 : enable - -__kernel void Convolution3x3(const __global half* in_param, - const __global half* out, - const __global half* w, - int IW, int IH, int IC, - int OW, int OH, int OC, int KX, int KY, - int stride_x, int stride_y, int pad_x, int pad_y, int dilation_x, int dilation_y, - const __local half* in_local, - __local half* out_local, - const __local half* w_local) +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable + +__kernel void Convolution3x3( + const __global half *in_param, + const __global half *out, + const __global half *w, + int IW, + int IH, + int IC, + int OW, + int OH, + int OC, + int KX, + int KY, + int stride_x, + int stride_y, + int pad_x, + int pad_y, + int dilation_x, + int dilation_y) { + __local half in_local[8 * 1024]; + __local half out_local[8 * 1024]; + __local half w_local[8 * 1024]; + + const int sizePlane = IW * IH; + event_t e1 = async_work_group_copy_2D2D( + in_local, // dst + in_param + get_group_id(0) * stride_y * IW, // src + 3 * IW, // num_elements_per_line, + IC, // num_lines, + IW * IH - 3 * IW, // src_line_stride, + 0, // dst_line_stride, + 0); + wait_group_events(1, &e1); + + const int sizeWeight = IC * 3 * 3; + e1 = async_work_group_copy(w_local, w + get_group_id(1) * sizeWeight, sizeWeight, 0); + wait_group_events(1, &e1); + int oh = get_global_id(0); int oc = get_global_id(1); - __local half* in = (__local half* )in_local + 1; + __local half *in = (__local half *)in_local + 1; int stride; int write_output = 0; - __local half* src; + __local half *src; - if((stride_x == 1) && (stride_y == 1)) - { - stride = OW / 8; + if ((stride_x == 1) && (stride_y == 1)) { + stride = OW / 8; write_output = 1; } - if((stride_x == 2) && (stride_y == 2)) - { - stride = OW / 4; + if ((stride_x == 2) && (stride_y == 2)) { + stride = OW / 4; write_output = 2; } - for (int ow = 0; ow < stride; ow++) - { + for (int ow = 0; ow < stride; ow++) { float8 val = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; - for (int ic = 0; ic < IC; ++ic) - { - src = (__local half* )((__local half8*)(in + ic * IW * 3) + ow); - __local half* k = (__local half* )(w_local + ic*3*3); - - half8 aux_in00 = *((__local half8*)src - 1); - half8 aux_in01 = *((__local half8*)src + 0); - half8 aux_in02 = *((__local half8*)src + 1); - half8 aux_in10 = *((__local half8*)(src + IW) - 1); - half8 aux_in11 = *((__local half8*)(src + IW) + 0); - half8 aux_in12 = *((__local half8*)(src + IW) + 1); - half8 aux_in20 = *((__local half8*)(src + IW * 2) - 1); - half8 aux_in21 = *((__local half8*)(src + IW * 2) + 0); - half8 aux_in22 = *((__local half8*)(src + IW * 2) + 1); - - short8 in00 = *((short8*)&aux_in00); - short8 in01 = *((short8*)&aux_in01); - short8 in02 = *((short8*)&aux_in02); - short8 in10 = *((short8*)&aux_in10); - short8 in11 = *((short8*)&aux_in11); - short8 in12 = *((short8*)&aux_in12); - short8 in20 = *((short8*)&aux_in20); - short8 in21 = *((short8*)&aux_in21); - short8 in22 = *((short8*)&aux_in22); + for (int ic = 0; ic < IC; ++ic) { + src = (__local half *)((__local half8 *)(in + ic * IW * 3) + ow); + __local half *k = (__local half *)(w_local + ic * 3 * 3); + + half8 aux_in00 = *((__local half8 *)src - 1); + half8 aux_in01 = *((__local half8 *)src + 0); + half8 aux_in02 = *((__local half8 *)src + 1); + half8 aux_in10 = *((__local half8 *)(src + IW) - 1); + half8 aux_in11 = *((__local half8 *)(src + IW) + 0); + half8 aux_in12 = *((__local half8 *)(src + IW) + 1); + half8 aux_in20 = *((__local half8 *)(src + IW * 2) - 1); + half8 aux_in21 = *((__local half8 *)(src + IW * 2) + 0); + half8 aux_in22 = *((__local half8 *)(src + IW * 2) + 1); + + short8 in00 = *((short8 *)&aux_in00); + short8 in01 = *((short8 *)&aux_in01); + short8 in02 = *((short8 *)&aux_in02); + short8 in10 = *((short8 *)&aux_in10); + short8 in11 = *((short8 *)&aux_in11); + short8 in12 = *((short8 *)&aux_in12); + short8 in20 = *((short8 *)&aux_in20); + short8 in21 = *((short8 *)&aux_in21); + short8 in22 = *((short8 *)&aux_in22); short8 aux_aux00 = __builtin_shave_cmu_alignvec_rri_short8(in00, in01, 14); short8 aux_aux01 = in01; @@ -72,15 +97,15 @@ __kernel void Convolution3x3(const __global half* in_param, short8 aux_aux21 = in21; short8 aux_aux22 = __builtin_shave_cmu_alignvec_rri_short8(in21, in22, 2); - half8 aux00 = *((half8*)&aux_aux00); - half8 aux01 = *((half8*)&aux_aux01); - half8 aux02 = *((half8*)&aux_aux02); - half8 aux10 = *((half8*)&aux_aux10); - half8 aux11 = *((half8*)&aux_aux11); - half8 aux12 = *((half8*)&aux_aux12); - half8 aux20 = *((half8*)&aux_aux20); - half8 aux21 = *((half8*)&aux_aux21); - half8 aux22 = *((half8*)&aux_aux22); + half8 aux00 = *((half8 *)&aux_aux00); + half8 aux01 = *((half8 *)&aux_aux01); + half8 aux02 = *((half8 *)&aux_aux02); + half8 aux10 = *((half8 *)&aux_aux10); + half8 aux11 = *((half8 *)&aux_aux11); + half8 aux12 = *((half8 *)&aux_aux12); + half8 aux20 = *((half8 *)&aux_aux20); + half8 aux21 = *((half8 *)&aux_aux21); + half8 aux22 = *((half8 *)&aux_aux22); half8 w00 = (half8)(*(k + 0)); half8 w01 = (half8)(*(k + 1)); @@ -102,69 +127,32 @@ __kernel void Convolution3x3(const __global half* in_param, val += convert_float8(aux21) * convert_float8(w21); val += convert_float8(aux22) * convert_float8(w22); } - if(write_output == 2) - *((__local half4*)(out_local) + ow) = convert_half4(val.s0246); - if(write_output == 1) - *((__local half8*)(out_local) + ow) = convert_half8(val); + if (write_output == 2) *((__local half4 *)(out_local) + ow) = convert_half4(val.s0246); + if (write_output == 1) *((__local half8 *)(out_local) + ow) = convert_half8(val); } - for (int ow = OW & ~(0x7); ow < OW; ow++) - { + for (int ow = OW & ~(0x7); ow < OW; ow++) { float val = 0.0f; - for (int ic = 0; ic < IC; ++ic) - { - for (int ky = 0; ky < 3; ++ky) - { - for (int kx = 0; kx < 3; ++kx) - { + for (int ic = 0; ic < IC; ++ic) { + for (int ky = 0; ky < 3; ++ky) { + for (int kx = 0; kx < 3; ++kx) { int iw = ow * stride_x - pad_x + kx * dilation_x; int ih = oh * stride_y - pad_y + ky * dilation_y; - val += convert_float(in[ic*IW*3 + (ky * dilation_y)*IW + iw]) * convert_float(w_local[ic*3*3 + ky*3 + kx]); + val += convert_float(in[ic * IW * 3 + (ky * dilation_y) * IW + iw]) + * convert_float(w_local[ic * 3 * 3 + ky * 3 + kx]); } } } out_local[ow] = convert_half(val); } -} -__kernel void __dma_preload_Convolution3x3( - const __global half* in_param, - const __global half* out, - const __global half* w, - int IW, int IH, int IC, - int OW, int OH, int OC, int KX, int KY, - int stride_x, int stride_y, int pad_x, int pad_y, int dilation_x, int dilation_y, - __local half* in_local, - const __local half* out_local, - __local half* w_local) -{ - const int sizePlane = IW*IH; - WorkGroupDmaCreateStrideTransaction( - in_param + get_group_id(0)*stride_y*IW, // src - in_local, // dst - 3 * IW * sizeof(half), // src width - 3 * IW * sizeof(half), // dst width - sizePlane * sizeof(half), // src stride - 3 * IW * sizeof(half), // dst stride - 3 * IW * IC * sizeof(half), //total size - 0 - ); - - const int sizeWeight = IC*3*3; - async_work_group_copy(w_local, w + get_group_id(1)*sizeWeight, sizeWeight, 0); -} + barrier(CLK_LOCAL_MEM_FENCE); -__kernel void __dma_postwrite_Convolution3x3( - const __global half* in_param, - __global half* out, - const __global half* w, - int IW, int IH, int IC, - int OW, int OH, int OC, int KX, int KY, - int stride_x, int stride_y, int pad_x, int pad_y, int dilation_x, int dilation_y, - const __local half* in_local, - const __local half* out_local, - const __local half* w_local) -{ - async_work_group_copy(out + get_group_id(1)*OW*OH + get_group_id(0)*OW, out_local, OW, 0); + event_t e2 = async_work_group_copy( + out + get_group_id(1) * OW * OH + get_group_id(0) * OW, + out_local, + OW, + 0); + wait_group_events(1, &e2); } diff --git a/inference-engine/src/vpu/custom_kernels/correlate.cl b/inference-engine/src/vpu/custom_kernels/correlate.cl index 0a7b3aeeabecea..3a9d722a6c4066 100644 --- a/inference-engine/src/vpu/custom_kernels/correlate.cl +++ b/inference-engine/src/vpu/custom_kernels/correlate.cl @@ -4,112 +4,105 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable -#define MAX_OPENCL_BUFF_SIZE 64*1024 +#define MAX_OPENCL_BUFF_SIZE 64 * 1024 -// Define if runtime supports it. MX runtime is compatible, KMB is in WIP state -#define USE_MANUAL_DMA 1 +#define USE_DMA 1 -#if defined (USE_MANUAL_DMA) -void dmacpyLineSrcStrideStart(global half* from, private half* to, int size, int src_width, int src_stride) +#if defined(USE_DMA) +void dmacpyLineSrcStrideStart(global half *from, private half *to, int size, int src_width, int src_stride) { - item_dma_event_t copyEvent = WorkItemDmaCreateStrideTransaction(from, to, src_width, src_width, src_stride, src_width, size, 0); + item_dma_event_t copyEvent = + WorkItemDmaCreateStrideTransaction(from, to, src_width, src_width, src_stride, src_width, size, 0); WaitWorkItemDmaEvents(1, ©Event); } -void dmacpyLineDstStrideStart(private half* from, global half* to, int size, int src_width, int src_stride) +void dmacpyLineDstStrideStart(private half *from, global half *to, int size, int src_width, int src_stride) { - item_dma_event_t copyEvent = WorkItemDmaCreateStrideTransaction(from, to, src_width, src_width, src_width, src_stride, size, 0); + item_dma_event_t copyEvent = + WorkItemDmaCreateStrideTransaction(from, to, src_width, src_width, src_width, src_stride, size, 0); WaitWorkItemDmaEvents(1, ©Event); } #endif -void memzero(void * ptr, size_t num) +void memzero(void *ptr, size_t num) { - float4* line0_ = (float4*) ptr; + float4 *line0_ = (float4 *)ptr; #pragma unroll 16 - for (int i = 0; i < num/16; i++) - { + for (int i = 0; i < num / 16; i++) { line0_[i] = (float4){0.f, 0.f, 0.f, 0.f}; } - uchar* ptr_ = (uchar*) ptr; - for (int i = num/16*16; i < num; i++) - { + uchar *ptr_ = (uchar *)ptr; + for (int i = num / 16 * 16; i < num; i++) { ptr_[i] = 0; } } -void __attribute__((noinline)) crosscorrh(__private const half* restrict line0, - __private const half* restrict line1, - __private half* restrict dline, - int topwidth, - int max_displacement, - int neighborhood_grid_radius, - int kernel_size, - int padding, - int bottomwidth, - int stride1, - int stride2, - int max_channels, - int cur_subchannels) +void __attribute__((noinline)) crosscorrh( + __private const half *restrict line0, + __private const half *restrict line1, + __private half *restrict dline, + int topwidth, + int max_displacement, + int neighborhood_grid_radius, + int kernel_size, + int padding, + int bottomwidth, + int stride1, + int stride2, + int max_channels, + int cur_subchannels) { - if (max_channels == 64) - { - for (int i = 0; i < kernel_size; i++) - { - int x1 = max_displacement - padding + i; - int offset1 = x1 >= 0 ? 0 : (-x1 + stride1 - 1)/stride1; - x1 += offset1*stride1; - - for (int blockIdx_x = offset1; blockIdx_x < topwidth && x1 < bottomwidth; blockIdx_x++, x1 += stride1) - { - int x2 = x1 - neighborhood_grid_radius*stride2; - int offset2 = x2 >= 0 ? 0 : (-x2 + stride2 - 1)/stride2; - x2 += offset2*stride2; + if (max_channels == 64) { + for (int i = 0; i < kernel_size; i++) { + int x1 = max_displacement - padding + i; + int offset1 = x1 >= 0 ? 0 : (-x1 + stride1 - 1) / stride1; + x1 += offset1 * stride1; + + for (int blockIdx_x = offset1; blockIdx_x < topwidth && x1 < bottomwidth; blockIdx_x++, x1 += stride1) { + int x2 = x1 - neighborhood_grid_radius * stride2; + int offset2 = x2 >= 0 ? 0 : (-x2 + stride2 - 1) / stride2; + x2 += offset2 * stride2; for (int top_channel_x = offset2 - neighborhood_grid_radius; top_channel_x <= neighborhood_grid_radius && x2 < bottomwidth; - top_channel_x++, x2 += stride2) - { + top_channel_x++, x2 += stride2) { half8 sum4 = (half8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - half8* src0 = (half8*)(line0 + x1*max_channels); - half8* src1 = (half8*)(line1 + x2*max_channels); + half8 *src0 = (half8 *)(line0 + x1 * max_channels); + half8 *src1 = (half8 *)(line1 + x2 * max_channels); #pragma unroll 8 - for (int ch = 0; ch < max_channels/8; ch++) - sum4 += (src0[ch])*(src1[ch]); + for (int ch = 0; ch < max_channels / 8; ch++) sum4 += (src0[ch]) * (src1[ch]); half sum = __builtin_shave_sau_sumx_f16_r(sum4); - dline[(top_channel_x + neighborhood_grid_radius)*topwidth + blockIdx_x] += (sum); + dline[(top_channel_x + neighborhood_grid_radius) * topwidth + blockIdx_x] += (sum); } } } - } - else - { - int neighborhood_grid_width = 2*neighborhood_grid_radius + 1; - - for (int blockIdx_x = 0; blockIdx_x < topwidth; blockIdx_x++) - { - for (int i = 0; i < kernel_size; i++) - { - int x1 = blockIdx_x*stride1 + max_displacement + i - padding; - - if ((x1 >= 0) && (x1 < bottomwidth)) - { - int o_min = - neighborhood_grid_radius*stride2; - int o_max = neighborhood_grid_width*stride2 - neighborhood_grid_radius*stride2; - if ((o_min) < ( - x1)) o_min -= ((x1 + o_min - (stride2 - 1))/stride2)*stride2; - if ((o_max) >= (bottomwidth+stride2 - x1)) o_max -= ((x1 + o_max - bottomwidth )/stride2)*stride2; + } else { + int neighborhood_grid_width = 2 * neighborhood_grid_radius + 1; + + for (int blockIdx_x = 0; blockIdx_x < topwidth; blockIdx_x++) { + for (int i = 0; i < kernel_size; i++) { + int x1 = blockIdx_x * stride1 + max_displacement + i - padding; + + if ((x1 >= 0) && (x1 < bottomwidth)) { + int o_min = -neighborhood_grid_radius * stride2; + int o_max = neighborhood_grid_width * stride2 - neighborhood_grid_radius * stride2; + if ((o_min) < (-x1)) { + o_min -= ((x1 + o_min - (stride2 - 1)) / stride2) * stride2; + } + if ((o_max) >= (bottomwidth + stride2 - x1)) { + o_max -= ((x1 + o_max - bottomwidth) / stride2) * stride2; + } int o = o_min; - for (; o <= o_max - 4*stride2; o += 4*stride2) - { - half8* bottom0 = (half8*)(line0 + x1*max_channels); - half8* bottom1_0 = (half8*)(line1 + (x1 + o + 0*stride2)*max_channels); - half8* bottom1_1 = (half8*)(line1 + (x1 + o + 1*stride2)*max_channels); - half8* bottom1_2 = (half8*)(line1 + (x1 + o + 2*stride2)*max_channels); - half8* bottom1_3 = (half8*)(line1 + (x1 + o + 3*stride2)*max_channels); + for (; o <= o_max - 4 * stride2; o += 4 * stride2) { + half8 *bottom0 = (half8 *)(line0 + x1 * max_channels); + half8 *bottom1_0 = (half8 *)(line1 + (x1 + o + 0 * stride2) * max_channels); + half8 *bottom1_1 = (half8 *)(line1 + (x1 + o + 1 * stride2) * max_channels); + half8 *bottom1_2 = (half8 *)(line1 + (x1 + o + 2 * stride2) * max_channels); + half8 *bottom1_3 = (half8 *)(line1 + (x1 + o + 3 * stride2) * max_channels); int c = 0; @@ -118,8 +111,7 @@ void __attribute__((noinline)) crosscorrh(__private const half* restrict line0, half8 sum42 = 0; half8 sum43 = 0; - for (; c <= cur_subchannels/8 - 4; c += 4) - { + for (; c <= cur_subchannels / 8 - 4; c += 4) { sum40 += bottom0[c + 0] * bottom1_0[c + 0]; sum40 += bottom0[c + 1] * bottom1_0[c + 1]; sum40 += bottom0[c + 2] * bottom1_0[c + 2]; @@ -141,8 +133,7 @@ void __attribute__((noinline)) crosscorrh(__private const half* restrict line0, sum43 += bottom0[c + 3] * bottom1_3[c + 3]; } - for (; c < cur_subchannels/8; c++) - { + for (; c < cur_subchannels / 8; c++) { sum40 += bottom0[c] * bottom1_0[c]; sum41 += bottom0[c] * bottom1_1[c]; sum42 += bottom0[c] * bottom1_2[c]; @@ -154,48 +145,47 @@ void __attribute__((noinline)) crosscorrh(__private const half* restrict line0, half sum2 = __builtin_shave_sau_sumx_f16_r(sum42); half sum3 = __builtin_shave_sau_sumx_f16_r(sum43); - for (c = c*8; c < cur_subchannels; c++) - { - sum0 += line0[x1*max_channels + c] * line1[(x1 + o + 0*stride2)*max_channels + c]; - sum1 += line0[x1*max_channels + c] * line1[(x1 + o + 1*stride2)*max_channels + c]; - sum2 += line0[x1*max_channels + c] * line1[(x1 + o + 2*stride2)*max_channels + c]; - sum3 += line0[x1*max_channels + c] * line1[(x1 + o + 3*stride2)*max_channels + c]; + for (c = c * 8; c < cur_subchannels; c++) { + sum0 += line0[x1 * max_channels + c] * line1[(x1 + o + 0 * stride2) * max_channels + c]; + sum1 += line0[x1 * max_channels + c] * line1[(x1 + o + 1 * stride2) * max_channels + c]; + sum2 += line0[x1 * max_channels + c] * line1[(x1 + o + 2 * stride2) * max_channels + c]; + sum3 += line0[x1 * max_channels + c] * line1[(x1 + o + 3 * stride2) * max_channels + c]; } - dline[blockIdx_x + (((o/stride2) + 0)*topwidth + neighborhood_grid_radius*topwidth)] += sum0; - dline[blockIdx_x + (((o/stride2) + 1)*topwidth + neighborhood_grid_radius*topwidth)] += sum1; - dline[blockIdx_x + (((o/stride2) + 2)*topwidth + neighborhood_grid_radius*topwidth)] += sum2; - dline[blockIdx_x + (((o/stride2) + 3)*topwidth + neighborhood_grid_radius*topwidth)] += sum3; + dline[blockIdx_x + (((o / stride2) + 0) * topwidth + neighborhood_grid_radius * topwidth)] += + sum0; + dline[blockIdx_x + (((o / stride2) + 1) * topwidth + neighborhood_grid_radius * topwidth)] += + sum1; + dline[blockIdx_x + (((o / stride2) + 2) * topwidth + neighborhood_grid_radius * topwidth)] += + sum2; + dline[blockIdx_x + (((o / stride2) + 3) * topwidth + neighborhood_grid_radius * topwidth)] += + sum3; } - for (; o < o_max; o += 1*stride2) - { - half8* bottom0 = (half8*)(line0 + x1*max_channels); - half8* bottom1 = (half8*)(line1 + (x1 + o)*max_channels); + for (; o < o_max; o += 1 * stride2) { + half8 *bottom0 = (half8 *)(line0 + x1 * max_channels); + half8 *bottom1 = (half8 *)(line1 + (x1 + o) * max_channels); int c = 0; half8 sum4 = 0; - for (; c <= cur_subchannels/8 - 4; c += 4) - { + for (; c <= cur_subchannels / 8 - 4; c += 4) { sum4 += bottom0[c + 0] * bottom1[c + 0]; sum4 += bottom0[c + 1] * bottom1[c + 1]; sum4 += bottom0[c + 2] * bottom1[c + 2]; sum4 += bottom0[c + 3] * bottom1[c + 3]; } - for (; c < cur_subchannels/8; c++) - { + for (; c < cur_subchannels / 8; c++) { sum4 += bottom0[c] * bottom1[c]; } half sum = __builtin_shave_sau_sumx_f16_r(sum4); - for (c = c*8; c < cur_subchannels; c++) - { - sum += line0[x1*max_channels + c] * line1[(x1 + o)*max_channels + c]; + for (c = c * 8; c < cur_subchannels; c++) { + sum += line0[x1 * max_channels + c] * line1[(x1 + o) * max_channels + c]; } - dline[blockIdx_x + (((o + neighborhood_grid_radius*stride2)/stride2)*topwidth)] += sum; + dline[blockIdx_x + (((o + neighborhood_grid_radius * stride2) / stride2) * topwidth)] += sum; } } } @@ -203,243 +193,257 @@ void __attribute__((noinline)) crosscorrh(__private const half* restrict line0, } } - -__kernel void correlate2_half(__global const half* restrict bottom0, - __global const half* restrict bottom1, - __global half* restrict top, - int topwidth, - int topheight, - int bottomwidth, - int bottomheight, - int bottomchannels, - int max_displacement, - int padding, - int neighborhood_grid_radius, - int neighborhood_grid_width, - int kernel_size, - int stride1, - int stride2) +__kernel void correlate2_half( + __global const half *restrict bottom0, + __global const half *restrict bottom1, + __global half *restrict top, + int topwidth, + int topheight, + int bottomwidth, + int bottomheight, + int bottomchannels, + int max_displacement, + int padding, + int neighborhood_grid_radius, + int neighborhood_grid_width, + int kernel_size, + int stride1, + int stride2) { - int max_channels = (MAX_OPENCL_BUFF_SIZE/sizeof(half) - topwidth*neighborhood_grid_width) / (3*bottomwidth); + int max_channels = (MAX_OPENCL_BUFF_SIZE / sizeof(half) - topwidth * neighborhood_grid_width) / (3 * bottomwidth); if (max_channels > 64) max_channels = 64; int subchannels_count = (bottomchannels + max_channels - 1) / max_channels; - int subchannels = (bottomchannels + subchannels_count-1) / subchannels_count; + int subchannels = (bottomchannels + subchannels_count - 1) / subchannels_count; if (subchannels < max_channels) subchannels = max_channels; - const int sumelems = kernel_size*kernel_size*bottomchannels; + const int sumelems = kernel_size * kernel_size * bottomchannels; - __private half cmx[MAX_OPENCL_BUFF_SIZE/sizeof(half)]; + __private half cmx[MAX_OPENCL_BUFF_SIZE / sizeof(half)]; - __private half* line0 = cmx; - __private half* line1 = line0 + bottomwidth*subchannels; - __private half* dline = line1 + bottomwidth*subchannels; + __private half *line0 = cmx; + __private half *line1 = line0 + bottomwidth * subchannels; + __private half *dline = line1 + bottomwidth * subchannels; int blockIdx_y = get_global_id(0); -#if defined(USE_MANUAL_DMA) - __private half* dmabuf = dline + topwidth*neighborhood_grid_width; +#if defined(USE_DMA) + __private half *dmabuf = dline + topwidth * neighborhood_grid_width; #endif - int y1 = blockIdx_y*stride1 + max_displacement; + int y1 = blockIdx_y * stride1 + max_displacement; - for (int j = 0; j < kernel_size; j++) - { - for (int bottomchannel = 0; bottomchannel < bottomchannels; bottomchannel += subchannels) - { + for (int j = 0; j < kernel_size; j++) { + for (int bottomchannel = 0; bottomchannel < bottomchannels; bottomchannel += subchannels) { // configure channel batching int startchannel = bottomchannel; int endchannel = startchannel + subchannels > bottomchannels ? bottomchannels : startchannel + subchannels; - int deltachannels = endchannel-startchannel; + int deltachannels = endchannel - startchannel; // load line form blob 0 with repackaging - if (y1+j-padding >= 0 && y1+j-padding < bottomheight) - { -#if defined(USE_MANUAL_DMA) - __global const half* curr = bottom0 + startchannel*bottomheight*bottomwidth + (y1+j-padding)*bottomwidth; - dmacpyLineSrcStrideStart(curr, - dmabuf, - bottomwidth*deltachannels*sizeof(half), - bottomwidth*sizeof(half), - bottomwidth*bottomheight*sizeof(half)); - - for (int ch = 0; ch < deltachannels; ch++) - { - for (int blockIdx_x = 0; blockIdx_x < bottomwidth/8; blockIdx_x++) - { - half8 val = ((half8*)(dmabuf + ch*bottomwidth))[blockIdx_x]; - line0[(blockIdx_x*8 + 0)*max_channels+ch] = val[0]; - line0[(blockIdx_x*8 + 1)*max_channels+ch] = val[1]; - line0[(blockIdx_x*8 + 2)*max_channels+ch] = val[2]; - line0[(blockIdx_x*8 + 3)*max_channels+ch] = val[3]; - - line0[(blockIdx_x*8 + 4)*max_channels+ch] = val[4]; - line0[(blockIdx_x*8 + 5)*max_channels+ch] = val[5]; - line0[(blockIdx_x*8 + 6)*max_channels+ch] = val[6]; - line0[(blockIdx_x*8 + 7)*max_channels+ch] = val[7]; + if (y1 + j - padding >= 0 && y1 + j - padding < bottomheight) { +#if defined(USE_DMA) + __global const half *curr = + bottom0 + startchannel * bottomheight * bottomwidth + (y1 + j - padding) * bottomwidth; + dmacpyLineSrcStrideStart( + curr, + dmabuf, + bottomwidth * deltachannels * sizeof(half), + bottomwidth * sizeof(half), + bottomwidth * bottomheight * sizeof(half)); + + for (int ch = 0; ch < deltachannels; ch++) { + for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) { + half8 val = ((half8 *)(dmabuf + ch * bottomwidth))[blockIdx_x]; + line0[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0]; + line0[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1]; + line0[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2]; + line0[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3]; + + line0[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4]; + line0[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5]; + line0[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6]; + line0[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7]; } - for (int blockIdx_x = bottomwidth/8*8; blockIdx_x < bottomwidth; blockIdx_x++) - { - line0[(blockIdx_x)*max_channels+ch] = dmabuf[blockIdx_x + ch*bottomwidth]; + for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) { + line0[(blockIdx_x)*max_channels + ch] = dmabuf[blockIdx_x + ch * bottomwidth]; } } if (deltachannels < subchannels) for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++) - memzero(line0 + blockIdx_x*max_channels+deltachannels, (subchannels-deltachannels)*sizeof(half)); + memzero( + line0 + blockIdx_x * max_channels + deltachannels, + (subchannels - deltachannels) * sizeof(half)); #else - for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++) - { + for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++) { for (int ch = 0; ch < deltachannels; ch++) - line0[blockIdx_x*max_channels+ch] - = bottom0[(ch+startchannel)*bottomheight*bottomwidth + (y1+j-padding)*bottomwidth + blockIdx_x]; + line0[blockIdx_x * max_channels + ch] = bottom0 + [(ch + startchannel) * bottomheight * bottomwidth + (y1 + j - padding) * bottomwidth + + blockIdx_x]; if (deltachannels < subchannels) - memzero(line0 + blockIdx_x*max_channels+deltachannels, (subchannels-deltachannels)*sizeof(half)); + memzero( + line0 + blockIdx_x * max_channels + deltachannels, + (subchannels - deltachannels) * sizeof(half)); } #endif - } - else - memzero(line0, max_channels*bottomwidth*sizeof(half)); + } else + memzero(line0, max_channels * bottomwidth * sizeof(half)); - for (int top_channel_y = 0; top_channel_y < neighborhood_grid_width; top_channel_y++) - { + for (int top_channel_y = 0; top_channel_y < neighborhood_grid_width; top_channel_y++) { int y2 = y1 + (top_channel_y - neighborhood_grid_radius) * stride2; - // load line form blob 1 with repackaging according to the line we work on now - if (y2+j-padding >= 0 && y2+j-padding < bottomheight) - { -#if defined(USE_MANUAL_DMA) - __global const half* curr = bottom1 + startchannel*bottomheight*bottomwidth + (y2+j-padding)*bottomwidth; - dmacpyLineSrcStrideStart(curr, - dmabuf, - bottomwidth*deltachannels*sizeof(half), - bottomwidth*sizeof(half), - bottomwidth*bottomheight*sizeof(half)); - - for (int ch = 0; ch < deltachannels; ch++) - { - for (int blockIdx_x = 0; blockIdx_x < bottomwidth/8; blockIdx_x++) - { - half8 val = ((half8*)(dmabuf + ch*bottomwidth))[blockIdx_x]; - line1[(blockIdx_x*8 + 0)*max_channels+ch] = val[0]; - line1[(blockIdx_x*8 + 1)*max_channels+ch] = val[1]; - line1[(blockIdx_x*8 + 2)*max_channels+ch] = val[2]; - line1[(blockIdx_x*8 + 3)*max_channels+ch] = val[3]; - - line1[(blockIdx_x*8 + 4)*max_channels+ch] = val[4]; - line1[(blockIdx_x*8 + 5)*max_channels+ch] = val[5]; - line1[(blockIdx_x*8 + 6)*max_channels+ch] = val[6]; - line1[(blockIdx_x*8 + 7)*max_channels+ch] = val[7]; + if (y2 + j - padding >= 0 && y2 + j - padding < bottomheight) { +#if defined(USE_DMA) + __global const half *curr = + bottom1 + startchannel * bottomheight * bottomwidth + (y2 + j - padding) * bottomwidth; + dmacpyLineSrcStrideStart( + curr, + dmabuf, + bottomwidth * deltachannels * sizeof(half), + bottomwidth * sizeof(half), + bottomwidth * bottomheight * sizeof(half)); + + for (int ch = 0; ch < deltachannels; ch++) { + for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) { + half8 val = ((half8 *)(dmabuf + ch * bottomwidth))[blockIdx_x]; + line1[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0]; + line1[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1]; + line1[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2]; + line1[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3]; + + line1[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4]; + line1[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5]; + line1[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6]; + line1[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7]; } - for (int blockIdx_x = bottomwidth/8*8; blockIdx_x < bottomwidth; blockIdx_x++) - { - line1[(blockIdx_x)*max_channels+ch] = dmabuf[blockIdx_x + ch*bottomwidth]; + for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) { + line1[(blockIdx_x)*max_channels + ch] = dmabuf[blockIdx_x + ch * bottomwidth]; } } #else - for (int ch = 0; ch < deltachannels; ch++) - { - for (int blockIdx_x = 0; blockIdx_x < bottomwidth/8; blockIdx_x++) - { - half8 val = ((__global half8*)(bottom1 + (ch+startchannel)*bottomheight*bottomwidth + (y2+j-padding)*bottomwidth))[blockIdx_x]; - line1[(blockIdx_x*8 + 0)*max_channels+ch] = val[0]; - line1[(blockIdx_x*8 + 1)*max_channels+ch] = val[1]; - line1[(blockIdx_x*8 + 2)*max_channels+ch] = val[2]; - line1[(blockIdx_x*8 + 3)*max_channels+ch] = val[3]; - - line1[(blockIdx_x*8 + 4)*max_channels+ch] = val[4]; - line1[(blockIdx_x*8 + 5)*max_channels+ch] = val[5]; - line1[(blockIdx_x*8 + 6)*max_channels+ch] = val[6]; - line1[(blockIdx_x*8 + 7)*max_channels+ch] = val[7]; + for (int ch = 0; ch < deltachannels; ch++) { + for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) { + half8 val = (( + __global half8 + *)(bottom1 + (ch + startchannel) * bottomheight * bottomwidth + (y2 + j - padding) * bottomwidth)) + [blockIdx_x]; + line1[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0]; + line1[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1]; + line1[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2]; + line1[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3]; + + line1[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4]; + line1[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5]; + line1[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6]; + line1[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7]; } - for (int blockIdx_x = bottomwidth/8*8; blockIdx_x < bottomwidth; blockIdx_x++) - { - half val = (bottom1 + (ch+startchannel)*bottomheight*bottomwidth + (y2+j-padding)*bottomwidth)[blockIdx_x]; - line1[(blockIdx_x)*max_channels+ch] = val; + for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) { + half val = + (bottom1 + (ch + startchannel) * bottomheight * bottomwidth + + (y2 + j - padding) * bottomwidth)[blockIdx_x]; + line1[(blockIdx_x)*max_channels + ch] = val; } } #endif - for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++) - { + for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++) { if (deltachannels < subchannels) - memzero(line1 + blockIdx_x*max_channels+deltachannels, (subchannels-deltachannels)*sizeof(half)); + memzero( + line1 + blockIdx_x * max_channels + deltachannels, + (subchannels - deltachannels) * sizeof(half)); } - } - else - memzero(line1, max_channels*bottomwidth*sizeof(half)); - - if(j == 0 && startchannel == 0) - { - memzero(dline, neighborhood_grid_width*topwidth*sizeof(half)); - } - else - { -#if defined(USE_MANUAL_DMA) - dmacpyLineSrcStrideStart(top + top_channel_y*neighborhood_grid_width*topheight*topwidth + blockIdx_y*topwidth, - dline, - topwidth*neighborhood_grid_width*sizeof(half), - topwidth*sizeof(half), - topwidth*topheight*sizeof(half)); + } else + memzero(line1, max_channels * bottomwidth * sizeof(half)); + + if (j == 0 && startchannel == 0) { + memzero(dline, neighborhood_grid_width * topwidth * sizeof(half)); + } else { +#if defined(USE_DMA) + dmacpyLineSrcStrideStart( + top + top_channel_y * neighborhood_grid_width * topheight * topwidth + blockIdx_y * topwidth, + dline, + topwidth * neighborhood_grid_width * sizeof(half), + topwidth * sizeof(half), + topwidth * topheight * sizeof(half)); #else - for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) - { - for (int blockIdx_x = 0; blockIdx_x < topwidth/8; blockIdx_x++) - { - half8 val = ((__global half8*)(top + ((top_channel_y*neighborhood_grid_width+top_channel_x)*topheight*topwidth + blockIdx_y*topwidth)))[blockIdx_x]; - ((half8*)(dline + top_channel_x*topwidth))[blockIdx_x] = val; + for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) { + for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) { + half8 val = (( + __global half8 + *)(top + ((top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth + blockIdx_y * topwidth))) + [blockIdx_x]; + ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] = val; } - for (int blockIdx_x = (topwidth/8)*8; blockIdx_x < topwidth; blockIdx_x++) - { - dline[top_channel_x*topwidth+blockIdx_x] = - top[(top_channel_y*neighborhood_grid_width+top_channel_x)*topheight*topwidth + blockIdx_y*topwidth+blockIdx_x]; + for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) { + dline[top_channel_x * topwidth + blockIdx_x] = + top[(top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth + + blockIdx_y * topwidth + blockIdx_x]; } } #endif } - if (y1+j-padding >= 0 && y1+j-padding < bottomheight && y2+j-padding >= 0 && y2+j-padding < bottomheight) - { - crosscorrh(line0, line1, dline, topwidth, max_displacement, neighborhood_grid_radius, - kernel_size, padding, bottomwidth, stride1, stride2, max_channels, subchannels); + if (y1 + j - padding >= 0 && y1 + j - padding < bottomheight && y2 + j - padding >= 0 + && y2 + j - padding < bottomheight) { + crosscorrh( + line0, + line1, + dline, + topwidth, + max_displacement, + neighborhood_grid_radius, + kernel_size, + padding, + bottomwidth, + stride1, + stride2, + max_channels, + subchannels); } - if (j == kernel_size-1 && endchannel == bottomchannels) - { - half8 scale = (half8){(half)sumelems, (half)sumelems, (half)sumelems, (half)sumelems, (half)sumelems, (half)sumelems, (half)sumelems, (half)sumelems}; - for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) - { - for (int blockIdx_x = 0; blockIdx_x < topwidth/8; blockIdx_x++) - { - ((half8*)(dline + top_channel_x*topwidth))[blockIdx_x] = - ((half8*)(dline + top_channel_x*topwidth))[blockIdx_x] / scale; + if (j == kernel_size - 1 && endchannel == bottomchannels) { + half8 scale = (half8){ + (half)sumelems, + (half)sumelems, + (half)sumelems, + (half)sumelems, + (half)sumelems, + (half)sumelems, + (half)sumelems, + (half)sumelems}; + for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) { + for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) { + ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] = + ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] / scale; } - for (int blockIdx_x = (topwidth/8)*8; blockIdx_x < topwidth; blockIdx_x++) - { - dline[top_channel_x*topwidth+blockIdx_x] = dline[top_channel_x*topwidth+blockIdx_x]/(half)sumelems; + for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) { + dline[top_channel_x * topwidth + blockIdx_x] = + dline[top_channel_x * topwidth + blockIdx_x] / (half)sumelems; } } } -#if defined(USE_MANUAL_DMA) - dmacpyLineDstStrideStart(dline, - top + top_channel_y*neighborhood_grid_width*topheight*topwidth + blockIdx_y*topwidth, - topwidth*neighborhood_grid_width*sizeof(half), - topwidth*sizeof(half), - topwidth*topheight*sizeof(half)); +#if defined(USE_DMA) + dmacpyLineDstStrideStart( + dline, + top + top_channel_y * neighborhood_grid_width * topheight * topwidth + blockIdx_y * topwidth, + topwidth * neighborhood_grid_width * sizeof(half), + topwidth * sizeof(half), + topwidth * topheight * sizeof(half)); #else - for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) - { - for (int blockIdx_x = 0; blockIdx_x < topwidth/8; blockIdx_x++) - { - ((__global half8*)(top + ((top_channel_y*neighborhood_grid_width+top_channel_x)*topheight*topwidth + blockIdx_y*topwidth)))[blockIdx_x] = - ((half8*)(dline + top_channel_x*topwidth))[blockIdx_x] + (half8) {0, 0, 0, 0, 0, 0, 0, 0}; + for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) { + for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) { + ((__global half8 + *)(top + ((top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth + blockIdx_y * topwidth))) + [blockIdx_x] = ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] + + (half8){0, 0, 0, 0, 0, 0, 0, 0}; } - for (int blockIdx_x = (topwidth/8)*8; blockIdx_x < topwidth; blockIdx_x++) - { - top[(top_channel_y*neighborhood_grid_width+top_channel_x)*topheight*topwidth + blockIdx_y*topwidth+blockIdx_x] - = dline[top_channel_x*topwidth+blockIdx_x] + (half)0; + for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) { + top[(top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth + + blockIdx_y * topwidth + blockIdx_x] = + dline[top_channel_x * topwidth + blockIdx_x] + (half)0; } } #endif diff --git a/inference-engine/src/vpu/custom_kernels/ctc.cl b/inference-engine/src/vpu/custom_kernels/ctc.cl index 609fc00251e5d1..5dbbe4eb94038e 100644 --- a/inference-engine/src/vpu/custom_kernels/ctc.cl +++ b/inference-engine/src/vpu/custom_kernels/ctc.cl @@ -3,10 +3,12 @@ // #pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable -__global half *find(__global const half *begin, __global const half *end, half value) { +__global half *find(__global const half *begin, __global const half *end, half value) +{ while (begin != end) { - if (*begin == value) { + if (*begin == value) { return begin; } ++begin; @@ -14,160 +16,79 @@ __global half *find(__global const half *begin, __global const half *end, half v return end; } -#define USE_MANUAL_DMA - -#ifdef USE_MANUAL_DMA - -__kernel void __dma_preload_CTCDecoder(__global half *probabilities, - __global half *sequence_indicators, - __global half *output_sequences, - int width, - int height, - int channels, - __local half *local_src, - __local half *local_dst) +__kernel void CTCDecoder( + __global half *restrict probabilities, + __global half *restrict sequence_indicators, + __global half *restrict output, + int width, + int height, + int channels) { - WorkGroupDmaCreateStrideTransaction( - probabilities, // src + __local half local_src[88 * 1 * 77]; + __local half local_dst[88 * 1]; + + event_t e1 = async_work_group_copy_2D2D( local_src, // dst - width * sizeof(half), // src_width, - width * sizeof(half), // dst_width, - width * height * sizeof(half), // src_stride, - width * sizeof(half), // dst_stride, - width * height * channels * sizeof(half), // size + probabilities, // src + width, // num_elements_per_line, + height * channels, // num_lines, + width * (height - 1), // src_line_stride, + width * (height - 1), // dst_line_stride, 0); -} -__kernel void __dma_postwrite_CTCDecoder(__global half *probabilities, - __global half *sequence_indicators, - __global half *output_sequences, - int width, - int height, - int channels, - __local half *local_src, - __local half *local_dst) -{ - WorkGroupDmaCreateStrideTransaction( - local_dst, // src - output_sequences, // dst - channels * sizeof(half), // src_width, - channels * sizeof(half), // dst_width, - channels * sizeof(half), // src_stride, - channels * sizeof(half), // dst_stride, - channels * height * sizeof(half), // size - 0); -} + wait_group_events(1, &e1); -__kernel void CTCDecoder(__global half *probabilities, - __global half *sequence_indicators, - __global half *output_sequences, - int width, - int height, - int channels, - __local half *local_src, - __local half *local_dst) -{ - const int T = channels; - const int B = height; - const int C = width; + const int T = channels; // Time + const int B = height; // Batches + const int C = width; // Chars - for (int i = 0; i < B*T; i++) - { + #pragma unroll 4 + for (int i = 0; i < B * T; i++) { local_dst[i] = -1.h; } int output_index = 0; - for (int b = 0; b < B; ++b) - { - __global const half *seq_ind = sequence_indicators + b*T; + for (int b = 0; b < B; ++b) { + __global const half *restrict seq_ind = sequence_indicators + b * T; const int seq_len = find(seq_ind + 1, seq_ind + T, 0.h) - seq_ind; - const int time = min(seq_len, T); + const int time = min(seq_len, T); int prev_class_idx = -1; - for (int t = 0; t < time; ++t) - { - __local const half *probs = local_src + b*C + t*C*B; - int max_class_idx = 0; - half max_prob = probs[0]; + #pragma unroll 4 + for (int t = 0; t < time; ++t) { + __local const half *restrict probs = local_src + b * C + t * C * B; - for (int c = 1; c < C; ++c) - { + int max_class_idx = 0; + half max_prob = probs[0]; + for (int c = 1; c < C; ++c) { const half prob = probs[c]; - if (prob > max_prob) - { + if (prob > max_prob) { max_class_idx = c; - max_prob = prob; + max_prob = prob; } } - if (max_class_idx < C-1 && max_class_idx != prev_class_idx) - { - local_dst[b*T + output_index] = (half)max_class_idx; + if (max_class_idx < C - 1 && max_class_idx != prev_class_idx) { + local_dst[b * T + output_index] = (half)max_class_idx; output_index++; } prev_class_idx = max_class_idx; } } -} - -#else - -__kernel void CTCDecoder(__global half *probabilities, - __global half *sequence_indicators, - __global half *output_sequences, - int width, - int height, - int channels, - __local half *local_src, - __local half *local_dst) -{ - const int T = channels; - const int B = height; - const int C = width; - - for (int i = 0; i < B*T; i++) - { - output_sequences[i] = -1.h; - } - int output_index = 0; - - for (int b = 0; b < B; ++b) - { - __global const half *seq_ind = sequence_indicators + b*T; - const int seq_len = find(seq_ind + 1, seq_ind + T, 0.h) - seq_ind; - const int time = min(seq_len, T); - - int prev_class_idx = -1; - - for (int t = 0; t < time; ++t) - { - __global const half *probs = probabilities + b*C + t*C*B; - int max_class_idx = 0; - half max_prob = probs[0]; - - for (int c = 1; c < C; ++c) - { - const half prob = probs[c]; - if (prob > max_prob) - { - max_class_idx = c; - max_prob = prob; - } - } + barrier(CLK_LOCAL_MEM_FENCE); - if (max_class_idx < C-1 && max_class_idx != prev_class_idx) - { - output_sequences[b*T + output_index] = (half)max_class_idx; - output_index++; - } + event_t e2 = async_work_group_copy_2D2D( + output, // dst + local_dst, // src + channels, // num_elements_per_line, + height, // num_lines, + 0, // src_line_stride, + 0, // dst_line_stride, + 0); - prev_class_idx = max_class_idx; - } - } + wait_group_events(1, &e2); } - -#endif diff --git a/inference-engine/src/vpu/custom_kernels/customLayerBindings.xml b/inference-engine/src/vpu/custom_kernels/customLayerBindings.xml index 929be758fdf99b..8a27ff52cb04e4 100644 --- a/inference-engine/src/vpu/custom_kernels/customLayerBindings.xml +++ b/inference-engine/src/vpu/custom_kernels/customLayerBindings.xml @@ -1,6 +1,6 @@ - + @@ -8,15 +8,12 @@ - - - @@ -26,22 +23,18 @@ - - - + - + - - @@ -50,82 +43,74 @@ - + - - - - - - - - - - - - - - - - - - - +--> + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + - + - - - - + + @@ -136,7 +121,7 @@ - + @@ -144,12 +129,11 @@ - - + @@ -160,8 +144,6 @@ - - @@ -174,12 +156,10 @@ - + - - @@ -204,64 +184,36 @@ - - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + + - + - + - + @@ -301,9 +253,6 @@ - - - @@ -331,9 +280,6 @@ - - - @@ -343,7 +289,7 @@ - + @@ -369,12 +315,10 @@ - + - - @@ -389,12 +333,10 @@ - + - - @@ -409,7 +351,7 @@ - + @@ -429,10 +371,6 @@ - - - - @@ -441,7 +379,7 @@ - + @@ -461,9 +399,6 @@ - - - @@ -509,8 +444,6 @@ - - @@ -530,8 +463,6 @@ - - @@ -570,7 +501,6 @@ - diff --git a/inference-engine/src/vpu/custom_kernels/cvtu8f16.cl b/inference-engine/src/vpu/custom_kernels/cvtu8f16.cl index 33d7d2f891eab2..5684268e62e629 100644 --- a/inference-engine/src/vpu/custom_kernels/cvtu8f16.cl +++ b/inference-engine/src/vpu/custom_kernels/cvtu8f16.cl @@ -3,88 +3,46 @@ // #pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable -#define USE_MANUAL_DMA 1 - -#if defined (USE_MANUAL_DMA) - -__kernel void __dma_preload_cvtu8f16( - __global uchar* restrict src, - __global half* restrict dst, - float scale, - float bias, - __local uchar* restrict local_src, - __local half* restrict local_dst) +__kernel void cvtu8f16(__global const uchar *restrict src, __global half *restrict dst, float scale, float bias) { - WorkGroupDmaCreate3DTransaction( - src + get_group_id(0)*get_local_size(0) - + get_group_id(1)*get_local_size(1)*get_global_size(0) - + get_group_id(2)*get_local_size(2)*get_global_size(0)*get_global_size(1), // src + __local uchar local_src[8 * 1024]; + __local half local_dst[8 * 1024]; + + event_t e1 = async_work_group_copy_3D3D( local_src, // dst - get_local_size(0) * sizeof(uchar), // src width - get_local_size(0) * sizeof(uchar), // dst width - get_global_size(0) * sizeof(uchar), // src stride - get_local_size(0) * sizeof(uchar), // dst stride + src + get_group_id(0) * get_local_size(0) + get_group_id(1) * get_local_size(1) * get_global_size(0) + + get_group_id(2) * get_local_size(2) * get_global_size(0) * get_global_size(1), // src + get_local_size(0), // num_elements_per_line + get_local_size(0) * get_local_size(1) / (get_local_size(0)), // num_lines + get_global_size(0) - get_local_size(0), // src_line_stride + 0, // dst_line_stride get_local_size(2), // num planes - get_global_size(0) * get_global_size(1) * sizeof(uchar), // src plane stride - get_local_size(0) * get_local_size(1) * sizeof(uchar), // dst plane stride - get_local_size(0) * get_local_size(1) * sizeof(uchar), // plane size + get_global_size(0) * (get_global_size(1) - get_local_size(1)), // src plane stride + 0, // dst plane stride 0); -} + wait_group_events(1, &e1); -__kernel void __dma_postwrite_cvtu8f16( - __global uchar* restrict src, - __global half* restrict dst, - float scale, - float bias, - __local uchar* restrict local_src, - __local half* restrict local_dst) -{ - WorkGroupDmaCreate3DTransaction( - local_dst, // src - dst + get_group_id(0)*get_local_size(0) - + get_group_id(1)*get_local_size(1)*get_global_size(0) - + get_group_id(2)*get_local_size(2)*get_global_size(0)*get_global_size(1), // dst - get_local_size(0) * sizeof(half), // src width - get_local_size(0) * sizeof(half), // dst width - get_local_size(0) * sizeof(half), // src stride - get_global_size(0) * sizeof(half), // dst stride - get_local_size(2), // num planes - get_local_size(0) * get_local_size(1) * sizeof(half), // src plane stride - get_global_size(0) * get_global_size(1) * sizeof(half), // dst plane stride - get_local_size(0) * get_local_size(1) * sizeof(half), // plane size - 0); -} + size_t idx = get_local_id(0) + + get_local_id(1) * get_local_size(0) + + get_local_id(2) * get_local_size(0) * get_local_size(1); -__kernel void cvtu8f16( - __global uchar* restrict src, - __global half* restrict dst, - float scale, - float bias, - __local uchar* restrict local_src, - __local half* restrict local_dst) -{ - size_t idx = get_local_id(0) + - get_local_id(1)*get_local_size(0) + - get_local_id(2)*get_local_size(0)*get_local_size(1); - local_dst[idx] = convert_half(local_src[idx])*(half)scale+(half)bias; -} + local_dst[idx] = convert_half(local_src[idx]) * (half)scale + (half)bias; -#else // defined (USE_MANUAL_DMA) + barrier(CLK_LOCAL_MEM_FENCE); -__kernel void cvtu8f16( - __global uchar* restrict src, - __global half* restrict dst, - float scale, - float bias, - __local uchar* restrict local_src, // unused, added for compatibility with DMA variant - __local half* restrict local_dst) // unused, added for compatibility with DMA variant -{ - int idx = get_global_id(0) + - get_global_id(1) * get_global_size(0) + - get_global_id(2) * get_global_size(0) * get_global_size(1); - dst[idx] = convert_half(src[idx])*(half)scale+(half)bias; + event_t e2 = async_work_group_copy_3D3D( + dst + get_group_id(0) * get_local_size(0) + get_group_id(1) * get_local_size(1) * get_global_size(0) + + get_group_id(2) * get_local_size(2) * get_global_size(0) * get_global_size(1), // dst + local_dst, // src + get_local_size(0), // num_elements_per_line + get_local_size(1), // num_lines + 0, // src_line_stride + get_global_size(0) - get_local_size(0), // dst_line_stride + get_local_size(2), // num_planes + 0, // src_plane_stride + get_global_size(0) * (get_global_size(1) - get_local_size(1)), // dst_plane_stride + 0); + wait_group_events(1, &e2); } - -#endif // defined (USE_MANUAL_DMA) - diff --git a/inference-engine/src/vpu/custom_kernels/detectron_prior_grid_gen.cl b/inference-engine/src/vpu/custom_kernels/detectron_prior_grid_gen.cl index e92d3c6afb7fa4..0f73395934bf19 100644 --- a/inference-engine/src/vpu/custom_kernels/detectron_prior_grid_gen.cl +++ b/inference-engine/src/vpu/custom_kernels/detectron_prior_grid_gen.cl @@ -3,102 +3,63 @@ // #pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable -__kernel void __dma_preload_experimental_detectron_prior_grid_generator( - __global const half* restrict input_priors, - __global const half* restrict input_feature_map, - __global const half* restrict input_rois, - __global half* restrict output, - __local half* restrict local_input_priors, - __local half* restrict local_output, +__kernel void experimental_detectron_prior_grid_generator( + __global const half *restrict input_priors, + __global const half *restrict input_feature_map, + __global const half *restrict input_rois, + __global half *restrict output, int grid_h, int grid_w, float stride_h, float stride_w, int num_priors, - int num_anchors_per_prior) { + int num_anchors_per_prior) +{ + __local half local_input_priors[8 * 1024]; + __local half local_output[8 * 1024]; - // Move input_priors to local memory. - WorkGroupDmaCreateStrideTransaction( - input_priors, // src - local_input_priors, // dst - num_anchors_per_prior * num_priors * sizeof(half), // src_width - num_anchors_per_prior * num_priors * sizeof(half), // dst_width - num_anchors_per_prior * num_priors * sizeof(half), // src_stride - num_anchors_per_prior * num_priors * sizeof(half), // dst_stride - num_anchors_per_prior * num_priors * sizeof(half), // total_size + event_t e1 = async_work_group_copy( + local_input_priors, + input_priors, + num_anchors_per_prior * num_priors, 0); -} + wait_group_events(1, &e1); -__kernel void __dma_postwrite_experimental_detectron_prior_grid_generator( - __global const half* restrict input_priors, - __global const half* restrict input_feature_map, - __global const half* restrict input_rois, - __global half* restrict output, - __local half* restrict local_input_priors, - __local half* restrict local_output, - int grid_h, - int grid_w, - float stride_h, - float stride_w, - int num_priors, - int num_anchors_per_prior) { - - int local_width = get_local_size(0); int width_start = get_group_id(0) * get_local_size(0); - int width_end = min(width_start + local_width, grid_w); - int width = width_end - width_start; - - WorkGroupDmaCreateStrideTransaction( - local_output, // src - output + get_group_id(0) * get_local_size(0) * - num_anchors_per_prior * num_priors - + get_group_id(1) * get_local_size(1) * grid_w * - num_anchors_per_prior * num_priors, // dst - width * num_anchors_per_prior * num_priors * sizeof(half), // src_width - width * num_anchors_per_prior * num_priors * sizeof(half), // dst_width - grid_w * num_anchors_per_prior * num_priors * sizeof(half), // src_stride - grid_w * num_anchors_per_prior * num_priors * sizeof(half), // dst_stride - width * num_anchors_per_prior * num_priors * sizeof(half), // total_size - 0); -} + int width_end = min(width_start + get_local_size(0), (unsigned)grid_w); + int width = width_end - width_start; -__kernel void experimental_detectron_prior_grid_generator( - __global const half* restrict input_priors, - __global const half* restrict input_feature_map, - __global const half* restrict input_rois, - __global half* restrict output, - __local half* restrict local_input_priors, - __local half* restrict local_output, - int grid_h, - int grid_w, - float stride_h, - float stride_w, - int num_priors, - int num_anchors_per_prior) { - - int workgroup_width = get_local_size(0); - int width_start = get_group_id(0) * workgroup_width; - int width_end = min(width_start + workgroup_width, grid_w); - int width = width_end - width_start; - - int h = get_group_id(1); - int w_idx = get_group_id(0) * workgroup_width; + int h = get_group_id(1); + int w_idx = get_group_id(0) * get_local_size(0); for (int w = 0; w < width; ++w) { #pragma unroll 4 for (int p = 0; p < num_priors; ++p) { local_output[(w * num_priors + p) * num_anchors_per_prior + 0] = - local_input_priors[4 * p + 0] + - convert_half(stride_w) * (convert_half(w_idx + w) + 0.5); + local_input_priors[4 * p + 0] + + convert_half(stride_w) * (convert_half(w_idx + w) + 0.5); local_output[(w * num_priors + p) * num_anchors_per_prior + 1] = - local_input_priors[4 * p + 1] + - convert_half(stride_h) * (convert_half(h) + 0.5); + local_input_priors[4 * p + 1] + convert_half(stride_h) * (convert_half(h) + 0.5); local_output[(w * num_priors + p) * num_anchors_per_prior + 2] = - local_input_priors[4 * p + 2] + - convert_half(stride_w) * (convert_half(w_idx + w) + 0.5); + local_input_priors[4 * p + 2] + + convert_half(stride_w) * (convert_half(w_idx + w) + 0.5); local_output[(w * num_priors + p) * num_anchors_per_prior + 3] = - local_input_priors[4 * p + 3] + - convert_half(stride_h) * (convert_half(h) + 0.5); + local_input_priors[4 * p + 3] + convert_half(stride_h) * (convert_half(h) + 0.5); } } + + barrier(CLK_LOCAL_MEM_FENCE); + + event_t e2 = async_work_group_copy_2D2D( + output + get_group_id(0) * get_local_size(0) * num_anchors_per_prior * num_priors + + get_group_id(1) * get_local_size(1) * grid_w * num_anchors_per_prior + * num_priors, // dst + local_output, // src + width * num_anchors_per_prior * num_priors, // num_elements_per_line + 1, // num_lines + (grid_w - width) * num_anchors_per_prior * num_priors, // src_line_stride + (grid_w - width) * num_anchors_per_prior * num_priors, // dst_line_stride + 0); + wait_group_events(1, &e2); } diff --git a/inference-engine/src/vpu/custom_kernels/fakequantize.cl b/inference-engine/src/vpu/custom_kernels/fakequantize.cl new file mode 100644 index 00000000000000..58fa1ee35c94cf --- /dev/null +++ b/inference-engine/src/vpu/custom_kernels/fakequantize.cl @@ -0,0 +1,111 @@ +// Copyright (C) 2018-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable + +__kernel void quantize( + __global const half *restrict src_data, + __global const half *restrict input_low, + __global const half *restrict input_high, + __global const half *restrict output_low, + __global const half *restrict output_high, + __global half *restrict dst_data, + int levels, + int input_low_size, + int input_high_size, + int output_low_size, + int output_high_size, + int W, + int H) +{ + __local half local_src[15 * 1024]; + __local half local_dst[15 * 1024]; + + event_t e1 = async_work_group_copy(local_src, src_data + get_group_id(2) * W * H, W * H, 0); + wait_group_events(1, &e1); + + int c = get_group_id(2); + + half h_ilow = (input_low_size == 1 ? input_low[0] : input_low[c]); + half h_ihigh = (input_high_size == 1 ? input_high[0] : input_high[c]); + half h_olow = (output_low_size == 1 ? output_low[0] : output_low[c]); + half h_ohigh = (output_high_size == 1 ? output_high[0] : output_high[c]); + + half const1 = (half)( + !(h_ihigh - h_ilow) ? 0.0f : convert_float(levels - 1) / (convert_float(h_ihigh) - convert_float(h_ilow))); + half const2 = + (half)(!(levels - 1) ? 0.0f : (convert_float(h_ohigh) - convert_float(h_olow)) / convert_float(levels - 1)); + + __local const half *restrict src = local_src + W * get_local_id(1); + __local half *restrict dst = local_dst + W * get_local_id(1); + + for (int w = 0; w < W / 8; w++) { + half8 val = *((__local half8 *)src + w); + half8 aux = (val - (half8)h_ilow) * (half8)const1 + (half8)0.5h; + + aux = (half8){ + (half)(short)(aux.s0), + (half)(short)(aux.s1), + (half)(short)(aux.s2), + (half)(short)(aux.s3), + (half)(short)(aux.s4), + (half)(short)(aux.s5), + (half)(short)(aux.s6), + (half)(short)(aux.s7)}; + + aux = aux * (half8)const2 + (half8)h_olow; + + short8 a; + short8 b; + a.s0 = (val.s0 <= h_ilow); + a.s1 = (val.s1 <= h_ilow); + a.s2 = (val.s2 <= h_ilow); + a.s3 = (val.s3 <= h_ilow); + a.s4 = (val.s4 <= h_ilow); + a.s5 = (val.s5 <= h_ilow); + a.s6 = (val.s6 <= h_ilow); + a.s7 = (val.s7 <= h_ilow); + + b.s0 = (val.s0 > h_ihigh); + b.s1 = (val.s1 > h_ihigh); + b.s2 = (val.s2 > h_ihigh); + b.s3 = (val.s3 > h_ihigh); + b.s4 = (val.s4 > h_ihigh); + b.s5 = (val.s5 > h_ihigh); + b.s6 = (val.s6 > h_ihigh); + b.s7 = (val.s7 > h_ihigh); + + a = ~(a - (short8)1); + b = ~(b - (short8)1); + + short8 c1 = (~a & b); + short8 c2 = (~a & ~b); + + short8 res = (a & as_short8((half8)h_olow)) | (c1 & as_short8((half8)h_ohigh)) | (c2 & as_short8(aux)); + + *((__local half8 *)dst + w) = as_half8(res); + } + + for (int w = W & (~0x7); w < W; w++) { + half val = src[w]; + short a = val <= h_ilow; + a = ~(a - 1); + short b = val > h_ihigh; + b = ~(b - 1); + + short c1 = (~a & b); + short c2 = (~a & ~b); + + short res = (a & as_short(h_olow)) | (c1 & as_short(h_ohigh)) + | (c2 & as_short(((half)(round((val - h_ilow) * const1) * const2) + h_olow))); + + dst[w] = as_half(res); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + event_t e2 = async_work_group_copy(dst_data + get_group_id(2) * W * H, local_dst, W * H, 0); + wait_group_events(1, &e2); +} diff --git a/inference-engine/src/vpu/custom_kernels/grn.cl b/inference-engine/src/vpu/custom_kernels/grn.cl index 88cebb83caa81b..2ae5a0ff5c0dbf 100644 --- a/inference-engine/src/vpu/custom_kernels/grn.cl +++ b/inference-engine/src/vpu/custom_kernels/grn.cl @@ -3,111 +3,61 @@ // #pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable -#define USE_MANUAL_DMA 1 - -#if defined (USE_MANUAL_DMA) - -__kernel void __dma_preload_grn_NCHW( - __global const half* restrict src, - __global half* restrict dst, - __local half* restrict local_src, - __local half* restrict local_dst, - int C, - float bias) -{ - WorkGroupDmaCreate3DTransaction( - src + get_group_id(0)*get_local_size(0) - + get_group_id(1)*get_local_size(1)*get_global_size(0), // src - local_src, // dst - get_local_size(0) * sizeof(half), // src width - get_local_size(0) * sizeof(half), // dst width - get_global_size(0) * sizeof(half), // src stride - get_local_size(0) * sizeof(half), // dst stride - C, // num planes - get_global_size(0) * get_global_size(1) * sizeof(half), // src plane stride - get_local_size(0) * get_local_size(1) * sizeof(half), // dst plane stride - get_local_size(0) * get_local_size(1) * sizeof(half), // plane size - 0); -} - -__kernel void __dma_postwrite_grn_NCHW( - __global const half* restrict src, - __global half* restrict dst, - __local const half* restrict local_src, - __local half* restrict local_dst, - int C, - float bias) +__kernel void grn(__global const half *restrict src_data, __global half *restrict dst_data, int C, float bias) { - WorkGroupDmaCreate3DTransaction( - local_dst, // src - dst + get_group_id(0)*get_local_size(0) - + get_group_id(1)*get_local_size(1)*get_global_size(0), // dst - get_local_size(0) * sizeof(half), // src width - get_local_size(0) * sizeof(half), // dst width - get_local_size(0) * sizeof(half), // src stride - get_global_size(0) * sizeof(half), // dst stride - C, // num planes - get_local_size(0) * get_local_size(1) * sizeof(half), // src plane stride - get_global_size(0) * get_global_size(1) * sizeof(half), // dst plane stride - get_local_size(0) * get_local_size(1) * sizeof(half), // plane size + __local half src[8 * 1024]; + __local half dst[8 * 1024]; + + const size_t index = get_group_id(0) * get_local_size(0) + get_group_id(1) * get_local_size(1) * get_global_size(0); + + event_t e1 = async_work_group_copy_3D3D( + src, // dst + src_data + index, // src + get_local_size(0), // num_elements_per_line, + get_local_size(1), // num_lines, + get_global_size(0) - get_local_size(0), // src_line_stride, + 0, // dst_line_stride, + C, // num_planes, + get_global_size(0) * (get_global_size(1) - get_local_size(1)), // src_plane_stride + 0, // dst_plane_stride 0); -} + wait_group_events(1, &e1); -__kernel void grn_NCHW( - __global const half* restrict src, - __global half* restrict dst, - __local half* restrict local_src, - __local half* restrict local_dst, - int C, - float bias) -{ float variance = bias + 1e-9f; #pragma unroll 8 - for (int c = 0; c < C; c++) - { - float val = (float) local_src[c*get_local_size(1)*get_local_size(0) + get_local_id(1)*get_local_size(0) + get_local_id(0)]; - variance += val*val; + for (int c = 0; c < C; c++) { + float val = (float)src[c * get_local_size(1) * get_local_size(0) + + get_local_id(1) * get_local_size(0) + + get_local_id(0)]; + variance += val * val; } - half hvariance = (half)(native_rsqrt((half)(variance/16.f))*0.25f); + half hvariance = (half)(native_rsqrt((half)(variance / 16.f)) * 0.25f); #pragma unroll 8 - for (int c = 0; c < C; c++) - { - local_dst[c*get_local_size(1)*get_local_size(0) + get_local_id(1)*get_local_size(0) + get_local_id(0)] - = local_src[c*get_local_size(1)*get_local_size(0) + get_local_id(1)*get_local_size(0) + get_local_id(0)] * hvariance; + for (int c = 0; c < C; c++) { + dst[c * get_local_size(1) * get_local_size(0) + + get_local_id(1) * get_local_size(0) + + get_local_id(0)] = + src[c * get_local_size(1) * get_local_size(0) + + get_local_id(1) * get_local_size(0) + get_local_id(0)] * hvariance; } -} - -#else // defined (USE_MANUAL_DMA) -__kernel void grn_NCHW( - __global const half* restrict src, - __global half* restrict dst, - __local half* restrict local_src, // unused, added for compatibility with DMA variant - __local half* restrict local_dst, // unused, added for compatibility with DMA variant - int C, - float bias) -{ - float variance = bias + 1e-9f; - - #pragma unroll 4 - for (int c = 0; c < C; c++) - { - float val = (float) src[c*get_global_size(1)*get_global_size(0) + get_global_id(1)*get_global_size(0) + get_global_id(0)]; - variance += val*val; - } - - half hvariance = (half)(native_rsqrt((half)(variance/16.f))*0.25f); - - #pragma unroll 4 - for (int c = 0; c < C; c++) - { - dst[c*get_global_size(1)*get_global_size(0) + get_global_id(1)*get_global_size(0) + get_global_id(0)] - = src[c*get_global_size(1)*get_global_size(0) + get_global_id(1)*get_global_size(0) + get_global_id(0)] * hvariance; - } + barrier(CLK_LOCAL_MEM_FENCE); + + event_t e2 = async_work_group_copy_3D3D( + dst_data + index, // src + dst, // dst + get_local_size(0), // num_elements_per_line, + get_local_size(1), // num_lines, + 0, // src_line_stride, + get_global_size(0) - get_local_size(0), // dst_line_stride, + C, // num_planes, + 0, // src_plane_stride + get_global_size(0) * (get_global_size(1) - get_local_size(1)), // dst_plane_stride + 0); + wait_group_events(1, &e2); } - -#endif // defined (USE_MANUAL_DMA) diff --git a/inference-engine/src/vpu/custom_kernels/mvn.cl b/inference-engine/src/vpu/custom_kernels/mvn.cl deleted file mode 100644 index 9c5499c38485fc..00000000000000 --- a/inference-engine/src/vpu/custom_kernels/mvn.cl +++ /dev/null @@ -1,390 +0,0 @@ -// Copyright (C) 2018-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -// Define if runtime supports it. MX runtime is compatible, KMB is in WIP state -#define USE_MANUAL_DMA 1 - -// Set to 1 if only output is zerroed before kernel execution -#define USE_ATOMICS 0 - -void atomic_add_global(volatile __global float *source, const float operand) { - union { - unsigned int intVal; - float floatVal; - } newVal; - union { - unsigned int intVal; - float floatVal; - } prevVal; - - do { - prevVal.floatVal = *source; - newVal.floatVal = prevVal.floatVal + operand; - } while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal); -} - -#if defined (USE_MANUAL_DMA) - -__kernel void __dma_preload_reduction_mean(const __global half* restrict src, - __global float* restrict mean, - __global float* restrict variance, - int W, - int H, - int across_channels, - __local half* restrict src_line) -{ - WorkGroupDmaCreateStrideTransaction( - src + get_group_id(1)*get_local_size(1)*W + - get_group_id(2)*get_local_size(2)*W*get_global_size(1), // src - src_line, // dst - W*get_local_size(1) * sizeof(half), // src width - W*get_local_size(1) * sizeof(half), // dst width - W*get_global_size(1) * sizeof(half), // src stride - W*get_local_size(1) * sizeof(half), // dst stride - W*get_local_size(1)*get_local_size(2)*sizeof(half), //total size - 0 - ); -} - -__kernel void reduction_mean(const __global half* restrict src, - __global float* restrict mean, - __global float* restrict variance, - int W, - int H, - int across_channels, - __local half* restrict src_line) -{ - int h = get_global_id(1); - int c = get_global_id(2); - - const int MAX_LOCAL_SIZE = 8; - - __local float mbuf[MAX_LOCAL_SIZE]; - __local float vbuf[MAX_LOCAL_SIZE]; - - mbuf[get_local_id(1)] = 0; - vbuf[get_local_id(1)] = 0; - - if (h < H) - { - float sum = 0.f; - float sum2 = 0.f; - - float8 sum4 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - float8 sum24 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - - const __local half8* lsrc = ((const __local half8*)(src_line + get_local_id(1)*W) ); - - #pragma unroll 16 - for (size_t w = 0; w < W/8; w++) - { - half8 sh = lsrc[w]; - float8 valf = convert_float8(sh); - - sum4 += valf; - sum24 += valf*valf; - } - - for (size_t w = W/8*8; w < W; w++) - { - float val = (float)src_line[get_local_id(1)*W + w]; - sum += val; - sum2 += val*val; - } - - mbuf[get_local_id(1)] = sum4.s0 + sum4.s1 + sum4.s2 + sum4.s3 + sum4.s4 + sum4.s5 + sum4.s6 + sum4.s7 + sum; - vbuf[get_local_id(1)] = sum24.s0 + sum24.s1 + sum24.s2 + sum24.s3 + sum24.s4 + sum24.s5 + sum24.s6 + sum24.s7 + sum2; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if (get_local_id(1) == 0) - { - float res = 0; - float res2 = 0; - - for (int i = 0; i < get_local_size(1); i++) - { - res += mbuf[i]; - res2 += vbuf[i]; - } - -// requires memory reset before layer execution -#if USE_ATOMICS - int idx = (across_channels == 0) ? c : 0; - - atomic_add_global(mean + idx, res); - atomic_add_global(variance + idx, res2); -#else - int idx = c*get_num_groups(1) + get_group_id(1); - - mean[idx] = res; - variance[idx] = res2; -#endif - } -} - -__kernel void __dma_preload_mvn_scale(const __global half * restrict src, - __global half * restrict dst, - __global float * restrict mean_part, - __global float * restrict power_mean, - int W, - int H1, - int across_channels, - int normalize_variance, - int nparts, - __local half * restrict src_line, - __local half * restrict dst_line - ) -{ - WorkGroupDmaCreateStrideTransaction( - src + get_group_id(1)*get_local_size(1)*W + - get_group_id(2)*get_local_size(2)*W*get_global_size(1), // src - src_line, // dst - W*get_local_size(1) * sizeof(half), // src width - W*get_local_size(1) * sizeof(half), // dst width - W*get_global_size(1) * sizeof(half), // src stride - W*get_local_size(1) * sizeof(half), // dst stride - W*get_local_size(1)*get_local_size(2)*sizeof(half), //total size - 0 - ); -} - -__kernel void __dma_postwrite_mvn_scale(const __global half * restrict src, - __global half * restrict dst, - __global float * restrict mean_part, - __global float * restrict power_mean, - int W, - int H1, - int across_channels, - int normalize_variance, - int nparts, - __local half * restrict src_line, - __local half * restrict dst_line) -{ - WorkGroupDmaCreateStrideTransaction( - dst_line, // src - dst + get_group_id(1)*get_local_size(1)*W + - get_group_id(2)*get_local_size(2)*W*get_global_size(1), // dst - W*get_local_size(1) * sizeof(half), // src width - W*get_local_size(1) * sizeof(half), // dst width - W*get_local_size(1) * sizeof(half), // dst stride - W*get_global_size(1) * sizeof(half), // src stride - W*get_local_size(1)*get_local_size(2)*sizeof(half), //total size - 0 - ); -} - -__kernel void mvn_scale(const __global half * restrict src, - __global half * restrict dst, - __global float * restrict mean_part, - __global float * restrict power_mean, - int W, - int H1, - int across_channels, - int normalize_variance, - int nparts, - __local half * restrict src_line, - __local half * restrict dst_line) -{ - int h = get_global_id(1); - int H = get_global_size(1); - - // can we avoid this check and use min/max? We can pass number of groups just as a param. -//#if !USE_ATOMICS -// if (h >= H1) return; -//#endif - - int c = get_global_id(2); - int C = get_global_size(2); - - int idx = (across_channels == 0) ? nparts*c : 0; - float scale = (across_channels == 0) ? H*W : H*W*C; - -#if USE_ATOMICS - float mean = mean_part[idx]; - float variance = power_mean[idx]; -#else - - int total = (across_channels == 0) ? nparts : nparts*C; - float mean = 0.f; - float variance = 0.f; - - for (int i = 0; i < total; i++) - { - mean += mean_part[idx+i]; - variance += power_mean[idx+i]; - } -#endif - - mean = mean/scale; - variance = variance/scale; - variance = variance - mean*mean; - variance = native_sqrt(variance) + 1e-9f; - - half hmean = mean; - half hvariance = (normalize_variance == 0) ? 1.f : (1.f / variance); - - const __local half8 * restrict src_data8 = (const __local half8 * restrict)(src_line + get_local_id(1)*W); - __local half8 * restrict dst_data8 = (__local half8 * restrict)(dst_line + get_local_id(1)*W); - - #pragma unroll 16 - for (size_t w = 0; w < W/8; w++) - { - dst_data8[w] = (src_data8[w] - hmean) * hvariance; - } - for (size_t w = W/8*8; w < W; w++) - { - dst_line[get_local_id(1)*W + w] = (src_line[get_local_id(1)*W + w] - hmean) * hvariance; - } -} - -#else - -__kernel void reduction_mean(const __global half* restrict src, - __global float* restrict mean, - __global float* restrict variance, - int W, - int H, - int across_channels, - __local half* restrict src_line) // for compatimility with DMA kernel -{ - int h = get_global_id(1); - int c = get_global_id(2); - - const int MAX_LOCAL_SIZE = 8; - - __local float mbuf[MAX_LOCAL_SIZE]; - __local float vbuf[MAX_LOCAL_SIZE]; - - mbuf[get_local_id(1)] = 0; - vbuf[get_local_id(1)] = 0; - - if (h < H) - { - float sum = 0.f; - float sum2 = 0.f; - - float8 sum4 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - float8 sum24 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - - const __global half8* src_line = (const __global half8 *)(src + c*H*W + h*W); - - #pragma unroll 16 - for (size_t w = 0; w < W/8; w++) - { - half8 sh = src_line[w]; - float8 valf = convert_float8(sh); - - sum4 += valf; - sum24 += valf*valf; - } - - for (size_t w = W/8*8; w < W; w++) - { - float val = (float)src[c*H*W + h*W + w]; - - sum += val; - sum2 += val*val; - } - - mbuf[get_local_id(1)] = sum4.s0 + sum4.s1 + sum4.s2 + sum4.s3 + sum4.s4 + sum4.s5 + sum4.s6 + sum4.s7 + sum; - vbuf[get_local_id(1)] = sum24.s0 + sum24.s1 + sum24.s2 + sum24.s3 + sum24.s4 + sum24.s5 + sum24.s6 + sum24.s7 + sum2; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if (get_local_id(1) == 0) - { - float res = 0; - float res2 = 0; - - for (int i = 0; i < get_local_size(1); i++) - { - res += mbuf[i]; - res2 += vbuf[i]; - } - -// requires memory reset before layer execution -#if USE_ATOMICS - int idx = (across_channels == 0) ? c : 0; - - atomic_add_global(mean + idx, res); - atomic_add_global(variance + idx, res2); -#else - int idx = c*get_num_groups(1) + get_group_id(1); - - mean[idx] = res; - variance[idx] = res2; -#endif - } -} - -__kernel void mvn_scale(const __global half * restrict src_data, - __global half * restrict dst_data, - __global float * restrict mean_part, - __global float * restrict power_mean, - int W, - int H1, - int across_channels, - int normalize_variance, - int nparts, - __local half * restrict src_line, - __local half * restrict dst_line) -{ - int h = get_global_id(1); - int H = get_global_size(1); - - // can we avoid this check and use min/max? We can pass number of groups just as a param. -//#if !USE_ATOMICS -// if (h >= H1) return; -//#endif - - int c = get_global_id(2); - int C = get_global_size(2); - - int idx = (across_channels == 0) ? nparts*c : 0; - float scale = (across_channels == 0) ? H*W : H*W*C; - -#if USE_ATOMICS - float mean = mean_part[idx]; - float variance = power_mean[idx]; -#else - - int total = (across_channels == 0) ? nparts : nparts*C; - float mean = 0.f; - float variance = 0.f; - - for (int i = 0; i < total; i++) - { - mean += mean_part[idx+i]; - variance += power_mean[idx+i]; - } -#endif - - mean = mean/scale; - variance = variance/scale; - variance = variance - mean*mean; - variance = native_sqrt(variance) + 1e-9f; - - half hmean = mean; - half hvariance = (normalize_variance == 0) ? 1.f : (1.f / variance); - - const __global half8 * restrict src_data8 = (const __global half8 * restrict)(src_data + c*H*W + h*W); - __global half8 * restrict dst_data8 = (__global half8 * restrict)(dst_data + c*H*W + h*W); - - #pragma unroll 16 - for (size_t w = 0; w < W/8; w++) - { - dst_data8[w] = (src_data8[w] - hmean) * hvariance; - } - for (size_t w = W/8*8; w < W; w++) - { - dst_data[c*H*W + h*W + w] = (src_data[c*H*W + h*W + w] - hmean) * hvariance; - } -} - -#endif // USE_MANUAL_DMA diff --git a/inference-engine/src/vpu/custom_kernels/mvn_reduction.cl b/inference-engine/src/vpu/custom_kernels/mvn_reduction.cl new file mode 100644 index 00000000000000..ef61b489db81a2 --- /dev/null +++ b/inference-engine/src/vpu/custom_kernels/mvn_reduction.cl @@ -0,0 +1,115 @@ +// Copyright (C) 2018-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable + +// Set to 1 only if output is zerroed before kernel execution +#define USE_ATOMICS 0 + +void atomic_add_global(volatile __global float *source, const float operand) +{ + union { + unsigned int intVal; + float floatVal; + } newVal; + union { + unsigned int intVal; + float floatVal; + } prevVal; + + do { + prevVal.floatVal = *source; + newVal.floatVal = prevVal.floatVal + operand; + } while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal); +} + +__kernel void reduction_mean( + __global const half *restrict src, + __global float *restrict mean, + __global float *restrict variance, + int W, + int H, + int across_channels) +{ + __local half src_line[4 * 1024]; + event_t e; + + e = async_work_group_copy_2D2D( + src_line, // dst + src + get_group_id(1) * get_local_size(1) * W + + get_group_id(2) * get_local_size(2) * W * get_global_size(1), // src + W * get_local_size(1), // num_elements_per_line, + get_local_size(2), // num_lines, + W * (get_global_size(1) - get_local_size(1)), // src_line_stride, + 0, // dst_line_stride, + 0); + + wait_group_events(1, &e); + + int h = get_global_id(1); + int c = get_global_id(2); + + const int MAX_LOCAL_SIZE = 8; + + __local float mbuf[MAX_LOCAL_SIZE]; + __local float vbuf[MAX_LOCAL_SIZE]; + + mbuf[get_local_id(1)] = 0; + vbuf[get_local_id(1)] = 0; + + if (h < H) { + float sum = 0.f; + float sum2 = 0.f; + + float8 sum4 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + float8 sum24 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + + const __local half8 *restrict lsrc = ((const __local half8 *)(src_line + get_local_id(1) * W)); + + #pragma unroll 16 + for (size_t w = 0; w < W / 8; w++) { + half8 sh = lsrc[w]; + float8 valf = convert_float8(sh); + + sum4 += valf; + sum24 += valf * valf; + } + + for (size_t w = W / 8 * 8; w < W; w++) { + float val = (float)src_line[get_local_id(1) * W + w]; + sum += val; + sum2 += val * val; + } + + mbuf[get_local_id(1)] = sum4.s0 + sum4.s1 + sum4.s2 + sum4.s3 + sum4.s4 + sum4.s5 + sum4.s6 + sum4.s7 + sum; + vbuf[get_local_id(1)] = + sum24.s0 + sum24.s1 + sum24.s2 + sum24.s3 + sum24.s4 + sum24.s5 + sum24.s6 + sum24.s7 + sum2; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (get_local_id(1) == 0) { + float res = 0; + float res2 = 0; + + for (int i = 0; i < get_local_size(1); i++) { + res += mbuf[i]; + res2 += vbuf[i]; + } + +// requires memory reset before layer execution +#if USE_ATOMICS + int idx = (across_channels == 0) ? c : 0; + + atomic_add_global(mean + idx, res); + atomic_add_global(variance + idx, res2); +#else + int idx = c * get_num_groups(1) + get_group_id(1); + + mean[idx] = res; + variance[idx] = res2; +#endif + } +} diff --git a/inference-engine/src/vpu/custom_kernels/mvn_scale.cl b/inference-engine/src/vpu/custom_kernels/mvn_scale.cl new file mode 100644 index 00000000000000..6f3d4658d30e49 --- /dev/null +++ b/inference-engine/src/vpu/custom_kernels/mvn_scale.cl @@ -0,0 +1,68 @@ +// Copyright (C) 2018-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable + +// Set to 1 only if output is zerroed before kernel execution +#define USE_ATOMICS 0 + +__attribute__((reqd_work_group_size(1, 1, 1))) __kernel void mvn_scale( + const __global half *restrict src, + __global float *restrict mean_part, + __global float *restrict power_mean, + __global half *restrict dst, + int W, + int H1, + int across_channels, + int normalize_variance, + int nparts) +{ + __local half src_line[4 * 1024]; + __local half dst_line[4 * 1024]; + + int c = get_group_id(2); + int C = get_global_size(2); + + int h = get_group_id(1); + int H = get_global_size(1); + + event_t e1 = async_work_group_copy(src_line, src + c * H * W + h * W, W, 0); + wait_group_events(1, &e1); + + int idx = (across_channels == 0) ? nparts * c : 0; + float scale = (across_channels == 0) ? H * W : H * W * C; + +#if USE_ATOMICS + float mean = mean_part[idx]; + float variance = power_mean[idx]; +#else + + int total = (across_channels == 0) ? nparts : nparts * C; + float mean = 0.f; + float variance = 0.f; + + for (int i = 0; i < total; i++) { + mean += mean_part[idx + i]; + variance += power_mean[idx + i]; + } +#endif + + mean = mean / scale; + variance = variance / scale; + variance = variance - mean * mean; + variance = native_sqrt(variance) + 1e-9f; + + half hmean = mean; + half hvariance = (normalize_variance == 0) ? 1.f : (1.f / variance); + + for (size_t w = 0; w < W; w++) { + dst_line[w] = (src_line[w] - hmean) * hvariance; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + event_t e2 = async_work_group_copy(dst + c * H * W + h * W, dst_line, W, 0); + wait_group_events(1, &e2); +} diff --git a/inference-engine/src/vpu/custom_kernels/quantize.cl b/inference-engine/src/vpu/custom_kernels/quantize.cl deleted file mode 100644 index dd225877bff35d..00000000000000 --- a/inference-engine/src/vpu/custom_kernels/quantize.cl +++ /dev/null @@ -1,176 +0,0 @@ -// Copyright (C) 2018-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -__kernel void __dma_preload_quantize(__global half const *const restrict src, - __global half const *const restrict input_low, - __global half const *const restrict input_high, - __global half const *const restrict output_low, - __global half const *const restrict output_high, - __global half *const restrict dst, - int levels, - int input_low_size, - int input_high_size, - int output_low_size, - int output_high_size, - int W, - int C, - __local half *const restrict local_src, - __local half const *const restrict local_dst) -{ - WorkGroupDmaCreateStrideTransaction( - src + get_group_id(1) * get_local_size(1) * W, // src - local_src, // dst - W * sizeof(half), // src_width, - W * sizeof(half), // dst_width, - get_global_size(1) * W * sizeof(half), // src_stride, - W * sizeof(half), // dst_stride, - W * C * sizeof(half), // size - 0); -} - -__kernel void __dma_postwrite_quantize(__global half const *const restrict src, - __global half const *const restrict input_low, - __global half const *const restrict input_high, - __global half const *const restrict output_low, - __global half const *const restrict output_high, - __global half *const restrict dst, - int levels, - int input_low_size, - int input_high_size, - int output_low_size, - int output_high_size, - int W, - int C, - __local half const *const restrict local_src, - __local half const *const restrict local_dst) -{ - WorkGroupDmaCreateStrideTransaction( - local_dst, // src - dst + get_group_id(1) * get_local_size(1) * W, // dst - W * sizeof(half), // src_width, - W * sizeof(half), // dst_width, - W * sizeof(half), // src_stride, - get_global_size(1) * W * sizeof(half), // dst_stride, - W * C * sizeof(half), // size - 0); -} - -__kernel void quantize(__global half const *const restrict src, - __global half const *const restrict input_low, - __global half const *const restrict input_high, - __global half const *const restrict output_low, - __global half const *const restrict output_high, - __global half const *const restrict dst, - int levels, - int input_low_size, - int input_high_size, - int output_low_size, - int output_high_size, - int W, - int C, - __local half const *const restrict local_src, - __local half *const restrict local_dst) -{ - int h = get_global_id(1); - int H = get_global_size(1); - - for (int c = 0; c < C; c++) - { - half h_ilow = (input_low_size == 1 ? input_low[0] : input_low[c]); - half h_ihigh = (input_high_size == 1 ? input_high[0] : input_high[c]); - half h_olow = (output_low_size == 1 ? output_low[0] : output_low[c]); - half h_ohigh = (output_high_size == 1 ? output_high[0] : output_high[c]); - - half const1 = (half)(!(h_ihigh - h_ilow) ? 0.0f : convert_float(levels - 1) / (convert_float(h_ihigh) - convert_float(h_ilow))); - half const2 = (half)(!(levels - 1) ? 0.0f : (convert_float(h_ohigh) - convert_float(h_olow)) / convert_float(levels - 1)); - - __local const half* restrict addr_src = local_src + c*W; - __local half* restrict addr_dst = local_dst + c*W; - - for (int w = 0; w < W / 8; w++) - { - half8 val = *((__local half8*)addr_src + w); -#if 1 - // round is too slow =( 902 b of code - //half8 aux = round((val - (half8)h_ilow) * (half8)const1); - - half8 aux = (val - (half8)h_ilow) * (half8)const1 + (half8)0.5h; - - aux = (half8){ - (half)(short)(aux.s0), - (half)(short)(aux.s1), - (half)(short)(aux.s2), - (half)(short)(aux.s3), - (half)(short)(aux.s4), - (half)(short)(aux.s5), - (half)(short)(aux.s6), - (half)(short)(aux.s7) - }; - - aux = aux * (half8)const2 + (half8)h_olow; - - // vector comparison add 756 b of assembly, so do in manually - // short8 a = val <= (half8)h_olow; - // short8 b = val > (half8)h_ohigh; - - short8 a; - short8 b; - a.s0 = (val.s0 <= h_ilow); - a.s1 = (val.s1 <= h_ilow); - a.s2 = (val.s2 <= h_ilow); - a.s3 = (val.s3 <= h_ilow); - a.s4 = (val.s4 <= h_ilow); - a.s5 = (val.s5 <= h_ilow); - a.s6 = (val.s6 <= h_ilow); - a.s7 = (val.s7 <= h_ilow); - - b.s0 = (val.s0 > h_ihigh); - b.s1 = (val.s1 > h_ihigh); - b.s2 = (val.s2 > h_ihigh); - b.s3 = (val.s3 > h_ihigh); - b.s4 = (val.s4 > h_ihigh); - b.s5 = (val.s5 > h_ihigh); - b.s6 = (val.s6 > h_ihigh); - b.s7 = (val.s7 > h_ihigh); - - a = ~(a-(short8)1); - b = ~(b-(short8)1); - - short8 c1 = (~a & b); - short8 c2 = (~a & ~b); - - short8 res = a & as_short8((half8)h_olow) - | c1 & as_short8((half8)h_ohigh) - | c2 & as_short8(aux); - - *((__local half8*)addr_dst + w) = as_half8(res); -#else - *((__local half8*)addr_dst + w) = val; -#endif - } - - for (int w = W & (~0x7); w < W; w++) - //for (int w = 0 ; w < W; w++) - { - half val = addr_src[w]; -#if 1 - short a = val <= h_ilow; a = ~(a-1); - short b = val > h_ihigh; b = ~(b-1); - - short c1 = (~a & b); - short c2 = (~a & ~b); - - short res = a & as_short(h_olow) - | c1 & as_short(h_ohigh) - | c2 & as_short(((half)(round( (val - h_ilow) * const1) * const2) + h_olow)); - - addr_dst[w] = as_half(res); -#else - addr_dst[w] = val; -#endif - } - } -} diff --git a/inference-engine/src/vpu/custom_kernels/region.cl b/inference-engine/src/vpu/custom_kernels/region.cl deleted file mode 100644 index d04b7383c60132..00000000000000 --- a/inference-engine/src/vpu/custom_kernels/region.cl +++ /dev/null @@ -1,474 +0,0 @@ -// Copyright (C) 2018-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -__constant static half log_2_e = (half)1.442695040888963; // log2(exp(1.0)) - -#define ALLOW_EARLY_RETURN 1 - -#define USE_MANUAL_DMA 1 - -#if USE_MANUAL_DMA - -static void inline logistic_activate(__local const half* restrict src, - __local half* restrict dst, - int offset) -{ - half val = src[offset]; - val = 1.0h / (1.0h + exp2(val * -log_2_e)); - dst[offset] = val; -} - -__kernel void __dma_preload_region_chw( - __global const half* restrict src, - __global half* restrict _0, - __local half* restrict local_src, - __local half* restrict _1, - int W, /* 13 */ - int H, /* 13 */ - int classes, /* 20 */ - int coords, /* 4 */ - int num, /* 5 */ - int maskSize, - int doSoftmax - ) -{ - const int local_C = classes + coords + 1; - const int c = get_group_id(1)*local_C; - const int h = get_group_id(0); - - WorkGroupDmaCreateStrideTransaction( - src + c*H*W + h*W, // src - local_src, // dst - W*sizeof(half), // src_width, - W*sizeof(half), // dst_width, - W*H*sizeof(half), // src_stride, - W*sizeof(half), // dst_stride, - W*local_C*sizeof(half), // size - 0); -} - -__kernel void __dma_postwrite_region_chw( - __global half* restrict _0, - __global half* restrict dst, - __local half* restrict _1, - __local const half* restrict local_dst, - int W, /* 13 */ - int H, /* 13 */ - int classes, /* 20 */ - int coords, /* 4 */ - int num, /* 5 */ - int maskSize, - int doSoftmax - ) -{ - const int local_C = classes + coords + 1; - const int c = get_group_id(1)*local_C; - const int h = get_group_id(0); - - WorkGroupDmaCreateStrideTransaction( - local_dst, // src - dst + c*H*W + h*W, // dst - W*sizeof(half), // src_width, - W*sizeof(half), // dst_width, - W*sizeof(half), // src_stride, - W*H*sizeof(half), // dst_stride, - W*local_C*sizeof(half), // size - 0); -} - -__kernel void region_chw( - __global half* restrict src_data, - __global half* restrict dst_data, - __local const half* restrict local_src, - __local half* restrict local_dst, - int W, /* 13 */ - int H, /* 13 */ - int classes, /* 20 */ - int coords, /* 4 */ - int num, /* 5 */ - int maskSize, - int doSoftmax - ) -{ - const int w = get_local_id(0); - -#if ALLOW_EARLY_RETURN - if (w >= W) return; -#endif - - __local const half *restrict src = local_src + w; - __local half *restrict dst = local_dst + w; - - const int stride = W; - logistic_activate(src, dst, 0*stride); - logistic_activate(src, dst, 1*stride); - - //copy plane 2 and 3 - dst[2*stride] = src[2*stride]; - dst[3*stride] = src[3*stride]; - - logistic_activate(src, dst, 4*stride); - - src += (coords + 1)*stride; - dst += (coords + 1)*stride; - - if (doSoftmax) - { - half max_val = src[0]; - #pragma unroll 4 - for (int c = 0; c < classes; c++) - { - max_val = max(max_val, src[c*stride]); - } - - half expSum = 0.0h; - #pragma unroll 4 - for (int c = 0; c < classes; c++) - { - const half e = src[c*stride] - max_val; - const half tmp = exp2(e * log_2_e); - dst[c*stride] = tmp; - expSum += tmp; - } - - const half invExpSum = 1.0h / expSum; - #pragma unroll 4 - for (int c = 0; c < classes; c++) - { - dst[c*stride] *= invExpSum; - } - } - else - { - #pragma unroll 4 - for (int c = 0; c < classes; c++) - { - logistic_activate(src, dst, c*stride); - } - } -} - -__kernel void __dma_preload_region_hwc( - __global const half* restrict src, - __global half* restrict _0, - __local half* restrict local_src, - __local half* restrict _1, - int W, /* 13 */ - int H, /* 13 */ - int classes, /* 20 */ - int coords, /* 4 */ - int num, /* 5 */ - int maskSize, - int doSoftmax - ) -{ - const int local_C = classes + coords + 1; - const int c = get_group_id(1)*local_C; - const int h = get_group_id(0); - if (!doSoftmax) num = maskSize; - const int C = local_C*num; - - WorkGroupDmaCreateStrideTransaction( - src + h*W*C + c, // src - local_src, // dst - local_C*sizeof(half), // src_width, - local_C*sizeof(half), // dst_width, - C*sizeof(half), // src_stride, - local_C*sizeof(half), // dst_stride, - local_C*W*sizeof(half), // size - 0); -} - -__kernel void __dma_postwrite_region_hwc( - __global half* restrict _0, - __global half* restrict dst, - __local half* restrict _1, - __local const half* restrict local_dst, - int W, /* 13 */ - int H, /* 13 */ - int classes, /* 20 */ - int coords, /* 4 */ - int num, /* 5 */ - int maskSize, - int doSoftmax - ) -{ - // Region always outputs in CHW layout; same as postwrite_chw - const int local_C = classes + coords + 1; - const int c = get_group_id(1)*local_C; - const int h = get_group_id(0); - - WorkGroupDmaCreateStrideTransaction( - local_dst, // src - dst + c*H*W + h*W, // dst - W*sizeof(half), // src_width, - W*sizeof(half), // dst_width, - W*sizeof(half), // src_stride, - W*H*sizeof(half), // dst_stride, - W*local_C*sizeof(half), // size - 0); -} - -static void inline logistic_activate_hwc(__local const half* restrict src, - __local half* restrict dst, - int offset, - int stride) -{ - half val = src[offset]; - val = 1.0h / (1.0h + exp2(val * -log_2_e)); - dst[offset*stride] = val; -} - -__kernel void region_hwc( - __global half* restrict src_data, - __global half* restrict dst_data, - __local const half* restrict local_src, - __local half* restrict local_dst, - int W, /* 13 */ - int H, /* 13 */ - int classes, /* 20 */ - int coords, /* 4 */ - int num, /* 5 */ - int maskSize, - int doSoftmax - ) -{ - const int w = get_local_id(0); - -#if ALLOW_EARLY_RETURN - if (w >= W) return; -#endif - - const int local_C = classes + coords + 1; - - __local const half *restrict src = local_src + w*local_C; - __local half *restrict dst = local_dst + w; - - const int stride = W; - logistic_activate_hwc(src, dst, 0, stride); - logistic_activate_hwc(src, dst, 1, stride); - - //copy plane 2 and 3 - dst[2*stride] = src[2]; - dst[3*stride] = src[3]; - - logistic_activate_hwc(src, dst, 4, stride); - - src += coords + 1; - dst += (coords + 1)*stride; - - if (doSoftmax) - { - half max_val = src[0]; - #pragma unroll 4 - for (int c = 0; c < classes; c++) - { - max_val = max(max_val, src[c]); - } - - half expSum = 0.0h; - #pragma unroll 4 - for (int c = 0; c < classes; c++) - { - const half e = src[c] - max_val; - const half tmp = exp2(e * log_2_e); - dst[c*stride] = tmp; - expSum += tmp; - } - - const half invExpSum = 1.0h / expSum; - #pragma unroll 4 - for (int c = 0; c < classes; c++) - { - dst[c*stride] *= invExpSum; - } - } - else - { - #pragma unroll 4 - for (int c = 0; c < classes; c++) - { - logistic_activate_hwc(src, dst, c, stride); - } - } -} - -#else // defined (USE_MANUAL_DMA) - -#define NUM_CLASSES 80 - -static void inline logistic_activate(__global const half* restrict src, - __global half* restrict dst, - int offset) -{ - half val = src[offset]; - val = 1.0h / (1.0h + exp2(val * -log_2_e)); - dst[offset] = val; -} - -__kernel void region_chw( - __global const half* restrict global_src, - __global half* restrict global_dst, - __local half* restrict _0, - __local half* restrict _1, - int W, /* 13 */ - int H, /* 13 */ - int classes, /* 20 */ - int coords, /* 4 */ - int num, /* 5 */ - int maskSize, - int doSoftmax - ) -{ - const int w = get_local_id(0); - -#if ALLOW_EARLY_RETURN - if (w >= W) return; -#endif - - const int local_C = classes + coords + 1; - const int c = get_group_id(1)*local_C; - const int h = get_group_id(0); - - __global const half *restrict src = global_src + c*H*W + h*W + w; - __global half *restrict dst = global_dst + c*H*W + h*W + w; - - const int stride = H*W; - logistic_activate(src, dst, 0*stride); - logistic_activate(src, dst, 1*stride); - - //copy plane 2 and 3 - dst[2*stride] = src[2*stride]; - dst[3*stride] = src[3*stride]; - - logistic_activate(src, dst, 4*stride); - - src += (coords + 1)*stride; - dst += (coords + 1)*stride; - - if (doSoftmax) - { - __private half data[NUM_CLASSES]; - - half max_val = src[0]; - for (int c = 0; c < classes; c++) - { - half tmp = src[c*stride]; - data[c] = tmp; - max_val = max(max_val, tmp); - } - - half expSum = 0.0h; - for (int c = 0; c < classes; c++) - { - half tmp = half_exp(data[c] - max_val); - data[c] = tmp; - expSum += tmp; - } - - for (int c = 0; c < classes; c++) - { - dst[c*stride] = data[c] / expSum; - } - } - else - { - #pragma unroll 4 - for (int c = 0; c < classes; c++) - { - logistic_activate(src, dst, c*stride); - } - } -} - -static void inline logistic_activate_hwc(__global const half* restrict src, - __global half* restrict dst, - int offset, - int stride) -{ - half val = src[offset]; - val = 1.0h / (1.0h + exp2(val * -log_2_e)); - dst[offset*stride] = val; -} - - -__kernel void region_hwc( - __global const half* restrict global_src, - __global half* restrict global_dst, - __local half* restrict _0, - __local half* restrict _1, - int W, /* 13 */ - int H, /* 13 */ - int classes, /* 20 */ - int coords, /* 4 */ - int num, /* 5 */ - int maskSize, - int doSoftmax - ) -{ - const int w = get_local_id(0); - -#if ALLOW_EARLY_RETURN - if (w >= W) return; -#endif - - const int local_C = classes + coords + 1; - const int c = get_group_id(1)*local_C; - const int h = get_group_id(0); - const int C = num*local_C; - - __global const half *restrict src = global_src + h*W*C + w*C + c; - __global half *restrict dst = global_dst + c*H*W + h*W + w; - - const int stride = H*W; - logistic_activate_hwc(src, dst, 0, stride); - logistic_activate_hwc(src, dst, 1, stride); - - //copy plane 2 and 3 - dst[2*stride] = src[2]; - dst[3*stride] = src[3]; - - logistic_activate_hwc(src, dst, 4, stride); - - src += coords + 1; - dst += (coords + 1)*stride; - - if (doSoftmax) - { - __private half data[NUM_CLASSES]; - - half max_val = src[0]; - for (int c = 0; c < classes; c++) - { - half tmp = src[c]; - data[c] = tmp; - max_val = max(max_val, tmp); - } - - half expSum = 0.0h; - for (int c = 0; c < classes; c++) - { - half tmp = half_exp(data[c] - max_val); - data[c] = tmp; - expSum += tmp; - } - - for (int c = 0; c < classes; c++) - { - dst[c*stride] = data[c] / expSum; - } - } - else - { - #pragma unroll 4 - for (int c = 0; c < classes; c++) - { - logistic_activate_hwc(src, dst, c, stride); - } - } -} - -#endif // defined (USE_MANUAL_DMA) diff --git a/inference-engine/src/vpu/custom_kernels/region_chw.cl b/inference-engine/src/vpu/custom_kernels/region_chw.cl index c728042fe85158..dba752e48b8cb4 100644 --- a/inference-engine/src/vpu/custom_kernels/region_chw.cl +++ b/inference-engine/src/vpu/custom_kernels/region_chw.cl @@ -3,75 +3,106 @@ // #pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable -#define NUM_CLASSES 80 +__constant static half log_2_e = (half)1.442695040888963; // log2(exp(1.0)) -#define nlog_2_e ((half)(-1.442695040888963)) +#define ALLOW_EARLY_RETURN 1 -static void logistic_activate(__global const half* restrict src_data, - __global half* restrict dst_data, - int offset) +static void inline logistic_activate(__local const half *restrict src, __local half *restrict dst, int offset) { - half val = src_data[offset]; - val = 1.f/(1.f + __builtin_shave_sau_exp2_f16_l_r(val*nlog_2_e)); - dst_data[offset] = val; + half val = src[offset]; + val = 1.0h / (1.0h + exp2(val * -log_2_e)); + dst[offset] = val; } -__kernel void region_ocl(__global const half* restrict src_data, - __global half* restrict dst_data, - int W, - int H, - int classes, - int coords, - int num, - int maskSize, - int doSoftmax) +__kernel void region_chw( + __global const half *restrict src_data, + __global half *restrict dst_data, + int W, + int H, + int classes, + int coords, + int num, + int maskSize, + int doSoftmax) { - int box_sz = H * W * (classes + coords + 1); - int pixel_pos =  min((int)get_global_id(0), H*W); - int box = get_global_id(1); + __local half local_src[13 * 13 * (4 + 1 + 80)]; + __local half local_dst[13 * 13 * (4 + 1 + 80)]; - //if (pixel_pos >= H*W) return; + const int box_sz = W * H * (classes + coords + 1); + event_t e1 = async_work_group_copy(local_src, src_data + get_group_id(1) * box_sz, box_sz, 0); + wait_group_events(1, &e1); - logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 0*H*W); - logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 1*H*W); + const int pixel_pos = get_local_id(0); + const int stride = W * H; - //copy plane 2 and 3 - dst_data[box * box_sz + pixel_pos + 2*H*W] = src_data[box * box_sz + pixel_pos + 2*H*W]; - dst_data[box * box_sz + pixel_pos + 3*H*W] = src_data[box * box_sz + pixel_pos + 3*H*W]; +#if ALLOW_EARLY_RETURN + if (pixel_pos < W * H) +#endif + { + __local const half *restrict src = local_src + pixel_pos; + __local half *restrict dst = local_dst + pixel_pos; - logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 4*H*W); + logistic_activate(src, dst, 0 * stride); + logistic_activate(src, dst, 1 * stride); - int data_offset = box * box_sz + (coords + 1) * W * H; + //copy plane 2 and 3 + dst[2 * stride] = src[2 * stride]; + dst[3 * stride] = src[3 * stride]; - __private half data[NUM_CLASSES]; + logistic_activate(src, dst, 4 * stride); - if (doSoftmax) { - half max_val = src_data[data_offset + 0*H*W + pixel_pos]; - for (int c = 0; c < classes; c++) { - half tmp = src_data[data_offset + c*H*W + pixel_pos]; - data[c] = tmp; - max_val = max( max_val, tmp); - } + src += (coords + 1) * stride; + dst += (coords + 1) * stride; - half expSum = 0.0f; + if (doSoftmax) { + half max_val = src[0]; + #pragma unroll 4 + for (int c = 1; c < classes; c++) { + max_val = max(max_val, src[c * stride]); + } - for (int c = 0; c < classes; c++) { - half tmp = half_exp(data[c] - max_val); - data[c] = tmp; - expSum += tmp; - } - for (int c = 0; c < classes; c++) { - data[c] = data[c] / expSum; - } + half expSum = 0.0h; + #pragma unroll 4 + for (int c = 0; c < classes; c++) { + const half e = src[c * stride] - max_val; + const half tmp = exp2(e * log_2_e); + dst[c * stride] = tmp; + expSum += tmp; + } - for (int c = 0; c < classes; c++) { - dst_data[data_offset + c*H*W + pixel_pos + 0] = data[c]; - } - } - else { - for (int i = 0; i < classes; i++) { - logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + (5 + i)*H*W); + const half recip = 1.h / expSum; + int c = 0; + for (; c < (classes & ~0x3); c += 4) { + const half t0 = dst[(c + 0) * stride]; + const half t1 = dst[(c + 1) * stride]; + const half t2 = dst[(c + 2) * stride]; + const half t3 = dst[(c + 3) * stride]; + + const half e0 = t0 * recip; + const half e1 = t1 * recip; + const half e2 = t2 * recip; + const half e3 = t3 * recip; + + dst[(c + 0) * stride] = e0; + dst[(c + 1) * stride] = e1; + dst[(c + 2) * stride] = e2; + dst[(c + 3) * stride] = e3; + } + for (; c < classes; c++) { + dst[c * stride] *= recip; + } + } else { + #pragma unroll 4 + for (int c = 0; c < classes; c++) { + logistic_activate(src, dst, c * stride); + } } } + + barrier(CLK_LOCAL_MEM_FENCE); + + event_t e2 = async_work_group_copy(dst_data + get_group_id(1) * box_sz, local_dst, box_sz, 0); + wait_group_events(1, &e2); } diff --git a/inference-engine/src/vpu/custom_kernels/region_chw_m7_branch0.cl b/inference-engine/src/vpu/custom_kernels/region_chw_m7_branch0.cl deleted file mode 100644 index f83e8149cad85d..00000000000000 --- a/inference-engine/src/vpu/custom_kernels/region_chw_m7_branch0.cl +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (C) 2018-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -#define NUM_CLASSES 80 - -static void logistic_activate(__global const half* restrict src_data, - __global half* restrict dst_data, - int offset) -{ - half val = src_data[offset]; - val = 1.0f/(1.0f + native_exp(-val)); - dst_data[offset] = val; -} - -__kernel void region_ocl(__global const half* restrict src_data, - __global half* restrict dst_data, - int W, - int H, - int classes, - int coords) -{ - const int box_sz = H * W * (classes + coords + 1); - const int pixel_pos = min((int)get_global_id(0), ((H*W) - 1)); - const int box = get_global_id(1); - - logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 0*H*W); - logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 1*H*W); - - //copy plane 2 and 3 - dst_data[box * box_sz + pixel_pos + 2*H*W] = src_data[box * box_sz + pixel_pos + 2*H*W]; - dst_data[box * box_sz + pixel_pos + 3*H*W] = src_data[box * box_sz + pixel_pos + 3*H*W]; - - logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 4*H*W); - int data_offset = box * box_sz + (coords + 1) * W * H; - - __private half data[NUM_CLASSES]; - - half max_val = src_data[data_offset + 0*H*W + pixel_pos]; - for (int c = 0; c < classes; c++) { - half tmp = src_data[data_offset + c*H*W + pixel_pos]; - data[c] = tmp; - max_val = max( max_val, tmp); - } - - half expSum = 0.0f; - - for (int c = 0; c < classes; c++) { - half tmp = half_exp(data[c] - max_val); - data[c] = tmp; - expSum += tmp; - } - for (int c = 0; c < classes; c++) { - dst_data[data_offset + c*H*W + pixel_pos + 0] = data[c] / expSum; - } -} diff --git a/inference-engine/src/vpu/custom_kernels/region_chw_m7_branch1.cl b/inference-engine/src/vpu/custom_kernels/region_chw_m7_branch1.cl deleted file mode 100644 index 16298d53beb7e4..00000000000000 --- a/inference-engine/src/vpu/custom_kernels/region_chw_m7_branch1.cl +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (C) 2018-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -#define NUM_CLASSES 80 - -static void logistic_activate(__global const half* restrict src_data, - __global half* restrict dst_data, - int offset) -{ - half val = src_data[offset]; - val = 1.0f/(1.0f + native_exp(-val)); - dst_data[offset] = val; -} - -__kernel void region_ocl(__global const half* restrict src_data, - __global half* restrict dst_data, - int W, - int H, - int classes, - int coords) -{ - int box_sz = H * W * (classes + coords + 1); - int pixel_pos = min((int)get_global_id(0), ((H*W) - 1)); - int box = get_global_id(1); - - logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 0*H*W); - logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 1*H*W); - - //copy plane 2 and 3 - dst_data[box * box_sz + pixel_pos + 2*H*W] = src_data[box * box_sz + pixel_pos + 2*H*W]; - dst_data[box * box_sz + pixel_pos + 3*H*W] = src_data[box * box_sz + pixel_pos + 3*H*W]; - - logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 4*H*W); - - int data_offset = box * box_sz + (coords + 1) * W * H; - - for (int i = 0; i < classes; i++) { - logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + (5 + i)*H*W); - } -} diff --git a/inference-engine/src/vpu/custom_kernels/region_hwc.cl b/inference-engine/src/vpu/custom_kernels/region_hwc.cl new file mode 100644 index 00000000000000..5db751a7c88498 --- /dev/null +++ b/inference-engine/src/vpu/custom_kernels/region_hwc.cl @@ -0,0 +1,114 @@ +// Copyright (C) 2018-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable + +__constant static half log_2_e = (half)1.442695040888963; // log2(exp(1.0)) + +#define ALLOW_EARLY_RETURN 1 + +static void inline logistic_activate_hwc( + __local const half *restrict src, + __local half *restrict dst, + int offset, + int stride) +{ + half val = src[offset]; + val = 1.0h / (1.0h + exp2(val * -log_2_e)); + dst[offset * stride] = val; +} + +__kernel void region_hwc( + __global const half *restrict src, + __global half *restrict dst, + int W, + int H, + int classes, + int coords, + int num, + int maskSize, + int doSoftmax) +{ + __local half local_src[13 * 13 * (4 + 1 + 80)]; + __local half local_dst[13 * 13 * (4 + 1 + 80)]; + + const int pixel_pos = get_local_id(0); + + const int local_C = classes + coords + 1; + const int c = get_group_id(1) * local_C; + const int h = get_group_id(0); + + num = (doSoftmax != 0) * num + (doSoftmax == 0) * maskSize; + const int C = local_C * num; + + event_t e1 = async_work_group_copy_2D2D( + local_src, // dst + src + h * W * C + c, // src + local_C, // num_elements_per_line, + H * W, // num_lines, + C - local_C, // src_line_stride, + 0, // dst_line_stride, + 0); + + wait_group_events(1, &e1); + +#if ALLOW_EARLY_RETURN + if (pixel_pos < W * H) +#endif + { + const int w = pixel_pos % W; + const int h = pixel_pos / W; + + __local const half *restrict src = local_src + h * W * local_C + w * local_C; + __local half *restrict dst = local_dst + h * W + w; + + const int stride = H * W; + logistic_activate_hwc(src, dst, 0, stride); + logistic_activate_hwc(src, dst, 1, stride); + + //copy plane 2 and 3 + dst[2 * stride] = src[2]; + dst[3 * stride] = src[3]; + + logistic_activate_hwc(src, dst, 4, stride); + + src += coords + 1; + dst += (coords + 1) * stride; + + if (doSoftmax) { + half max_val = src[0]; + #pragma unroll 4 + for (int c = 1; c < classes; c++) { + max_val = max(max_val, src[c]); + } + + half expSum = 0.0h; + #pragma unroll 4 + for (int c = 0; c < classes; c++) { + const half e = src[c] - max_val; + const half tmp = exp2(e * log_2_e); + dst[c * stride] = tmp; + expSum += tmp; + } + + const half invExpSum = 1.0h / expSum; + #pragma unroll 4 + for (int c = 0; c < classes; c++) { + dst[c * stride] *= invExpSum; + } + } else { + #pragma unroll 4 + for (int c = 0; c < classes; c++) { + logistic_activate_hwc(src, dst, c, stride); + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + const int box_sz = W * H * (classes + coords + 1); + event_t e2 = async_work_group_copy(dst + get_group_id(1) * box_sz, local_dst, box_sz, 0); + wait_group_events(1, &e2); +} diff --git a/inference-engine/src/vpu/custom_kernels/reorg_chw.cl b/inference-engine/src/vpu/custom_kernels/reorg_chw.cl index 6cd2b7890e6189..1b4ac7e69bd1f2 100644 --- a/inference-engine/src/vpu/custom_kernels/reorg_chw.cl +++ b/inference-engine/src/vpu/custom_kernels/reorg_chw.cl @@ -3,119 +3,65 @@ // #pragma OPENCL EXTENSION cl_khr_fp16 : enable - -#define USE_MANUAL_DMA - -#if defined (USE_MANUAL_DMA) - -__kernel void __dma_preload_reorg_chw(__global half const *restrict src, - __global half *restrict dst, - int W, - int H, - int C, - int stride, - __local half *restrict local_src, - __local half *restrict local_dst - ) +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable + +__kernel void reorg_chw( + __global const half *restrict src, + __global half *restrict dst, + int W, + int H, + int C, + int stride) { - const int stride_y = get_group_id(1); + __local half local_src[8 * 1024]; + __local half local_dst[8 * 1024]; - const int srcIdx = stride_y*W*stride + W*stride*stride*get_group_id(0); - - WorkGroupDmaCreateStrideTransaction( - src + srcIdx, // src + event_t e1 = async_work_group_copy_2D2D( local_src, // dst - W * stride * sizeof(half), // src width - W * stride * sizeof(half), // dst width - W * stride * stride * get_num_groups(0) * sizeof(half), // src stride - W * stride * sizeof(half), // dst stride - W * stride * get_local_size(0) * sizeof(half), //total size - 0); -} - -__kernel void __dma_postwrite_reorg_chw(__global half const *restrict src, - __global half *restrict dst, - int W, - int H, - int C, - int stride, - __local half *restrict local_src, - __local half const *restrict local_dst - ) -{ - const int stride_y = get_group_id(1); - - const int dstIdx = stride_y*W*stride*get_global_size(0) + get_group_id(0)*W; - - WorkGroupDmaCreateStrideTransaction( - local_dst, // src - dst + dstIdx, // dst - W * sizeof(half), // src width - W * sizeof(half), // dst width - W * sizeof(half), // src stride - W * get_num_groups(0) * sizeof(half), // dst stride - get_local_size(0) * W * stride * sizeof(half), //total size + src + get_group_id(1) * W * stride + + get_group_id(0) * W * stride * stride, // src + W * stride, // num_elements_per_line, + get_local_size(0), // num_lines, + W * stride * (stride * get_num_groups(0) - 1), // src_line_stride, + 0, // dst_line_stride, 0); -} + wait_group_events(1, &e1); -__kernel void reorg_chw(__global half const *restrict src, - __global half *restrict dst, - int W, - int H, - int C, - int stride, - __local half *restrict local_src, - __local half *restrict local_dst - ) -{ - const int c = get_local_id(0); + const int c = get_local_id(0); const int stride_x = get_local_id(1); - const int srcIdx = stride_x + c*W*stride; - const int dstIdx = stride_x*W*get_local_size(0) + c*W; + const int srcIdx = stride_x + c * W * stride; + const int dstIdx = stride_x * W * get_local_size(0) + c * W; int x = 0; for (; x <= W - 8; x += 8) { - half8 data = (half8) { - local_src[srcIdx + (x + 0)*stride], local_src[srcIdx + (x + 1)*stride], - local_src[srcIdx + (x + 2)*stride], local_src[srcIdx + (x + 3)*stride], - local_src[srcIdx + (x + 4)*stride], local_src[srcIdx + (x + 5)*stride], - local_src[srcIdx + (x + 6)*stride], local_src[srcIdx + (x + 7)*stride] - }; - - *((__local half8*)(&local_dst[dstIdx + x])) = data; + half8 data = (half8){ + local_src[srcIdx + (x + 0) * stride], + local_src[srcIdx + (x + 1) * stride], + local_src[srcIdx + (x + 2) * stride], + local_src[srcIdx + (x + 3) * stride], + local_src[srcIdx + (x + 4) * stride], + local_src[srcIdx + (x + 5) * stride], + local_src[srcIdx + (x + 6) * stride], + local_src[srcIdx + (x + 7) * stride]}; + + *((__local half8 *)(&local_dst[dstIdx + x])) = data; } for (; x < W; x++) { - local_dst[dstIdx + x] = local_src[srcIdx + x*stride]; + local_dst[dstIdx + x] = local_src[srcIdx + x * stride]; } -} - -#else - -__kernel void reorg_chw(__global half const *restrict src, - __global half *restrict dst, - int W, - int H, - int C, - int stride, - __local half const *restrict _0, - __local half *restrict _1 - ) -{ - const int stride_x = get_local_id(1); - const int stride_y = get_group_id(1); - const int N = get_global_size(0); - const int c = get_local_id(0)*get_num_groups(0) + get_group_id(0); - const int srcIdx = c*W*stride*stride + stride_x + stride_y*W*stride; - const int dstIdx = c*W + stride_x*W*N + stride_y*W*N*stride; + barrier(CLK_LOCAL_MEM_FENCE); - #pragma unroll 8 - for (int x = 0; x < W; x++) { - dst[dstIdx + x] = src[srcIdx + x*stride]; - } + event_t e2 = async_work_group_copy_2D2D( + dst + get_group_id(0) * W + + get_group_id(1) * W * stride * get_global_size(0), // dst + local_dst, // src + W, // num_elements_per_line + get_local_size(0) * stride, // num_lines + 0, // src_line_stride + W * (get_num_groups(0) - 1), // dst_line_stride + 0); + wait_group_events(1, &e2); } - -#endif - diff --git a/inference-engine/src/vpu/custom_kernels/reorg_chw_local.cl b/inference-engine/src/vpu/custom_kernels/reorg_chw_local.cl deleted file mode 100644 index 35032cf9223c7c..00000000000000 --- a/inference-engine/src/vpu/custom_kernels/reorg_chw_local.cl +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (C) 2018-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -// kernel with local memory buffer -__kernel void reorg(__global const half* restrict src, - __global half* restrict out, - __local half* restrict tmp, - int H, - int W, - int stride) -{ - int h = min((int)get_global_id(0), H-1); - - int c = get_global_id(1); - int C = get_global_size(1); - int C2 = C/(stride*stride); - - int offset = c / C2; - - int c2 = c - C2 * offset; - - int H2 = H*stride; - int W2 = W*stride; - - for (int w = 0; w < W; ++w) - { - int h2 = h*stride + offset / stride; - int w2 = w*stride + offset - stride * (offset / stride); - - tmp[get_local_id(1)*get_local_size(0)*W + get_local_id(0)*W + w] = src[W2*H2*c2 + W2*h2 + w2]; - } - - for (int w = 0; w < W; ++w) - { - out[W*H*c + W*h + w] = tmp[get_local_id(1)*get_local_size(0)*W + get_local_id(0)*W + w]; - } -} diff --git a/inference-engine/src/vpu/custom_kernels/reorg_chw_stack.cl b/inference-engine/src/vpu/custom_kernels/reorg_chw_stack.cl deleted file mode 100644 index 3e0932e7cd638b..00000000000000 --- a/inference-engine/src/vpu/custom_kernels/reorg_chw_stack.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (C) 2018-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -#define MAX_W 512 - -// kernel that uses private memory on stack -__kernel void reorg(__global const half* restrict src, - __global half* restrict out, - int H, - int W, - int stride) -{ - int h = min((int)get_global_id(0), H-1); - - int c = get_global_id(1); - int C = get_global_size(1); - int C2 = C/(stride*stride); - - int offset = c / C2; - - int c2 = c - C2 * offset; - - int b = get_global_id(2); - - __private half tmp[MAX_W]; - - int H2 = H*stride; - int W2 = W*stride; - - for (int w = 0; w < W; ++w) - { - int h2 = h*stride + offset / stride; - int w2 = w*stride + offset - stride * (offset / stride); - - tmp[w] = src[W2*H2*C2*b + W2*H2*c2 + W2*h2 + w2]; - } - - for (int w = 0; w < W; ++w) - { - out[W*H*C*b + W*H*c + W*h + w] = tmp[w]; - } -} diff --git a/inference-engine/src/vpu/custom_kernels/reorg_hwc.cl b/inference-engine/src/vpu/custom_kernels/reorg_hwc.cl index 6bbddc08f9af0e..6937bd96cfce25 100644 --- a/inference-engine/src/vpu/custom_kernels/reorg_hwc.cl +++ b/inference-engine/src/vpu/custom_kernels/reorg_hwc.cl @@ -3,66 +3,32 @@ // #pragma OPENCL EXTENSION cl_khr_fp16 : enable - -__kernel void __dma_preload_reorg_hwc(__global half const *restrict src, - __global half *restrict _0, - int W, - int H, - int C, - int stride, - __local half *restrict local_src, - __local half *restrict _1 - ) +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable + +__kernel void reorg_hwc( + __global half const *restrict src, + __global half *restrict dst, + int W, + int H, + int C, + int stride) { - const int stride_x = get_group_id(1); + __local half local_src[8 * 1024]; + __local half local_dst[8 * 1024]; - WorkGroupDmaCreateStrideTransaction( - src + get_group_id(0) * stride + stride_x * C, // src + event_t e1 = async_work_group_copy_2D2D( local_src, // dst - stride * sizeof(half), // src_width, - stride * sizeof(half), // dst_width, - C * stride * sizeof(half), // src_stride, - stride * sizeof(half), // dst_stride, - H * W * sizeof(half), // size + src + get_group_id(0) * stride + get_group_id(1) * C, // src + stride, // num_elements_per_line + H * W / stride, // num_lines + (C - 1) * stride, // src_line_stride + 0, // dst_line_stride 0); -} - -__kernel void __dma_postwrite_reorg_hwc(__global half const *restrict _0, - __global half *restrict dst, - int W, - int H, - int C, - int stride, - __local half *restrict _1, - __local half *restrict local_dst - ) -{ - const int stride_x = get_group_id(1); + wait_group_events(1, &e1); - WorkGroupDmaCreateStrideTransaction( - local_dst, // src - dst + stride_x * C + get_group_id(0) * stride, // dst - stride * sizeof(half), // src_width, - stride * sizeof(half), // dst_width, - stride * sizeof(half), // src_stride, - C * stride * sizeof(half), // dst_stride, - W * H * sizeof(half), // size - 0); -} - -__kernel void reorg_hwc(__global half const *restrict src, - __global half *restrict dst, - int W, - int H, - int C, - int stride, - __local half *restrict local_src, - __local half *restrict local_dst - ) -{ const int stride_y = get_local_id(1); - const int blocks = get_local_size(0); - const int b = get_local_id(0); + const int blocks = get_local_size(0); + const int b = get_local_id(0); const int OC = stride * stride; const int OH = H / stride; @@ -73,67 +39,27 @@ __kernel void reorg_hwc(__global half const *restrict src, for (int block_h = 0; block_h < stride; block_h++) { const int src_line = b * stride * stride + stride_y * stride + block_h; - const int c = src_line / IH; - const int h = src_line % IH; + const int c = src_line / IH; + const int h = src_line % IH; const int dst_line = b * stride + stride_y * blocks * stride + block_h; - const int oc = dst_line / OH; - const int oh = dst_line % OH; + const int oc = dst_line / OH; + const int oh = dst_line % OH; for (int w = 0; w < W / stride; w++) { - local_dst[oh*OW*OC + w*OC + oc] = local_src[h*IW*IC + w*IC + c]; + local_dst[oh * OW * OC + w * OC + oc] = local_src[h * IW * IC + w * IC + c]; } } -} -__kernel void reorg_hwc_naive(__global half const *restrict src, - __global half *restrict dst, - int W, - int H, - int C, - int stride, - __local half *restrict local_src, - __local half *restrict local_dst - ) -{ - const int out_c = C / (stride * stride); - const int oc = C * (stride * stride); - const int oh = H / stride; - const int ow = W / stride; + barrier(CLK_LOCAL_MEM_FENCE); - const int c = get_global_id(0); - - for (int h = 0; h < H; ++h) - { - int in_index = W * (h + H*c) + (0); - int new_z = in_index / (oh*ow); - int new_y = (in_index %(oh*ow)) / ow; - int new_x = (in_index %(oh*ow)) % ow; - int new_index = new_z + new_x * oc + new_y * oc * ow; - - in_index++; - - int c2 = c % out_c; - int offset = c / out_c; - int w2 = 0 * stride + offset % stride; - int h2 = h * stride + offset / stride; - int out_index = w2 + W * stride * (h2 + H * stride * c2); - - #pragma unroll 2 - for(int i = 0; i < W; ++i, out_index+=stride, in_index++) - { - // repacking coordinates - int k0 = out_index / (H*W); - int j0 = (out_index % (H*W)) / W; - int i0 = (out_index % (H*W)) % W; - int out_index_repack = k0 + C * i0 + C * W * j0; - - dst[new_index] = src[out_index_repack]; - - int new_z = in_index / (oh*ow); - int new_y = (in_index %(oh*ow)) / ow; - int new_x = (in_index %(oh*ow)) % ow; - new_index = new_z + new_x * oc + new_y * oc * ow; - } - } + event_t e2 = async_work_group_copy_2D2D( + dst + get_group_id(1) * C + get_group_id(0) * stride, // dst + local_dst, // src + stride, // num_elements_per_line + W * H / stride, // num_lines + 0, // src_line_stride + C * stride - stride, // dst_line_stride + 0); + wait_group_events(1, &e2); } diff --git a/inference-engine/src/vpu/custom_kernels/reorg_hwc_naive.cl b/inference-engine/src/vpu/custom_kernels/reorg_hwc_naive.cl new file mode 100644 index 00000000000000..72841984916d61 --- /dev/null +++ b/inference-engine/src/vpu/custom_kernels/reorg_hwc_naive.cl @@ -0,0 +1,53 @@ +// Copyright (C) 2018-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__kernel void reorg_hwc_naive( + __global half const *restrict src, + __global half *restrict dst, + int W, + int H, + int C, + int stride) +{ + const int out_c = C / (stride * stride); + const int oc = C * (stride * stride); + const int oh = H / stride; + const int ow = W / stride; + + const int c = get_global_id(0); + + for (int h = 0; h < H; ++h) { + int in_index = W * (h + H * c) + (0); + int new_z = in_index / (oh * ow); + int new_y = (in_index % (oh * ow)) / ow; + int new_x = (in_index % (oh * ow)) % ow; + int new_index = new_z + new_x * oc + new_y * oc * ow; + + in_index++; + + int c2 = c % out_c; + int offset = c / out_c; + int w2 = 0 * stride + offset % stride; + int h2 = h * stride + offset / stride; + int out_index = w2 + W * stride * (h2 + H * stride * c2); + + #pragma unroll 2 + for (int i = 0; i < W; ++i, out_index += stride, in_index++) { + // repacking coordinates + int k0 = out_index / (H * W); + int j0 = (out_index % (H * W)) / W; + int i0 = (out_index % (H * W)) % W; + int out_index_repack = k0 + C * i0 + C * W * j0; + + dst[new_index] = src[out_index_repack]; + + int new_z = in_index / (oh * ow); + int new_y = (in_index % (oh * ow)) / ow; + int new_x = (in_index % (oh * ow)) % ow; + new_index = new_z + new_x * oc + new_y * oc * ow; + } + } +} diff --git a/inference-engine/src/vpu/custom_kernels/resample_AA.cl b/inference-engine/src/vpu/custom_kernels/resample_AA.cl new file mode 100644 index 00000000000000..905eb4e928c47e --- /dev/null +++ b/inference-engine/src/vpu/custom_kernels/resample_AA.cl @@ -0,0 +1,122 @@ +// Copyright (C) 2018-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable + +#define USE_OPTIMIZED_ROUND + +#ifdef USE_OPTIMIZED_ROUND +#define ROUND(x) ((int)((x) + 0.5f)) +#else +#define ROUND(x) (int)(round(x)) +#endif + +inline int out_to_in(float ox, float f) +{ +#ifdef USE_OPTIMIZED_ROUND + return (int)((ox + 0.5f) / f); +#else + return ROUND((ox + 0.5f) / f - 0.5f); +#endif +} + +static inline float triangleCoeff(float x) { return 1.0f - fabs(x); } + +static inline float4 triangleCoeff4(float4 x) { return 1.0f - fabs(x); } + +__kernel void resample_with_antialias( + __global const half *restrict src, + __global half *restrict dst, + int iw, + int ih, + float factor, + int ow, + int oh, + int channels) +{ + __local half local_src[20 * 1024]; + __local half local_dst[8 * 1024]; + + const int r = (factor > 1.0f) ? 2 : ceil(1.0f / factor); + const int oy_first = get_group_id(1) * get_local_size(1); + const int oy_last = (get_group_id(1) + 1) * get_local_size(1) - 1; + const int iy_first = max(out_to_in(oy_first, factor) - r, 0); + const int iy_last = min(out_to_in(oy_last, factor) + r, ih - 1); + const int iy_size = iy_last - iy_first + 1; + + event_t e1 = async_work_group_copy_2D2D( + local_src, // dst + src + get_group_id(2) * get_local_size(2) * ih * iw + iy_first * iw, // src + iy_size * iw, // num_elements_per_line, + get_local_size(2), // num_lines, + (ih - iy_size) * iw, // src_line_stride, + 0, // dst_line_stride, + 0); + wait_group_events(1, &e1); + + const int oy = get_global_id(1); + const float iy_f = ((oy + 0.5f) / factor - 0.5f) - iy_first; + const int iy = ROUND(iy_f); + + __local half const *restrict start_src = + local_src + iw * get_local_id(1) + iw * iy_size * get_local_id(2); + __local half *restrict start_dst = + local_dst + ow * get_local_id(1) + ow * get_local_size(1) * get_local_id(2); + + for (int ox = 0; ox < ow; ox++) { + const float ix_f = (float)((ox + 0.5f) / factor) - 0.5f; + const int ix_i = ROUND(ix_f); + + float4 v_sum = 0.f; + float4 v_wsum = 0.f; + for (int y = 0; y < iy_size; y++) { + float dy = iy_f - y; + int x = max(ix_i - r, 0); + int end_x = min(ix_i + r, iw - 1); + + float4 dx; + for (int i = 0; i < 4; i++) dx[i] = ix_f - x - i; + + for (; x < end_x - 3; x += 4, dx -= 4) { + float4 w = + factor * triangleCoeff4(factor * dx) * factor * triangleCoeff(factor * dy); + float4 src_vec = { + start_src[y * iw + x + 0], + start_src[y * iw + x + 1], + start_src[y * iw + x + 2], + start_src[y * iw + x + 3]}; + + v_sum += w * src_vec; + v_wsum += w; + } + + for (; x <= end_x; x++) { + float dx = ix_f - x; + float w = factor * triangleCoeff(factor * dx) * factor * triangleCoeff(factor * dy); + + v_sum[0] += w * start_src[y * iw + x]; + v_wsum[0] += w; + } + } + + v_sum[0] = v_sum[0] + v_sum[1] + v_sum[2] + v_sum[3]; + v_wsum[0] = v_wsum[0] + v_wsum[1] + v_wsum[2] + v_wsum[3]; + + start_dst[get_local_id(1) * ow + ox] = (!v_wsum[0]) ? 0.0f : (half)(v_sum[0] / v_wsum[0]); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + event_t e2 = async_work_group_copy_2D2D( + dst + get_group_id(2) * get_local_size(2) * get_global_size(1) * ow + + get_group_id(1) * get_local_size(1) * ow, // dst + local_dst, // src + get_local_size(1) * ow, // num_elements_per_line, + get_local_size(2), // num_lines, + 0, // src_line_stride, + (get_global_size(1) - get_local_size(1)) * ow, // dst_line_stride, + 0); + wait_group_events(1, &e2); +} diff --git a/inference-engine/src/vpu/custom_kernels/resample_nn.cl b/inference-engine/src/vpu/custom_kernels/resample_nn.cl deleted file mode 100644 index 9584cb2518f340..00000000000000 --- a/inference-engine/src/vpu/custom_kernels/resample_nn.cl +++ /dev/null @@ -1,173 +0,0 @@ -// Copyright (C) 2018-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -#define USE_OPTIMIZED_ROUND - -#ifdef USE_OPTIMIZED_ROUND - #define ROUND(x) ((int)((x) + 0.5f)) -#else - #define ROUND(x) (int)(round(x)) -#endif - -inline int out_to_in(float ox, float f) { - return (int)((ox + 0.5f) * f); -} - -#define USE_MANUAL_DMA - -#if defined (USE_MANUAL_DMA) - -void interpolationCHW_nn(__local half* psrc, __local half* pdst, int OW, int IW, int C, float rw, float rh) -{ - float alpha = rh / 2.0f - 0.5f; - - for (int w = 0; w < OW/8; w++) - { - float fw0 = rw*(w*8+0) + alpha; - float fw1 = rw*(w*8+1) + alpha; - float fw2 = rw*(w*8+2) + alpha; - float fw3 = rw*(w*8+3) + alpha; - - float fw4 = rw*(w*8+4) + alpha; - float fw5 = rw*(w*8+5) + alpha; - float fw6 = rw*(w*8+6) + alpha; - float fw7 = rw*(w*8+7) + alpha; - - int iw0 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw0), IW-1); - int iw1 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw1), IW-1); - int iw2 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw2), IW-1); - int iw3 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw3), IW-1); - - int iw4 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw4), IW-1); - int iw5 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw5), IW-1); - int iw6 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw6), IW-1); - int iw7 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw7), IW-1); - - for (int c = 0; c < C; c++) - { - half8 val = { - *((__local half*)(psrc + c * IW + iw0)), - *((__local half*)(psrc + c * IW + iw1)), - - *((__local half*)(psrc + c * IW + iw2)), - *((__local half*)(psrc + c * IW + iw3)), - - *((__local half*)(psrc + c * IW + iw4)), - *((__local half*)(psrc + c * IW + iw5)), - - *((__local half*)(psrc + c * IW + iw6)), - *((__local half*)(psrc + c * IW + iw7)), - }; - *((__local half8*)(pdst + c * OW + w*8)) = val; - } - } - - for (int w = OW/8*8; w < OW; w++) - { - float fw = rw*w + alpha; - int iw0 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw), IW-1); - - for (int c = 0; c < C; c++) - { - *((__local half*)(pdst + c * OW + w)) = *((__local half*)(psrc + c * IW + iw0)); - } - } -} - -__kernel void __dma_preload_resample_nearest(__global const half* restrict src, - __global half* restrict _0, - __local half* restrict local_src, - __local half* restrict _1, - int iw, - int ih, - float factor, - int ow, - int oh, - int channels) -{ - const int oy_first = get_group_id(1) * get_local_size(1); - const int oy_last = (get_group_id(1) + 1) * get_local_size(1) - 1; - const int iy_first = out_to_in(oy_first, 1.0 / factor); - const int iy_last = out_to_in(oy_last, 1.0 /factor); - const int iy_size = iy_last - iy_first + 1; - - WorkGroupDmaCreateStrideTransaction( - src + get_group_id(2)*channels*ih*iw + iy_first*iw, // src - local_src, // dst - iy_size * iw * sizeof(half), // src_width, - iy_size * iw * sizeof(half), // dst_width, - ih * iw * sizeof(half), // src_stride, - iy_size * iw * sizeof(half), // dst_stride, - channels * iy_size * iw * sizeof(half), // size - 0); -} - -__kernel void __dma_postwrite_resample_nearest(__global const half* restrict _0, - __global half* restrict dst, - __local half* restrict _1, - __local half* restrict local_dst, - int iw, - int ih, - float factor, - int ow, - int oh, - int channels) -{ - - WorkGroupDmaCreateStrideTransaction( - local_dst, // src - dst + get_group_id(2)*channels*get_global_size(1)*ow + get_group_id(1)*get_local_size(1)*ow, // dst - get_local_size(1) * ow * sizeof(half), // src_width, - get_local_size(1) * ow * sizeof(half), // dst_width, - get_local_size(1) * ow * sizeof(half), // src_stride, - get_global_size(1) * ow * sizeof(half), // dst_stride, - channels * get_local_size(1) * ow * sizeof(half), // size - 0); -} - -kernel void resample_nearest(__global const half* restrict src, - __global half* restrict dst, - __local half* restrict local_src, - __local half* restrict local_dst, - int iw, - int ih, - float factor, - int ow, - int oh, - int channels) -{ - interpolationCHW_nn(local_src, local_dst, ow, iw, channels, 1.0 / factor, 1.0 / factor); -} - -#else // defined (USE_MANUAL_DMA) - -kernel void resample_nearest(__global const half* restrict src, - __global half* restrict dst, - __local half* restrict local_src, - __local half* restrict local_dst, - int iw, - int ih, - float factor, - int ow, - int oh, - int channels) -{ - const float inv_factor = 1.0f / factor; - const int iy = out_to_in(get_global_id(1), inv_factor); - - __global half* dst_data = dst + get_global_id(1)*ow; - __global half* src_data = src + iy*iw; - - for (int ox = 0; ox < ow; ++ox) - { - const int ix = out_to_in(ox, inv_factor); - for (int c = 0; c < channels; c++) { - dst_data[c*oh*ow + ox] = src_data[c*ih*iw + ix]; - } - } -} - -#endif // defined (USE_MANUAL_DMA) diff --git a/inference-engine/src/vpu/custom_kernels/resample_noAA.cl b/inference-engine/src/vpu/custom_kernels/resample_noAA.cl new file mode 100644 index 00000000000000..77885b6a40c5cb --- /dev/null +++ b/inference-engine/src/vpu/custom_kernels/resample_noAA.cl @@ -0,0 +1,112 @@ +// Copyright (C) 2018-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable + +#define USE_OPTIMIZED_ROUND + +#ifdef USE_OPTIMIZED_ROUND +#define ROUND(x) ((int)((x) + 0.5f)) +#else +#define ROUND(x) (int)(round(x)) +#endif + +inline int out_to_in(float ox, float f) { return (int)((ox + 0.5f) * f); } + +void interpolationCHW_nn(__local half *psrc, __local half *pdst, int OW, int IW, int C, float rw, float rh) +{ + float alpha = rh / 2.0f - 0.5f; + + for (int w = 0; w < OW / 8; w++) { + float fw0 = rw * (w * 8 + 0) + alpha; + float fw1 = rw * (w * 8 + 1) + alpha; + float fw2 = rw * (w * 8 + 2) + alpha; + float fw3 = rw * (w * 8 + 3) + alpha; + + float fw4 = rw * (w * 8 + 4) + alpha; + float fw5 = rw * (w * 8 + 5) + alpha; + float fw6 = rw * (w * 8 + 6) + alpha; + float fw7 = rw * (w * 8 + 7) + alpha; + + int iw0 = min((int)ROUND(fw0), IW - 1); + int iw1 = min((int)ROUND(fw1), IW - 1); + int iw2 = min((int)ROUND(fw2), IW - 1); + int iw3 = min((int)ROUND(fw3), IW - 1); + + int iw4 = min((int)ROUND(fw4), IW - 1); + int iw5 = min((int)ROUND(fw5), IW - 1); + int iw6 = min((int)ROUND(fw6), IW - 1); + int iw7 = min((int)ROUND(fw7), IW - 1); + + for (int c = 0; c < C; c++) { + half8 val = { + *((__local half *)(psrc + c * IW + iw0)), + *((__local half *)(psrc + c * IW + iw1)), + *((__local half *)(psrc + c * IW + iw2)), + *((__local half *)(psrc + c * IW + iw3)), + + *((__local half *)(psrc + c * IW + iw4)), + *((__local half *)(psrc + c * IW + iw5)), + *((__local half *)(psrc + c * IW + iw6)), + *((__local half *)(psrc + c * IW + iw7)), + }; + *((__local half8 *)(pdst + c * OW + w * 8)) = val; + } + } + + for (int w = OW / 8 * 8; w < OW; w++) { + float fw = rw * w + alpha; + int iw0 = min((int)ROUND(fw), IW - 1); + + for (int c = 0; c < C; c++) { + *((__local half *)(pdst + c * OW + w)) = *((__local half *)(psrc + c * IW + iw0)); + } + } +} + +kernel void resample_nearest( + __global const half *restrict src, + __global half *restrict dst, + int iw, + int ih, + float factor, + int ow, + int oh, + int channels) +{ + __local half local_src[14 * 1024]; + __local half local_dst[14 * 1024]; + + const int oy_first = get_group_id(1) * get_local_size(1); + const int oy_last = (get_group_id(1) + 1) * get_local_size(1) - 1; + const int iy_first = out_to_in(oy_first, 1.0 / factor); + const int iy_last = out_to_in(oy_last, 1.0 / factor); + + const int iy_size = iy_last - iy_first + 1; + + event_t e1 = async_work_group_copy_2D2D( + local_src, // dst + src + get_group_id(2) * channels * ih * iw + iy_first * iw, // src + iy_size * iw, // num_elements_per_line, + channels, // num_lines, + ih * iw - iy_size * iw, // src_line_stride, + 0, // dst_line_stride, + 0); + + wait_group_events(1, &e1); + + interpolationCHW_nn(local_src, local_dst, ow, iw, channels, 1.0 / factor, 1.0 / factor); + + event_t e2 = async_work_group_copy_2D2D( + dst + get_group_id(2) * channels * get_global_size(1) * ow + get_group_id(1) * get_local_size(1) * ow, // dst + local_dst, // src + get_local_size(1) * ow, // size_t num_elements_per_line, + channels, // size_t num_lines, + 0, // size_t src_line_stride, + get_global_size(1) * ow - get_local_size(1) * ow, // size_t dst_line_stride, + 0); + + wait_group_events(1, &e2); +} diff --git a/inference-engine/src/vpu/custom_kernels/resample_with_antialias.cl b/inference-engine/src/vpu/custom_kernels/resample_with_antialias.cl deleted file mode 100644 index 26d310dc3405d3..00000000000000 --- a/inference-engine/src/vpu/custom_kernels/resample_with_antialias.cl +++ /dev/null @@ -1,245 +0,0 @@ -// Copyright (C) 2018-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -#define USE_OPTIMIZED_ROUND - -#ifdef USE_OPTIMIZED_ROUND - #define ROUND(x) ((int)((x) + 0.5f)) -#else - #define ROUND(x) (int)(round(x)) -#endif - - -inline int out_to_in(float ox, float f) { -#ifdef USE_OPTIMIZED_ROUND - return (int)((ox + 0.5f) / f); -#else - return ROUND((ox + 0.5f) / f - 0.5f); -#endif -} - -static inline float triangleCoeff(float x) -{ - return 1.0f - fabs(x); -} - -static inline float4 triangleCoeff4(float4 x) -{ - return 1.0f - fabs(x); -} - -static inline half triangleCoeffHalf(half x) -{ - return 1.0h - fabs(x); -} - -static inline half4 triangleCoeffHalf4(half4 x) -{ - return 1.0h - fabs(x); -} - -static inline half8 triangleCoeffHalf8(half8 x) -{ - return 1.0h - fabs(x); -} - -#define USE_MANUAL_DMA - -#if defined (USE_MANUAL_DMA) - -__kernel void __dma_preload_resample_with_antialias(__global const half* restrict src, - __global half* restrict _0, - __local half* restrict local_src, - __local half* restrict _1, - int iw, - int ih, - float factor, - int ow, - int oh, - int channels) -{ - const int r = (factor > 1.0f) ? 2 : ceil(1.0f / factor); - const int oy_first = get_group_id(1) * get_local_size(1); - const int oy_last = (get_group_id(1) + 1) * get_local_size(1) - 1; - const int iy_first = max(out_to_in(oy_first, factor) - r, 0); - const int iy_last = min(out_to_in(oy_last, factor) + r, ih - 1); - const int iy_size = iy_last - iy_first + 1; - - WorkGroupDmaCreateStrideTransaction( - src + get_group_id(2)*get_local_size(2)*ih*iw + iy_first*iw, // src - local_src, // dst - iy_size * iw * sizeof(half), // src_width, - iy_size * iw * sizeof(half), // dst_width, - ih * iw * sizeof(half), // src_stride, - iy_size * iw * sizeof(half), // dst_stride, - get_local_size(2) * iy_size * iw * sizeof(half), // size - 0); -} - -__kernel void __dma_postwrite_resample_with_antialias(__global const half* restrict _0, - __global half* restrict dst, - __local half* restrict _1, - __local half* restrict dst_local, - int iw, - int ih, - float factor, - int ow, - int oh, - int channels) -{ - WorkGroupDmaCreateStrideTransaction( - dst_local, // src - dst + get_group_id(2)*get_local_size(2)*get_global_size(1)*ow + get_group_id(1)*get_local_size(1)*ow, // dst - get_local_size(1) * ow * sizeof(half), // src_width, - get_local_size(1) * ow * sizeof(half), // dst_width, - get_local_size(1) * ow * sizeof(half), // src_stride, - get_global_size(1) * ow * sizeof(half), // dst_stride, - get_local_size(2) * get_local_size(1) * ow * sizeof(half), // size - 0); -} - -__kernel void resample_with_antialias(const __global half* restrict src, - __global half* restrict dst, - __local half* restrict local_src, - __local half* restrict local_dst, - int iw, - int ih, - float factor, - int ow, - int oh, - int channels) -{ - const int r = (factor > 1.0f) ? 2 : ceil(1.0f / factor); - const int oy_first = get_group_id(1) * get_local_size(1); - const int oy_last = (get_group_id(1) + 1) * get_local_size(1) - 1; - const int iy_first = max(out_to_in(oy_first, factor) - r, 0); - const int iy_last = min(out_to_in(oy_last, factor) + r, ih - 1); - const int iy_size = iy_last - iy_first + 1; - const int oy = get_global_id(1); - const float iy_f = ((oy + 0.5f) / factor - 0.5f) - iy_first; - const int iy = ROUND(iy_f); - - __local half const *restrict start_src = local_src + iw * get_local_id(1) + iw * iy_size * get_local_id(2); - __local half *restrict start_dst = local_dst + ow * get_local_id(1) + ow * get_local_size(1) * get_local_id(2); - - for (int ox = 0; ox < ow; ox++) - { - const float ix_f = (float)((ox + 0.5f) / factor) - 0.5f; - const int ix_i = ROUND(ix_f); - - float4 v_sum = 0.f; - float4 v_wsum = 0.f; - for (int y = 0; y < iy_size; y++) - { - float dy = iy_f - y; - int x = max(ix_i - r, 0); - int end_x = min(ix_i + r, iw - 1); - - float4 dx; - for (int i = 0; i < 4; i++) - dx[i] = ix_f - x - i; - - for (; x < end_x - 3; x += 4, dx -= 4) - { - float4 w = factor*triangleCoeff4(factor*dx) * factor*triangleCoeff(factor*dy); - float4 src_vec = { start_src[y*iw + x + 0], - start_src[y*iw + x + 1], - start_src[y*iw + x + 2], - start_src[y*iw + x + 3] }; - - v_sum += w * src_vec; - v_wsum += w; - } - - for (; x <= end_x; x++) - { - float dx = ix_f - x; - float w = factor*triangleCoeff(factor*dx) * factor*triangleCoeff(factor*dy); - - v_sum[0] += w * start_src[y*iw + x]; - v_wsum[0] += w; - } - } - - v_sum[0] = v_sum[0] + v_sum[1] + v_sum[2] + v_sum[3]; - v_wsum[0] = v_wsum[0] + v_wsum[1] + v_wsum[2] + v_wsum[3]; - - start_dst[get_local_id(1)*ow + ox] = (!v_wsum[0]) ? 0.0f : (half)(v_sum[0] / v_wsum[0]); - } -} - -#else - -__kernel void resample_with_antialias(const __global half* restrict src, - __global half* restrict dst, - __local half* restrict _0, - __local half* restrict _1, - int iw, - int ih, - float factor, - int ow, - int oh, - int channels) -{ - int oy = get_global_id(1); - int c = get_global_id(2); - - int r = (factor > 1.0f) ? 2 : ceil((1.0f)/factor); - - const __global half* restrict start_src = src + iw * ih * c; - __global half* restrict start_dst = dst + ow * oh * c; - - float iy_f = (oy + 0.5) / factor - 0.5f; - int iy_i = ROUND(iy_f); - - for (int ox = 0; ox < ow; ox++) - { - float ix_f = (ox + 0.5) / factor - 0.5f; - int ix_i = ROUND(ix_f); - - float4 v_sum = 0.f; - float4 v_wsum = 0.f; - - for (int y = max(iy_i - r, 0); y <= min(iy_i + r, (int)ih - 1); y++) - { - float dy = iy_f - y; - int x = max(ix_i - r, 0); - int end_x = min(ix_i + r, (int)iw - 1); - - float4 dx; - for (int i = 0; i < 4; i++) - dx[i] = ix_f - x - i; - - for (; x <= end_x - 3; x += 4, dx -= 4) - { - float4 w = factor*triangleCoeff4(factor*dx) * factor*triangleCoeff(factor*dy); - float4 src_vec = { start_src[y*iw + x + 0], - start_src[y*iw + x + 1], - start_src[y*iw + x + 2], - start_src[y*iw + x + 3] }; - - v_sum += w * src_vec; - v_wsum += w; - } - - for (; x <= end_x; x++) - { - float dx = ix_f - x; - float w = factor*triangleCoeff(factor*dx) * factor*triangleCoeff(factor*dy); - - v_sum[0] += w * start_src[y*iw + x]; - v_wsum[0] += w; - } - } - - v_sum[0] = v_sum[0] + v_sum[1] + v_sum[2] + v_sum[3]; - v_wsum[0] = v_wsum[0] + v_wsum[1] + v_wsum[2] + v_wsum[3]; - - start_dst[oy*ow + ox] = (!v_wsum[0]) ? (half)0.0f : (half)(v_sum[0] / v_wsum[0]); - } -} - -#endif diff --git a/inference-engine/src/vpu/custom_kernels/shuffle_channels.cl b/inference-engine/src/vpu/custom_kernels/shuffle_channels.cl index 237e26fe4d6060..3a54d5ecd6e076 100644 --- a/inference-engine/src/vpu/custom_kernels/shuffle_channels.cl +++ b/inference-engine/src/vpu/custom_kernels/shuffle_channels.cl @@ -4,12 +4,13 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable -__kernel void ShuffleChannel(__global const half* restrict src_data, - __global half* restrict dst_data, - int C, - int H, - int W, - int G) +__kernel void ShuffleChannel( + __global const half *restrict src_data, + __global half *restrict dst_data, + int C, + int H, + int W, + int G) { int c = get_global_id(0); if (c >= C) return; @@ -18,16 +19,15 @@ __kernel void ShuffleChannel(__global const half* restrict src_data, int cy = c % G; int cx = c / G; - __global const half8* src_line = ((__global const half8*)(src_data + cy*CX*H*W + cx*H*W)); - __global half8* dst_line = ((__global half8*)(dst_data + cx*CY*H*W + cy*H*W)); + __global const half8 *src_line = + ((__global const half8 *)(src_data + cy * CX * H * W + cx * H * W)); + __global half8 *dst_line = ((__global half8 *)(dst_data + cx * CY * H * W + cy * H * W)); - for (int i = 0; i < W*H/8; i++) - { + for (int i = 0; i < W * H / 8; i++) { dst_line[i] = src_line[i]; } - for (int i = W*H/8*8; i < W*H; i++) - { - dst_data[cx*CY*H*W + cy*H*W + i] = src_data[cy*CX*H*W + cx*H*W + i]; + for (int i = W * H / 8 * 8; i < W * H; i++) { + dst_data[cx * CY * H * W + cy * H * W + i] = src_data[cy * CX * H * W + cx * H * W + i]; } } diff --git a/inference-engine/src/vpu/custom_kernels/st.cl b/inference-engine/src/vpu/custom_kernels/st.cl index bac1606edbc11d..fdef731654492f 100644 --- a/inference-engine/src/vpu/custom_kernels/st.cl +++ b/inference-engine/src/vpu/custom_kernels/st.cl @@ -3,51 +3,29 @@ // #pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable #define MAX_WIDTH 512 -#define MIN(a, b) ((a) < (b)) ? (a) : (b); - -__kernel void __dma_postwrite_ocl_st(__global half const *const restrict src_data, - __global half const *const restrict theta, - __global half *const restrict dst_data, - int C, - int W, - __local half const *const restrict local_dst) -{ - const int x0 = get_global_id(0) * MAX_WIDTH; - const int x1 = MIN(x0 + MAX_WIDTH, W); - const int length = x1 - x0; - WorkGroupDmaCreate3DTransaction( - local_dst, // src - dst_data + get_global_id(1) * W + x0, // dst - length * sizeof(half), // src width - length * sizeof(half), // dst width - length * sizeof(half), // src stride - W * sizeof(half), // dst stride - C, // num planes - get_local_size(1) * length * sizeof(half), // src plane stride - get_global_size(1) * W * sizeof(half), // dst plane stride - get_local_size(1) * length * sizeof(half), // plane size - 0); -} - -__attribute__((noinline)) -void calcInd(__global half const *const restrict theta, - half *const restrict weight, - int *const restrict ind, - int y, int H, int x0, int length, int step, int W) +__attribute__((noinline)) void calcInd( + __global const half *restrict theta, + __local half *restrict weight, + __local int *restrict ind, + int y, + int H, + int x0, + int length, + int step, + int W) { float a = (float)y * 1.0f / H * 2 - 1; int x = 0; - float8 va = (float8) {a, a, a, a, a, a, a, a}; - float8 vxy = (float8) {x0 + 0, x0 + 1, x0 + 2, x0 + 3, - x0 + 4, x0 + 5, x0 + 6, x0 + 7}; + float8 va = (float8){a, a, a, a, a, a, a, a}; + float8 vxy = (float8){x0 + 0, x0 + 1, x0 + 2, x0 + 3, x0 + 4, x0 + 5, x0 + 6, x0 + 7}; - for (; x <= length - 8; x += 8, vxy += 8) - { + for (; x <= length - 8; x += 8, vxy += 8) { float8 va1 = vxy * 1.0f / W * 2 - 1.f; float8 vx = (va * theta[0] + va1 * theta[1] + theta[2] + 1.f) / 2.f * H; @@ -61,21 +39,27 @@ void calcInd(__global half const *const restrict theta, float8 bx = 1.f - ax; float8 by = 1.f - ay; - union {int8 d; uint8 i; } check_x; + union { + int8 d; + uint8 i; + } check_x; check_x.d = ix; - int8 b01 = check_x.i < (uint8)H; + int8 b01 = check_x.i < (uint8)H; check_x.d = ix + 1; - int8 b45 = check_x.i < (uint8)H; + int8 b45 = check_x.i < (uint8)H; - union {int8 d; uint8 i; } check_y; + union { + int8 d; + uint8 i; + } check_y; check_y.d = iy; - int8 b23 = check_y.i < (uint8)W; + int8 b23 = check_y.i < (uint8)W; check_y.d = iy + 1; - int8 b67 = check_y.i < (uint8)W; + int8 b67 = check_y.i < (uint8)W; int8 b0123 = b01 & b23; int8 b0167 = b01 & b67; @@ -87,33 +71,48 @@ void calcInd(__global half const *const restrict theta, int8 TR_id = ((ix + 0) * W + (iy + 1)) * (b0167 & 1); int8 BR_id = ((ix + 1) * W + (iy + 1)) * (b4567 & 1); - union {float8 f; int8 i;} w0; w0.f = bx * by; - union {float8 f; int8 i;} w1; w1.f = ax * by; - union {float8 f; int8 i;} w2; w2.f = bx * ay; - union {float8 f; int8 i;} w3; w3.f = ax * ay; + union { + float8 f; + int8 i; + } w0; + w0.f = bx * by; + union { + float8 f; + int8 i; + } w1; + w1.f = ax * by; + union { + float8 f; + int8 i; + } w2; + w2.f = bx * ay; + union { + float8 f; + int8 i; + } w3; + w3.f = ax * ay; w0.i = w0.i & b0123; w1.i = w1.i & b4523; w2.i = w2.i & b0167; w3.i = w3.i & b4567; - *((half8*)(weight + x + 0*step)) = convert_half8(w0.f); - *((half8*)(weight + x + 1*step)) = convert_half8(w1.f); - *((half8*)(weight + x + 2*step)) = convert_half8(w2.f); - *((half8*)(weight + x + 3*step)) = convert_half8(w3.f); + *((__local half8 *)(weight + x + 0 * step)) = convert_half8(w0.f); + *((__local half8 *)(weight + x + 1 * step)) = convert_half8(w1.f); + *((__local half8 *)(weight + x + 2 * step)) = convert_half8(w2.f); + *((__local half8 *)(weight + x + 3 * step)) = convert_half8(w3.f); - *((int8*)(ind + x + 0*step)) = TL_id; - *((int8*)(ind + x + 1*step)) = BL_id; - *((int8*)(ind + x + 2*step)) = TR_id; - *((int8*)(ind + x + 3*step)) = BR_id; + *((__local int8 *)(ind + x + 0 * step)) = TL_id; + *((__local int8 *)(ind + x + 1 * step)) = BL_id; + *((__local int8 *)(ind + x + 2 * step)) = TR_id; + *((__local int8 *)(ind + x + 3 * step)) = BR_id; } - for (; x < length; x++) - { + for (; x < length; x++) { float a1 = (float)(x0 + x) * 1.0f / W * 2 - 1; - float fx = (a * theta[0] + a1 * theta[1] + theta[2] + 1)/2 * H; - float fy = (a * theta[3] + a1 * theta[4] + theta[5] + 1)/2 * W; + float fx = (a * theta[0] + a1 * theta[1] + theta[2] + 1) / 2 * H; + float fy = (a * theta[3] + a1 * theta[4] + theta[5] + 1) / 2 * W; const int ix = (int)(fx) - (fx < 0); const int iy = (int)(fy) - (fy < 0); @@ -123,15 +122,15 @@ void calcInd(__global half const *const restrict theta, float bx = 1 - ax; float by = 1 - ay; - int b0 = ix >= 0; + int b0 = ix >= 0; int b4 = ix >= -1; - int b1 = ix < H; - int b5 = ix < H-1; + int b1 = ix < H; + int b5 = ix < H - 1; - int b2 = iy >= 0; + int b2 = iy >= 0; int b6 = iy >= -1; - int b3 = iy < W; - int b7 = iy < W-1; + int b3 = iy < W; + int b7 = iy < W - 1; int b01 = b0 & b1; int b23 = b2 & b3; @@ -148,69 +147,79 @@ void calcInd(__global half const *const restrict theta, int TR_id = ((ix + 0) * W + (iy + 1)) * b0167; int BR_id = ((ix + 1) * W + (iy + 1)) * b4567; - half w0 = bx*by*b0123; - half w1 = ax*by*b4523; - half w2 = bx*ay*b0167; - half w3 = ax*ay*b4567; + half w0 = bx * by * b0123; + half w1 = ax * by * b4523; + half w2 = bx * ay * b0167; + half w3 = ax * ay * b4567; - weight[x + 0*step] = w0; - weight[x + 1*step] = w1; - weight[x + 2*step] = w2; - weight[x + 3*step] = w3; + weight[x + 0 * step] = w0; + weight[x + 1 * step] = w1; + weight[x + 2 * step] = w2; + weight[x + 3 * step] = w3; - ind[x + 0*step] = TL_id; - ind[x + 1*step] = BL_id; - ind[x + 2*step] = TR_id; - ind[x + 3*step] = BR_id; + ind[x + 0 * step] = TL_id; + ind[x + 1 * step] = BL_id; + ind[x + 2 * step] = TR_id; + ind[x + 3 * step] = BR_id; } } -__attribute__((noinline)) -void apply(__global half const *const restrict src, - half const *const restrict weight, - int const *const restrict ind, - __local half *const restrict dst, - int length, - int step) +__attribute__((noinline)) void apply( + __global half const *restrict src, + __local half const *restrict weight, + __local int const *restrict ind, + __local half *restrict dst, + int src_stride, + int step) { int x = 0; - for(; x <= length - 8; x += 8) - { - int8 TL_id = *((int8*)(ind + x + 0*step)); - int8 BL_id = *((int8*)(ind + x + 1*step)); - int8 TR_id = *((int8*)(ind + x + 2*step)); - int8 BR_id = *((int8*)(ind + x + 3*step)); - - half8 w00 = *((half8*)(weight + x + 0*step)); - half8 w01 = *((half8*)(weight + x + 1*step)); - half8 w02 = *((half8*)(weight + x + 2*step)); - half8 w03 = *((half8*)(weight + x + 3*step)); - - half8 TL = (half8){src[TL_id[0]], src[TL_id[1]], src[TL_id[2]], src[TL_id[3]], - src[TL_id[4]], src[TL_id[5]], src[TL_id[6]], src[TL_id[7]]}; - half8 TR = (half8){src[TR_id[0]], src[TR_id[1]], src[TR_id[2]], src[TR_id[3]], - src[TR_id[4]], src[TR_id[5]], src[TR_id[6]], src[TR_id[7]]}; - half8 BL = (half8){src[BL_id[0]], src[BL_id[1]], src[BL_id[2]], src[BL_id[3]], - src[BL_id[4]], src[BL_id[5]], src[BL_id[6]], src[BL_id[7]]}; - half8 BR = (half8){src[BR_id[0]], src[BR_id[1]], src[BR_id[2]], src[BR_id[3]], - src[BR_id[4]], src[BR_id[5]], src[BR_id[6]], src[BR_id[7]]}; - - half8 res = w00 * TL + w01 * BL + w02 * TR + w03 * BR; - - *((__local half8*)(dst + x)) = res; + for (; x <= src_stride - 8; x += 8) { + int8 TL_id = *((__local int8 *)(ind + x + 0 * step)); + int8 BL_id = *((__local int8 *)(ind + x + 1 * step)); + int8 TR_id = *((__local int8 *)(ind + x + 2 * step)); + int8 BR_id = *((__local int8 *)(ind + x + 3 * step)); + + half8 w00 = *((__local half8 *)(weight + x + 0 * step)); + half8 w01 = *((__local half8 *)(weight + x + 1 * step)); + half8 w02 = *((__local half8 *)(weight + x + 2 * step)); + half8 w03 = *((__local half8 *)(weight + x + 3 * step)); + + half8 TL = (half8){ + src[TL_id[0]], src[TL_id[1]], + src[TL_id[2]], src[TL_id[3]], + src[TL_id[4]], src[TL_id[5]], + src[TL_id[6]], src[TL_id[7]]}; + half8 TR = (half8){ + src[TR_id[0]], src[TR_id[1]], + src[TR_id[2]], src[TR_id[3]], + src[TR_id[4]], src[TR_id[5]], + src[TR_id[6]], src[TR_id[7]]}; + half8 BL = (half8){ + src[BL_id[0]], src[BL_id[1]], + src[BL_id[2]], src[BL_id[3]], + src[BL_id[4]], src[BL_id[5]], + src[BL_id[6]], src[BL_id[7]]}; + half8 BR = (half8){ + src[BR_id[0]], src[BR_id[1]], + src[BR_id[2]], src[BR_id[3]], + src[BR_id[4]], src[BR_id[5]], + src[BR_id[6]], src[BR_id[7]]}; + + half8 res = w00 * TL + w01 * BL + w02 * TR + w03 * BR; + + *((__local half8 *)(dst + x)) = res; } - for (; x < length; x++) - { - int TL_id = ind[x + 0*step]; - int BL_id = ind[x + 1*step]; - int TR_id = ind[x + 2*step]; - int BR_id = ind[x + 3*step]; + for (; x < src_stride; x++) { + int TL_id = ind[x + 0 * step]; + int BL_id = ind[x + 1 * step]; + int TR_id = ind[x + 2 * step]; + int BR_id = ind[x + 3 * step]; - half w00 = weight[x + 0*step]; - half w01 = weight[x + 1*step]; - half w02 = weight[x + 2*step]; - half w03 = weight[x + 3*step]; + half w00 = weight[x + 0 * step]; + half w01 = weight[x + 1 * step]; + half w02 = weight[x + 2 * step]; + half w03 = weight[x + 3 * step]; half TL = src[TL_id]; half TR = src[TR_id]; @@ -218,36 +227,52 @@ void apply(__global half const *const restrict src, half BR = src[BR_id]; half res = w00 * TL + w01 * BL + w02 * TR + w03 * BR; + dst[x] = res; } } -__kernel void ocl_st(__global half const *const restrict src_data, - __global half const *const restrict theta, - __global half const *const restrict dst_data, - int C, - int W, - __local half *const restrict local_dst) +__kernel void ocl_st( + __global half const *const restrict src_data, + __global half const *const restrict theta, + __global half *const restrict dst_data, + int C, + int W) { + __local int ind[4 * MAX_WIDTH] __attribute__((aligned(16))); + __local half weight[4 * MAX_WIDTH] __attribute__((aligned(16))); + __local half local_dst[4 * 1024]; + int w = get_group_id(0); int y = get_global_id(1); int H = get_global_size(1); - __private int ind[4][MAX_WIDTH] __attribute__((aligned(16))); - __private half weight[4][MAX_WIDTH] __attribute__((aligned(16))); - - const int x0 = w * MAX_WIDTH; - const int x1 = MIN(x0 + MAX_WIDTH, W); - const int length = x1 - x0; + const int x0 = w * MAX_WIDTH; + const int x1 = min(x0 + MAX_WIDTH, W); + const int src_stride = x1 - x0; - calcInd(theta, weight, ind, y, H, x0, length, MAX_WIDTH, W); + calcInd(theta, weight, ind, y, H, x0, src_stride, MAX_WIDTH, W); - for (int c = 0; c < C; c++) - { - __global half const *const restrict src = src_data + c*H*W; - __local half *const restrict dst = local_dst + c*get_local_size(1)*length + get_local_id(1)*length; + for (int c = 0; c < C; c++) { + __global half const *restrict src = src_data + c * H * W; + __local half *restrict dst = local_dst + c * get_local_size(1) * src_stride + get_local_id(1) * src_stride; - apply(src, weight, ind, dst, length, MAX_WIDTH); + apply(src, weight, ind, dst, src_stride, MAX_WIDTH); } + + barrier(CLK_LOCAL_MEM_FENCE); + + event_t e = async_work_group_copy_3D3D( + dst_data + get_group_id(1) * get_local_size(1) * W + x0, // dst + local_dst, // src + src_stride, // num_elements_per_line + get_local_size(1), // num_lines + 0, // src_line_stride + W - src_stride, // dst_line_stride + C, // num planes + 0, // src plane stride + W * (get_global_size(1) - get_local_size(1)), // dst plane stride + 0); + wait_group_events(1, &e); } diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/frontend/ShaveElfMetadata.h b/inference-engine/src/vpu/graph_transformer/include/vpu/frontend/ShaveElfMetadata.h new file mode 100644 index 00000000000000..f6d0645a43d5cc --- /dev/null +++ b/inference-engine/src/vpu/graph_transformer/include/vpu/frontend/ShaveElfMetadata.h @@ -0,0 +1,188 @@ +// Copyright (C) 2018-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef SHAVE_METADATA_H_INCLUDED +#define SHAVE_METADATA_H_INCLUDED + +#include + + +enum { + md_invalid_index = ~0u, +}; + +enum md_version_t { + md_version_1_0 = 0x00010000, // version 1.0 + md_version_1_1 = 0x00010001, // version 1.1 + md_version_1_2 = 0x00010002, // version 1.2 + md_version_latest = md_version_1_2 +}; + +struct md_header_t { + uint32_t version; // 0xFFFF0000 = Major 0x0000FFFF = Minor + + // md_kernel_descriptor_t array info + uint32_t kernel_count; // number of kernels in the .metadata + uint32_t kernel_first; // absolute byte offset to first + // md_kernel_descriptor_t from start of .metadata + + // md_kernel_argument_t array info + uint32_t arg_count; // number of arguments in the .metadata + uint32_t arg_first; // absolute byte offset to first + // md_kernel_argument_t from start of .metadata + + // md_kernel_sipp_info_t array info + uint32_t sipp_info_count; // number of sipp dma infos in .metadata + uint32_t sipp_info_first; // absolute byte offset to first + // md_kernel_sipp_info_t from start of .metadata + + // md_expr_t array info + uint32_t expr_count; // number of expressions in .metadata + uint32_t expr_first; // absolute byte offset to first + // kernel_expr_t from start of .metadata + + // md_expr_node_t array info + uint32_t expr_node_count; // number of expression nodes in .metadata + uint32_t expr_node_first; // absolute byte offset to first md_expr_node_t + // from start of .metadata + + // function table + uint32_t func_count; // number of functions in the function table + uint32_t func_first; // absolute byte offset to the first md_function_t +}; + +struct md_function_t { + uint32_t load_address; // runtime address of a kernel function +}; + +struct md_kernel_variant_t { + uint32_t name; // offset into the string table of the kernel name + uint32_t factor; // vector width / unroll factor + uint32_t func; // index into the kernel function table +}; + +enum md_kernel_variant_type_t { + md_variant_scalar = 0, // basic scalar kernel + md_variant_vectorized, // kernel has been vectorized + md_variant_unrolled, // kernel has been loop unrolled + md_variant_sipp_dma, // sipp dma kernel + md_variant_sipp_dma_vectorized, // vectorized sipp dma kernel + md_variant_dma_preload, // kernel preload function + md_variant_dma_postwrite, // kernel postwrite function + md_variant_dma_fallback, // kernel fallback function + md_VARIANT_COUNT +}; + +constexpr int kVariantCount = md_VARIANT_COUNT; + +enum md_kernel_flags_t { + md_kernel_flags_ddr_write = 1u, // kernel writes to DDR memory + md_kernel_flags_ddr_read = 2u, // kernel reads from DDR memory + md_kernel_flags_generated_prepost = 4u, // kernel has an autogenerated prepost +}; + +struct md_kernel_descriptor_t { + uint32_t flags; // combination of md_kernel_flags_t + + uint32_t arg_count; // number of arguments for this kernel + uint32_t arg_index; // index of first kernel_argument_t + + uint32_t sipp_dma_in_count; // number of SIPP dma input arguments (or 0 if no SIPP dma) + uint32_t sipp_dma_out_count; // number of SIPP dma output arguments (or 0 if no SIPP dma) + uint32_t sipp_info_index; // index into the kernel_sipp_info_t list + + uint32_t name; // metadata string table offset for kernel name + + uint32_t stack_size_wg; // estimate of stack usage per work group (fixed) + uint32_t stack_size_wi; // estimate of stack usage per work item + + // kernel variant list + md_kernel_variant_t variant[kVariantCount]; +}; + +enum md_arg_addr_space_t { + md_addr_space_private = 0, + md_addr_space_global, // global address space (ddr) + md_addr_space_constant, // + md_addr_space_local, // local address space (cmx) + + md_addr_space_undef, // none of the others +}; + +enum md_arg_flags_t { + md_arg_flags_dma_input = 1u, // local argument is being read from + md_arg_flags_dma_output = 2u, // local argument is being written to + md_arg_flags_dma_double_buffer = 4u, // local argument should be double buffered + md_arg_flags_generated_prepost = 8u, // preload and post write are auto generated +}; + +struct md_kernel_argument_t { + uint32_t flags; // bitfield of md_arg_flags_t + uint32_t name; // argument name + uint32_t array_size_expr; // index to a `kernel_expr_t` type for evaluating total number of element + uint32_t size_elm; // size in bytes of the underlying element + md_arg_addr_space_t addr_space; // the arguments address space + uint32_t alignment; // alignment require in bytes + uint32_t arg_pack_offset; // offset into the argument pack +}; + +struct md_kernel_sipp_info_t { + uint32_t num_dims; // number of dimensions of the dma + uint32_t span_x; + uint32_t span_y; + + // below are all indexes to a 'kernel_expr_t' + uint32_t elm_size; // size in bytes of the element + uint32_t stride_y; // stride in elm_size in y axis + uint32_t stride_z; // z + uint32_t base; // address of the base of the buffer + uint32_t size_x; // size in elements for x dim + uint32_t size_y; // y + uint32_t size_z; // z + uint32_t max_x; // max work item index in x dim + uint32_t max_y; // y + uint32_t max_z; // z +}; + +enum md_expr_node_type_t { + md_type_global_size = 0, // global work size + md_type_local_size, // local work size + md_type_param, // kernel parameter + md_type_immediate, // uint32_t immediate value + + md_type_op_umul, // unsigned multiply + md_type_op_udiv, // unsigned divide + + md_type_op_add, // add + md_type_op_sub, // subtract + + md_type_op_min, // signed min + md_type_op_max, // signed max + md_type_op_umin, // unsigned min + md_type_op_umax, // unsigned max + + md_type_op_and, // bitwise and + md_type_op_or, // bitwise or + md_type_op_xor, // bitwise xor + + md_type_op_shl, // left shift + md_type_op_lshr, // right shift + + // more operators as needed + // ... +}; + +struct md_expr_node_t { + md_expr_node_type_t type; // type of this expression node + uint32_t value; // immediate or operand +}; + +struct md_expr_t { + uint32_t node_count; // number of md_expr_node_t's that make up this + // expression + uint32_t node_first; // index of the first md_expr_node_t that + // is part of this expression +}; + +#endif // SHAVE_METADATA_H_INCLUDED diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/frontend/ShaveElfMetadataParser.h b/inference-engine/src/vpu/graph_transformer/include/vpu/frontend/ShaveElfMetadataParser.h new file mode 100644 index 00000000000000..51b7800a4bc4dc --- /dev/null +++ b/inference-engine/src/vpu/graph_transformer/include/vpu/frontend/ShaveElfMetadataParser.h @@ -0,0 +1,225 @@ +// Copyright (C) 2018-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef SHAVE_METADATA_PARSER_H_INCLUDED +#define SHAVE_METADATA_PARSER_H_INCLUDED + +#include +#include +#include +#include + +#include "ShaveElfMetadata.h" + + +struct md_parser_t { + md_parser_t(const uint8_t *data, size_t data_size, + const char *strtab, + size_t strtab_size) + : hdr(reinterpret_cast(data)), + kernel_descriptor(reinterpret_cast( + data + hdr->kernel_first)), + kernel_argument(reinterpret_cast( + data + hdr->arg_first)), + kernel_sipp_info(reinterpret_cast( + data + hdr->sipp_info_first)), + expr_node(reinterpret_cast( + data + hdr->expr_node_first)), + expr(reinterpret_cast(data + hdr->expr_first)), + func(reinterpret_cast(data + hdr->func_first)), + strtab(strtab), strtab_size(strtab_size) { + (void)data_size; + (void)strtab_size; + assert(hdr->version == md_version_latest); + } + + // Return the metadata version + // + md_version_t get_version() const { + return static_cast(hdr->version); + } + + // Get a kernel by name + // + const md_kernel_descriptor_t *get_kernel(const std::string &name) const { + for (uint32_t i=0; i < hdr->kernel_count; ++i) { + const md_kernel_descriptor_t *d = get_kernel(i); + const char *n = get_name(d); + if (name == n) { + return d; + } + } + return nullptr; + } + + // Get a kernel id by name + // + int get_kernel_id(const std::string& name) const { + for (uint32_t i = 0; i < hdr->kernel_count; ++i) { + const md_kernel_descriptor_t* d = get_kernel(i); + const char* n = get_name(d); + if (name == n) { + return i; + } + } + return -1; + } + + // Return true if a kernel has a specific variant + // + bool kernel_has_variant(const md_kernel_descriptor_t *kernel, + md_kernel_variant_type_t variant) const { + const auto &v = kernel->variant[ variant ]; + return v.name != md_invalid_index && + v.func != md_invalid_index; + } + + // return the load address of a kernel variant + // + uint32_t get_kernel_load_addr(const md_kernel_descriptor_t *kernel, const md_kernel_variant_type_t variant) { + if (!kernel_has_variant(kernel, variant)) { + return 0; + } + const auto &v = kernel->variant[ variant ]; + const md_function_t &f = func[v.func]; + return f.load_address; + } + + // Get a rough stack size estimate for a kernel variant + // + uint32_t get_kernel_stack_estimate(const md_kernel_descriptor_t *kernel, + md_kernel_variant_type_t variant, + const uint32_t local_size[3]) const { + const uint32_t local_area = local_size[0] * local_size[1] * local_size[2]; + const uint32_t per_wi = local_area * kernel->stack_size_wi; + const uint32_t per_wg = kernel->stack_size_wg; + const uint32_t factor = kernel->variant[variant].factor; + switch (variant) { + case md_variant_vectorized: + case md_variant_unrolled: return per_wg + per_wi * factor; + case md_variant_scalar: + default: return per_wg + per_wi; + } + } + + // Return the number of local arguments a kernel has + // + uint32_t get_num_local_args(const md_kernel_descriptor_t *kernel) const { + uint32_t out = 0; + for (uint32_t i = 0; i < kernel->arg_count; ++i) { + const md_kernel_argument_t *arg = get_argument(kernel->arg_index + i); + out += arg->addr_space == md_addr_space_local; + } + return out; + } + + // Get the number of distinct kernels in this file + // + uint32_t get_kernel_count() const { + return hdr->kernel_count; + } + + // Get a function by index + // + const md_function_t *get_func_ptr(uint32_t index) const { + assert(index != md_invalid_index && index < hdr->func_count); + return func + index; + } + + // Get a kernel by load address + // + const md_kernel_descriptor_t *get_kernel_by_addr(uint32_t addr) const { + for (uint32_t i = 0; i < hdr->kernel_count; ++i) { + const md_kernel_descriptor_t *desc = get_kernel(i); + for (uint32_t j = 0; j < md_VARIANT_COUNT; ++j) { + const uint32_t index = desc->variant[j].func; + if (index == md_invalid_index) { + continue; + } + const md_function_t *ptr = get_func_ptr(index); + if (ptr->load_address == addr) { + return desc; + } + } + } + return nullptr; + } + + // Get a kernel by index + // + const md_kernel_descriptor_t *get_kernel(uint32_t index) const { + assert(index < hdr->kernel_count); + return kernel_descriptor + index; + } + + // Get an argument by index + // + const md_kernel_argument_t *get_argument(uint32_t index) const { + assert(index < hdr->arg_count); + return kernel_argument + index; + } + + // Get SIPP info by index + // + const md_kernel_sipp_info_t *get_sipp_info(uint32_t index) const { + assert(index < hdr->sipp_info_count); + return kernel_sipp_info + index; + } + + // Get an expression node by index + // + const md_expr_node_t *get_expr_node(uint32_t index) const { + assert(index < hdr->expr_node_count); + return expr_node + index; + } + + // Get an expression by index + // + const md_expr_t *get_expr(uint32_t index) const { + assert(index < hdr->expr_count); + return expr + index; + } + + // Get a kernel argument for a specific kernel by position + // + const md_kernel_argument_t *get_argument(const md_kernel_descriptor_t *kernel, uint32_t index) const { + assert(index < kernel->arg_count); + return get_argument(kernel->arg_index + index); + } + + // Return the name of a kernel + // + const char *get_name(const md_kernel_descriptor_t *kernel) const { + return strtab + kernel->name; + } + + // Return the name of an argument + // + const char *get_name(const md_kernel_argument_t *arg) const { + return strtab + arg->name; + } + + // Evaluate an arbitary expression + // + uint32_t evaluate_expr(const md_expr_t *expression, + const uint32_t local_size[3], + const uint32_t global_size[3], + const uint32_t *param, + uint32_t param_count) const; + +protected: + // structure parsers + const md_header_t *hdr; + const md_kernel_descriptor_t *kernel_descriptor; + const md_kernel_argument_t *kernel_argument; + const md_kernel_sipp_info_t *kernel_sipp_info; + const md_expr_node_t *expr_node; + const md_expr_t *expr; + const md_function_t *func; + // string table + const char *strtab; + const size_t strtab_size; +}; + +#endif // SHAVE_METADATA_PARSER_H_INCLUDED diff --git a/inference-engine/src/vpu/graph_transformer/src/frontend/ShaveElfMetadataParser.cpp b/inference-engine/src/vpu/graph_transformer/src/frontend/ShaveElfMetadataParser.cpp new file mode 100644 index 00000000000000..d8c14661f06d66 --- /dev/null +++ b/inference-engine/src/vpu/graph_transformer/src/frontend/ShaveElfMetadataParser.cpp @@ -0,0 +1,93 @@ +// Copyright (C) 2018-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpu/frontend/ShaveElfMetadataParser.h" +#include + +namespace { + +// two operand operator evaluation +uint32_t md_eval_expression_type_op_2( + const md_expr_node_type_t type, + const uint32_t lhs, + const uint32_t rhs) { + switch (type) { + case md_type_op_umul: return lhs * rhs; + case md_type_op_udiv: return lhs / rhs; + case md_type_op_add: return (int32_t)lhs + (int32_t)rhs; + case md_type_op_sub: return (int32_t)lhs - (int32_t)rhs; + case md_type_op_min: return std::min((int32_t)lhs, (int32_t)rhs); + case md_type_op_max: return std::max((int32_t)lhs, (int32_t)rhs); + case md_type_op_umin: return std::min(lhs, rhs); + case md_type_op_umax: return std::max(lhs, rhs); + case md_type_op_and: return lhs & rhs; + case md_type_op_or: return lhs | rhs; + case md_type_op_xor: return lhs ^ rhs; + case md_type_op_shl: return lhs << rhs; + case md_type_op_lshr: return lhs >> rhs; + default: + assert(!"unknown node type"); + return 0; + } +} +} // namespace + +uint32_t md_parser_t::evaluate_expr(const md_expr_t *expression, + const uint32_t local_size[3], + const uint32_t global_size[3], + const uint32_t *param, + uint32_t param_count) const { + // find the nodes for the given expr_index + assert(expression->node_first < hdr->expr_node_count); + const md_expr_node_t *node = expr_node + expression->node_first; + // the intermediate value stack + std::vector values; + // for all of the nodes in this expression + for (uint32_t i = 0; i < expression->node_count; ++i) { + // get the node + const md_expr_node_t &v = node[i]; + // dispatch the opcode + switch (v.type) { + case md_type_immediate: + values.push_back(v.value); + break; + case md_type_op_umul: { + case md_type_op_udiv: + case md_type_op_add: + case md_type_op_sub: + case md_type_op_min: + case md_type_op_max: + case md_type_op_umin: + case md_type_op_umax: + case md_type_op_and: + case md_type_op_or: + case md_type_op_xor: + case md_type_op_shl: + case md_type_op_lshr: + uint32_t rhs = values.rbegin()[0]; + uint32_t lhs = values.rbegin()[1]; + values.pop_back(); + values.back() = md_eval_expression_type_op_2(v.type, lhs, rhs); + } + break; + case md_type_global_size: + assert(v.value < 3); + values.push_back(global_size[v.value]); + break; + case md_type_local_size: + assert(v.value < 3); + values.push_back(local_size[v.value]); + break; + case md_type_param: + assert(v.value < param_count); + values.push_back(param[v.value]); + break; + default: + assert(!"unknown node type"); + } + } + // should only be one value remaining which is the result + assert(values.size() == 1); + return values.back(); +} diff --git a/inference-engine/src/vpu/graph_transformer/src/frontend/custom_kernel.cpp b/inference-engine/src/vpu/graph_transformer/src/frontend/custom_kernel.cpp index c95750cc024438..da70641421e951 100644 --- a/inference-engine/src/vpu/graph_transformer/src/frontend/custom_kernel.cpp +++ b/inference-engine/src/vpu/graph_transformer/src/frontend/custom_kernel.cpp @@ -2,20 +2,30 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include #include +#include +#include +#include #include +#include namespace vpu { +VPU_PACKED(Elf32Shdr { + uint32_t shName; + uint32_t pad0[3]; + uint32_t shOffset; + uint32_t shSize; + uint32_t pad1[4]; +};) + VPU_PACKED(Elf32Ehdr { - uint8_t offs1[28]; - uint32_t ePhoff; // Program header offset - uint32_t eShoff; // Section header offset - uint8_t offs2[12]; - uint16_t eShnum; // Number of sections - uint16_t offs3; + uint32_t pad0[7]; + uint32_t ePhoff; + uint32_t eShoff; + uint32_t pad1[3]; + uint16_t eShnum; + uint16_t eShstrndx; };) VPU_PACKED(Elf32Section { @@ -95,111 +105,66 @@ std::pair findSymbolTable( return std::make_pair(strShdr, symShdr); } -SmallVector deduceKernelParameters( - const char* ELFData, - uint32_t kernelAddress) { - IE_ASSERT(ELFData != nullptr); - const auto cmp = ie::details::CaselessEq{}; - - auto ehdr = reinterpret_cast(ELFData); - auto phdr = reinterpret_cast(ELFData + ehdr->ePhoff); - auto shdr = reinterpret_cast(ELFData + ehdr->eShoff); - - const Elf32Section* strShdr = nullptr; - const Elf32Section* symShdr = nullptr; - std::tie(strShdr, symShdr) = findSymbolTable(ELFData); - IE_ASSERT(symShdr != nullptr && strShdr != nullptr); - - auto numSymEntries = symShdr->shSize / symShdr->shEntsize; - auto sym = reinterpret_cast(ELFData + symShdr->shOffset); - auto firstStr = ELFData + strShdr->shOffset; - - const char* kernelArgStrings = nullptr; - for (size_t i = 0; i < numSymEntries; i++) { - if (cmp(firstStr + sym[i].stName, "opencl.kernelArgs.strings")) { - kernelArgStrings = ELFData + shdr[sym[i].stShndx].shOffset; - break; +SmallVector deduceKernelParameters(const md_parser_t& parser, int kernelId) { + const auto kernelDesc = parser.get_kernel(kernelId); + IE_ASSERT(kernelDesc != nullptr); + // Number of elements we get from parser is always greater by one + const auto argCount = kernelDesc->arg_count - 1; + + auto arguments = SmallVector{}; + arguments.reserve(argCount); + for (size_t i = 0; i < argCount; i++) { + const auto arg = parser.get_argument(kernelDesc, i); + VPU_THROW_UNLESS(arg, "Error while parsing custom layer elf file."); + + // skip hoisted buffers + if (arg->flags & md_arg_flags_generated_prepost) { + continue; } - } - IE_ASSERT(kernelArgStrings != nullptr); - - SmallVector parameters; - for (size_t i = 0; i < numSymEntries; i++) { - if (cmp(firstStr + sym[i].stName, "opencl.kernelArgs.info")) { - auto ptr = ELFData + shdr[sym[i].stShndx].shOffset; - auto numKernels = *reinterpret_cast(ptr); - - auto metaOffset = sizeof(int); - for (int k = 0; k < numKernels; k++) { - auto kHdr = reinterpret_cast(ptr + metaOffset); - if (kHdr->address-phdr->pVaddr == kernelAddress) { - auto aHdr = reinterpret_cast( - reinterpret_cast(&(kHdr->argOffset)) + sizeof(kHdr->argOffset) + kHdr->argOffset); - - auto numArgs = reinterpret_cast(aHdr)[-1]; - for (int n = 0; n < numArgs; n++, aHdr++) { - parameters.push_back(kernelArgStrings + aHdr->stringOffset); - } - - break; - } - - metaOffset += kHdr->sectionSize + sizeof(kHdr->address) + sizeof(kHdr->flags); - } - } + const auto argName = parser.get_name(arg); + arguments.emplace_back(argName); } - return parameters; + return arguments; } -int32_t getKernelId( - const char* ELFData, - uint32_t kernelAddress) { - IE_ASSERT(ELFData != nullptr); - const auto cmp = ie::details::CaselessEq{}; +static const Elf32Shdr *get_elf_section_with_name(const uint8_t *elf_data, const char* section_name) { + IE_ASSERT(elf_data); + IE_ASSERT(section_name); - auto ehdr = reinterpret_cast(ELFData); - auto phdr = reinterpret_cast(ELFData + ehdr->ePhoff); - auto shdr = reinterpret_cast(ELFData + ehdr->eShoff); + const auto *ehdr = reinterpret_cast(elf_data); + IE_ASSERT(0 != ehdr->eShoff); + IE_ASSERT(0 != ehdr->ePhoff); - const Elf32Section* strShdr = nullptr; - const Elf32Section* symShdr = nullptr; - std::tie(strShdr, symShdr) = findSymbolTable(ELFData); - IE_ASSERT(symShdr != nullptr && strShdr != nullptr); + // Pointer to the first section header + const Elf32Shdr *shdr = reinterpret_cast(elf_data + ehdr->eShoff); - auto numSymEntries = symShdr->shSize / symShdr->shEntsize; - auto sym = reinterpret_cast(ELFData + symShdr->shOffset); - auto firstStr = ELFData + strShdr->shOffset; + // Pointer to section header string table header + const Elf32Shdr *strShdr = &shdr[ehdr->eShstrndx]; - const char* kernelArgStrings = nullptr; - for (size_t i = 0; i < numSymEntries; i++) { - if (cmp(firstStr + sym[i].stName, "opencl.kernelArgs.strings")) { - kernelArgStrings = ELFData + shdr[sym[i].stShndx].shOffset; - break; - } + // We couldn't find sections for the symbol string names and for the symbols + // entries + if (!strShdr) { + return nullptr; } - IE_ASSERT(kernelArgStrings != nullptr); - - for (size_t i = 0; i < numSymEntries; i++) { - if (cmp(firstStr + sym[i].stName, "opencl.kernelArgs.info")) { - auto ptr = ELFData + shdr[sym[i].stShndx].shOffset; - auto numKernels = *reinterpret_cast(ptr); - auto metaOffset = sizeof(int); - for (int k = 0; k < numKernels; k++) { - auto kHdr = reinterpret_cast(ptr + metaOffset); + // The string at index 0, which corresponds to the first byte, is a null + // character + const char *firstStr = reinterpret_cast(elf_data + strShdr->shOffset); - if (kHdr->address-phdr->pVaddr == kernelAddress) { - return k; - } + // Find the section with the custom SHAVEComputeAorta data + for (uint16_t i = 0; i < ehdr->eShnum; i++) { + const char *currentSectionName = firstStr + shdr[i].shName; - metaOffset += kHdr->sectionSize + sizeof(kHdr->address) + sizeof(kHdr->flags); - } + if (0 == strcmp(currentSectionName, section_name)) { + return shdr + i; } } - return -1; + // If we reached this point, it means that there wasn't a section with + // the name we were looking for + return nullptr; } uint32_t getKernelEntry(const char* ELFData, const std::string& kernelName) { @@ -230,8 +195,9 @@ uint32_t getKernelEntry(const char* ELFData, const std::string& kernelName) { CustomKernel::CustomKernel(const pugi::xml_node& kernel, std::string configDir): _configDir {std::move(configDir)} { _maxShaves = XMLParseUtils::GetIntAttr(kernel, "max-shaves", 0); + std::string fileName; for (auto source = kernel.child("Source"); !source.empty(); source = source.next_sibling("Source")) { - auto fileName = _configDir + "/" + XMLParseUtils::GetStrAttr(source, "filename", ""); + fileName = _configDir + "/" + XMLParseUtils::GetStrAttr(source, "filename", ""); std::ifstream inputFile(fileName, std::ios::binary); if (!inputFile.is_open()) { @@ -244,9 +210,30 @@ CustomKernel::CustomKernel(const pugi::xml_node& kernel, std::string configDir): } const auto kernelEntryName = XMLParseUtils::GetStrAttr(kernel, "entry"); - const auto kernelEntry = getKernelEntry(&_kernelBinary[0], kernelEntryName); - _parameters = deduceKernelParameters(&_kernelBinary[0], kernelEntry); - _kernelId = getKernelId(&_kernelBinary[0], kernelEntry); + + const auto elf = reinterpret_cast(_kernelBinary.data()); + const Elf32Shdr *neoMetadataShdr = get_elf_section_with_name(elf, ".neo_metadata"); + VPU_THROW_UNLESS(neoMetadataShdr, "Error while parsing custom layer elf: Couldn't find .neo_metadata section"); + + const uint8_t *neoMetadata = elf + neoMetadataShdr->shOffset; + const size_t neoMetadataSize = neoMetadataShdr->shSize; + + const Elf32Shdr *neoMetadataStrShdr = get_elf_section_with_name(elf, ".neo_metadata.str"); + VPU_THROW_UNLESS(neoMetadataStrShdr, "Error while parsing custom layer elf: Couldn't find .neo_metadata.str section"); + + const char *neoMetadataStr = reinterpret_cast(elf + neoMetadataStrShdr->shOffset); + const size_t neoMetadataStrSize = neoMetadataStrShdr->shSize; + + const auto parser = md_parser_t{neoMetadata, neoMetadataSize, neoMetadataStr, neoMetadataStrSize}; + _kernelId = parser.get_kernel_id(kernelEntryName); + VPU_THROW_UNLESS(_kernelId != -1, "Failed to find kernel with name `%l`", kernelEntryName); + + VPU_THROW_UNLESS(parser.get_kernel_count() == 1, + "Failed to load kernel binary '%l'\n" + "\tReason: binary should contain only one kernel, but contains %l", + fileName, parser.get_kernel_count()); + + _parameters = deduceKernelParameters(parser, _kernelId); processParametersNode(kernel); processWorkSizesNode(kernel); diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/custom.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/custom.cpp index 27cc40086a0d19..bc4e34652dd605 100644 --- a/inference-engine/src/vpu/graph_transformer/src/stages/custom.cpp +++ b/inference-engine/src/vpu/graph_transformer/src/stages/custom.cpp @@ -136,7 +136,7 @@ class CustomStage final : public StageNode { case CustomParamType::OutputBuffer: case CustomParamType::Data: { VPU_THROW_UNLESS(ports.find(kp) != ports.end(), - "XML specification for %s layer has no definition for %s parameter. Layer name: %s", + "XML specification for %s layer has no definition for '%s' parameter. Layer name: %s", origLayer()->type, kp, origLayer()->name); int id = ports.find(kp)->second; diff --git a/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_custom_test.cpp b/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_custom_test.cpp index a8352dbfa5243f..3ad912198dac0a 100644 --- a/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_custom_test.cpp +++ b/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_custom_test.cpp @@ -20,7 +20,7 @@ INSTANTIATE_TEST_CASE_P(accuracy, myriadLayersTestsFakeQuantize_smoke, INSTANTIATE_TEST_CASE_P(accuracy, myriadLayersTestsQuantizeBinarize_smoke, ::testing::Combine( ::testing::ValuesIn(s_QuantizeTensors), - ::testing::ValuesIn(s_QuantizeLevels), + ::testing::Values(2), ::testing::ValuesIn(s_QuantizeSwitchOut), ::testing::ValuesIn(s_CustomConfig))); diff --git a/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_custom_test.hpp b/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_custom_test.hpp index a446a710f55671..20c18a2496028a 100644 --- a/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_custom_test.hpp +++ b/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_custom_test.hpp @@ -799,7 +799,7 @@ TEST_P(myriadLayersTestsQuantizeBinarize_smoke, Quantize_Binarization) { - + @@ -1057,6 +1057,10 @@ TEST_P(myriadLayersTestsBinaryConvolution_smoke, BinaryConvolution) { } _config[InferenceEngine::MYRIAD_CUSTOM_LAYERS] = customConfig; + if (kernel.x == 3 && kernel.y == 3 && dilations == 2) { + GTEST_SKIP() << "Computing wrong after hoisting"; + } + SetInputTensor(dims); auto dimsOutput = dims; dimsOutput.h = (dims.h) / strides; @@ -1112,7 +1116,7 @@ static std::vector s_BinaryConvolutionGroup = { static std::vector s_BinaryConvolutionKernel = { {{1, 1}}, {{1, 3}}, - {{3, 3}}, + {{3, 3}} }; static std::vector s_BinaryConvolutionStrides = { 1, 2 diff --git a/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_region_test.cpp b/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_region_test.cpp index 50eb4eb5541dcc..f81be4a08fc93b 100644 --- a/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_region_test.cpp +++ b/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_region_test.cpp @@ -14,5 +14,22 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values(1, 0), ::testing::Values(vpu::LayoutPreference::ChannelMajor, vpu::LayoutPreference::ChannelMinor), ::testing::Values(IRVersion::v7, IRVersion::v10), - ::testing::ValuesIn(s_CustomConfig) + ::testing::Values("") )); + +#ifdef VPU_HAS_CUSTOM_KERNELS + +INSTANTIATE_TEST_CASE_P( + accuracy_custom, myriadLayersTestsRegionYolo_smoke, + ::testing::Combine( + ::testing::Values(4), + ::testing::Values(20), + ::testing::Values(5, 10), + ::testing::Values(3), + ::testing::Values(1, 0), + ::testing::Values(vpu::LayoutPreference::ChannelMajor, vpu::LayoutPreference::ChannelMinor), + ::testing::Values(IRVersion::v7, IRVersion::v10), + ::testing::Values(s_CustomConfig[1]) +)); + +#endif diff --git a/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_reorg_test.cpp b/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_reorg_test.cpp index d60a7d4b55ae2c..d46d0c1061f272 100644 --- a/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_reorg_test.cpp +++ b/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_reorg_test.cpp @@ -9,5 +9,17 @@ INSTANTIATE_TEST_CASE_P(accuracy, myriadLayersTestsReorg_smoke, ::testing::Combi ::testing::Values(2), ::testing::Values(vpu::LayoutPreference::ChannelMinor, vpu::LayoutPreference::ChannelMajor), ::testing::Values(IRVersion::v7, IRVersion::v10), - ::testing::ValuesIn(s_CustomConfig) + ::testing::Values({}) )); + +#ifdef VPU_HAS_CUSTOM_KERNELS + +INSTANTIATE_TEST_CASE_P(accuracy_custom, myriadLayersTestsReorg_smoke, ::testing::Combine( + ::testing::ValuesIn(s_ReorgInputs_CustomLayer), + ::testing::Values(2), + ::testing::Values(vpu::LayoutPreference::ChannelMinor, vpu::LayoutPreference::ChannelMajor), + ::testing::Values(IRVersion::v7, IRVersion::v10), + ::testing::Values(s_CustomConfig[1]) +)); + +#endif diff --git a/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_reorg_test.hpp b/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_reorg_test.hpp index 372d6155b346dc..3f27835e0b2344 100644 --- a/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_reorg_test.hpp +++ b/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_reorg_test.hpp @@ -111,3 +111,9 @@ static std::vector s_ReorgInputs = { {1, 192, 6 * 26, 6 * 26}, {1, 4, 6, 6} }; + +static std::vector s_ReorgInputs_CustomLayer = { + {1, 64, 26, 26}, + {1, 64, 128, 128}, + {1, 4, 6, 6} +}; diff --git a/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_resample_test.cpp b/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_resample_test.cpp index 6030976b32ea24..97d81cfe68dca7 100644 --- a/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_resample_test.cpp +++ b/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_resample_test.cpp @@ -4,13 +4,26 @@ #include "myriad_layers_resample_test.hpp" -// #-31522 INSTANTIATE_TEST_CASE_P( - DISABLED_accuracy, myriadResampleLayerTests_smoke, + accuracy, myriadResampleLayerTests_smoke, ::testing::Combine( ::testing::ValuesIn(s_ResampleInput), ::testing::Values(2.0f, 0.5f), + ::testing::Values(false), + ::testing::Values(false, true), + ::testing::Values("")) +); + +#ifdef VPU_HAS_CUSTOM_KERNELS + +INSTANTIATE_TEST_CASE_P( + accuracy_custom, myriadResampleLayerTests_smoke, + ::testing::Combine( + ::testing::ValuesIn(s_ResampleInput), + ::testing::Values(2.0f), ::testing::Values(false, true), ::testing::Values(false, true), - ::testing::ValuesIn(s_CustomConfig)) + ::testing::Values(s_CustomConfig[1])) ); + +#endif From 82e15a5a648de300b177fc7a32e9cc49d8fee840 Mon Sep 17 00:00:00 2001 From: Roman Kazantsev Date: Wed, 9 Sep 2020 08:34:43 +0300 Subject: [PATCH 32/66] Support python 3.8 by the Model Optimizer tool in default configuration (#2078) * Support python 3.8 by the Model Optimizer tool in default configuration * Fix after review #1 * Fix after the second round review --- model-optimizer/mo/utils/versions_checker.py | 89 ++++++++++++++++--- .../mo/utils/versions_checker_test.py | 41 ++++++++- model-optimizer/requirements.txt | 3 +- model-optimizer/requirements_tf.txt | 3 +- 4 files changed, 118 insertions(+), 18 deletions(-) diff --git a/model-optimizer/mo/utils/versions_checker.py b/model-optimizer/mo/utils/versions_checker.py index d98a8ddd5ec719..0b532819e19b65 100644 --- a/model-optimizer/mo/utils/versions_checker.py +++ b/model-optimizer/mo/utils/versions_checker.py @@ -44,17 +44,24 @@ def check_python_version(): return 1 -def parse_versions_list(required_fw_versions, version_list): +def parse_and_filter_versions_list(required_fw_versions, version_list, env_setup): """ Please do not add parameter type annotations (param:type). Because we import this file while checking Python version. Python 2.x will fail with no clear message on type annotations. - Parsing requirements versions + Parsing requirements versions for a dependency and filtering out requirements that + satisfy environment setup such as python version. + if environment version (python_version, etc.) is satisfied :param required_fw_versions: String with fw versions from requirements file :param version_list: List for append + :param env_setup: a dictionary with environment setup :return: list of tuples of strings like (name_of_module, sign, version) + Examples of required_fw_versions: + 'tensorflow>=1.15.2,<2.0; python_version < "3.8"' + 'tensorflow>=2.0' + Returned object is: [('tensorflow', '>=', '1.2.0'), ('networkx', '==', '2.1'), ('numpy', None, None)] """ @@ -62,26 +69,57 @@ def parse_versions_list(required_fw_versions, version_list): line = required_fw_versions.strip('\n') line = line.strip(' ') if line == '': - return [] - splited_versions_by_conditions = re.split(r"==|>=|<=|>|<", line) + return version_list + splited_requirement = line.split(";") + + # check environment marker + if len(splited_requirement) > 1: + env_req = splited_requirement[1] + splited_env_req = re.split(r"==|>=|<=|>|<", env_req) + splited_env_req = [l.strip(',') for l in splited_env_req] + env_marker = splited_env_req[0].strip(' ') + if env_marker == 'python_version' and env_marker in env_setup: + installed_python_version = env_setup['python_version'] + env_req_version_list = [] + splited_required_versions = re.split(r",", env_req) + for i, l in enumerate(splited_required_versions): + for comparison in ['==', '>=', '<=', '<', '>']: + if comparison in l: + required_version = splited_env_req[i + 1].strip(' ').replace('"', '') + env_req_version_list.append((env_marker, comparison, required_version)) + break + not_satisfied_list = [] + for name, key, required_version in env_req_version_list: + version_check(name, installed_python_version, required_version, + key, not_satisfied_list, 0) + if len(not_satisfied_list) > 0: + # this python_version requirement is not satisfied to required environment + # and requirement for a dependency will be skipped + return version_list + else: + log.error("{} is unsupported environment marker and it will be ignored".format(env_marker), + extra={'is_warning': True}) + + # parse a requirement for a dependency + requirement = splited_requirement[0] + splited_versions_by_conditions = re.split(r"==|>=|<=|>|<", requirement) splited_versions_by_conditions = [l.strip(',') for l in splited_versions_by_conditions] if len(splited_versions_by_conditions) == 0: - return [] + return version_list if len(splited_versions_by_conditions) == 1: version_list.append((splited_versions_by_conditions[0], None, None)) else: - splited_required_versions= re.split(r",", line) + splited_required_versions= re.split(r",", requirement) for i, l in enumerate(splited_required_versions): - comparisons = ['==', '>=', '<=', '<', '>'] - for comparison in comparisons: + for comparison in ['==', '>=', '<=', '<', '>']: if comparison in l: version_list.append((splited_versions_by_conditions[0], comparison, splited_versions_by_conditions[i + 1])) break return version_list -def get_module_version_list_from_file(file_name): +def get_module_version_list_from_file(file_name, env_setup): """ Please do not add parameter type annotations (param:type). Because we import this file while checking Python version. @@ -89,6 +127,7 @@ def get_module_version_list_from_file(file_name): Reads file with requirements :param file_name: Name of the requirements file + :param env_setup: a dictionary with environment setup elements :return: list of tuples of strings like (name_of_module, sign, version) File content example: @@ -102,7 +141,7 @@ def get_module_version_list_from_file(file_name): req_dict = list() with open(file_name) as f: for line in f: - req_dict = parse_versions_list(line, req_dict) + req_dict = parse_and_filter_versions_list(line, req_dict, env_setup) return req_dict @@ -113,7 +152,7 @@ def version_check(name, installed_v, required_v, sign, not_satisfied_v, exit_cod Python 2.x will fail with no clear message on type annotations. Evaluates comparison of installed and required versions according to requirements file of one module. - If installed version does not satisfy requirements appends this module to not_stisfied_v list. + If installed version does not satisfy requirements appends this module to not_satisfied_v list. :param name: module name :param installed_v: installed version of module :param required_v: required version of module @@ -146,6 +185,25 @@ def version_check(name, installed_v, required_v, sign, not_satisfied_v, exit_cod return exit_code +def get_environment_setup(): + """ + Get environment setup such as Python version, TensorFlow version + :return: a dictionary of environment variables + """ + env_setup = dict() + python_version = "{}.{}.{}".format(sys.version_info.major, + sys.version_info.minor, + sys.version_info.micro) + env_setup['python_version'] = python_version + try: + exec("import tensorflow") + env_setup['tensorflow'] = sys.modules["tensorflow"].__version__ + exec("del tensorflow") + except (AttributeError, ImportError): + pass + return env_setup + + def check_requirements(framework=None): """ Please do not add parameter type annotations (param:type). @@ -158,13 +216,20 @@ def check_requirements(framework=None): :param framework: framework name :return: exit code (0 - execution successful, 1 - error) """ + env_setup = get_environment_setup() if framework is None: framework_suffix = "" + elif framework == "tf": + if "tensorflow" in env_setup and env_setup["tensorflow"] >= LooseVersion("2.0.0"): + framework_suffix = "_tf2" + else: + framework_suffix = "_tf" else: framework_suffix = "_{}".format(framework) + file_name = "requirements{}.txt".format(framework_suffix) requirements_file = os.path.realpath(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, file_name)) - requirements_list = get_module_version_list_from_file(requirements_file) + requirements_list = get_module_version_list_from_file(requirements_file, env_setup) not_satisfied_versions = [] exit_code = 0 for name, key, required_version in requirements_list: diff --git a/model-optimizer/mo/utils/versions_checker_test.py b/model-optimizer/mo/utils/versions_checker_test.py index 227b74ee0086d8..35346d8fba4ac2 100644 --- a/model-optimizer/mo/utils/versions_checker_test.py +++ b/model-optimizer/mo/utils/versions_checker_test.py @@ -18,7 +18,7 @@ import unittest.mock as mock from unittest.mock import mock_open -from mo.utils.versions_checker import get_module_version_list_from_file, parse_versions_list +from mo.utils.versions_checker import get_module_version_list_from_file, parse_and_filter_versions_list class TestingVersionsChecker(unittest.TestCase): @@ -30,18 +30,51 @@ def test_get_module_version_list_from_file(self, mock_open): ref_list =[('mxnet', '>=', '1.0.0'), ('mxnet', '<=', '1.3.1'), ('networkx', '>=', '1.11'), ('numpy', '==', '1.12.0'), ('defusedxml', '<=', '0.5.0')] - version_list = get_module_version_list_from_file('mock_file') + version_list = get_module_version_list_from_file('mock_file', {}) self.assertEqual(len(version_list), 5) for i, version_dict in enumerate(version_list): self.assertTupleEqual(ref_list[i], version_dict) + @mock.patch('builtins.open', new_callable=mock_open, create=True) + def test_get_module_version_list_from_file2(self, mock_open): + mock_open.return_value.__enter__ = mock_open + mock_open.return_value.__iter__ = mock.Mock( + return_value=iter(['tensorflow>=1.15.2,<2.0; python_version < "3.8"', + 'tensorflow>=2.0; python_version >= "3.8"', + 'numpy==1.12.0', + 'defusedxml<=0.5.0'])) + ref_list =[('tensorflow', '>=', '1.15.2'), + ('tensorflow', '<', '2.0'), + ('numpy', '==', '1.12.0'), + ('defusedxml', '<=', '0.5.0')] + version_list = get_module_version_list_from_file('mock_file', {'python_version': '3.7.0'}) + self.assertEqual(len(version_list), 4) + for i, version_dict in enumerate(version_list): + self.assertTupleEqual(ref_list[i], version_dict) + + @mock.patch('builtins.open', new_callable=mock_open, create=True) + def test_get_module_version_list_from_file3(self, mock_open): + mock_open.return_value.__enter__ = mock_open + mock_open.return_value.__iter__ = mock.Mock( + return_value=iter(['tensorflow>=1.15.2,<2.0; python_version < "3.8"', + 'tensorflow>=2.0; python_version >= "3.8"', + 'numpy==1.12.0', + 'defusedxml<=0.5.0'])) + ref_list =[('tensorflow', '>=', '2.0'), + ('numpy', '==', '1.12.0'), + ('defusedxml', '<=', '0.5.0')] + version_list = get_module_version_list_from_file('mock_file', {'python_version': '3.8.1'}) + self.assertEqual(len(version_list), 3) + for i, version_dict in enumerate(version_list): + self.assertTupleEqual(ref_list[i], version_dict) + @mock.patch('builtins.open', new_callable=mock_open, create=True) def test_get_module_version_list_from_file_with_fw_name(self, mock_open): mock_open.return_value.__enter__ = mock_open mock_open.return_value.__iter__ = mock.Mock( return_value=iter(['mxnet'])) ref_list = [('mxnet', None, None)] - version_list = get_module_version_list_from_file('mock_file') + version_list = get_module_version_list_from_file('mock_file', {}) self.assertEqual(len(version_list), 1) for i, version_dict in enumerate(version_list): self.assertTupleEqual(ref_list[i], version_dict) @@ -49,7 +82,7 @@ def test_get_module_version_list_from_file_with_fw_name(self, mock_open): def test_append_version_list(self): v1 = 'mxnet>=1.0.0,<=1.3.1' req_list = list() - parse_versions_list(v1, req_list) + parse_and_filter_versions_list(v1, req_list, {}) ref_list = [('mxnet', '>=', '1.0.0'), ('mxnet', '<=', '1.3.1')] for i, v in enumerate(req_list): diff --git a/model-optimizer/requirements.txt b/model-optimizer/requirements.txt index e8069df734d5d8..137b4113b3c82c 100644 --- a/model-optimizer/requirements.txt +++ b/model-optimizer/requirements.txt @@ -1,4 +1,5 @@ -tensorflow>=1.15.2,<2.0 +tensorflow>=1.15.2,<2.0; python_version < "3.8" +tensorflow>=2.0; python_version >= "3.8" mxnet>=1.0.0,<=1.5.1 networkx>=1.11 numpy>=1.13.0 diff --git a/model-optimizer/requirements_tf.txt b/model-optimizer/requirements_tf.txt index ef7e24ed235ac1..a22cd69ac7b731 100644 --- a/model-optimizer/requirements_tf.txt +++ b/model-optimizer/requirements_tf.txt @@ -1,4 +1,5 @@ -tensorflow>=1.15.2,<2.0 +tensorflow>=1.15.2,<2.0; python_version < "3.8" +tensorflow>=2.0; python_version >= "3.8" networkx>=1.11 numpy>=1.13.0 test-generator==0.1.1 From 64bcbe59c7cce19a88d88c0e75cf18b51c517dc2 Mon Sep 17 00:00:00 2001 From: Anton Voronov Date: Wed, 9 Sep 2020 09:17:57 +0300 Subject: [PATCH 33/66] [CPU] gemm convolution: added post ops JIT implementation (#1817) --- inference-engine/thirdparty/mkl-dnn | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference-engine/thirdparty/mkl-dnn b/inference-engine/thirdparty/mkl-dnn index ae3c03550796c2..683bea673b4e51 160000 --- a/inference-engine/thirdparty/mkl-dnn +++ b/inference-engine/thirdparty/mkl-dnn @@ -1 +1 @@ -Subproject commit ae3c03550796c2131dfb683a8eefb286cf7e8db3 +Subproject commit 683bea673b4e510eb150e4b338aeeeb366ba17f6 From 60627367513639bfca775244c03c3f8a6e768f64 Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Wed, 9 Sep 2020 09:57:38 +0300 Subject: [PATCH 34/66] [IE CLDNN] Fix eltwise activation types mismatch (#2090) --- .../core/cl_kernels/generic_eltwise_ref.cl | 4 +- .../tests/test_cases/fusings_gpu_test.cpp | 46 ++++++++++++++++++- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/generic_eltwise_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/generic_eltwise_ref.cl index b9987833cf0859..99d8a2ed643804 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/generic_eltwise_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/generic_eltwise_ref.cl @@ -120,9 +120,9 @@ KERNEL(eltwise)( FUSED_OPS; OUTPUT_TYPE out = FUSED_OPS_RESULT; #elif QUANTIZATION_TERM && !OUTPUT_IS_FP - OUTPUT_TYPE out = ACTIVATION(TO_OUTPUT_TYPE(res), ACTIVATION_PARAMS); + OUTPUT_TYPE out = TO_OUTPUT_TYPE(ACTIVATION(res, ACTIVATION_PARAMS)); #else - OUTPUT_TYPE out = ACTIVATION_TYPED(TO_OUTPUT_TYPE(res), ACTIVATION_PARAMS_TYPED); + OUTPUT_TYPE out = TO_OUTPUT_TYPE(ACTIVATION_TYPED(res, ACTIVATION_PARAMS_TYPED)); #endif #if QUANTIZATION_TERM && !OUTPUT_IS_FP diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp index 0a0d949059c94e..9a10f31d5112a2 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp @@ -5377,7 +5377,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, scatter_update_quantize, scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_4, 2, 3 }, scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_5, 2, 3 }, - + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_1, 2, 3 }, scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_2, 2, 3 }, scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_3, 2, 3 }, @@ -5421,7 +5421,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, scatter_update_scale_activation, scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_4, 2, 4 }, scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_5, 2, 4 }, - + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_1, 2, 4 }, scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_2, 2, 4 }, scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_3, 2, 4 }, @@ -6253,6 +6253,48 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, eltwise_test_params{CASE_ELTWISE_U8_FP16_3}, }), ); +class eltwise_activation : public EltwiseFusingTest {}; +TEST_P(eltwise_activation, basic) { + auto p = GetParam(); + create_topologies(input_layout("input", get_input_layout(p)), + input_layout("input2", get_input_layout2(p)), + eltwise("eltwise", {"input", "input2"}, p.mode, p.default_type), + activation("activation", "eltwise", activation_func::relu, {6.0f, 0.0f}), + reorder("out", "activation", p.default_format, data_types::f32)); + + tolerance = 1e-5f; + execute(p); +} + +TEST_P(eltwise_activation, fp16_out) { + auto p = GetParam(); + create_topologies(input_layout("input", get_input_layout(p)), + input_layout("input2", get_input_layout2(p)), + eltwise("eltwise", {"input", "input2"}, p.mode, data_types::f16), + activation("activation", "eltwise", activation_func::relu, {6.0f, 0.0f}), + reorder("out", "activation", p.default_format, data_types::f32)); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_CASE_P(fusings_gpu, + eltwise_activation, + ::testing::ValuesIn(std::vector{ + eltwise_test_params{CASE_ELTWISE_FP16_1}, + eltwise_test_params{CASE_ELTWISE_FP16_2}, + eltwise_test_params{CASE_ELTWISE_FP16_3}, + eltwise_test_params{CASE_ELTWISE_FP32_1}, + eltwise_test_params{CASE_ELTWISE_FP32_2}, + eltwise_test_params{CASE_ELTWISE_FP32_3}, + eltwise_test_params{CASE_ELTWISE_FP32_FP16_1}, + eltwise_test_params{CASE_ELTWISE_FP32_FP16_2}, + eltwise_test_params{CASE_ELTWISE_FP32_FP16_3}, + eltwise_test_params{CASE_ELTWISE_FP16_FP32_1}, + eltwise_test_params{CASE_ELTWISE_FP16_FP32_2}, + eltwise_test_params{CASE_ELTWISE_FP16_FP32_3} + }), ); + /* ----------------------------------------------------------------------------------------------------- */ /* ---------------------------------------- Reduce cases ----------------------------------------------- */ /* ----------------------------------------------------------------------------------------------------- */ From b8a8fe7c0dfb97bf16a2499994aec52bec685a7a Mon Sep 17 00:00:00 2001 From: Andrey Somsikov Date: Wed, 9 Sep 2020 10:10:22 +0300 Subject: [PATCH 35/66] Sort memcheck timeline report for failures (#2088) --- .../memcheck-template/timeline_report.html | 4 ++-- tests/stress_tests/scripts/memcheck_upload.py | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/tests/stress_tests/scripts/memcheck-template/timeline_report.html b/tests/stress_tests/scripts/memcheck-template/timeline_report.html index d1c5cee0e13487..d6f3132e584bf8 100644 --- a/tests/stress_tests/scripts/memcheck-template/timeline_report.html +++ b/tests/stress_tests/scripts/memcheck-template/timeline_report.html @@ -14,8 +14,8 @@

Memcheck report

{% for timeline in timelines %}

{{ timeline['device'][0]|e }} {{ timeline['model'][0]|e }} {{ timeline['test_name'][0]|e }}

-
- PASS +
+ {{ "PASS" if timeline['status'] else "FAIL" }}
diff --git a/tests/stress_tests/scripts/memcheck_upload.py b/tests/stress_tests/scripts/memcheck_upload.py index 9f8a7a510e3b8e..38c05270198d3a 100644 --- a/tests/stress_tests/scripts/memcheck_upload.py +++ b/tests/stress_tests/scripts/memcheck_upload.py @@ -192,6 +192,18 @@ def _transpose_dicts(items, template=None): def query_timeline(records, db_url, db_collection, max_items=20, similarity=TIMELINE_SIMILARITY): """ Query database for similar memcheck items committed previously """ + def timeline_key(item): + """ Defines order for timeline report entries + """ + if len(item['metrics']['vmhwm']) <= 1: + return 1 + order = item['metrics']['vmhwm'][-1] - item['metrics']['vmhwm'][-2] + \ + item['metrics']['vmrss'][-1] - item['metrics']['vmrss'][-2] + if not item['status']: + # ensure failed cases are always on top + order += sys.maxsize/2 + return order + client = MongoClient(db_url) collection = client[DATABASE][db_collection] result = [] @@ -213,7 +225,11 @@ def query_timeline(records, db_url, db_collection, max_items=20, similarity=TIME pass # keep only the record if timeline failed to generate items += [record] timeline = _transpose_dicts(items, template=record) + timeline['status'] = bool(timeline['metrics']['vmrss'][-1] < timeline['ref_metrics']['vmrss'][-1] and + timeline['metrics']['vmhwm'][-1] < timeline['ref_metrics']['vmhwm'][-1]) result += [timeline] + + result.sort(key=timeline_key, reverse=True) return result From ddaceb047d3f77f118764ec52a3fe17e0e4d109e Mon Sep 17 00:00:00 2001 From: Katarzyna Mitrus Date: Wed, 9 Sep 2020 09:50:27 +0200 Subject: [PATCH 36/66] Use element type instead of F32 for constants (#2121) --- ngraph/frontend/onnx_import/src/op/image_scaler.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ngraph/frontend/onnx_import/src/op/image_scaler.cpp b/ngraph/frontend/onnx_import/src/op/image_scaler.cpp index 4f62246eb02f1a..c047fb9fda4b55 100644 --- a/ngraph/frontend/onnx_import/src/op/image_scaler.cpp +++ b/ngraph/frontend/onnx_import/src/op/image_scaler.cpp @@ -47,10 +47,10 @@ namespace ngraph data_shape[1].get_length()); const auto scale_const = - default_opset::Constant::create(element::f32, Shape{}, {scale}); + default_opset::Constant::create(data.get_element_type(), Shape{}, {scale}); - const auto bias_const = - default_opset::Constant::create(element::f32, {1, bias.size(), 1, 1}, bias); + const auto bias_const = default_opset::Constant::create( + data.get_element_type(), {1, bias.size(), 1, 1}, bias); const auto scaler = std::make_shared( std::make_shared(data, scale_const), bias_const); From 14e2b0be20d694938e166296285fdd86c75d276a Mon Sep 17 00:00:00 2001 From: Alexander Perepelkin Date: Wed, 9 Sep 2020 11:47:40 +0300 Subject: [PATCH 37/66] Use metadata from executable network when fetching results of the infer request; Use user provided output precision when it was supplied (#2111) --- .../functional_test_utils/layer_test_utils.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/inference-engine/tests/ie_test_utils/functional_test_utils/layer_test_utils.cpp b/inference-engine/tests/ie_test_utils/functional_test_utils/layer_test_utils.cpp index 07088187c131fc..b03361f7e5e213 100644 --- a/inference-engine/tests/ie_test_utils/functional_test_utils/layer_test_utils.cpp +++ b/inference-engine/tests/ie_test_utils/functional_test_utils/layer_test_utils.cpp @@ -126,7 +126,7 @@ void LayerTestsCommon::Infer() { inferRequest = executableNetwork.CreateInferRequest(); inputs.clear(); - for (const auto &input : cnnNetwork.getInputsInfo()) { + for (const auto &input : executableNetwork.GetInputsInfo()) { const auto &info = input.second; auto blob = GenerateInput(*info); inferRequest.SetBlob(info->name(), blob); @@ -134,7 +134,7 @@ void LayerTestsCommon::Infer() { } if (configuration.count(InferenceEngine::PluginConfigParams::KEY_DYN_BATCH_ENABLED) && configuration.count(InferenceEngine::PluginConfigParams::YES)) { - auto batchSize = cnnNetwork.getInputsInfo().begin()->second->getTensorDesc().getDims()[0] / 2; + auto batchSize = executableNetwork.GetInputsInfo().begin()->second->getTensorDesc().getDims()[0] / 2; inferRequest.SetBatch(batchSize); } inferRequest.Infer(); @@ -160,8 +160,13 @@ std::vector> LayerTestsCommon::CalculateRefs() { std::copy(buffer, buffer + inputSize, referenceInput.data()); } - const auto &actualOutputs = GetOutputs(); - const auto &convertType = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(actualOutputs[0]->getTensorDesc().getPrecision()); + auto ieOutPrc = outPrc; + if (outPrc == InferenceEngine::Precision::UNSPECIFIED) { + const auto &actualOutputs = GetOutputs(); + ieOutPrc = actualOutputs[0]->getTensorDesc().getPrecision(); + } + + const auto &convertType = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(ieOutPrc); std::vector> expectedOutputs; switch (refMode) { case INTERPRETER: { @@ -195,7 +200,7 @@ std::vector> LayerTestsCommon::CalculateRefs() { std::vector LayerTestsCommon::GetOutputs() { auto outputs = std::vector{}; - for (const auto &output : cnnNetwork.getOutputsInfo()) { + for (const auto &output : executableNetwork.GetOutputsInfo()) { const auto &name = output.first; outputs.push_back(inferRequest.GetBlob(name)); } From 030e0f46feec6e88101fb455caec1c0459b8b9ff Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 9 Sep 2020 12:34:16 +0300 Subject: [PATCH 38/66] Hide itt.hpp to src folders (#2091) --- .../{include/low_precision_transformations => src}/itt.hpp | 0 .../src/low_precision_transformations/src/transformer.cpp | 2 +- inference-engine/src/transformations/CMakeLists.txt | 3 ++- .../transformations/{include => src}/transformations/itt.hpp | 0 4 files changed, 3 insertions(+), 2 deletions(-) rename inference-engine/src/low_precision_transformations/{include/low_precision_transformations => src}/itt.hpp (100%) rename inference-engine/src/transformations/{include => src}/transformations/itt.hpp (100%) diff --git a/inference-engine/src/low_precision_transformations/include/low_precision_transformations/itt.hpp b/inference-engine/src/low_precision_transformations/src/itt.hpp similarity index 100% rename from inference-engine/src/low_precision_transformations/include/low_precision_transformations/itt.hpp rename to inference-engine/src/low_precision_transformations/src/itt.hpp diff --git a/inference-engine/src/low_precision_transformations/src/transformer.cpp b/inference-engine/src/low_precision_transformations/src/transformer.cpp index 48e3f8f08cd15d..68683ca51ed142 100644 --- a/inference-engine/src/low_precision_transformations/src/transformer.cpp +++ b/inference-engine/src/low_precision_transformations/src/transformer.cpp @@ -4,7 +4,7 @@ #include "low_precision_transformations/transformer.hpp" #include "low_precision_transformations/network_helper.hpp" -#include "low_precision_transformations/itt.hpp" +#include "itt.hpp" #include diff --git a/inference-engine/src/transformations/CMakeLists.txt b/inference-engine/src/transformations/CMakeLists.txt index 4b83b9d4bd68f7..f7c579bf16b5d0 100644 --- a/inference-engine/src/transformations/CMakeLists.txt +++ b/inference-engine/src/transformations/CMakeLists.txt @@ -26,7 +26,8 @@ add_library(${TARGET_NAME} SHARED ${LIBRARY_SRC} ${PUBLIC_HEADERS}) target_link_libraries(${TARGET_NAME} PUBLIC ${NGRAPH_LIBRARIES} PRIVATE openvino::itt ngraph::builder) -target_include_directories(${TARGET_NAME} PUBLIC ${PUBLIC_HEADERS_DIR}) +target_include_directories(${TARGET_NAME} PUBLIC ${PUBLIC_HEADERS_DIR} + PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src") add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME}) diff --git a/inference-engine/src/transformations/include/transformations/itt.hpp b/inference-engine/src/transformations/src/transformations/itt.hpp similarity index 100% rename from inference-engine/src/transformations/include/transformations/itt.hpp rename to inference-engine/src/transformations/src/transformations/itt.hpp From 40923893b607d1cba59d8e05bd856817c49eb0ea Mon Sep 17 00:00:00 2001 From: Alexander Peskov Date: Mon, 31 Aug 2020 15:50:20 +0300 Subject: [PATCH 39/66] [NGRAPH] Fix ReduceSum decompose pass Signed-off-by: Alexander Peskov --- .../convert_reduce_to_pooling.hpp | 27 ++++++++++++++----- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/inference-engine/src/transformations/include/transformations/convert_reduce_to_pooling.hpp b/inference-engine/src/transformations/include/transformations/convert_reduce_to_pooling.hpp index 94386d03a36d56..23dffdbdf5a912 100644 --- a/inference-engine/src/transformations/include/transformations/convert_reduce_to_pooling.hpp +++ b/inference-engine/src/transformations/include/transformations/convert_reduce_to_pooling.hpp @@ -231,21 +231,34 @@ ngraph::matcher_pass_callback ConvertReduceBase::convert_reduce_to_pooling() { input.get_node_shared_ptr()->set_friendly_name(reduce->get_friendly_name() + "/pool"); new_ops.push_back(input.get_node_shared_ptr()); } else if (std::is_same()) { + // Fallback to real type because of potential data loss in case of integer AVG Pool + bool fallback_to_real = input.get_element_type().is_integral(); + + if (fallback_to_real) { + input = std::make_shared(input, ngraph::element::f32); + new_ops.push_back(input.get_node_shared_ptr()); + } + input = std::make_shared(input, - strides, - pads_begin, - pads_end, - kernel, - true, - ngraph::op::RoundingType::FLOOR); + strides, + pads_begin, + pads_end, + kernel, + true, + ngraph::op::RoundingType::FLOOR); input.get_node_shared_ptr()->set_friendly_name(reduce->get_friendly_name() + "/pool"); new_ops.push_back(input.get_node_shared_ptr()); input = std::make_shared(input, - ngraph::opset1::Constant::create(reduce->input(0).get_element_type(), ngraph::Shape{1}, {reduction_dims_count})); + ngraph::opset1::Constant::create(input.get_element_type(), ngraph::Shape{1}, {reduction_dims_count})); input.get_node_shared_ptr()->set_friendly_name(reduce->get_friendly_name() + "/mul"); new_ops.push_back(input.get_node_shared_ptr()); + + if (fallback_to_real) { + input = std::make_shared(input, reduce->output(0).get_element_type()); + new_ops.push_back(input.get_node_shared_ptr()); + } } else { return false; } From 3dc27c8be863c86a342e22a93f06bd7a4b9c13f3 Mon Sep 17 00:00:00 2001 From: Alexander Peskov Date: Mon, 7 Sep 2020 02:45:15 +0300 Subject: [PATCH 40/66] [TEST] Fix blob util broadcast filler for non fp32 types Signed-off-by: Alexander Peskov --- .../common_test_utils/data_utils.cpp | 43 ++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/inference-engine/tests/ie_test_utils/common_test_utils/data_utils.cpp b/inference-engine/tests/ie_test_utils/common_test_utils/data_utils.cpp index a878de1d803468..12ae4eebc53e38 100644 --- a/inference-engine/tests/ie_test_utils/common_test_utils/data_utils.cpp +++ b/inference-engine/tests/ie_test_utils/common_test_utils/data_utils.cpp @@ -123,6 +123,40 @@ void fill_data_with_broadcast(InferenceEngine::Blob::Ptr& blob, InferenceEngine: } } +template +void copy_with_convert(InferenceEngine::Blob::Ptr& src_blob, InferenceEngine::Blob::Ptr& dst_blob) { + using SRC_TYPE = typename InferenceEngine::PrecisionTrait::value_type; + using DST_TYPE = typename InferenceEngine::PrecisionTrait::value_type; + + auto src_lock_m = src_blob->as()->rwmap(); + auto src_ptr = src_lock_m.as(); + auto src_size = src_blob->size(); + + auto dst_lock_m = dst_blob->as()->rwmap(); + auto dst_ptr = dst_lock_m.as(); + + std::copy(src_ptr, src_ptr + src_size, dst_ptr); +} + +InferenceEngine::Blob::Ptr make_with_precision_convert(InferenceEngine::Blob::Ptr& blob, InferenceEngine::Precision prc) { + IE_ASSERT(isDenseBlob(blob)); + auto td = blob->getTensorDesc(); + td.setPrecision(prc); + + auto new_blob = make_blob_with_precision(td); + new_blob->allocate(); + +#define CASE(_PRC) case InferenceEngine::Precision::_PRC: \ + copy_with_convert (blob, new_blob); break + switch (prc) { + CASE(FP32); CASE(I64); CASE(U64); CASE(I32); CASE(U32); CASE(I16); CASE(U16); CASE(I8); CASE(U8); + default: THROW_IE_EXCEPTION << "Unsupported precision case"; + } +#undef CASE + + return new_blob; +} + void fill_data_with_broadcast(InferenceEngine::Blob::Ptr& blob, size_t axis, std::vector values) { InferenceEngine::SizeVector value_dims(blob->getTensorDesc().getDims().size() - axis, 1); value_dims.front() = values.size(); @@ -130,7 +164,14 @@ void fill_data_with_broadcast(InferenceEngine::Blob::Ptr& blob, size_t axis, std auto layout = InferenceEngine::TensorDesc::getLayoutByDims(value_dims); InferenceEngine::TensorDesc value_tdesc(prc, value_dims, layout); - auto values_blob = make_blob_with_precision(value_tdesc, values.data()); + InferenceEngine::Blob::Ptr values_blob; + if (prc == InferenceEngine::Precision::FP32) { + values_blob = make_blob_with_precision(value_tdesc, values.data()); + } else { + values_blob = make_blob_with_precision(value_tdesc, values.data()); + values_blob = make_with_precision_convert(values_blob, prc); + } + fill_data_with_broadcast(blob, values_blob); } From ad74204402e235b964dfedb4dfb85c750d9ef9ba Mon Sep 17 00:00:00 2001 From: Alexander Peskov Date: Mon, 7 Sep 2020 02:47:14 +0300 Subject: [PATCH 41/66] [TEST] One more ReduceSUM func test Special test case with input values which cannot be correctly processed via decomposition with int AVG pool layer. Signed-off-by: Alexander Peskov --- .../single_layer_tests/reduce_ops.cpp | 16 +++++++++++++ .../include/single_layer_tests/reduce_ops.hpp | 5 ++++ .../src/single_layer_tests/reduce_ops.cpp | 24 ++++++++++++++++++- 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/reduce_ops.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/reduce_ops.cpp index 370879808efe8f..0c10a69f239ed1 100644 --- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/reduce_ops.cpp +++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/reduce_ops.cpp @@ -152,4 +152,20 @@ INSTANTIATE_TEST_CASE_P( params_ReductionTypes, ReduceOpsLayerTest::getTestCaseName ); + +INSTANTIATE_TEST_CASE_P( + Reduce, + ReduceOpsLayerWithSpecificInputTest, + testing::Combine( + testing::ValuesIn(decltype(axes) {{0}, {1}}), + testing::Values(opTypes[1]), + testing::Values(true), + testing::Values(ngraph::helpers::ReductionType::Sum), + testing::Values(InferenceEngine::Precision::FP32, + InferenceEngine::Precision::I32), + testing::Values(std::vector {2, 10}), + testing::Values(CommonTestUtils::DEVICE_CPU)), + ReduceOpsLayerWithSpecificInputTest::getTestCaseName +); + } // namespace diff --git a/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/reduce_ops.hpp b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/reduce_ops.hpp index ccbdad916c1c36..0b5ce1b23a7fc9 100644 --- a/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/reduce_ops.hpp +++ b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/reduce_ops.hpp @@ -34,4 +34,9 @@ class ReduceOpsLayerTest : public testing::WithParamInterface, void SetUp() override; }; +class ReduceOpsLayerWithSpecificInputTest : public ReduceOpsLayerTest { +protected: + InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo &info) const override; +}; + } // namespace LayerTestsDefinitions \ No newline at end of file diff --git a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/reduce_ops.cpp b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/reduce_ops.cpp index df6266551358ef..7191be4060a274 100644 --- a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/reduce_ops.cpp +++ b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/reduce_ops.cpp @@ -80,6 +80,28 @@ void ReduceOpsLayerTest::SetUp() { TEST_P(ReduceOpsLayerTest, CompareWithRefs) { Run(); -}; +} + +InferenceEngine::Blob::Ptr ReduceOpsLayerWithSpecificInputTest::GenerateInput(const InferenceEngine::InputInfo &info) const { + auto axis_vec = std::get<0>(GetParam()); + IE_ASSERT(axis_vec.size() == 1); + + auto axis = axis_vec[0]; + auto td = info.getTensorDesc(); + auto dims = td.getDims(); + + // Slice of tensor through axis is {1, 0, 0, ....}, the mean value is 1/slice_size + auto raw_values = std::vector(dims[axis], 0); + raw_values[0] = 1; + + auto blob = make_blob_with_precision(td); + blob->allocate(); + CommonTestUtils::fill_data_with_broadcast(blob, axis, raw_values); + return blob; +} + +TEST_P(ReduceOpsLayerWithSpecificInputTest, CompareWithRefs) { + Run(); +} } // namespace LayerTestsDefinitions \ No newline at end of file From fd02b384a73aca6678f681e06755a1528f067171 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20Do=C5=82bniak?= Date: Wed, 9 Sep 2020 12:04:33 +0200 Subject: [PATCH 42/66] Clang compilation error fix for Linux (#2126) --- .../tests/functional/plugin/cpu/bfloat16/bfloat16_helpers.hpp | 2 +- inference-engine/tests_deprecated/helpers/tests_common_func.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/bfloat16_helpers.hpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/bfloat16_helpers.hpp index 11d5eb106f7b92..d09f50c76f2a18 100644 --- a/inference-engine/tests/functional/plugin/cpu/bfloat16/bfloat16_helpers.hpp +++ b/inference-engine/tests/functional/plugin/cpu/bfloat16/bfloat16_helpers.hpp @@ -51,7 +51,7 @@ class BFloat16Helpers { static float getMaxAbsValue(const float* data, size_t size) { float maxVal = 0.f; for (size_t i = 0; i < size; i++) { - if (fabs(data[i] > maxVal)) { + if (fabs(data[i]) > maxVal) { maxVal = fabs(data[i]); } } diff --git a/inference-engine/tests_deprecated/helpers/tests_common_func.cpp b/inference-engine/tests_deprecated/helpers/tests_common_func.cpp index 2087abbd6dd81d..92778836245b09 100644 --- a/inference-engine/tests_deprecated/helpers/tests_common_func.cpp +++ b/inference-engine/tests_deprecated/helpers/tests_common_func.cpp @@ -255,7 +255,7 @@ bool TestsCommonFunc::compareTop( #endif for (size_t i = 0; i < blob.size(); ++i) { - if (abs(ref_top[i].second - buffer[i]) > threshold) { + if (std::abs(ref_top[i].second - buffer[i]) > threshold) { return false; } } From fe4d720b69e2cd5944265e9358540ed49740146d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Karzy=C5=84ski?= <4430709+postrational@users.noreply.github.com> Date: Wed, 9 Sep 2020 12:07:02 +0200 Subject: [PATCH 43/66] Temporarily disable ONNX Loop operator for the 2021.1 release (#2110) --- ngraph/frontend/onnx_import/src/ops_bridge.cpp | 2 +- ngraph/test/runtime/ie/unit_test.manifest | 11 +++++++++++ ngraph/test/runtime/interpreter/unit_test.manifest | 11 +++++++++++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/ngraph/frontend/onnx_import/src/ops_bridge.cpp b/ngraph/frontend/onnx_import/src/ops_bridge.cpp index 3f4f8ad4427c11..97b62622c06745 100644 --- a/ngraph/frontend/onnx_import/src/ops_bridge.cpp +++ b/ngraph/frontend/onnx_import/src/ops_bridge.cpp @@ -310,7 +310,7 @@ namespace ngraph REGISTER_OPERATOR("Less", 1, less); REGISTER_OPERATOR("Log", 1, log); REGISTER_OPERATOR("LogSoftmax", 1, log_softmax); - REGISTER_OPERATOR("Loop", 1, loop); + // REGISTER_OPERATOR("Loop", 1, loop); // Loop operator disabled for the 2021.1 release REGISTER_OPERATOR("LpNormalization", 1, lp_norm); REGISTER_OPERATOR("LRN", 1, lrn); REGISTER_OPERATOR("LSTM", 1, lstm); diff --git a/ngraph/test/runtime/ie/unit_test.manifest b/ngraph/test/runtime/ie/unit_test.manifest index 783fe8c99877c8..d4b037f9657272 100644 --- a/ngraph/test/runtime/ie/unit_test.manifest +++ b/ngraph/test/runtime/ie/unit_test.manifest @@ -1457,3 +1457,14 @@ IE_GPU.onnx_model_fake_quantize_nonconst_inputs_infer # ONNX Loop onnx_controlflow_loop_2d_add_execution + +# ONNX Loop - tests disabled temporarily +onnx_controlflow_loop_2d_add_check_model +onnx_controlflow_loop_scalars_check_model +onnx_controlflow_loop_add_initializer_from_parent_scope +onnx_controlflow_loop_add_input_from_parent_scope +onnx_controlflow_loop_add_node_from_parent_scope +onnx_controlflow_loop_add_value_access_to_body_scope_exception +onnx_controlflow_loop_add_value_the_same_node_from_parent_and_subgraph +onnx_controlflow_loop_2d_add_exception_if_no_identity_cond +onnx_controlflow_loop_2d_add_const_cond diff --git a/ngraph/test/runtime/interpreter/unit_test.manifest b/ngraph/test/runtime/interpreter/unit_test.manifest index 2c864ef151a2ef..4502e97d0dccd6 100644 --- a/ngraph/test/runtime/interpreter/unit_test.manifest +++ b/ngraph/test/runtime/interpreter/unit_test.manifest @@ -24,6 +24,17 @@ INTERPRETER.onnx_resize11_sizes_nearest_asymmetric_floor # ONNX Loop onnx_controlflow_loop_2d_add_execution +# ONNX Loop - tests disabled temporarily +onnx_controlflow_loop_2d_add_check_model +onnx_controlflow_loop_scalars_check_model +onnx_controlflow_loop_add_initializer_from_parent_scope +onnx_controlflow_loop_add_input_from_parent_scope +onnx_controlflow_loop_add_node_from_parent_scope +onnx_controlflow_loop_add_value_access_to_body_scope_exception +onnx_controlflow_loop_add_value_the_same_node_from_parent_and_subgraph +onnx_controlflow_loop_2d_add_exception_if_no_identity_cond +onnx_controlflow_loop_2d_add_const_cond + # Disabled tests for disabled reference implementations INTERPRETER.onnx_model_qlinear_matmul_3d INTERPRETER.onnx_dyn_shapes_expand_uint16_dyn_shape From 685e8f8e7e0d0290fb381f34dcb248f3296db579 Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Wed, 9 Sep 2020 13:21:55 +0300 Subject: [PATCH 44/66] [IE TESTS] Remove normilizer tests (#2098) --- .../tests_deprecated/unit/CMakeLists.txt | 1 - .../normalizer/supported_fusions_test.cpp | 420 ------------------ 2 files changed, 421 deletions(-) delete mode 100644 inference-engine/tests_deprecated/unit/engines/mkldnn/normalizer/supported_fusions_test.cpp diff --git a/inference-engine/tests_deprecated/unit/CMakeLists.txt b/inference-engine/tests_deprecated/unit/CMakeLists.txt index 0c7a6c38b8ccb4..c99007e8908af7 100644 --- a/inference-engine/tests_deprecated/unit/CMakeLists.txt +++ b/inference-engine/tests_deprecated/unit/CMakeLists.txt @@ -59,7 +59,6 @@ if (ENABLE_MKL_DNN) file(GLOB MKLDNN_TESTS engines/mkldnn/*.cpp - engines/mkldnn/normalizer/*.cpp engines/mkldnn/graph/layers/extensions/*.cpp engines/mkldnn/graph/layers/internal/*.cpp engines/mkldnn/graph/structure/*.cpp diff --git a/inference-engine/tests_deprecated/unit/engines/mkldnn/normalizer/supported_fusions_test.cpp b/inference-engine/tests_deprecated/unit/engines/mkldnn/normalizer/supported_fusions_test.cpp deleted file mode 100644 index a0936c5039e352..00000000000000 --- a/inference-engine/tests_deprecated/unit/engines/mkldnn/normalizer/supported_fusions_test.cpp +++ /dev/null @@ -1,420 +0,0 @@ -// Copyright (C) 2018-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include - -#include -#include -#include -#include - -#include "tests_common.hpp" -#include "common_test_utils/xml_net_builder/xml_net_builder.hpp" -#include "common_test_utils/common_layers_params.hpp" -#include "common_test_utils/data_utils.hpp" -#include "common_test_utils/common_utils.hpp" - -struct conv_eltwise_params { - std::vector in1; - std::vector in2; - - CommonTestUtils::conv_common_params conv; - CommonTestUtils::eltwise_common_params eltwise; -}; - -struct in_conv_in_conv_eltwise_params { - std::vector in1; - std::vector in2; - - CommonTestUtils::conv_common_params conv1; - CommonTestUtils::conv_common_params conv2; - CommonTestUtils::eltwise_common_params eltwise; -}; - -struct conv_conv_eltwise_conv_pooling_params { - std::vector in1; - std::vector in2; - - CommonTestUtils::conv_common_params conv1; - CommonTestUtils::conv_common_params conv2; - CommonTestUtils::conv_common_params conv3; - CommonTestUtils::eltwise_common_params eltwise; - CommonTestUtils::pool_common_params pool; -}; - -class ConvSum: public TestsCommon, public ::testing::WithParamInterface { - std::string getModel(conv_eltwise_params p) { - std::string precision = "FP32"; - std::vector convOutShape(p.in1.size()); - getConvOutShape(p.in1, p.conv, convOutShape); - - std::vector min_stat(p.in1[1]); - std::vector max_stat(p.in1[1]); - CommonTestUtils::fill_data_sine(min_stat.data(), p.in1[1], -1, 1, 1); - CommonTestUtils::fill_data_sine(max_stat.data(), p.in1[1], 1, 1, -1); - std::vector conv_min_stat(convOutShape[1]); - std::vector conv_max_stat(convOutShape[1]); - CommonTestUtils::fill_data_sine(conv_min_stat.data(), convOutShape[1], -1, 1, 1); - CommonTestUtils::fill_data_sine(conv_max_stat.data(), convOutShape[1], 1, 1, -1); - - std::map elt_params = { - {"operation", "sum"} - }; - std::vector> edges = { {"0,0", "2,2"}, {"2,3", "3,4"}, {"1,1", "3,5"} }; - - return CommonTestUtils::DefaultNetBuilder::buildNetworkWithOneInput( - "Fusion_conv_sum", p.in1, precision) - .addInputLayer(precision, convOutShape) - .convolutionLayer(precision, {{p.in1}, {convOutShape}}, p.conv) - .addLayer("Eltwise", precision, &elt_params, {{convOutShape, convOutShape}, {convOutShape}}, 0, 0, "data", "") - .finish(&edges); - } - -protected: - virtual void TearDown() { - } - - virtual void SetUp() { - try { - TestsCommon::SetUp(); - conv_eltwise_params p = ::testing::WithParamInterface::GetParam(); - std::string model = getModel(p); - printf("model:\n%s", model.c_str()); - - InferenceEngine::Core ie; - auto network = ie.ReadNetwork(model, getConvWeightsBlob(p.in1, p.conv)); - std::shared_ptr score_engine(new MKLDNNPlugin::Engine()); - InferenceEngine::ExecutableNetwork exeNetwork1; - ASSERT_NO_THROW(exeNetwork1 = score_engine->LoadNetwork(network, {})); - - auto conv = CommonTestUtils::getLayerByName(network, "Convolution2"); - auto eltwise = CommonTestUtils::getLayerByName(network, "Eltwise3"); - - ASSERT_EQ(conv->precision, InferenceEngine::Precision::I8); - ASSERT_EQ(conv->outData[0]->getPrecision(), InferenceEngine::Precision::I8); - ASSERT_EQ(eltwise->precision, InferenceEngine::Precision::I8); - ASSERT_EQ(eltwise->outData[0]->getPrecision(), InferenceEngine::Precision::I8); - } catch (const InferenceEngine::details::InferenceEngineException &e) { - FAIL() << e.what(); - } - } -}; - -class ConvSumReLU: public TestsCommon, public ::testing::WithParamInterface { - std::string getModel(conv_eltwise_params p) { - std::string precision = "FP32"; - std::vector convOutShape(p.in1.size()); - getConvOutShape(p.in1, p.conv, convOutShape); - - std::vector min_stat(p.in1[1]); - std::vector max_stat(p.in1[1]); - CommonTestUtils::fill_data_sine(min_stat.data(), p.in1[1], -1, 1, 1); - CommonTestUtils::fill_data_sine(max_stat.data(), p.in1[1], 1, 1, -1); - std::vector conv_min_stat(convOutShape[1]); - std::vector conv_max_stat(convOutShape[1]); - CommonTestUtils::fill_data_sine(conv_min_stat.data(), convOutShape[1], -1, 1, 1); - CommonTestUtils::fill_data_sine(conv_max_stat.data(), convOutShape[1], 1, 1, -1); - - std::map elt_params = { - {"operation", "sum"} - }; - std::map relu_params = {}; - std::vector> edges = { {"0,0", "2,2"}, {"2,3", "3,4"}, {"1,1", "3,5"}, {"3,6", "4,7"} }; - return CommonTestUtils::DefaultNetBuilder::buildNetworkWithOneInput( - "Fusion_conv_sum", p.in1, precision) - .addInputLayer(precision, convOutShape) - .convolutionLayer(precision, {{p.in1}, {convOutShape}}, p.conv) - .addLayer("Eltwise", precision, &elt_params, {{convOutShape, convOutShape}, {convOutShape}}, 0, 0, "data", "") - .addLayer("ReLU", precision, &relu_params, {{convOutShape, convOutShape}, {convOutShape}}, 0, 0, "data", "") - .finish(&edges); - } - -protected: - virtual void TearDown() { - } - - virtual void SetUp() { - try { - TestsCommon::SetUp(); - conv_eltwise_params p = ::testing::WithParamInterface::GetParam(); - std::string model = getModel(p); - printf("model:\n%s", model.c_str()); - - Core ie; - auto network = ie.ReadNetwork(model, getConvWeightsBlob(p.in1, p.conv)); - - std::shared_ptr score_engine(new MKLDNNPlugin::Engine()); - InferenceEngine::ExecutableNetwork exeNetwork1; - ASSERT_NO_THROW(exeNetwork1 = score_engine->LoadNetwork(network, { })); - - auto conv = CommonTestUtils::getLayerByName(network, "Convolution2"); - auto eltwise = CommonTestUtils::getLayerByName(network, "Eltwise3"); - auto relu4 = CommonTestUtils::getLayerByName(network, "ReLU4"); - - ASSERT_EQ(conv->precision, InferenceEngine::Precision::I8); - ASSERT_EQ(conv->outData[0]->getPrecision(), InferenceEngine::Precision::I8); - ASSERT_EQ(eltwise->precision, InferenceEngine::Precision::I8); - ASSERT_EQ(eltwise->outData[0]->getPrecision(), InferenceEngine::Precision::I8); - ASSERT_EQ(relu4->precision, InferenceEngine::Precision::I8); - } catch (const InferenceEngine::details::InferenceEngineException &e) { - FAIL() << e.what(); - } - } -}; - -class ConvConvSum: public TestsCommon, public ::testing::WithParamInterface { - std::string getModel(conv_eltwise_params p) { - std::string precision = "FP32"; - std::vector convOutShape(p.in1.size()); - getConvOutShape(p.in1, p.conv, convOutShape); - - std::vector min_stat(p.in1[1]); - std::vector max_stat(p.in1[1]); - CommonTestUtils::fill_data_sine(min_stat.data(), p.in1[1], -1, 1, 1); - CommonTestUtils::fill_data_sine(max_stat.data(), p.in1[1], 1, 1, -1); - std::vector conv_min_stat(convOutShape[1]); - std::vector conv_max_stat(convOutShape[1]); - CommonTestUtils::fill_data_sine(conv_min_stat.data(), convOutShape[1], -1, 1, 1); - CommonTestUtils::fill_data_sine(conv_max_stat.data(), convOutShape[1], 1, 1, -1); - - std::map elt_params = { - {"operation", "sum"} - }; - std::vector> edges = { {"0,0", "2,2"}, {"2,3", "4,6"}, {"1,1", "3,4"}, {"3,5", "4,7"} }; - return CommonTestUtils::DefaultNetBuilder::buildNetworkWithOneInput( - "Fusion_conv_sum", p.in1, precision) - .addInputLayer(precision, p.in1) - .convolutionLayer(precision, {{p.in1}, {convOutShape}}, p.conv) - .convolutionLayer(precision, {{p.in1}, {convOutShape}}, p.conv) - .addLayer("Eltwise", precision, &elt_params, {{convOutShape, convOutShape}, {convOutShape}}, 0, 0, "data", "") - .finish(&edges); - } - -protected: - virtual void TearDown() { - } - - virtual void SetUp() { - try { - TestsCommon::SetUp(); - conv_eltwise_params p = ::testing::WithParamInterface::GetParam(); - std::string model = getModel(p); - printf("model:\n%s", model.c_str()); - - Core ie; - auto network = ie.ReadNetwork(model, getConvWeightsBlob(p.in1, p.conv)); - - std::shared_ptr score_engine(new MKLDNNPlugin::Engine()); - InferenceEngine::ExecutableNetwork exeNetwork1; - ASSERT_NO_THROW(exeNetwork1 = score_engine->LoadNetwork(network, { })); - - auto conv2 = CommonTestUtils::getLayerByName(network, "Convolution2"); - auto conv3 = CommonTestUtils::getLayerByName(network, "Convolution3"); - auto eltwise = CommonTestUtils::getLayerByName(network, "Eltwise3"); - - ASSERT_EQ(conv2->precision, InferenceEngine::Precision::I8); - ASSERT_EQ(conv2->outData[0]->getPrecision(), InferenceEngine::Precision::I8); - ASSERT_EQ(conv3->precision, InferenceEngine::Precision::I8); - ASSERT_EQ(conv3->outData[0]->getPrecision(), InferenceEngine::Precision::I8); - ASSERT_EQ(eltwise->precision, InferenceEngine::Precision::I8); - ASSERT_EQ(eltwise->outData[0]->getPrecision(), InferenceEngine::Precision::I8); - } catch (const InferenceEngine::details::InferenceEngineException &e) { - FAIL() << e.what(); - } - } -}; - -class ConvConvSumReLU: public TestsCommon, public ::testing::WithParamInterface { - std::string getModel(in_conv_in_conv_eltwise_params p) { - std::string precision = "FP32"; - std::vector convOutShape1(p.in1.size()); - std::vector convOutShape2(p.in2.size()); - getConvOutShape(p.in1, p.conv1, convOutShape1); - getConvOutShape(p.in2, p.conv2, convOutShape2); - - std::map elt_params = { - {"operation", "sum"} - }; - std::map relu_params = {}; - std::vector> edges = { {"0,0", "2,2"}, {"2,3", "4,6"}, {"1,1", "3,4"}, {"3,5", "4,7"}, {"4,8", "5,9"} }; - return CommonTestUtils::DefaultNetBuilder::buildNetworkWithOneInput( - "Fusion_conv_sum", p.in1, precision) - .addInputLayer(precision, p.in2) - .convolutionLayer(precision, {{p.in1}, {convOutShape1}}, p.conv1) - .convolutionLayer(precision, {{p.in2}, {convOutShape2}}, p.conv2) - .addLayer("Eltwise", precision, &elt_params, {{convOutShape1, convOutShape2}, {convOutShape1}}, 0, 0, "data", "") - .addLayer("ReLU", precision, &relu_params, {{convOutShape1}, {convOutShape1}}, 0, 0, "data", "") - .finish(&edges); - } - -protected: - virtual void TearDown() { - } - - virtual void SetUp() { - try { - TestsCommon::SetUp(); - in_conv_in_conv_eltwise_params p = ::testing::WithParamInterface::GetParam(); - std::string model = getModel(p); - printf("model:\n%s", model.c_str()); - - Core ie; - size_t weight_size = getConvWeightsSize(p.in1, p.conv1, "FP32") + getConvBiasesSize(p.conv1, "FP32") + - getConvWeightsSize(p.in2, p.conv2, "FP32") + getConvBiasesSize(p.conv2, "FP32"); - auto network = ie.ReadNetwork(model, CommonTestUtils::getWeightsBlob(weight_size)); - - std::shared_ptr score_engine(new MKLDNNPlugin::Engine()); - InferenceEngine::ExecutableNetwork exeNetwork1; - ASSERT_NO_THROW(exeNetwork1 = score_engine->LoadNetwork(network, { })); - - auto conv2 = CommonTestUtils::getLayerByName(network, "Convolution2"); - auto conv3 = CommonTestUtils::getLayerByName(network, "Convolution3"); - auto eltwise = CommonTestUtils::getLayerByName(network, "Eltwise3"); - auto relu5 = CommonTestUtils::getLayerByName(network, "ReLU5"); - - ASSERT_EQ(conv2->precision, InferenceEngine::Precision::I8); - ASSERT_EQ(conv2->outData[0]->getPrecision(), InferenceEngine::Precision::I8); - ASSERT_EQ(conv3->precision, InferenceEngine::Precision::I8); - ASSERT_EQ(conv3->outData[0]->getPrecision(), InferenceEngine::Precision::I8); - ASSERT_EQ(eltwise->precision, InferenceEngine::Precision::I8); - ASSERT_EQ(eltwise->outData[0]->getPrecision(), InferenceEngine::Precision::I8); - ASSERT_EQ(relu5->precision, InferenceEngine::Precision::I8); - } catch (const InferenceEngine::details::InferenceEngineException &e) { - FAIL() << e.what(); - } - } -}; - -class ConvConvSumReLUPoolConv: public TestsCommon, public ::testing::WithParamInterface { - std::string getModel(conv_conv_eltwise_conv_pooling_params p) { - std::string precision = "FP32"; - std::vector convOutShape1(p.in1.size()); - std::vector convOutShape2(p.in2.size()); - std::vector convOutShape3(p.in1.size()); - std::vector poolOutShape(p.in2.size()); - getConvOutShape(p.in1, p.conv1, convOutShape1); - getConvOutShape(p.in2, p.conv2, convOutShape2); - getConvOutShape(convOutShape1, p.conv3, convOutShape3); - getPoolOutShape(convOutShape1, p.pool, poolOutShape); - - std::map elt_params = { - {"operation", "sum"} - }; - std::map relu_params = {}; - std::vector> edges = { {"0,0", "2,2"}, - {"2,3", "4,6"}, - {"1,1", "3,4"}, - {"3,5", "4,7"}, - {"4,8", "5,9"}, - {"5,10", "7,13"}, - {"4,8", "6,11"} }; - return CommonTestUtils::DefaultNetBuilder::buildNetworkWithOneInput( - "Fusion_conv_sum", p.in1, precision) - .addInputLayer(precision, p.in2) - .convolutionLayer(precision, {{p.in1}, {convOutShape1}}, p.conv1) - .convolutionLayer(precision, {{p.in2}, {convOutShape2}}, p.conv2) - .addLayer("Eltwise", precision, &elt_params, {{convOutShape1, convOutShape2}, {convOutShape1}}, 0, 0, "data", "") - .addLayer("ReLU", precision, &relu_params, {{convOutShape1}, {convOutShape1}}, 0, 0, "data", "") - .convolutionLayer(precision, {{convOutShape1}, {convOutShape3}}, p.conv3) - .addLayer("Pooling", precision, &relu_params, {{convOutShape1}, {poolOutShape}}, 0, 0, "data", "") - .finish(&edges); - } - -protected: - virtual void TearDown() { - } - - virtual void SetUp() { - try { - TestsCommon::SetUp(); - conv_conv_eltwise_conv_pooling_params p = - ::testing::WithParamInterface::GetParam(); - std::string model = getModel(p); - printf("model:\n%s", model.c_str()); - - Core ie; - std::vector convOutShape3(p.in1.size()); - size_t weight_size = getConvWeightsSize(p.in1, p.conv1, "FP32") + getConvBiasesSize(p.conv1, "FP32") + - getConvWeightsSize(p.in2, p.conv2, "FP32") + getConvBiasesSize(p.conv2, "FP32") + - getConvWeightsSize(convOutShape3, p.conv3, "FP32") + getConvBiasesSize(p.conv3, "FP32"); - auto network = ie.ReadNetwork(model, CommonTestUtils::getWeightsBlob(weight_size)); - - std::shared_ptr score_engine(new MKLDNNPlugin::Engine()); - InferenceEngine::ExecutableNetwork exeNetwork1; - ASSERT_NO_THROW(exeNetwork1 = score_engine->LoadNetwork(network, {})); - - auto conv2 = CommonTestUtils::getLayerByName(network, "Convolution2"); - auto conv3 = CommonTestUtils::getLayerByName(network, "Convolution3"); - auto eltwise = CommonTestUtils::getLayerByName(network, "Eltwise3"); - auto relu5 = CommonTestUtils::getLayerByName(network, "ReLU5"); - - ASSERT_EQ(conv2->precision, InferenceEngine::Precision::I8); - ASSERT_EQ(conv2->outData[0]->getPrecision(), InferenceEngine::Precision::I8); - ASSERT_EQ(conv3->precision, InferenceEngine::Precision::I8); - ASSERT_EQ(conv3->outData[0]->getPrecision(), InferenceEngine::Precision::I8); - ASSERT_EQ(eltwise->precision, InferenceEngine::Precision::I8); - ASSERT_EQ(eltwise->outData[0]->getPrecision(), InferenceEngine::Precision::I8); - ASSERT_EQ(relu5->precision, InferenceEngine::Precision::I8); - } catch (const InferenceEngine::details::InferenceEngineException &e) { - FAIL() << e.what(); - } - } -}; - - -// there is no o-scale in Input1 -TEST_P(ConvSum, DISABLED_TestsNormalizerSupportedFusions) {} -INSTANTIATE_TEST_CASE_P( - TestsNormalizerSupportedFusions, ConvSum, - ::testing::Values( - conv_eltwise_params{{1, 16, 4, 4}, {1, 16, 4, 4}, - { {1, 1}, {1, 1}, {0, 0}, {0, 0}, {1, 1}, "", 1, 32, true, "I8" }, - {"sum", {}} } - )); - -TEST_P(ConvSumReLU, DISABLED_TestsNormalizerSupportedFusions) {} -INSTANTIATE_TEST_CASE_P( - TestsNormalizerSupportedFusions, ConvSumReLU, - ::testing::Values( - conv_eltwise_params{{1, 16, 4, 4}, {1, 16, 4, 4}, - { {1, 1}, {1, 1}, {0, 0}, {0, 0}, {1, 1}, "", 1, 32, true, "I8" }, - {"sum", {}} } - )); - -// there is no oi-scale in Convolution3 -TEST_P(ConvConvSum, DISABLED_TestsNormalizerSupportedFusions) {} -INSTANTIATE_TEST_CASE_P( - TestsNormalizerSupportedFusions, ConvConvSum, - ::testing::Values( - conv_eltwise_params{{1, 16, 4, 4}, {1, 16, 4, 4}, - { {1, 1}, {1, 1}, {0, 0}, {0, 0}, {1, 1}, "", 1, 32, true, "I8" }, - {"sum", {}} } - )); - -TEST_P(ConvConvSumReLU, DISABLED_TestsNormalizerSupportedFusions) {} -INSTANTIATE_TEST_CASE_P( - TestsNormalizerSupportedFusions, ConvConvSumReLU, - ::testing::Values( - in_conv_in_conv_eltwise_params{{1, 16, 4, 4}, {1, 16, 4, 4}, - { {1, 1}, {1, 1}, {0, 0}, {0, 0}, {1, 1}, "", 1, 32, true, "I8" }, - { {1, 1}, {1, 1}, {0, 0}, {0, 0}, {1, 1}, "", 1, 32, true, "I8" }, - {"sum", {}} }, - in_conv_in_conv_eltwise_params{{1, 48, 40, 20}, {1, 32, 40, 20}, - { {1, 1}, {1, 1}, {0, 0}, {0, 0}, {1, 1}, "", 1, 64, true, "I8" }, - { {1, 1}, {1, 1}, {0, 0}, {0, 0}, {1, 1}, "", 1, 64, true, "I8" }, - {"sum", {}} } - )); - -TEST_P(ConvConvSumReLUPoolConv, DISABLED_TestsNormalizerSupportedFusions) {} -INSTANTIATE_TEST_CASE_P( - TestsNormalizerSupportedFusions, ConvConvSumReLUPoolConv, - ::testing::Values( - conv_conv_eltwise_conv_pooling_params{{1, 16, 4, 4}, {1, 16, 4, 4}, - { {1, 1}, {1, 1}, {0, 0}, {0, 0}, {1, 1}, "", 1, 32, true, "I8" }, - { {1, 1}, {1, 1}, {0, 0}, {0, 0}, {1, 1}, "", 1, 32, true, "I8" }, - { {1, 1}, {1, 1}, {0, 0}, {0, 0}, {1, 1}, "", 1, 32, true, "I8" }, - {"sum", {}}, - { {1, 1}, {1, 1}, {0, 0}, {0, 0} } } - )); - From d43c9cfa0e6933df29d5d5840db3ee962f2578fc Mon Sep 17 00:00:00 2001 From: "Gladilov, Gleb" Date: Wed, 9 Sep 2020 13:22:29 +0300 Subject: [PATCH 45/66] [IE][Tests]: Fixes dangling reference access in nGraph function comparator (#2105) Usage reference to front of the queue after pop is UB Signed-off-by: Gladilov, Gleb --- .../tests/ngraph_functions/src/utils/ngraph_helpers.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/inference-engine/tests/ngraph_functions/src/utils/ngraph_helpers.cpp b/inference-engine/tests/ngraph_functions/src/utils/ngraph_helpers.cpp index 2ae552c7590b4c..ac00d88c6b210c 100644 --- a/inference-engine/tests/ngraph_functions/src/utils/ngraph_helpers.cpp +++ b/inference-engine/tests/ngraph_functions/src/utils/ngraph_helpers.cpp @@ -211,9 +211,8 @@ void CompareFunctions(const Function& actual, const Function& expected) { std::queue nodes; nodes.emplace(actualResult, expectedResult); while (!nodes.empty()) { - const auto& checkingNodes = nodes.front(); - const auto& actualNode = checkingNodes.first; - const auto& expectedNode = checkingNodes.second; + const auto actualNode = nodes.front().first; + const auto expectedNode = nodes.front().second; nodes.pop(); CompareNodes(*actualNode, *expectedNode); From 3a1667c35c11fa644fd93438cce247d84611e2fe Mon Sep 17 00:00:00 2001 From: Lukasz Debski Date: Wed, 9 Sep 2020 12:35:31 +0200 Subject: [PATCH 46/66] [IE CLDNN] fs_b_yx_fsv32 addition in quantize_scale_shift kernel and reorders removal fix (#2124) --- .../quantize_kernel_scale_shift_opt.cpp | 2 ++ .../remove_redundant_reorders.cpp | 25 +++++++++++-------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_scale_shift_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_scale_shift_opt.cpp index 6b5214231787f0..8023c56722fecb 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_scale_shift_opt.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_scale_shift_opt.cpp @@ -41,6 +41,7 @@ ParamsKey QuantizeKernelScaleShift::GetSupportedKey() const { k.EnableInputLayout(DataLayout::b_fs_yx_fsv32); k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32); k.EnableInputLayout(DataLayout::bs_fs_yx_bsv16_fsv16); + k.EnableInputLayout(DataLayout::fs_b_yx_fsv32); k.EnableOutputLayout(DataLayout::bfyx); k.EnableOutputLayout(DataLayout::yxfb); k.EnableOutputLayout(DataLayout::bfzyx); @@ -50,6 +51,7 @@ ParamsKey QuantizeKernelScaleShift::GetSupportedKey() const { k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32); k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv32); k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16); + k.EnableOutputLayout(DataLayout::fs_b_yx_fsv32); k.EnableTensorOffset(); k.EnableTensorPitches(); k.EnableBatching(); diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp index ac9cc4205372d5..530c37df9ff8e4 100644 --- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp @@ -315,26 +315,31 @@ void remove_redundant_reorders::run(program_impl& p) { } } - // This pass removed reorder if the next node supports reorder's input format + // This pass removed reorder if the next node supports reorder's input format and data type doesn't change itr = p.get_processing_order().begin(); while (itr != p.get_processing_order().end()) { - auto& node = *itr++; - if (!node->is_type() || !node->is_in_data_flow() || node->get_users().size() != 1 || node->get_dependencies().size() != 1) + auto& node_ptr = *itr++; + if (!node_ptr->is_type() || !node_ptr->is_in_data_flow() || node_ptr->get_users().size() != 1 || node_ptr->get_dependencies().size() != 1) continue; - auto& usr = node->get_users().front(); - auto& dep = node->get_dependency(0); + auto& usr = node_ptr->get_users().front(); + auto& dep = node_ptr->get_dependency(0); if (!usr->is_type() || (dep.get_output_layout().format != format::b_fs_yx_fsv16 && dep.get_output_layout().format != format::fs_b_yx_fsv32 && dep.get_output_layout().format != format::bfyx)) continue; - dep.merge_output_padding(node->get_output_layout().data_padding); - p.replace_all_usages(*node, dep); - p.add_optimized_primitive_info(node->id()); - p.remove_all_connections(*node); - p.remove_if_dangling(*node); + auto& node = node_ptr->as(); + auto same_data_type = node.input().get_output_layout().data_type == node.get_output_layout().data_type; + if (!same_data_type) + continue; + + dep.merge_output_padding(node.get_output_layout().data_padding); + p.replace_all_usages(node, dep); + p.add_optimized_primitive_info(node.id()); + p.remove_all_connections(node); + p.remove_if_dangling(node); } // This pass removes reorder for Convolution BFYX -> FS_B_YX_FSV32 From 135ae12b0db73e25b4b8f4eac6bb897bb6318f50 Mon Sep 17 00:00:00 2001 From: Ilya Churaev Date: Wed, 9 Sep 2020 14:31:12 +0300 Subject: [PATCH 47/66] Fixed AddressSanitizer issue (#2122) --- ngraph/core/src/op/variadic_split.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ngraph/core/src/op/variadic_split.cpp b/ngraph/core/src/op/variadic_split.cpp index 1279f2e3cc64ba..c010eabf25a4d6 100644 --- a/ngraph/core/src/op/variadic_split.cpp +++ b/ngraph/core/src/op/variadic_split.cpp @@ -245,7 +245,8 @@ namespace output->set_shape(output_shape); evaluate(data_tensor, output, lower_bounds, upper_bounds); lower_bounds.at(axis) = upper_bounds.at(axis); - upper_bounds.at(axis) += split_lengths[split_pos]; + if (split_pos < split_lengths.size()) + upper_bounds.at(axis) += split_lengths[split_pos]; } return true; From 8b87e1a47786b24f7fd4f912f28df3115c19d5ad Mon Sep 17 00:00:00 2001 From: Bartosz Sochacki Date: Wed, 9 Sep 2020 13:55:07 +0200 Subject: [PATCH 48/66] [GNA] Fix for concat layer with >2 inputs (#1475) * Fix for concat layer with more than 2 inputs Signed-off-by: Bartosz Sochacki * Fixed check if affine is used for crop layer Signed-off-by: Bartosz Sochacki * code cleanup for fix affine layer check Signed-off-by: Bartosz Sochacki * added test for concat layer with multiple inputs * simplified test to use less number of layers * fixed code style * fixed coding style * addressed review comments and one more issue that appeared during testing * fixed code style errors * scale factor propagation for concat layer with multiple inputs * fix for a case when all inputs to concat are activation layers * fix for linux compilation - C++14 is not enabled and fails on lambda with auto parameters * corrected current year in headers in concat multi input tests * fixes for code review issues raised by Denis Orlov * enabled integer mode computation in GNA concat multi input test * removed 1 space per review comment * a fix to fail when not all scale factors are equal * added GNA_DEVICE_MODE config to concat multi input test * corrected searching for a next input to concat layer * changed selection of 2nd candidate for source quant value * code style fix - else and brackets should be in the same line * small code improvement * fix for mixing line endings * addressed with endless requantization loop and fixed failing tests --- .../gna_plugin/frontend/scale_factor_calc.hpp | 203 +++++++++++------- .../src/gna_plugin/gna_graph_compiler.cpp | 8 +- .../src/gna_plugin/layers/gna_layer_info.hpp | 9 +- .../gna_plugin/optimizer/gna_pass_manager.cpp | 141 ++++++------ .../subgraph_tests/concat_multi_input.cpp | 43 ++++ .../subgraph_tests/concat_multi_input.hpp | 34 +++ .../src/subgraph_tests/concat_multi_input.cpp | 88 ++++++++ 7 files changed, 375 insertions(+), 151 deletions(-) create mode 100644 inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/concat_multi_input.cpp create mode 100644 inference-engine/tests/functional/plugin/shared/include/subgraph_tests/concat_multi_input.hpp create mode 100644 inference-engine/tests/functional/plugin/shared/src/subgraph_tests/concat_multi_input.cpp diff --git a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp index 5da54c811b6dd5..037dc357653a7e 100644 --- a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp +++ b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp @@ -170,7 +170,7 @@ class ScaleFactorPerLayer { if (!fp32eq(quantSibling->_dst_quant.scale, 1)) { // means we already restarted propagation input memory layer // need to search for requantiseable layer prior memory output layer - InferenceEngine::CNNLayerPtr restartedLayer; + InferenceEngine::CNNLayerPtr restartedLayer; gnalog() << "Memory layer :"<< input->name << " scale factor: " << quantSibling->_dst_quant.scale << " doesn't match its outputs counterpart: " << cnnLayer->name << " scale factor: " << inputQuant->_dst_quant.scale << "\n"; @@ -382,119 +382,166 @@ class ScaleFactorPerLayer { if ( !concatLayer ) { THROW_GNA_EXCEPTION << "Incorrect Concat Layer pointer \n"; } - auto in0 = InferenceEngine::CNNNetPrevLayer(concatLayer, 0); - auto in1 = InferenceEngine::CNNNetPrevLayer(concatLayer, 1); - auto infoIn0 = LayerInfo(in0); - auto infoIn1 = LayerInfo(in1); - auto quantParams0 = InferenceEngine::getInjectedData(in0); - auto quantParams1 = InferenceEngine::getInjectedData(in1); - GNAPluginNS::QuantizedLayerParams* sourceQuantParams = NULL; - auto quantData = InferenceEngine::getInjectedData(*concatLayer); + + if (concatLayer->insData.size() < 2) { + THROW_GNA_EXCEPTION << "Concat layer has unsupported number of incoming layers."; + } auto fp32eq = [](float p1, float p2) -> bool { return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2))); }; - // if both inputs have same quant value - trivial propagation - if (fp32eq(quantParams0->_dst_quant.scale, quantParams1->_dst_quant.scale)) { + auto quantData = InferenceEngine::getInjectedData(*concatLayer); + std::vector inputLayers; + for (auto input_idx = 0; input_idx != concatLayer->insData.size(); input_idx++) { + inputLayers.push_back(InferenceEngine::CNNNetPrevLayer(concatLayer, input_idx)); + } + + // if all inputs have same quant value - trivial propagation + auto in0 = inputLayers.front(); + auto quantParams0 = InferenceEngine::getInjectedData(in0); + auto scaleFactor = quantParams0->_dst_quant.scale; + auto scaleFactorCheck = [scaleFactor, &fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) { + auto quantParams = InferenceEngine::getInjectedData(inputLayer); + return fp32eq(quantParams->_dst_quant.scale, scaleFactor); + }; + + if (std::find_if_not(inputLayers.begin() + 1, inputLayers.end(), scaleFactorCheck) == inputLayers.end()) { quantData->_dst_quant.scale = quantParams0->_dst_quant.scale; quantData->_src_quant.scale = quantParams0->_dst_quant.scale; return true; } - // support only cases when one of input is network input - if (infoIn0.isInput() && infoIn1.isInput()) { - THROW_GNA_EXCEPTION << "Two Input layers " << in0->name << "and" << in1->name << " has different scales in concat!!! \n"; - } - int concatIdxToUpdate = -1; + // check if all inputs have the same quant value + auto inputLayerCheck = [](InferenceEngine::CNNLayerPtr& inputLayer) { + auto info = LayerInfo(inputLayer); + return info.isInput(); + }; - if (infoIn0.isInput()) { - sourceQuantParams = quantParams0; - } else if (infoIn1.isInput()) { - concatIdxToUpdate = 0; - sourceQuantParams = quantParams1; + GNAPluginNS::QuantizedLayerParams* sourceQuantParams = nullptr; + auto firstInputIt = std::find_if(inputLayers.begin(), inputLayers.end(), inputLayerCheck); + if (firstInputIt != inputLayers.end()) { + auto quantParamsFirst = InferenceEngine::getInjectedData(*firstInputIt); + auto nextInputIt = firstInputIt + 1; + while ((nextInputIt = std::find_if(nextInputIt, inputLayers.end(), inputLayerCheck)) != inputLayers.end()) { + auto quantParamsSecond = InferenceEngine::getInjectedData(*nextInputIt); + if (!fp32eq(quantParamsSecond->_dst_quant.scale, quantParamsFirst->_dst_quant.scale)) { + THROW_GNA_EXCEPTION << "Two Input layers " << (*firstInputIt)->name + << " and " << (*nextInputIt)->name << " have different scales in concat!!! \n"; + } + } } - // possible case when some of the concat inputs are free to select scale ex: const->concat<-affine - if (quantParams1->_dst_quant.scale == 1.0) { - quantParams1->_weights_quant = quantParams0->_dst_quant; - quantParams1->_dst_quant = quantParams0->_dst_quant; + // find a source quant value + // - 1st candidate - non-activation layer with non-1 scale factor + // - 2nd candidate - 1st layer with non-1 scale factor + auto sourceLayerCheck = [&fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) { + auto quantParams = InferenceEngine::getInjectedData(inputLayer); + LayerInfo info(inputLayer); + return !info.isActivation() && !fp32eq(quantParams->_dst_quant.scale, 1.0f); + }; - sourceQuantParams = quantParams0; + static std::map restarted_counter; + auto restartedCountIt = restarted_counter.find(concatLayer->name); + if (restartedCountIt == restarted_counter.end()) { + auto pos = restarted_counter.insert({ "concatLayer->name", 0 }); + restartedCountIt = pos.first; } - if (quantParams0->_dst_quant.scale == 1.0) { - quantParams0->_weights_quant = quantParams1->_dst_quant; - quantParams0->_dst_quant = quantParams1->_dst_quant; - sourceQuantParams = quantParams1; + if (restartedCountIt->second % 2 == 1) { + std::reverse(inputLayers.begin(), inputLayers.end()); } + ++restartedCountIt->second; - if (!sourceQuantParams) { - auto in0LayerInfo = LayerInfo(in0); - auto in1LayerInfo = LayerInfo(in1); - if (in0LayerInfo.isActivation()) { - quantParams0->_weights_quant = quantParams1->_dst_quant; - quantParams0->_dst_quant = quantParams1->_dst_quant; - sourceQuantParams = quantParams1; - } else if (in1LayerInfo.isActivation()) { - quantParams1->_weights_quant = quantParams0->_dst_quant; - quantParams1->_dst_quant = quantParams0->_dst_quant; - sourceQuantParams = quantParams0; - } else { - THROW_GNA_LAYER_EXCEPTION(concatLayer) << "Concat quantization for " << in0->type << ": " << in0->name - << " and " << in1->type << ": " << in1->name - << " as inputs needs to be implemented! None of these inputs is an activation.\n"; + auto sourceLayerIt = std::find_if(inputLayers.begin(), inputLayers.end(), sourceLayerCheck); + if (sourceLayerIt == inputLayers.end()) { + auto nonDefaultScaleFactor = [&fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) { + auto quantParams = InferenceEngine::getInjectedData(inputLayer); + return !fp32eq(quantParams->_dst_quant.scale, 1.0f); + }; + + sourceLayerIt = std::find_if(inputLayers.begin(), inputLayers.end(), nonDefaultScaleFactor); + } + + std::set concatIdxToUpdate; + if (sourceLayerIt != inputLayers.end()) { + auto quantParams = InferenceEngine::getInjectedData(*sourceLayerIt); + auto scaleFactor = quantParams->_dst_quant.scale; + sourceQuantParams = quantParams; + + for (auto it = inputLayers.begin(); it != inputLayers.end(); ++it) { + auto quantParamsIn = InferenceEngine::getInjectedData(*it); + if (fp32eq(quantParamsIn->_dst_quant.scale, scaleFactor)) { + continue; + } + + // possible case when some of the concat inputs are free to select scale ex: const->concat<-affine + if (!fp32eq(quantParamsIn->_dst_quant.scale, 1.0f) && !LayerInfo(*it).isActivation()) { + concatIdxToUpdate.insert(std::distance(inputLayers.begin(), it)); + } + + quantParamsIn->_weights_quant = quantParams->_dst_quant; + quantParamsIn->_dst_quant = quantParams->_dst_quant; } } - if (!fp32eq(quantParams0->_dst_quant.scale, quantParams1->_dst_quant.scale) && concatIdxToUpdate == -1) { + auto updatedScaleFactor = InferenceEngine::getInjectedData(in0)->_dst_quant.scale; + auto equalScaleFactor = [updatedScaleFactor, &fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) { + auto quantParams = InferenceEngine::getInjectedData(inputLayer); + return fp32eq(quantParams->_dst_quant.scale, updatedScaleFactor); + }; + + auto layerIt = std::find_if_not(inputLayers.begin() + 1, inputLayers.end(), equalScaleFactor); + if (layerIt != inputLayers.end()) { THROW_GNA_EXCEPTION << "layers entered into concat have different scale factors" << concatLayer->name; } quantData->_dst_quant.scale = sourceQuantParams->_dst_quant.scale; quantData->_src_quant.scale = sourceQuantParams->_dst_quant.scale; - if (fp32eq(quantParams0->_dst_quant.scale, quantParams1->_dst_quant.scale) || concatIdxToUpdate == -1) { + if (layerIt == inputLayers.end() && concatIdxToUpdate.empty()) { return true; } - auto destinationQuantParams = InferenceEngine::getInjectedData(*concatLayer); - destinationQuantParams->_dst_quant.scale = sourceQuantParams->_dst_quant.scale; - - InferenceEngine::CNNLayerPtr restartedLayer; - // making a link activation possible without extra layer if first input to concat not a parent / indirect parent of second input - // using ufs - upper first search - gnalog() << "[UFS] searching for quantizeable layer prior: "<< concatLayer->name << ", via " << concatIdxToUpdate << "\n"; - - CNNNetDFS(InferenceEngine::CNNLayerPtr(concatLayer, [](InferenceEngine::CNNLayer *) {}), - [&restartedLayer, concatLayer](InferenceEngine::CNNLayerPtr layer) { - gnalog() << "[UFS] from : " << concatLayer->name << " reached: " << layer->name; - // found that direct input to concat is a indirect parent of align filter - so no link required - auto info = LayerInfo(layer); - if (!info.isWeightable() && !info.isActivation()) { - gnalog() << "... skipped\n"; - return; - } - restartedLayer = layer; - gnalog() << "... OK, need requantize\n"; - }, true, [&restartedLayer, &concatLayer, &concatIdxToUpdate](InferenceEngine::CNNLayer *from) { + for (auto& layerIdToUpdate : concatIdxToUpdate) { + auto destinationQuantParams = InferenceEngine::getInjectedData(*concatLayer); + destinationQuantParams->_dst_quant.scale = sourceQuantParams->_dst_quant.scale; + + InferenceEngine::CNNLayerPtr restartedLayer; + // making a link activation possible without extra layer if first input to concat not a parent / indirect parent of second input + // using ufs - upper first search + gnalog() << "[UFS] searching for quantizeable layer prior: " << concatLayer->name << ", via " << layerIdToUpdate << "\n"; + + CNNNetDFS(InferenceEngine::CNNLayerPtr(concatLayer, [](InferenceEngine::CNNLayer*) {}), + [&restartedLayer, concatLayer](InferenceEngine::CNNLayerPtr layer) { + gnalog() << "[UFS] from : " << concatLayer->name << " reached: " << layer->name; + // found that direct input to concat is a indirect parent of align filter - so no link required + auto info = LayerInfo(layer); + if (!info.isWeightable() && !info.isActivation()) { + gnalog() << "... skipped\n"; + return; + } + restartedLayer = layer; + gnalog() << "... OK, need requantize\n"; + }, true, [&restartedLayer, &concatLayer, &layerIdToUpdate](InferenceEngine::CNNLayer* from) { // aborting UFS once found functional layer, and using only specified input of concat return make_upstream_order(restartedLayer == nullptr ? from : nullptr, - from == concatLayer ? concatIdxToUpdate : -1); + from == concatLayer ? layerIdToUpdate : -1); }); - if (restartedLayer == nullptr) { - THROW_GNA_EXCEPTION << "cannot requantize " << concatIdxToUpdate << "input to concat: " << concatLayer->name; - } - auto quantDataForConCatInput = InferenceEngine::getInjectedData(*restartedLayer); + if (restartedLayer == nullptr) { + THROW_GNA_EXCEPTION << "cannot requantize " << layerIdToUpdate << "input to concat: " << concatLayer->name; + } + auto quantDataForConCatInput = InferenceEngine::getInjectedData(*restartedLayer); - auto restarLayerInfo = LayerInfo(restartedLayer); - if (restarLayerInfo.isActivation()) { - // requantize activation by just changing it's output scale factor - quantDataForConCatInput->_dst_quant.scale = sourceQuantParams->_dst_quant.scale; - } + auto restarLayerInfo = LayerInfo(restartedLayer); + if (restarLayerInfo.isActivation()) { + // requantize activation by just changing it's output scale factor + quantDataForConCatInput->_dst_quant.scale = sourceQuantParams->_dst_quant.scale; + } - result = ScaleFactorUpdateResult(restartedLayer.get()); + result = ScaleFactorUpdateResult(restartedLayer.get()); + } return true; } diff --git a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp index 31372b5d4aab38..3c463b51580f11 100644 --- a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp +++ b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp @@ -110,6 +110,12 @@ void GNAGraphCompiler::fillConcatConnections(InferenceEngine::CNNLayerPtr layer) InferenceEngine::details::product(begin(dataInput->getDims()), end(dataInput->getDims())) * dataInput->getPrecision().size(); + // concat align layer can have additional padding, so the size of layer needs to be calculated + // based on original number of rows + if (ptrConcatLayerInput->CheckParamPresence("original_num_rows")) { + layer_size = ptrConcatLayerInput->GetParamAsInt("original_num_rows") * dataInput->getPrecision().size(); + } + layerInfoItem.concatInputLayers.emplace_back(GNAConcatLayer::ConcatConnectedLayerInfo{ptrConcatLayerInput->name, concat_size, layer_size}); concat_size += layer_size; @@ -848,7 +854,7 @@ void GNAGraphCompiler::CropPrimitive(InferenceEngine::CNNLayerPtr layer) { size_t cropOffset = offset.front() * cropLayer->precision.size(); size_t cropOutputSize = dim.front() * cropLayer->precision.size(); - if (ALIGN64(cropOffset) == cropOffset) { + if (!LayerInfo(cropLayer).isCropAffined()) { // leave crop as it is GNAPluginNS::GNACropLayer cropLayerInfoItem(layer); std::string& id = layer->name; diff --git a/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp b/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp index 8137fbac4c014b..fc3c44e1e23b37 100644 --- a/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp +++ b/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp @@ -260,10 +260,11 @@ class LayerInfo { bool isCropAffined() const noexcept { auto cropLayer = dynamic_cast (layer); if (cropLayer != nullptr && !cropLayer->offset.empty()) { - try { - size_t cropOffset = cropLayer->offset.back() * cropLayer->precision.size(); - return (ALIGN64(cropOffset) != cropOffset); - } catch (InferenceEngine::details::InferenceEngineException) {} + // currently crop layer only supports 2 bytes in int16 and int8 mode. + // In fp32 mode this is not necessary but is useful for testing + auto bytesPerCropElement = 2; + size_t cropOffset = cropLayer->offset.back() * bytesPerCropElement; + return (ALIGN64(cropOffset) != cropOffset); } return false; } diff --git a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp index 760838b3ab81a2..4cd259de84f3bd 100644 --- a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp +++ b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp @@ -703,14 +703,10 @@ void InsertCopyLayerPass::run() { if (LayerInfo(l).isConcat() && LayerInfo(prevIndirectLayer).isCrop()) { bInsert = true; } if (bInsert) { - if (LayerInfo(prevIndirectLayer).isCrop()) { - auto cropLayer = LayerInfo(prevIndirectLayer).as(); - size_t cropOffset = cropLayer->offset.back() * cropLayer->precision.size(); - if (ALIGN(cropOffset, 8) != cropOffset) { - // The crop will be replaced by affine. - // Copy layer insertion is not required - continue; - } + if (LayerInfo(prevIndirectLayer).isCropAffined()) { + // The crop will be replaced by affine. + // Copy layer insertion is not required + continue; } auto prevLayer = CNNNetPrevLayer(l, i); InsertCopyLayer(prevLayer, l, i, getPassManager()); @@ -788,8 +784,9 @@ void InsertConcatAligningFilterPass::run() { size_t num_rows_out = num_rows_padded + num_rows_in; // encodes offset to beginning of split layer input + size_t bytesOffset = (aligned64_offset / bytesPerConcatElement) * (quantized ? bytesPerConcatElement : 4); concatAligningFilter->params["output_offset"] = - std::to_string((aligned64_offset / bytesPerConcatElement) * (quantized ? bytesPerConcatElement : 4)); + std::to_string(bytesOffset); // for padded rows we cannot use copy layer - TBD how to implement concatAligningFilter->params["num_rows_padded"] = std::to_string(num_rows_padded); @@ -843,84 +840,92 @@ void ReorderConcatInputsPass::run() { } int numOfLinkLayers = 0; - for (auto & l : *pLayers) { - // 1st stage locate concat align filter + for (auto& l : *pLayers) { + // 1st stage locate concat LayerInfo info(l); - if (!info.isConcatAlignFilter()) { + if (!info.isConcat()) { continue; } - // 2rd locating concat - if (l->outData.size() != 1) { - THROW_GNA_EXCEPTION << "no concat layer after concat aligning layer" << l->name; + // 2nd stage locate first input in concat + if (l->insData.size() < 2) { + THROW_GNA_EXCEPTION << "Concat layer has unsupported number of incoming layers: " << l->name; } - auto nextLayers = getInputTo(l->outData.front()); - if (nextLayers.size() != 1) { - THROW_GNA_EXCEPTION << "Invalid concat connection in align filter : " << l->name; - } - auto concat = nextLayers.begin()->second; - if (!LayerInfo(concat).isConcat()) { - THROW_GNA_EXCEPTION << "no concat layer after concat-aligning layer" << l->name << ", but was: " << concat->type; - } - // 3stage locate first input in concat - if (concat->insData.size() < 2) { - THROW_GNA_EXCEPTION << "Concat layer has unsupported number of incoming layers: " << concat->name; - } - auto inputsToConcatFirst = CNNNetGetPrevLayersSkip(concat, [](CNNLayerPtr origin){ - return !LayerInfo(origin).isNonFunctional() && !LayerInfo(origin).isSplit(); - }, 0); + auto concatLayer = info.as(); + auto getLayerByIndex = [&concatLayer](int idx) { + auto input = concatLayer->insData[idx]; + auto lockedInput = input.lock(); + if (!lockedInput) { + THROW_GNA_EXCEPTION << "cannot get insdata : " << idx << " for layer: " << concatLayer->name; + } + return lockedInput; + }; - if (inputsToConcatFirst.empty()) { - THROW_GNA_EXCEPTION << "cannot locate first input into concat layer: " << l; - } + for (auto input_idx = 1; input_idx != concatLayer->insData.size(); input_idx++) { + auto concatInput = getLayerByIndex(input_idx); + auto currConcatLayer = getCreatorLayer(concatInput).lock(); - auto firstInputToConcat = inputsToConcatFirst.front().first; + LayerInfo infoConcatInput(currConcatLayer); + if (!infoConcatInput.isConcatAlignFilter()) { + continue; + } - // concat has first input of concat align filter - dont need to reorder it - if (firstInputToConcat == l) { - continue; - } + auto inputsToConcatPrev = CNNNetGetPrevLayersSkip(l, [](CNNLayerPtr origin) { + return !LayerInfo(origin).isNonFunctional() && !LayerInfo(origin).isSplit(); + }, input_idx - 1); - bool bFinish = false; - // making a link activation possible without extra layer if first input to concat not a parent / indirect parent of second input - // using ufs - upper first search - gnalog() << "[UFS] searching for: "<< firstInputToConcat->name << "\n"; - - CNNNetDFS(l, [&l, &firstInputToConcat, &bFinish](CNNLayerPtr layer) { - gnalog() << "[UFS] from : "<< l->name <<" reached: " << layer->name << "\n"; - // found that direct input to concat is a indirect parent of align filter - so no link required - if (layer.get() == firstInputToConcat.get() || LayerInfo(firstInputToConcat).isInput()) { - gnalog() << "[UFS] copy layer insertion needed\n"; - bFinish = true; + if (inputsToConcatPrev.empty()) { + THROW_GNA_EXCEPTION << "cannot locate first input into concat layer: " << currConcatLayer; } - }, true, [&bFinish](InferenceEngine::CNNLayer* from) { - // aborting UFS once link not need - return make_upstream_order(!bFinish ? from : nullptr); - }); - auto linkName = std::string("link_") + std::to_string(numOfLinkLayers++); + auto prevInputToConcat = inputsToConcatPrev.front().first; + + // concat has first input of concat align filter - dont need to reorder it + if (prevInputToConcat == currConcatLayer) { + continue; + } + + bool bFinish = false; + // making a link activation possible without extra layer if first input to concat not a parent / indirect parent of second input + // using ufs - upper first search + gnalog() << "[UFS] searching for: " << prevInputToConcat->name << "\n"; + + CNNNetDFS(currConcatLayer, [&currConcatLayer, &prevInputToConcat, &bFinish](CNNLayerPtr layer) { + gnalog() << "[UFS] from : " << currConcatLayer->name << " reached: " << layer->name << "\n"; + // found that direct input to concat is a indirect parent of align filter - so no link required + if (layer.get() == prevInputToConcat.get() || LayerInfo(prevInputToConcat).isInput()) { + gnalog() << "[UFS] copy layer insertion needed\n"; + bFinish = true; + } + }, true, [&bFinish](InferenceEngine::CNNLayer* from) { + // aborting UFS once link not needed + return make_upstream_order(!bFinish ? from : nullptr); + }); + + auto linkName = std::string("link_") + std::to_string(numOfLinkLayers++); - auto linkWithoutQuant = std::make_shared(LayerParams({linkName, "link", Precision::FP32})); + auto linkWithoutQuant = std::make_shared(LayerParams({ linkName, "link", Precision::FP32 })); - auto link = quantized ? - InferenceEngine::injectData(linkWithoutQuant) : - linkWithoutQuant; + auto link = quantized ? + InferenceEngine::injectData(linkWithoutQuant) : + linkWithoutQuant; - auto linkOutData = std::make_shared(linkName, - TensorDesc(Precision::FP32, - SizeVector({1}), - Layout::C)); - getCreatorLayer(linkOutData) = link; + auto linkOutData = std::make_shared(linkName, + TensorDesc(Precision::FP32, + SizeVector({ 1 }), + Layout::C)); + getCreatorLayer(linkOutData) = link; - link->outData.push_back(linkOutData); - link->insData.push_back(l->outData.front()); + link->outData.push_back(linkOutData); + link->insData.push_back(currConcatLayer->outData.front()); - getInputTo(linkOutData)[firstInputToConcat->name + ".via.link"] = firstInputToConcat; - firstInputToConcat->insData.push_back(linkOutData); + getInputTo(linkOutData)[prevInputToConcat->name + ".via.link"] = prevInputToConcat; + prevInputToConcat->insData.push_back(linkOutData); - getInputTo(l->outData.front())[linkName] = link; + getInputTo(currConcatLayer->outData.front())[linkName] = link; + } } } diff --git a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/concat_multi_input.cpp b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/concat_multi_input.cpp new file mode 100644 index 00000000000000..8d3f05cfd0b7c8 --- /dev/null +++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/concat_multi_input.cpp @@ -0,0 +1,43 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "subgraph_tests/concat_multi_input.hpp" +#include "common_test_utils/test_constants.hpp" + +using namespace LayerTestsDefinitions; + +namespace { + +std::vector>> inShapes = { + {{1, 8}, {1, 8}}, + {{1, 3}, {1, 3}, {1, 3}}, + {{1, 16}, {1, 16}, {1, 16}}, + {{1, 16}, {1, 16}, {1, 16}, {1, 16}}, + {{1, 32}, {1, 32}, {1, 32}, {1, 32}}, + {{1, 16}, {1, 32}, {1, 16}, {1, 32}, {1, 16}, {1, 32}}, +}; + +const std::vector netPrecisions = { + InferenceEngine::Precision::FP32, + InferenceEngine::Precision::FP16, +}; + +std::map additional_config = { + {"GNA_DEVICE_MODE", "GNA_SW_EXACT"}, + {"GNA_COMPACT_MODE", "NO"}, + {"GNA_SCALE_FACTOR_0", "2048"}, + {"GNA_PRECISION", "I16"}, +}; + +INSTANTIATE_TEST_CASE_P(concat_multi_input, ConcatMultiInput, + ::testing::Combine( + ::testing::ValuesIn(inShapes), + ::testing::ValuesIn(netPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_GNA), + ::testing::Values(additional_config)), + ConcatMultiInput::getTestCaseName); + +} //namespace diff --git a/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/concat_multi_input.hpp b/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/concat_multi_input.hpp new file mode 100644 index 00000000000000..8f0e06d41e41c3 --- /dev/null +++ b/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/concat_multi_input.hpp @@ -0,0 +1,34 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include + +#include "functional_test_utils/layer_test_utils.hpp" +#include "ngraph_functions/utils/ngraph_helpers.hpp" +#include "ngraph_functions/builders.hpp" + +typedef std::tuple< + std::vector>, // Input shapes + InferenceEngine::Precision, // Network Precision + std::string, // Target Device + std::map // Config +> concatMultiParams; + +namespace LayerTestsDefinitions { + +class ConcatMultiInput : public testing::WithParamInterface, + virtual public LayerTestsUtils::LayerTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + void SetUp() override; +}; + +} // namespace LayerTestsDefinitions diff --git a/inference-engine/tests/functional/plugin/shared/src/subgraph_tests/concat_multi_input.cpp b/inference-engine/tests/functional/plugin/shared/src/subgraph_tests/concat_multi_input.cpp new file mode 100644 index 00000000000000..1d70dfe0448122 --- /dev/null +++ b/inference-engine/tests/functional/plugin/shared/src/subgraph_tests/concat_multi_input.cpp @@ -0,0 +1,88 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include + +#include "common_test_utils/common_utils.hpp" +#include "functional_test_utils/plugin_cache.hpp" +#include "functional_test_utils/layer_test_utils.hpp" +#include "functional_test_utils/blob_utils.hpp" + +#include "ngraph_functions/pass/convert_prc.hpp" + +#include "subgraph_tests/concat_multi_input.hpp" + + +namespace LayerTestsDefinitions { + + +std::string ConcatMultiInput::getTestCaseName(testing::TestParamInfo obj) { + std::vector> inputShapes; + InferenceEngine::Precision netPrecision; + std::string targetDevice; + std::map additional_config; + std::tie(inputShapes, netPrecision, targetDevice, additional_config) = obj.param; + + std::ostringstream result; + result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_"; + result << "netPRC=" << netPrecision.name() << "_"; + result << "targetDevice=" << targetDevice; + + return result.str(); +} + +void ConcatMultiInput::SetUp() { + std::vector> inputShapes; + InferenceEngine::Precision netPrecision; + std::map additional_config; + std::tie(inputShapes, netPrecision, targetDevice, additional_config) = this->GetParam(); + configuration.insert(additional_config.begin(), additional_config.end()); + + auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); + std::vector paramSize = { 1, 0 }; + for (const auto& val : inputShapes) { + paramSize[1] += val[1]; + } + auto params = ngraph::builder::makeParams(ngPrc, { paramSize }); + auto stride = std::make_shared(ngraph::element::i64, ngraph::Shape{ 2 }, std::vector{ 1, 1 }); + + std::vector newAxis = { 0, 0 }; + std::vector begin_mask = { 0, 0 }; + std::vector end_mask = { 0, 0 }; + std::vector> ssArray; + ngraph::OutputVector concatInput; + + auto relu = std::make_shared(params[0]); + std::vector startOffset = { 0, 0 }; + for (size_t i = 0; i < inputShapes.size(); ++i) { + std::vector shape = { static_cast(inputShapes[i][0]), + static_cast(inputShapes[i][1]) }; + std::vector endoffset = { static_cast(inputShapes[i][0]) + startOffset[0], + static_cast(inputShapes[i][1]) + startOffset[1]}; + auto begin = std::make_shared(ngraph::element::i64, ngraph::Shape{ 2 }, startOffset); + auto end = std::make_shared(ngraph::element::i64, ngraph::Shape{ 2 }, endoffset); + auto ss = std::make_shared(relu, begin, end, stride, begin_mask, end_mask, newAxis); + ssArray.push_back(ss); + concatInput.push_back(ssArray[i]); + + startOffset[1] += shape[1]; + } + + auto concat = std::make_shared(concatInput, 1); + + ngraph::ResultVector results{ std::make_shared(concat) }; + function = std::make_shared(results, params, "ConcatMultiInput"); +} + +TEST_P(ConcatMultiInput, CompareWithRefImpl) { + Run(); +}; + + +} // namespace LayerTestsDefinitions From f86d930e3f105a252b1e5d27cbe64187dd723565 Mon Sep 17 00:00:00 2001 From: Anton Potapov Date: Wed, 9 Sep 2020 15:30:08 +0300 Subject: [PATCH 49/66] [PP GAPI] - Generic precision conversion kernel; support for U8 (#2076) - added U8 support - tests are extended --- .../src/preprocessing/ie_preprocess_gapi_kernels.cpp | 12 +++++++----- .../src/preprocessing/ie_preprocess_gapi_kernels.hpp | 4 ++-- .../ie_preprocess_gapi_kernels_impl.hpp | 11 ++++++++++- .../fluid_preproc/cpu/fluid_tests_cpu.cpp | 4 ++-- 4 files changed, 21 insertions(+), 10 deletions(-) diff --git a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp index 654b625d3fd323..e104dcfdb0e856 100644 --- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp +++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp @@ -2247,25 +2247,27 @@ GAPI_FLUID_KERNEL(FConvertDepth, ConvertDepth, false) { static const int Window = 1; static void run(const cv::gapi::fluid::View& src, int depth, cv::gapi::fluid::Buffer& dst) { - GAPI_Assert(src.meta().depth == CV_16U || src.meta().depth == CV_32F); - GAPI_Assert(dst.meta().depth == CV_32F || dst.meta().depth == CV_16U); + GAPI_Assert(src.meta().depth == CV_8U || src.meta().depth == CV_32F || src.meta().depth == CV_16U); + GAPI_Assert(dst.meta().depth == CV_8U || dst.meta().depth == CV_32F || dst.meta().depth == CV_16U); GAPI_Assert(src.meta().chan == 1); GAPI_Assert(dst.meta().chan == 1); GAPI_Assert(src.length() == dst.length()); - constexpr unsigned supported_types_n = 2; + constexpr unsigned supported_types_n = 3; using p_f = void (*)( const uint8_t* src, uint8_t* dst, const int width); using table_string_t = std::array; constexpr std::array func_table = { - table_string_t{convert_precision, convert_precision}, - table_string_t{convert_precision, convert_precision} + table_string_t{convert_precision, convert_precision, convert_precision}, + table_string_t{convert_precision, convert_precision, convert_precision}, + table_string_t{convert_precision, convert_precision, convert_precision} }; auto depth_to_index = [](int depth){ switch (depth) { case CV_16U: return 0; case CV_32F: return 1; + case CV_8U: return 2; default: GAPI_Assert(!"not supported depth"); return -1; } }; diff --git a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.hpp b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.hpp index 7b6acdbe29f410..685eae4db966b8 100644 --- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.hpp +++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.hpp @@ -145,8 +145,8 @@ namespace gapi { G_TYPED_KERNEL(ConvertDepth, , "com.intel.ie.ConvertDepth") { static cv::GMatDesc outMeta(const cv::GMatDesc& in, int depth) { - GAPI_Assert(in.depth == CV_16U || in.depth == CV_32F); - GAPI_Assert(depth == CV_32F || depth == CV_16U); + GAPI_Assert(in.depth == CV_8U || in.depth == CV_16U || in.depth == CV_32F); + GAPI_Assert(depth == CV_8U || depth == CV_32F || depth == CV_16U); return in.withDepth(depth); } diff --git a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_impl.hpp b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_impl.hpp index 6d5161f7f33aa8..bce88e1206ca70 100644 --- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_impl.hpp +++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_impl.hpp @@ -24,6 +24,7 @@ #include #include +#include #if defined(__GNUC__) && (__GNUC__ <= 5) #include @@ -38,12 +39,20 @@ template<> inline short saturate_cast(int x) { return (std::min)(SHRT_MAX, (std: template<> inline short saturate_cast(float x) { return saturate_cast(static_cast(std::rint(x))); } template<> inline float saturate_cast(float x) { return x; } template<> inline short saturate_cast(short x) { return x; } + template<> inline uint16_t saturate_cast(uint16_t x) { return x; } template<> inline float saturate_cast(uint16_t x) { return x; } + template<> inline uint16_t saturate_cast(int x) { return (std::min)(USHRT_MAX, (std::max)(0, x)); } template<> inline uint16_t saturate_cast(float x) { return saturate_cast(static_cast(std::rint(x))); } -template<> inline uchar saturate_cast(int v) { return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); } +template<> inline uchar saturate_cast(int v) { return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); } + +template<> inline uint16_t saturate_cast(uint8_t x) { return x; } +template<> inline float saturate_cast(uint8_t x) { return x; } +template<> inline uint8_t saturate_cast(uint8_t x) { return x; } +template<> inline uint8_t saturate_cast(uint16_t x) { using lim = std::numeric_limits; return std::min(static_cast(lim::max()), std::max(static_cast(lim::min()), x));} +template<> inline uint8_t saturate_cast(float x) { return saturate_cast(static_cast(std::rint(x))); } //------------------------------------------------------------------------------ constexpr static const int ONE = 1 << 15; diff --git a/inference-engine/tests_deprecated/fluid_preproc/cpu/fluid_tests_cpu.cpp b/inference-engine/tests_deprecated/fluid_preproc/cpu/fluid_tests_cpu.cpp index 954bff98cae665..1e21638df5bbf5 100644 --- a/inference-engine/tests_deprecated/fluid_preproc/cpu/fluid_tests_cpu.cpp +++ b/inference-engine/tests_deprecated/fluid_preproc/cpu/fluid_tests_cpu.cpp @@ -171,8 +171,8 @@ INSTANTIATE_TEST_CASE_P(I420toRGBTestFluid, I420toRGBTestGAPI, Values(0))); INSTANTIATE_TEST_CASE_P(ConvertDepthFluid, ConvertDepthTestGAPI, - Combine(Values(CV_16U, CV_32F), - Values(CV_32F, CV_16U), + Combine(Values(CV_16U, CV_32F, CV_8U), + Values(CV_32F, CV_16U, CV_8U), Values(cv::Size(3840, 2160), cv::Size(1920, 1080), cv::Size(1280, 720), From 4bd05c5364eb0b8202a9bf25f5dd1773d5b1e6e8 Mon Sep 17 00:00:00 2001 From: Vitaliy Urusovskij Date: Wed, 9 Sep 2020 15:30:23 +0300 Subject: [PATCH 50/66] Implement statistics collection: (#2056) 1. Add `-s` CLI key to get statistics file path 2. Implement `StatisticsWriter` singleton to manage handle to this file --- tests/time_tests/common/cli.h | 8 +++ tests/time_tests/common/main.cpp | 5 ++ tests/time_tests/common/statistics_writer.h | 55 +++++++++++++++++++++ tests/time_tests/common/timer.h | 5 +- 4 files changed, 71 insertions(+), 2 deletions(-) create mode 100644 tests/time_tests/common/statistics_writer.h diff --git a/tests/time_tests/common/cli.h b/tests/time_tests/common/cli.h index 7f19b6a51781a9..b21758d80ee612 100644 --- a/tests/time_tests/common/cli.h +++ b/tests/time_tests/common/cli.h @@ -21,6 +21,9 @@ static const char target_device_message[] = "Required. Specify a target device t "Use \"-d MULTI:\" format to specify MULTI plugin. " \ "The application looks for a suitable plugin for the specified device."; +/// @brief message for statistics path argument +static const char statistics_path_message[] = "Required. Path to a file to write statistics."; + /// @brief Define flag for showing help message
DEFINE_bool(h, false, help_message); @@ -35,6 +38,10 @@ DEFINE_string(m, "", model_message); /// It is a required parameter DEFINE_string(d, "", target_device_message); +/// @brief Define parameter for set path to a file to write statistics
+/// It is a required parameter +DEFINE_string(s, "", statistics_path_message); + /** * @brief This function show a help message */ @@ -46,4 +53,5 @@ static void showUsage() { std::cout << " -h, --help " << help_message << std::endl; std::cout << " -m \"\" " << model_message << std::endl; std::cout << " -d \"\" " << target_device_message << std::endl; + std::cout << " -s \"\" " << statistics_path_message << std::endl; } diff --git a/tests/time_tests/common/main.cpp b/tests/time_tests/common/main.cpp index e03866529a9268..164394f0909dfe 100644 --- a/tests/time_tests/common/main.cpp +++ b/tests/time_tests/common/main.cpp @@ -3,6 +3,7 @@ // #include "cli.h" +#include "statistics_writer.h" #include "../ftti_pipeline/ftti_pipeline.h" #include @@ -23,6 +24,9 @@ bool parseAndCheckCommandLine(int argc, char **argv) { if (FLAGS_d.empty()) throw std::logic_error("Device is required but not set. Please set -d option."); + if (FLAGS_s.empty()) + throw std::logic_error("Statistics file path is required but not set. Please set -s option."); + return true; } @@ -43,5 +47,6 @@ int main(int argc, char **argv) { if (!parseAndCheckCommandLine(argc, argv)) return -1; + StatisticsWriter::Instance().setFile(FLAGS_s); return _runPipeline(); } \ No newline at end of file diff --git a/tests/time_tests/common/statistics_writer.h b/tests/time_tests/common/statistics_writer.h new file mode 100644 index 00000000000000..f698ff16a1310d --- /dev/null +++ b/tests/time_tests/common/statistics_writer.h @@ -0,0 +1,55 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include + + +/** + * @brief Class response for writing provided statistics + * + * Object of the class is writing provided statistics to a specified + * file in YAML format. + */ +class StatisticsWriter { +private: + std::ofstream statistics_file; + + StatisticsWriter() = default; + StatisticsWriter(const StatisticsWriter&) = delete; + StatisticsWriter& operator=(const StatisticsWriter&) = delete; +public: + /** + * @brief Creates StatisticsWriter singleton object + */ + static StatisticsWriter& Instance(){ + static StatisticsWriter writer; + return writer; + } + + /** + * @brief Specifies, opens and validates statistics path for writing + */ + void setFile(const std::string &statistics_path) { + statistics_file.open(statistics_path); + if (!statistics_file.good()) { + std::stringstream err; + err << "Statistic file \"" << statistics_path << "\" can't be used for writing"; + throw std::runtime_error(err.str()); + } + } + + /** + * @brief Writes provided statistics in YAML format. + */ + void write(const std::pair &record) { + if (!statistics_file) + throw std::runtime_error("Statistic file path isn't set"); + statistics_file << record.first << ": " << record.second << "\n"; + } +}; diff --git a/tests/time_tests/common/timer.h b/tests/time_tests/common/timer.h index e92d944bf932f9..dc7a17abebf4de 100644 --- a/tests/time_tests/common/timer.h +++ b/tests/time_tests/common/timer.h @@ -5,11 +5,12 @@ #pragma once #include -#include #include #include #include +#include "statistics_writer.h" + using time_point = std::chrono::high_resolution_clock::time_point; /** @@ -38,7 +39,7 @@ class Timer { ~Timer(){ float duration = std::chrono::duration_cast( std::chrono::high_resolution_clock::now() - start_time).count(); - std::cout << name << ":" << duration << "\n"; // TODO: replace with writer + StatisticsWriter::Instance().write({name, duration}); } }; From dca30b4522f94e3f7fc6914db406f8f9ab3b25cc Mon Sep 17 00:00:00 2001 From: Vladimir Gavrilov Date: Wed, 9 Sep 2020 16:28:52 +0300 Subject: [PATCH 51/66] Extend MO for support of Interpolate-4 (#2026) * Commit. * Added opset4 version in the class Interpolate. * Added class ONNXResize11Op to read ONNX Resize with opset version >= 11. * Added support for Interpolate-4 into transformations TestInterpolateReshapeWA and InterpolateConcat. * Added support for Interpolate-4 into transformation InterpolateWithConcat. * Deleted redundant checks from the transformation UpsampleToResample. * Reverted last changes. * Changed ONNX Resize extractor to support for Interpolate-4. * Added conversion of ONNXResize11Op into Interpolate-4. * Added support for Interpolate-4 into the transformation InterpolateSequenceToInterpolate. * Small fix for formatting. * Written tests for MO version of Interpolate-4 with shape_calculation_mode = sizes. * Written tests for infer function of Interpolate-4. * Now transformations InterpolateWithConcat, InterpolateConcat, InterpolateReshapeWA skip Interpolate-4. * Used create_op_with_const_inputs in the transformation InterpolateSequenceToInterpolate. * The transformation ONNXResize11ToInterpolate4 was rewritten using find_and_replace_pattern. * Now the dictionary infers (dictionary of infer functions of Interpolate) is a class static attribute. * Deleted unused variable. * Restored original logic of find_and_replace_pattern method of the class InterpolateReshapeWA. * Used create_op_with_const_inputs() in the transformation InterpolateSequenceToInterpolate for opset1 case. * Replaced resize_name by resize.soft_get('name', resize.id). * Small fixes. * Added two tests for Interpolate-4 infer function. * Fixed the transformation ONNXResize11ToInterpolateV4 for the case when ONNXResize11 operation has 3 inputs. * Added conversion of ONNXResize11 with tf_crop_and_resize_mode to ROIPooling + ONNXResize11. * Fixed bugs in the transformation ONNXResize11ToInterpolateV4 and in the infer function of the operation ONNXResize11. * Small changes. * Renamed transformation that converts ONNXResize11 into ROIPooling + ONNXResize11 and fixed BOM-file. * Fixed tests for the transformation InterpolateSequenceToInterpolate. * Small change. * Now the transformation InterpolateSequenceToInterpolate preserves output layer name. * Deleted the transformation ONNXResize11ToTFCropAndResize. --- model-optimizer/automation/package_BOM.txt | 2 + .../extensions/back/InterpolateReshape.py | 14 +- .../back/InterpolateReshape_test.py | 5 +- .../extensions/front/interpolate_reshape.py | 9 +- .../front/interpolate_reshape_test.py | 3 +- .../extensions/front/onnx/resize_ext.py | 20 +- .../InterpolateSequenceToInterpolate.py | 196 +++- .../InterpolateSequenceToInterpolate_test.py | 841 +++++++++++++++++- .../middle/ONNXResize11ToInterpolateV4.py | 172 ++++ .../extensions/middle/UpsampleToResample.py | 2 - .../extensions/ops/ONNXResize11.py | 72 ++ model-optimizer/extensions/ops/interpolate.py | 141 ++- .../extensions/ops/interpolate_test.py | 281 ++++++ 13 files changed, 1645 insertions(+), 113 deletions(-) create mode 100644 model-optimizer/extensions/middle/ONNXResize11ToInterpolateV4.py create mode 100644 model-optimizer/extensions/ops/ONNXResize11.py create mode 100644 model-optimizer/extensions/ops/interpolate_test.py diff --git a/model-optimizer/automation/package_BOM.txt b/model-optimizer/automation/package_BOM.txt index df126dd35d99d6..9301c42d5fdd35 100644 --- a/model-optimizer/automation/package_BOM.txt +++ b/model-optimizer/automation/package_BOM.txt @@ -538,6 +538,7 @@ extensions/middle/MulFakeQuantizeFuse.py extensions/middle/MXNetRNNSequenceNormalize.py extensions/middle/MXNetSplitMultiLayers.py extensions/middle/MXTileReplacer.py +extensions/middle/ONNXResize11ToInterpolateV4.py extensions/middle/ONNXRNNSequenceNormalize.py extensions/middle/PartialInfer.py extensions/middle/pass_separator.py @@ -640,6 +641,7 @@ extensions/ops/non_zero.py extensions/ops/normalize.py extensions/ops/normalize_l2.py extensions/ops/one_hot.py +extensions/ops/ONNXResize11.py extensions/ops/pack.py extensions/ops/parameter.py extensions/ops/pnorm.py diff --git a/model-optimizer/extensions/back/InterpolateReshape.py b/model-optimizer/extensions/back/InterpolateReshape.py index 2a4daf753dfba4..fa7701eaef654c 100644 --- a/model-optimizer/extensions/back/InterpolateReshape.py +++ b/model-optimizer/extensions/back/InterpolateReshape.py @@ -17,6 +17,7 @@ from extensions.ops.elementwise import Mul from extensions.ops.gather import Gather +from extensions.ops.interpolate import Interpolate from mo.back.replacement import BackReplacementPattern from mo.front.caffe.extractors.utils import get_canonical_axis_index from mo.front.common.partial_infer.utils import int64_array @@ -67,7 +68,7 @@ def make_interpolate_reshapeable(interpolate, concat): output_shape = interpolate.out_port(0).data.get_shape() - interp_axes = [get_canonical_axis_index(output_shape, axis) for axis in interpolate.axes] + interp_axes = [get_canonical_axis_index(output_shape, axis) for axis in Interpolate.get_axes(interpolate)] concat_axis = get_canonical_axis_index(output_shape, concat.axis) if concat_axis in interp_axes: return @@ -82,12 +83,13 @@ def make_interpolate_reshapeable(interpolate, concat): shape = Shape(graph, {'name': src.node.soft_get('name', src.node.id) + '/Shape'}).create_node() shape.in_port(0).connect(src) - gather = create_op_with_const_inputs(graph, Gather, {1: np.array(interpolate.axes, dtype=np.int32), 2: int64_array(0)}, + gather = create_op_with_const_inputs(graph, Gather, + {1: np.array(interp_axes, dtype=np.int32), 2: int64_array(0)}, {'name': shape.name + '/Gathered'}, shape) interpolate.in_port(1).get_connection().set_source(gather.out_port(0)) def find_and_replace_pattern(self, graph: Graph): - for interpolate in graph.get_op_nodes(type='Interpolate'): + for interpolate in graph.get_op_nodes(type='Interpolate', version='opset1'): if interpolate.in_port(1).get_source().node.soft_get('type') != 'Const': continue dsts = interpolate.out_port(0).get_destinations() @@ -132,7 +134,7 @@ def run_after(self): @staticmethod def make_interpolate_reshapeable(interpolate): assert interpolate.soft_get('type') == 'Interpolate' - axes = interpolate.axes + axes = Interpolate.get_axes(interpolate) input_shape = interpolate.in_port(0).data.get_shape() output_shape = interpolate.out_port(0).data.get_shape() if not np.all(np.remainder(output_shape, input_shape) == 0) and \ @@ -149,6 +151,6 @@ def make_interpolate_reshapeable(interpolate): interpolate.in_port(1).get_connection().set_source(mul.out_port(0)) def find_and_replace_pattern(self, graph: Graph): - for interpolate in graph.get_op_nodes(type='Interpolate'): + for interpolate in graph.get_op_nodes(type='Interpolate', version='opset1'): if interpolate.in_port(1).get_source().node.soft_get('type') == 'Const': - self.make_interpolate_reshapeable(interpolate) + self.make_interpolate_reshapeable(interpolate) \ No newline at end of file diff --git a/model-optimizer/extensions/back/InterpolateReshape_test.py b/model-optimizer/extensions/back/InterpolateReshape_test.py index 46f8adf3dfb300..c685004f85c779 100644 --- a/model-optimizer/extensions/back/InterpolateReshape_test.py +++ b/model-optimizer/extensions/back/InterpolateReshape_test.py @@ -29,7 +29,7 @@ **valued_const_with_data('out_shape', np.array([60, 160])), **regular_op_with_shaped_data('interpolate', [1, 3, 60, 160], {'type': 'Interpolate', 'axes': [2, 3], - 'op': 'Interpolate'}), + 'op': 'Interpolate', 'version': 'opset1'}), **regular_op_with_shaped_data('shape', [4], {'type': 'ShapeOf', 'op': 'ShapeOf'}), **valued_const_with_data('indices', np.array([2, 3])), @@ -79,6 +79,7 @@ def test_interpolate_concat_reshape_graph_comparison(self): *connect('placeholder_1', '1:concat'), *connect('concat', 'output'), ], nodes_with_edges_only=True) + InterpolateConcat().find_and_replace_pattern(graph) graph.clean_up() graph_ref = build_graph(nodes, [ @@ -93,4 +94,4 @@ def test_interpolate_concat_reshape_graph_comparison(self): *connect('concat', 'output'), ], nodes_with_edges_only=True) (flag, resp) = compare_graphs(graph, graph_ref, 'output', check_op_attrs=True) - self.assertTrue(flag, resp) + self.assertTrue(flag, resp) \ No newline at end of file diff --git a/model-optimizer/extensions/front/interpolate_reshape.py b/model-optimizer/extensions/front/interpolate_reshape.py index 8e1e11da1df9ce..1d0201d0dcb7fb 100644 --- a/model-optimizer/extensions/front/interpolate_reshape.py +++ b/model-optimizer/extensions/front/interpolate_reshape.py @@ -16,6 +16,7 @@ import numpy as np from extensions.ops.gather import Gather +from extensions.ops.interpolate import Interpolate from mo.front.common.partial_infer.utils import int64_array from mo.front.common.replacement import FrontReplacementPattern from mo.front.tf.graph_utils import create_op_with_const_inputs @@ -141,9 +142,7 @@ def get_non_interpolate_concat_sources(self, concat: Node): def make_interpolate_reshape_able(self, interpolate: Node, concat: Node): assert interpolate.soft_get('type') == 'Interpolate' assert concat.soft_get('type') == 'Concat' - - interp_axes = interpolate.soft_get('axes', None) - interp_axes = interp_axes if interp_axes is None else int64_array(interp_axes) + interp_axes = Interpolate.get_axes(interpolate) concat_axis = self.get_concat_axis(concat) if concat_axis is None or interp_axes is None \ @@ -163,12 +162,12 @@ def make_interpolate_reshape_able(self, interpolate: Node, concat: Node): shape = Shape(graph, {'name': src.node.soft_get('name', src.node.id) + '/Shape'}).create_node() shape.in_port(0).connect(src) gather = create_op_with_const_inputs(graph, Gather, - {1: np.array(interpolate.axes, dtype=np.int32), 2: int64_array(0)}, + {1: np.array(interp_axes, dtype=np.int32), 2: int64_array(0)}, {'name': shape.name + '/Gathered'}, input_node=shape) interpolate.in_port(1).get_connection().set_source(gather.out_port(0)) def find_and_replace_pattern(self, graph: Graph): - for interpolate in graph.get_op_nodes(type='Interpolate'): + for interpolate in graph.get_op_nodes(type='Interpolate', version='opset1'): if interpolate.in_port(1).get_source().node.soft_get('type') != 'Const': continue diff --git a/model-optimizer/extensions/front/interpolate_reshape_test.py b/model-optimizer/extensions/front/interpolate_reshape_test.py index 7fb0b42e7c14ce..cb3dabb438d190 100644 --- a/model-optimizer/extensions/front/interpolate_reshape_test.py +++ b/model-optimizer/extensions/front/interpolate_reshape_test.py @@ -29,7 +29,8 @@ **valued_const_with_data('out_shape', np.array([60, 160])), **regular_op_with_shaped_data('interpolate', [1, 3, 60, 160], - {'type': 'Interpolate', 'axes': int64_array([2, 3]), 'op': 'Interpolate'}), + {'type': 'Interpolate', 'axes': int64_array([2, 3]), 'op': 'Interpolate', + 'version': 'opset1'}), **regular_op_with_shaped_data('identity_00', [1, 3, 60, 160], {'identity': True, 'op': 'Identity'}), **regular_op_with_shaped_data('identity_01', [1, 3, 60, 160], {'identity': True, 'op': 'Identity'}), diff --git a/model-optimizer/extensions/front/onnx/resize_ext.py b/model-optimizer/extensions/front/onnx/resize_ext.py index c8aefdbd6f7a24..37ff807a3f45ad 100644 --- a/model-optimizer/extensions/front/onnx/resize_ext.py +++ b/model-optimizer/extensions/front/onnx/resize_ext.py @@ -15,10 +15,10 @@ """ from extensions.ops.upsample import UpsampleOp +from extensions.ops.ONNXResize11 import ONNXResize11Op from mo.front.extractor import FrontExtractorOp from mo.front.onnx.extractors.utils import onnx_attr, get_onnx_opset_version from mo.graph.graph import Node -from mo.utils.error import Error class ResizeExtractor(FrontExtractorOp): @@ -29,7 +29,19 @@ class ResizeExtractor(FrontExtractorOp): def extract(cls, node: Node): onnx_opset_version = get_onnx_opset_version(node) if onnx_opset_version is not None and onnx_opset_version >= 11: - raise Error("ONNX Resize operation from opset {} is not supported.".format(onnx_opset_version)) - mode = onnx_attr(node, 'mode', 's', default=b'nearest').decode() - UpsampleOp.update_node_stat(node, {'mode': mode}) + mode = onnx_attr(node, 'mode', 's', default=b'nearest').decode() + transformation_mode = onnx_attr(node, + 'coordinate_transformation_mode', + 's', + default=b'half_pixel').decode() + nearest_mode = onnx_attr(node, 'nearest_mode', 's', default=b'round_prefer_floor').decode() + cubic_coeff_a = onnx_attr(node, 'cubic_coeff_a', 'f', default=-0.75) + attrs = { + 'mode': mode, 'coordinate_transformation_mode': transformation_mode, + 'nearest_mode': nearest_mode, 'cube_coeff': cubic_coeff_a + } + ONNXResize11Op.update_node_stat(node, attrs) + else: + mode = onnx_attr(node, 'mode', 's', default=b'nearest').decode() + UpsampleOp.update_node_stat(node, {'mode': mode}) return cls.enabled diff --git a/model-optimizer/extensions/middle/InterpolateSequenceToInterpolate.py b/model-optimizer/extensions/middle/InterpolateSequenceToInterpolate.py index ef83275cf69eea..4494ea800ba9bb 100644 --- a/model-optimizer/extensions/middle/InterpolateSequenceToInterpolate.py +++ b/model-optimizer/extensions/middle/InterpolateSequenceToInterpolate.py @@ -15,13 +15,15 @@ """ import logging as log +import numpy as np from typing import List from extensions.ops.interpolate import Interpolate from mo.front.common.partial_infer.utils import int64_array -from mo.graph.graph import Graph, Node +from mo.front.tf.graph_utils import create_op_with_const_inputs +from mo.graph.graph import Graph, Node, rename_nodes from mo.middle.replacement import MiddleReplacementPattern -from mo.ops.const import Const +from mo.utils.error import Error def node_has_one_consumer(node: Node) -> bool: @@ -50,6 +52,76 @@ def __init__(self): # We need to accumulate set of axes of compared nodes, because there can be a sequence of a set of axes # {i}{j}{i} self.accumulated_axes = set() + self.default_values_for_opset4 = { + 'mode': None, + 'shape_calculation_mode': None, + 'coordinate_transformation_mode': 'half_pixel', + 'nearest_mode': 'round_prefer_floor', + 'antialias': 0, + 'cube_coeff': -0.75 + } + self.default_pads = int64_array([0]) + + def _compare_attributes_of_interpolate1(self, first: Node, second: Node) -> bool: + """ + This function checks whether attributes of Interpolate-1 nodes first and second are identical + (except attribute 'axes'). + :param first: the first of compared nodes + :param second: the second of compared nodes + :return: True, if attributes of nodes are identical and False otherwise + """ + # If some of attributes 'mode', 'align_corners', 'antialias', 'pads_begin', 'pads_end' are different, + # then attributes of nodes are not identical. + op = Interpolate(graph=first.graph, attrs={}) + for attr in ['mode', 'align_corners', 'antialias', 'pads_begin', 'pads_end']: + if first.soft_get(attr, default=op.attrs[attr]) != second.soft_get(attr, default=op.attrs[attr]): + return False + return True + + def _compare_attributes_of_interpolate4(self, first: Node, second: Node) -> bool: + """ + This function checks whether attributes of Interpolate-4 nodes first and second are identical. + :param first: the first of compared nodes + :param second: the second of compared nodes + :return: True, if attributes of nodes are identical and False otherwise + """ + # If some of attributes 'mode', 'coordinate_transformation_mode', 'nearest_mode', 'antialias', 'cube_coeff' + # are different, then attributes of first and second are not identical. + for attr in self.default_values_for_opset4.keys(): + default_value = self.default_values_for_opset4[attr] + if first.soft_get(attr, default=default_value) != second.soft_get(attr, default=default_value): + return False + + # If attributes 'pads_begin' or 'pads_end' of nodes first and second are different, then attributes + # of first and second are not identical. + for attr in ['pads_begin', 'pads_end']: + if not np.array_equal(first.soft_get(attr, default=self.default_pads), + second.soft_get(attr, default=self.default_pads)): + return False + return True + + def _compare_attributes(self, first: Node, second: Node) -> bool: + """ + This function checks whether attributes of nodes first and second are identical (except attribute 'axes'). + :param first: the first of compared nodes + :param second: the second of compared nodes + :return: True, if attributes of nodes are identical and False otherwise + """ + # If opsets of nodes are different, then nodes have different attributes. + fst_opset = first.get_opset() + snd_opset = second.get_opset() + if fst_opset != snd_opset: + return False + + if fst_opset not in ['opset1', 'opset4']: + fst_name = first.soft_get('name', first.id) + snd_name = second.soft_get('name', second.id) + raise Error('Unsupported opset {} for nodes with names {} and {}'.format(fst_opset, fst_name, snd_name)) + + if fst_opset == 'opset1': + return self._compare_attributes_of_interpolate1(first, second) + else: + return self._compare_attributes_of_interpolate4(first, second) def __call__(self, first: Node, second: Node) -> bool: """ @@ -58,15 +130,11 @@ def __call__(self, first: Node, second: Node) -> bool: :param second: the second of fused nodes :return: True, if nodes can be fused, and False otherwise """ - # If some of attributes 'mode', 'align_corners', 'antialias', 'pads_begin', 'pads_end' are different, - # then nodes cannot be fused, because fused result will be incorrect. - op = Interpolate(graph=first.graph, attrs={}) - for attr in ['mode', 'align_corners', 'antialias', 'pads_begin', 'pads_end']: - if first.soft_get(attr, default=op.attrs[attr]) != second.soft_get(attr, default=op.attrs[attr]): - return False + if not self._compare_attributes(first, second): + return False - fst_axes = set([a for a in first.axes]) - snd_axes = set([a for a in second.axes]) + fst_axes = set([a for a in Interpolate.get_axes(first)]) + snd_axes = set([a for a in Interpolate.get_axes(second)]) self.accumulated_axes = self.accumulated_axes | fst_axes @@ -108,6 +176,40 @@ def collect_sequences(xs: List[Node]) -> List[List[Node]]: return result +def get_interpolate_attributes(node: Node) -> dict: + opset_to_default_values = { + 'opset1': { + 'mode': None, + 'align_corners': 0, + 'antialias': 0, + 'pads_begin': 0, + 'pads_end': 0, + 'version': 'opset1' + }, + 'opset4': { + 'mode': None, + 'shape_calculation_mode': None, + 'antialias': 0, + 'pads_begin': int64_array([0]), + 'pads_end': int64_array([0]), + 'coordinate_transformation_mode': 'half_pixel', + 'nearest_mode': 'round_prefer_floor', + 'cube_coeff': -0.75, + 'version': 'opset4' + }, + } + opset = node.get_opset() + result = {} + if opset in opset_to_default_values: + default_values = opset_to_default_values[opset] + for attr in default_values.keys(): + value = node.soft_get(attr, default=default_values[attr]) + result[attr] = value + return result + else: + raise Error('Unsupported opset {} for node with name {}.'.format(opset, node.soft_get('name', node.id))) + + def replace_sequence(seq: List[Node], graph: Graph): """ This function replaces a sequence of consecutive Interpolate layers with one Interpolate layer, @@ -127,37 +229,55 @@ def replace_sequence(seq: List[Node], graph: Graph): dims_and_scales_ = [] # Each element of the list dims_and_scales_ is a pair - # (axis, output size for this axis) - for interp in seq: - dims_and_scales_.extend(zip(interp.axes, interp.in_port(1).get_connection().get_source().node.value)) - - axis_to_size = sorted(list(dict(dims_and_scales_).items()), key=lambda x: x[0]) - axes_of_node = int64_array([z[0] for z in axis_to_size]) - sizes = int64_array([z[1] for z in axis_to_size]) + # (axis, output size for this axis) (opset1) + # or + # (axis, output size for this axis, output scales for this axis) (opset4) + if seq[0].get_opset() == 'opset1': + for interp in seq: + dims_and_scales_.extend(zip(Interpolate.get_axes(interp), + interp.in_port(1).get_connection().get_source().data.get_value())) + + axis_to_size = sorted(list(dict(dims_and_scales_).items()), key=lambda x: x[0]) + axes_of_node = int64_array([z[0] for z in axis_to_size]) + sizes = int64_array([z[1] for z in axis_to_size]) + scales = np.ones(len(axis_to_size)) + else: + for interp in seq: + dims_and_scales_.extend(zip(Interpolate.get_axes(interp), + interp.in_port(1).get_connection().get_source().data.get_value(), + interp.in_port(2).get_connection().get_source().data.get_value())) + + axis_to_size = sorted(dims_and_scales_, key=lambda x: x[0]) + axes_of_node = int64_array([z[0] for z in axis_to_size]) + sizes = int64_array([z[1] for z in axis_to_size]) + scales = np.array([z[2] for z in axis_to_size]) fst_interp_node = seq[0] last_interp_node = seq[-1] - fst_interp_node_name = fst_interp_node.name - fst_interp_node_mode = fst_interp_node.mode - fst_interp_node_align_corners = fst_interp_node.soft_get('align_corners', default=0) - fst_interp_node_antialias = fst_interp_node.soft_get('antialias', default=0) - fst_interp_node_pads_begin = fst_interp_node.soft_get('pads_begin', default=0) - fst_interp_node_pads_end = fst_interp_node.soft_get('pads_end', default=0) - interp_node = Interpolate(graph, dict(name=fst_interp_node_name + '/Interpolate_', - axes=axes_of_node, - mode=fst_interp_node_mode, - align_corners=fst_interp_node_align_corners, - antialias=fst_interp_node_antialias, - pads_begin=fst_interp_node_pads_begin, - pads_end=fst_interp_node_pads_end)).create_node() - - scales_node = Const(graph, dict(name=fst_interp_node_name + '/scales_', value=sizes)).create_node() - scales_node.out_port(0).connect(interp_node.in_port(1)) - - fst_interp_connection = fst_interp_node.in_port(0).get_connection() - fst_interp_connection.set_destination(interp_node.in_port(0)) - - last_interp_node.out_port(0).get_connection().set_source(interp_node.out_port(0)) + last_interp_node_name = last_interp_node.soft_get('name', last_interp_node.id) + attributes = get_interpolate_attributes(fst_interp_node) + + opset = fst_interp_node.get_opset() + if opset == 'opset1': + attributes['axes'] = axes_of_node + interp_node = create_op_with_const_inputs(graph, Interpolate, {1: sizes}, attributes) + + fst_interp_connection = fst_interp_node.in_port(0).get_connection() + fst_interp_connection.set_destination(interp_node.in_port(0)) + + last_interp_node.out_port(0).get_connection().set_source(interp_node.out_port(0)) + else: + attributes['in_ports_count'] = 4 + interp_node = create_op_with_const_inputs(graph, Interpolate, + {1: sizes, 2: scales, 3: axes_of_node}, + attributes) + + fst_interp_connection = fst_interp_node.in_port(0).get_connection() + fst_interp_connection.set_destination(interp_node.in_port(0)) + + last_interp_node.out_port(0).get_connection().set_source(interp_node.out_port(0)) + + rename_nodes([(last_interp_node, last_interp_node_name + '/delete_'), (interp_node, last_interp_node_name)]) class InterpolateSequenceToInterpolate(MiddleReplacementPattern): diff --git a/model-optimizer/extensions/middle/InterpolateSequenceToInterpolate_test.py b/model-optimizer/extensions/middle/InterpolateSequenceToInterpolate_test.py index 6e6390cfa37298..197d84444cb7d4 100644 --- a/model-optimizer/extensions/middle/InterpolateSequenceToInterpolate_test.py +++ b/model-optimizer/extensions/middle/InterpolateSequenceToInterpolate_test.py @@ -15,6 +15,7 @@ """ +import numpy as np import unittest from extensions.middle.InterpolateSequenceToInterpolate import InterpolateSequenceToInterpolate @@ -22,6 +23,206 @@ from mo.utils.ir_engine.compare_graphs import compare_graphs from mo.utils.unittest.graph import build_graph +graph_node_attrs_for_2d_case_1_opset4_case = { + 'placeholder': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'}, + 'placeholder_data': { + 'value': None, + 'shape': int64_array([1, 4, 220, 350]), + 'kind': 'data', + 'data_type': None + }, + 'size_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([660]) + }, + 'size_1_data': {'value': int64_array([660]), 'shape': [1], 'kind': 'data'}, + 'scale_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': np.array([3.0]) + }, + 'scale_1_data': {'value': np.array([3.0]), 'shape': [1], 'kind': 'data'}, + 'axes_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([2]) + }, + 'axes_1_data': {'value': int64_array([2]), 'shape': [1], 'kind': 'data'}, + 'interpolate_1': { + 'type': 'Interpolate', + 'kind': 'op', + 'op': 'Interpolate', + 'mode': 'nearest', + 'shape_calculation_mode': 'scales', + 'version': 'opset4' + }, + 'interpolate_1_data': {'value': None, 'shape': int64_array([1, 4, 660, 350]), 'kind': 'data'}, + 'size_2': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([700]) + }, + 'size_2_data': {'value': int64_array([700]), 'shape': [1], 'kind': 'data'}, + 'scale_2': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': np.array([2.0]) + }, + 'scale_2_data': {'value': np.array([2.0]), 'shape': [1], 'kind': 'data'}, + 'axes_2': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([3]) + }, + 'axes_2_data': {'value': int64_array([3]), 'shape': [1], 'kind': 'data'}, + 'interpolate_2': { + 'type': 'Interpolate', + 'kind': 'op', + 'op': 'Interpolate', + 'mode': 'nearest', + 'shape_calculation_mode': 'scales', + 'version': 'opset4' + }, + 'interpolate_2_data': {'value': None, 'shape': int64_array([1, 4, 660, 700]), 'kind': 'data'}, + 'size_3': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([1320]) + }, + 'size_3_data': {'value': int64_array([1320]), 'shape': [1], 'kind': 'data'}, + 'scale_3': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': np.array([2.0]) + }, + 'scale_3_data': {'value': np.array([2.0]), 'shape': [1], 'kind': 'data'}, + 'axes_3': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([2]) + }, + 'axes_3_data': {'value': int64_array([2]), 'shape': [1], 'kind': 'data'}, + 'interpolate_3': { + 'type': 'Interpolate', + 'kind': 'op', + 'op': 'Interpolate', + 'mode': 'nearest', + 'shape_calculation_mode': 'scales', + 'version': 'opset4' + }, + 'interpolate_3_data': {'value': None, 'shape': int64_array([1, 4, 1320, 700]), 'kind': 'data'}, + 'abs': {'type': 'Abs', 'kind': 'op', 'op': 'Abs'}, + 'abs_data': {'value': None, 'shape': int64_array([1, 4, 1320, 700]), 'kind': 'data'}, + 'output': {'kind': 'op', 'op': 'Result'}, +} + +edges_for_2d_case_1_opset4_case = [ + ('placeholder', 'placeholder_data'), + + ('placeholder_data', 'interpolate_1', {'in': 0}), + ('size_1', 'size_1_data'), + ('scale_1', 'scale_1_data'), + ('axes_1', 'axes_1_data'), + ('size_1_data', 'interpolate_1', {'in': 1}), + ('scale_1_data', 'interpolate_1', {'in': 2}), + ('axes_1_data', 'interpolate_1', {'in': 3}), + ('interpolate_1', 'interpolate_1_data'), + + ('interpolate_1_data', 'interpolate_2', {'in': 0}), + ('size_2', 'size_2_data'), + ('scale_2', 'scale_2_data'), + ('axes_2', 'axes_2_data'), + ('size_2_data', 'interpolate_2', {'in': 1}), + ('scale_2_data', 'interpolate_2', {'in': 2}), + ('axes_2_data', 'interpolate_2', {'in': 3}), + ('interpolate_2', 'interpolate_2_data'), + + ('interpolate_2_data', 'interpolate_3', {'in': 0}), + ('size_3', 'size_3_data'), + ('scale_3', 'scale_3_data'), + ('axes_3', 'axes_3_data'), + ('size_3_data', 'interpolate_3', {'in': 1}), + ('scale_3_data', 'interpolate_3', {'in': 2}), + ('axes_3_data', 'interpolate_3', {'in': 3}), + ('interpolate_3', 'interpolate_3_data'), + + ('interpolate_3_data', 'abs'), + ('abs', 'abs_data'), + ('abs_data', 'output'), +] + + +ref_graph_node_attrs_for_2d_case_1_opset4_case = { + 'placeholder': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'}, + 'placeholder_data': { + 'value': None, + 'shape': int64_array([1, 4, 220, 350]), + 'kind': 'data', + 'data_type': None + }, + 'size_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([660, 700]) + }, + 'size_1_data': {'value': None, 'shape': None, 'kind': 'data'}, + 'scale_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': np.array([3.0, 2.0]) + }, + 'scale_1_data': {'value': None, 'shape': None, 'kind': 'data'}, + 'axes_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([2, 3]) + }, + 'axes_1_data': {'value': None, 'shape': None, 'kind': 'data'}, + 'interpolate_1': { + 'type': 'Interpolate', + 'kind': 'op', + 'op': 'Interpolate', + 'mode': 'nearest', + 'shape_calculation_mode': 'scales', + 'antialias': 0, + 'pads_begin': int64_array([0]), + 'pads_end': int64_array([0]), + 'coordinate_transformation_mode': 'half_pixel', + 'nearest_mode': 'round_prefer_floor', + 'cube_coeff': -0.75, + 'version': 'opset4' + }, + 'interpolate_1_data': {'value': None, 'shape': int64_array([1, 4, 660, 700]), 'kind': 'data'}, + 'size_3': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([1320]) + }, + 'size_3_data': {'value': None, 'shape': [1], 'kind': 'data'}, + 'scale_3': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': np.array([2.0]) + }, + 'scale_3_data': {'value': None, 'shape': [1], 'kind': 'data'}, + 'axes_3': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([2]) + }, + 'axes_3_data': {'value': int64_array([2]), 'shape': [1], 'kind': 'data'}, + 'interpolate_3': { + 'type': 'Interpolate', + 'kind': 'op', + 'op': 'Interpolate', + 'mode': 'nearest', + 'shape_calculation_mode': 'scales', + 'version': 'opset4' + }, + 'interpolate_3_data': {'value': None, 'shape': int64_array([1, 4, 1320, 700]), 'kind': 'data'}, + 'abs': {'type': 'Abs', 'kind': 'op', 'op': 'Abs'}, + 'abs_data': {'value': None, 'shape': int64_array([1, 4, 1320, 700]), 'kind': 'data'}, + 'output': {'kind': 'op', 'op': 'Result'}, +} + +ref_edges_for_2d_case_1_opset4_case = [ + ('placeholder', 'placeholder_data'), + + ('placeholder_data', 'interpolate_1', {'in': 0}), + ('size_1', 'size_1_data'), + ('scale_1', 'scale_1_data'), + ('axes_1', 'axes_1_data'), + ('size_1_data', 'interpolate_1', {'in': 1}), + ('scale_1_data', 'interpolate_1', {'in': 2}), + ('axes_1_data', 'interpolate_1', {'in': 3}), + ('interpolate_1', 'interpolate_1_data'), + + ('interpolate_1_data', 'interpolate_3', {'in': 0}), + ('size_3', 'size_3_data'), + ('scale_3', 'scale_3_data'), + ('axes_3', 'axes_3_data'), + ('size_3_data', 'interpolate_3', {'in': 1}), + ('scale_3_data', 'interpolate_3', {'in': 2}), + ('axes_3_data', 'interpolate_3', {'in': 3}), + ('interpolate_3', 'interpolate_3_data'), + + ('interpolate_3_data', 'abs'), + ('abs', 'abs_data'), + ('abs_data', 'output'), +] + + graph_node_attrs_for_2d_case_1 = { 'placeholder': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'}, 'placeholder_data': { @@ -33,37 +234,40 @@ 'scale_1': { 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([660]) }, - 'scale_1_data': {'value': None, 'shape': [1], 'kind': 'data'}, + 'scale_1_data': {'value': int64_array([660]), 'shape': [1], 'kind': 'data'}, 'interpolate_1': { 'type': 'Interpolate', 'kind': 'op', 'op': 'Interpolate', 'axes': int64_array([2]), - 'mode': 'nearest' + 'mode': 'nearest', + 'version': 'opset1' }, 'interpolate_1_data': {'value': None, 'shape': int64_array([1, 4, 660, 350]), 'kind': 'data'}, 'scale_2': { 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([700]) }, - 'scale_2_data': {'value': None, 'shape': [1], 'kind': 'data'}, + 'scale_2_data': {'value': int64_array([700]), 'shape': [1], 'kind': 'data'}, 'interpolate_2': { 'type': 'Interpolate', 'kind': 'op', 'op': 'Interpolate', 'axes': int64_array([3]), - 'mode': 'nearest' + 'mode': 'nearest', + 'version': 'opset1' }, 'interpolate_2_data': {'value': None, 'shape': int64_array([1, 4, 660, 700]), 'kind': 'data'}, 'scale_3': { 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([1320]) }, - 'scale_3_data': {'value': None, 'shape': [1], 'kind': 'data'}, + 'scale_3_data': {'value': int64_array([1320]), 'shape': [1], 'kind': 'data'}, 'interpolate_3': { 'type': 'Interpolate', 'kind': 'op', 'op': 'Interpolate', 'axes': int64_array([2]), - 'mode': 'nearest' + 'mode': 'nearest', + 'version': 'opset1' }, 'interpolate_3_data': {'value': None, 'shape': int64_array([1, 4, 1320, 700]), 'kind': 'data'}, 'abs': {'type': 'Abs', 'kind': 'op', 'op': 'Abs'}, @@ -112,7 +316,8 @@ 'kind': 'op', 'op': 'Interpolate', 'axes': int64_array([2]), - 'mode': 'nearest' + 'mode': 'nearest', + 'version': 'opset1' }, 'interpolate_1_data': {'value': None, 'shape': int64_array([1, 4, 660, 350]), 'kind': 'data'}, 'abs': {'type': 'Abs', 'kind': 'op', 'op': 'Abs'}, @@ -151,7 +356,8 @@ 'kind': 'op', 'op': 'Interpolate', 'axes': int64_array([2]), - 'mode': 'nearest' + 'mode': 'nearest', + 'version': 'opset1' }, 'interpolate_1_data': {'value': None, 'shape': int64_array([1, 4, 660, 350]), 'kind': 'data'}, 'scale_2': { @@ -163,7 +369,8 @@ 'kind': 'op', 'op': 'Interpolate', 'axes': int64_array([3]), - 'mode': 'linear' + 'mode': 'linear', + 'version': 'opset1' }, 'interpolate_2_data': {'value': None, 'shape': int64_array([1, 4, 660, 700]), 'kind': 'data'}, 'scale_3': { @@ -175,7 +382,8 @@ 'kind': 'op', 'op': 'Interpolate', 'axes': int64_array([2]), - 'mode': 'cubic' + 'mode': 'cubic', + 'version': 'opset1' }, 'interpolate_3_data': {'value': None, 'shape': int64_array([1, 4, 1320, 700]), 'kind': 'data'}, 'abs': {'type': 'Abs', 'kind': 'op', 'op': 'Abs'}, @@ -186,7 +394,153 @@ edges_for_2d_case_3 = edges_for_2d_case_1 -graph_node_attrs_for_2d_case_4 = { +new_graph_node_attrs_for_2d_case_4_opset4_case = { + 'placeholder': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'}, + 'placeholder_data': { + 'value': None, + 'shape': int64_array([1, 4, 220, 350]), + 'kind': 'data', + 'data_type': None + }, + 'size_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([2200]) + }, + 'size_1_data': {'value': int64_array([2200]), 'shape': [1], 'kind': 'data'}, + 'scale_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': np.array([10.0]) + }, + 'scale_1_data': {'value': np.array([10.0]), 'shape': [1], 'kind': 'data'}, + 'axes_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([2]) + }, + 'axes_1_data': {'value': int64_array([2]), 'shape': [1], 'kind': 'data'}, + 'interpolate_1': { + 'type': 'Interpolate', + 'kind': 'op', + 'op': 'Interpolate', + 'mode': 'linear', + 'coordinate_transformation_mode': 'asymmetric', + 'nearest_mode': 'simple', + 'cube_coeff': -0.4, + 'antialias': 1, + 'shape_calculation_mode': 'scales', + 'version': 'opset4' + }, + 'interpolate_1_data': {'value': None, 'shape': int64_array([1, 4, 2200, 350]), 'kind': 'data'}, + 'size_2': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([700]) + }, + 'size_2_data': {'value': int64_array([700]), 'shape': [1], 'kind': 'data'}, + 'scale_2': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': np.array([2.0]) + }, + 'scale_2_data': {'value': np.array([2.0]), 'shape': [1], 'kind': 'data'}, + 'axes_2': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([3]) + }, + 'axes_2_data': {'value': int64_array([3]), 'shape': [1], 'kind': 'data'}, + 'interpolate_2': { + 'type': 'Interpolate', + 'kind': 'op', + 'op': 'Interpolate', + 'mode': 'linear', + 'coordinate_transformation_mode': 'asymmetric', + 'nearest_mode': 'simple', + 'cube_coeff': -0.4, + 'antialias': 1, + 'shape_calculation_mode': 'scales', + 'version': 'opset4' + }, + 'interpolate_2_data': {'value': None, 'shape': int64_array([1, 4, 2200, 700]), 'kind': 'data'}, + 'abs': {'type': 'Abs', 'kind': 'op', 'op': 'Abs'}, + 'abs_data': {'value': None, 'shape': int64_array([1, 4, 2200, 700]), 'kind': 'data'}, + 'output': {'kind': 'op', 'op': 'Result'}, +} + +new_edges_for_2d_case_4_opset4_case = [ + ('placeholder', 'placeholder_data'), + + ('placeholder_data', 'interpolate_1', {'in': 0}), + ('size_1', 'size_1_data'), + ('size_1_data', 'interpolate_1', {'in': 1}), + ('scale_1', 'scale_1_data'), + ('scale_1_data', 'interpolate_1', {'in': 2}), + ('axes_1', 'axes_1_data'), + ('axes_1_data', 'interpolate_1', {'in': 3}), + ('interpolate_1', 'interpolate_1_data'), + + ('interpolate_1_data', 'interpolate_2', {'in': 0}), + ('size_2', 'size_2_data'), + ('size_2_data', 'interpolate_2', {'in': 1}), + ('scale_2', 'scale_2_data'), + ('scale_2_data', 'interpolate_2', {'in': 2}), + ('axes_2', 'axes_2_data'), + ('axes_2_data', 'interpolate_2', {'in': 3}), + ('interpolate_2', 'interpolate_2_data'), + + ('interpolate_2_data', 'abs'), + ('abs', 'abs_data'), + ('abs_data', 'output'), +] + + +new_ref_graph_node_attrs_for_2d_case_4_opset4_case = { + 'placeholder': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'}, + 'placeholder_data': { + 'value': None, + 'shape': int64_array([1, 4, 220, 350]), + 'kind': 'data', + 'data_type': None + }, + 'size_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([2200, 700]) + }, + 'size_1_data': {'value': None, 'shape': None, 'kind': 'data'}, + 'scale_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': np.array([10.0, 2.0]) + }, + 'scale_1_data': {'value': None, 'shape': None, 'kind': 'data'}, + 'axes_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([2, 3]) + }, + 'axes_1_data': {'value': None, 'shape': None, 'kind': 'data'}, + 'interpolate_1': { + 'type': 'Interpolate', + 'kind': 'op', + 'op': 'Interpolate', + 'mode': 'linear', + 'coordinate_transformation_mode': 'asymmetric', + 'nearest_mode': 'simple', + 'cube_coeff': -0.4, + 'antialias': 1, + 'shape_calculation_mode': 'scales', + 'version': 'opset4' + }, + 'interpolate_1_data': {'value': None, 'shape': int64_array([1, 4, 2200, 700]), 'kind': 'data'}, + 'abs': {'type': 'Abs', 'kind': 'op', 'op': 'Abs'}, + 'abs_data': {'value': None, 'shape': int64_array([1, 4, 2200, 700]), 'kind': 'data'}, + 'output': {'kind': 'op', 'op': 'Result'}, +} + +new_ref_edges_for_2d_case_4_opset4_case = [ + ('placeholder', 'placeholder_data'), + + ('placeholder_data', 'interpolate_1', {'in': 0}), + ('size_1', 'size_1_data'), + ('size_1_data', 'interpolate_1', {'in': 1}), + ('scale_1', 'scale_1_data'), + ('scale_1_data', 'interpolate_1', {'in': 2}), + ('axes_1', 'axes_1_data'), + ('axes_1_data', 'interpolate_1', {'in': 3}), + ('interpolate_1', 'interpolate_1_data'), + + ('interpolate_1_data', 'abs'), + ('abs', 'abs_data'), + ('abs_data', 'output'), +] + + +graph_node_attrs_for_2d_case_4_opset4_case = { 'placeholder': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'}, 'placeholder_data': { 'value': None, @@ -198,6 +552,82 @@ 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([2200]) }, 'scale_1_data': {'value': None, 'shape': [1], 'kind': 'data'}, + 'axes_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([2]) + }, + 'axes_1_data': {'value': int64_array([2]), 'shape': [1], 'kind': 'data'}, + 'interpolate_1': { + 'type': 'Interpolate', + 'kind': 'op', + 'op': 'Interpolate', + 'mode': 'linear', + 'coordinate_transformation_mode': 'asymmetric', + 'nearest_mode': 'simple', + 'cube_coeff': -0.4, + 'antialias': 1, + 'version': 'opset4' + }, + 'interpolate_1_data': {'value': None, 'shape': int64_array([1, 4, 2200, 350]), 'kind': 'data'}, + 'scale_2': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([700]) + }, + 'scale_2_data': {'value': None, 'shape': [1], 'kind': 'data'}, + 'axes_2': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([3]) + }, + 'axes_2_data': {'value': int64_array([3]), 'shape': [1], 'kind': 'data'}, + 'interpolate_2': { + 'type': 'Interpolate', + 'kind': 'op', + 'op': 'Interpolate', + 'mode': 'linear', + 'coordinate_transformation_mode': 'asymmetric', + 'nearest_mode': 'simple', + 'cube_coeff': -0.4, + 'antialias': 1, + 'version': 'opset4' + }, + 'interpolate_2_data': {'value': None, 'shape': int64_array([1, 4, 2200, 700]), 'kind': 'data'}, + 'abs': {'type': 'Abs', 'kind': 'op', 'op': 'Abs'}, + 'abs_data': {'value': None, 'shape': int64_array([1, 4, 2200, 700]), 'kind': 'data'}, + 'output': {'kind': 'op', 'op': 'Result'}, +} + +edges_for_2d_case_4_opset4_case = [ + ('placeholder', 'placeholder_data'), + + ('placeholder_data', 'interpolate_1', {'in': 0}), + ('scale_1', 'scale_1_data'), + ('scale_1_data', 'interpolate_1', {'in': 1}), + ('axes_1', 'axes_1_data'), + ('axes_1_data', 'interpolate_1', {'in': 2}), + ('interpolate_1', 'interpolate_1_data'), + + ('interpolate_1_data', 'interpolate_2', {'in': 0}), + ('scale_2', 'scale_2_data'), + ('scale_2_data', 'interpolate_2', {'in': 1}), + ('axes_2', 'axes_2_data'), + ('axes_2_data', 'interpolate_2', {'in': 2}), + ('interpolate_2', 'interpolate_2_data'), + + ('interpolate_2_data', 'abs'), + ('abs', 'abs_data'), + ('abs_data', 'output'), +] + + +graph_node_attrs_for_2d_case_4 = { + 'placeholder': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'}, + 'placeholder_data': { + 'value': None, + 'shape': int64_array([1, 4, 220, 350]), + 'kind': 'data', + 'data_type': None + }, + 'scale_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([2200]) + }, + 'scale_1_data': {'value': int64_array([2200]), 'shape': [1], 'kind': 'data'}, 'interpolate_1': { 'type': 'Interpolate', 'kind': 'op', @@ -207,13 +637,14 @@ 'align_corners': 0, 'antialias': 1, 'pads_begin': 5, - 'pads_end': 3 + 'pads_end': 3, + 'version': 'opset1' }, 'interpolate_1_data': {'value': None, 'shape': int64_array([1, 4, 2200, 350]), 'kind': 'data'}, 'scale_2': { 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([700]) }, - 'scale_2_data': {'value': None, 'shape': [1], 'kind': 'data'}, + 'scale_2_data': {'value': int64_array([700]), 'shape': [1], 'kind': 'data'}, 'interpolate_2': { 'type': 'Interpolate', 'kind': 'op', @@ -223,7 +654,8 @@ 'align_corners': 0, 'antialias': 1, 'pads_begin': 5, - 'pads_end': 3 + 'pads_end': 3, + 'version': 'opset1' }, 'interpolate_2_data': {'value': None, 'shape': int64_array([1, 4, 2200, 700]), 'kind': 'data'}, 'abs': {'type': 'Abs', 'kind': 'op', 'op': 'Abs'}, @@ -271,7 +703,8 @@ 'align_corners': 0, 'antialias': 1, 'pads_begin': 5, - 'pads_end': 3 + 'pads_end': 3, + 'version': 'opset1' }, 'interpolate_1_data': {'value': None, 'shape': int64_array([1, 4, 220, 350]), 'kind': 'data'}, 'scale_2': { @@ -287,7 +720,8 @@ 'align_corners': 0, 'antialias': 1, 'pads_begin': 5, - 'pads_end': 3 + 'pads_end': 3, + 'version': 'opset1' }, 'interpolate_2_data': {'value': None, 'shape': int64_array([1, 4, 220, 350]), 'kind': 'data'}, 'abs': {'type': 'Abs', 'kind': 'op', 'op': 'Abs'}, @@ -298,6 +732,144 @@ edges_for_2d_case_6 = edges_for_2d_case_4 +new_ref_graph_node_attrs_for_3d_case_1_opset4_case = { + 'placeholder': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'}, + 'placeholder_data': { + 'value': None, + 'shape': int64_array([1, 5, 1024, 256, 800]), + 'kind': 'data', + 'data_type': None + }, + 'size_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([4096, 1280, 2400]) + }, + 'size_1_data': {'value': None, 'shape': None, 'kind': 'data'}, + 'scale_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': np.array([4.0, 5.0, 3.0]) + }, + 'scale_1_data': {'value': None, 'shape': None, 'kind': 'data'}, + 'axes_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([2, 3, 4]) + }, + 'axes_1_data': {'value': None, 'shape': None, 'kind': 'data'}, + 'interpolate_1': { + 'type': 'Interpolate', + 'kind': 'op', + 'op': 'Interpolate', + 'mode': 'nearest', + 'shape_calculation_mode': 'sizes', + 'version': 'opset4' + }, + 'interpolate_1_data': {'value': None, 'shape': int64_array([1, 5, 4096, 1280, 2400]), 'kind': 'data'}, + 'size_3': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([512]) + }, + 'size_3_data': {'value': None, 'shape': [1], 'kind': 'data'}, + 'scale_3': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': np.array([512.0 / 2400.0]) + }, + 'scale_3_data': {'value': None, 'shape': [1], 'kind': 'data'}, + 'axes_3': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([4]) + }, + 'axes_3_data': {'value': int64_array([4]), 'shape': [1], 'kind': 'data'}, + 'interpolate_3': { + 'type': 'Interpolate', + 'kind': 'op', + 'op': 'Interpolate', + 'mode': 'nearest', + 'shape_calculation_mode': 'sizes', + 'version': 'opset4' + }, + 'interpolate_3_data': {'value': None, 'shape': int64_array([1, 5, 4096, 1280, 512]), 'kind': 'data'}, + 'abs': {'type': 'Abs', 'kind': 'op', 'op': 'Abs'}, + 'abs_data': {'value': None, 'shape': int64_array([1, 5, 4096, 1280, 512]), 'kind': 'data'}, + 'output': {'kind': 'op', 'op': 'Result'}, +} + + +new_ref_edges_for_3d_case_1_opset4_case = ref_edges_for_2d_case_1_opset4_case + + +new_graph_node_attrs_for_3d_case_1_opset4_case = { + 'placeholder': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'}, + 'placeholder_data': { + 'value': None, + 'shape': int64_array([1, 5, 1024, 256, 800]), + 'kind': 'data', + 'data_type': None + }, + 'size_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([4096, 2400]) + }, + 'size_1_data': {'value': int64_array([4096, 2400]), 'shape': [2], 'kind': 'data'}, + 'scale_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': np.array([4.0, 3.0]) + }, + 'scale_1_data': {'value': np.array([4.0, 3.0]), 'shape': [2], 'kind': 'data'}, + 'axes_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([2, 4]) + }, + 'axes_1_data': {'value': int64_array([2, 4]), 'shape': [2], 'kind': 'data'}, + 'interpolate_1': { + 'type': 'Interpolate', + 'kind': 'op', + 'op': 'Interpolate', + 'mode': 'nearest', + 'shape_calculation_mode': 'sizes', + 'version': 'opset4' + }, + 'interpolate_1_data': {'value': None, 'shape': int64_array([1, 5, 4096, 256, 2400]), 'kind': 'data'}, + 'size_2': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([1280]) + }, + 'size_2_data': {'value': int64_array([1280]), 'shape': [1], 'kind': 'data'}, + 'scale_2': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': np.array([5.0]) + }, + 'scale_2_data': {'value': np.array([5.0]), 'shape': [1], 'kind': 'data'}, + 'axes_2': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([3]) + }, + 'axes_2_data': {'value': int64_array([3]), 'shape': [1], 'kind': 'data'}, + 'interpolate_2': { + 'type': 'Interpolate', + 'kind': 'op', + 'op': 'Interpolate', + 'mode': 'nearest', + 'shape_calculation_mode': 'sizes', + 'version': 'opset4' + }, + 'interpolate_2_data': {'value': None, 'shape': int64_array([1, 5, 4096, 1280, 2400]), 'kind': 'data'}, + 'size_3': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([512]) + }, + 'size_3_data': {'value': int64_array([512]), 'shape': [1], 'kind': 'data'}, + 'scale_3': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': np.array([512.0 / 2400.0]) + }, + 'scale_3_data': {'value': np.array([512.0 / 2400.0]), 'shape': [1], 'kind': 'data'}, + 'axes_3': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([4]) + }, + 'axes_3_data': {'value': int64_array([4]), 'shape': [1], 'kind': 'data'}, + 'interpolate_3': { + 'type': 'Interpolate', + 'kind': 'op', + 'op': 'Interpolate', + 'mode': 'nearest', + 'shape_calculation_mode': 'sizes', + 'version': 'opset4' + }, + 'interpolate_3_data': {'value': None, 'shape': int64_array([1, 5, 4096, 1280, 512]), 'kind': 'data'}, + 'abs': {'type': 'Abs', 'kind': 'op', 'op': 'Abs'}, + 'abs_data': {'value': None, 'shape': int64_array([1, 5, 4096, 1280, 512]), 'kind': 'data'}, + 'output': {'kind': 'op', 'op': 'Result'}, +} + +new_edges_for_3d_case_1_opset4_case = edges_for_2d_case_1_opset4_case + + graph_node_attrs_for_3d_case_1 = { 'placeholder': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'}, 'placeholder_data': { @@ -309,37 +881,40 @@ 'scale_1': { 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([4096, 2400]) }, - 'scale_1_data': {'value': None, 'shape': [2], 'kind': 'data'}, + 'scale_1_data': {'value': int64_array([4096, 2400]), 'shape': [2], 'kind': 'data'}, 'interpolate_1': { 'type': 'Interpolate', 'kind': 'op', 'op': 'Interpolate', 'axes': int64_array([2, 4]), - 'mode': 'nearest' + 'mode': 'nearest', + 'version': 'opset1' }, 'interpolate_1_data': {'value': None, 'shape': int64_array([1, 5, 4096, 256, 2400]), 'kind': 'data'}, 'scale_2': { 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([1280]) }, - 'scale_2_data': {'value': None, 'shape': [1], 'kind': 'data'}, + 'scale_2_data': {'value': int64_array([1280]), 'shape': [1], 'kind': 'data'}, 'interpolate_2': { 'type': 'Interpolate', 'kind': 'op', 'op': 'Interpolate', 'axes': int64_array([3]), - 'mode': 'nearest' + 'mode': 'nearest', + 'version': 'opset1' }, 'interpolate_2_data': {'value': None, 'shape': int64_array([1, 5, 4096, 1280, 2400]), 'kind': 'data'}, 'scale_3': { 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([512]) }, - 'scale_3_data': {'value': None, 'shape': [1], 'kind': 'data'}, + 'scale_3_data': {'value': int64_array([512]), 'shape': [1], 'kind': 'data'}, 'interpolate_3': { 'type': 'Interpolate', 'kind': 'op', 'op': 'Interpolate', 'axes': int64_array([4]), - 'mode': 'nearest' + 'mode': 'nearest', + 'version': 'opset1' }, 'interpolate_3_data': {'value': None, 'shape': int64_array([1, 5, 4096, 1280, 512]), 'kind': 'data'}, 'abs': {'type': 'Abs', 'kind': 'op', 'op': 'Abs'}, @@ -367,7 +942,8 @@ 'kind': 'op', 'op': 'Interpolate', 'axes': int64_array([2, 3]), - 'mode': 'nearest' + 'mode': 'nearest', + 'version': 'opset1' }, 'interpolate_1_data': {'value': None, 'shape': int64_array([1, 5, 4096, 1280, 800]), 'kind': 'data'}, 'abs': {'type': 'Abs', 'kind': 'op', 'op': 'Abs'}, @@ -395,7 +971,8 @@ 'kind': 'op', 'op': 'Interpolate', 'axes': int64_array([2]), - 'mode': 'nearest' + 'mode': 'nearest', + 'version': 'opset1' }, 'interpolate_1_data': {'value': None, 'shape': int64_array([16, 44, 256, 87, 790]), 'kind': 'data'}, 'scale_2': { @@ -407,7 +984,8 @@ 'kind': 'op', 'op': 'Interpolate', 'axes': int64_array([4]), - 'mode': 'linear' + 'mode': 'linear', + 'version': 'opset1' }, 'interpolate_2_data': {'value': None, 'shape': int64_array([16, 44, 256, 87, 2370]), 'kind': 'data'}, 'scale_3': { @@ -419,7 +997,8 @@ 'kind': 'op', 'op': 'Interpolate', 'axes': int64_array([3]), - 'mode': 'cubic' + 'mode': 'cubic', + 'version': 'opset1' }, 'interpolate_3_data': {'value': None, 'shape': int64_array([16, 44, 256, 435, 2370]), 'kind': 'data'}, 'abs': {'type': 'Abs', 'kind': 'op', 'op': 'Abs'}, @@ -430,6 +1009,105 @@ edges_for_3d_case_3 = edges_for_2d_case_3 +new_ref_graph_node_attrs_for_3d_case_4_opset4_case = { + 'placeholder': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'}, + 'placeholder_data': { + 'value': None, + 'shape': int64_array([10, 64, 511, 416, 10240]), + 'kind': 'data', + 'data_type': None + }, + 'size_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([4599, 912, 133120]) + }, + 'size_1_data': {'value': None, 'shape': None, 'kind': 'data'}, + 'scale_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', + 'value': np.array([4599.0 / 511.0, 912.0 / 416.0, 133120.0 / 10240.0]) + }, + 'scale_1_data': {'value': None, 'shape': None, 'kind': 'data'}, + 'axes_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([2, 3, 4]) + }, + 'axes_1_data': {'value': None, 'shape': None, 'kind': 'data'}, + 'interpolate_1': { + 'type': 'Interpolate', + 'kind': 'op', + 'op': 'Interpolate', + 'mode': 'linear', + 'antialias': 1, + 'shape_calculation_mode': 'sizes', + 'version': 'opset4' + }, + 'interpolate_1_data': {'value': None, 'shape': int64_array([10, 64, 4599, 912, 133120]), 'kind': 'data'}, + 'abs': {'type': 'Abs', 'kind': 'op', 'op': 'Abs'}, + 'abs_data': {'value': None, 'shape': int64_array([10, 64, 4599, 912, 133120]), 'kind': 'data'}, + 'output': {'kind': 'op', 'op': 'Result'}, +} + +new_ref_edges_for_3d_case_4_opset4_case = new_ref_edges_for_2d_case_4_opset4_case + + +new_graph_node_attrs_for_3d_case_4_opset4_case = { + 'placeholder': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'}, + 'placeholder_data': { + 'value': None, + 'shape': int64_array([10, 64, 511, 416, 10240]), + 'kind': 'data', + 'data_type': None + }, + 'size_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([4599, 133120]) + }, + 'size_1_data': {'value': int64_array([4599, 133120]), 'shape': [2], 'kind': 'data'}, + 'scale_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': np.array([4599.0 / 511.0, 133120.0 / 10240.0]) + }, + 'scale_1_data': {'value': np.array([4599.0 / 511.0, 133120.0 / 10240.0]), 'shape': [2], 'kind': 'data'}, + 'axes_1': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([2, 4]) + }, + 'axes_1_data': {'value': int64_array([2, 4]), 'shape': [2], 'kind': 'data'}, + 'interpolate_1': { + 'type': 'Interpolate', + 'kind': 'op', + 'op': 'Interpolate', + 'mode': 'linear', + 'antialias': 1, + 'shape_calculation_mode': 'sizes', + 'version': 'opset4' + }, + 'interpolate_1_data': {'value': None, 'shape': int64_array([10, 64, 4599, 416, 133120]), 'kind': 'data'}, + 'size_2': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([912]) + }, + 'size_2_data': {'value': int64_array([912]), 'shape': [1], 'kind': 'data'}, + 'scale_2': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': np.array([912.0 / 416.0]) + }, + 'scale_2_data': {'value': np.array([912.0 / 416.0]), 'shape': [1], 'kind': 'data'}, + 'axes_2': { + 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([3]) + }, + 'axes_2_data': {'value': int64_array([3]), 'shape': [1], 'kind': 'data'}, + 'interpolate_2': { + 'type': 'Interpolate', + 'kind': 'op', + 'op': 'Interpolate', + 'mode': 'linear', + 'antialias': 1, + 'shape_calculation_mode': 'sizes', + 'version': 'opset4' + }, + 'interpolate_2_data': {'value': None, 'shape': int64_array([10, 64, 4599, 912, 133120]), 'kind': 'data'}, + 'abs': {'type': 'Abs', 'kind': 'op', 'op': 'Abs'}, + 'abs_data': {'value': None, 'shape': int64_array([10, 64, 4599, 912, 133120]), 'kind': 'data'}, + 'output': {'kind': 'op', 'op': 'Result'}, +} + +new_edges_for_3d_case_4_opset4_case = new_edges_for_2d_case_4_opset4_case + + graph_node_attrs_for_3d_case_4 = { 'placeholder': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'}, 'placeholder_data': { @@ -441,7 +1119,7 @@ 'scale_1': { 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([4599, 133120]) }, - 'scale_1_data': {'value': None, 'shape': [2], 'kind': 'data'}, + 'scale_1_data': {'value': int64_array([4599, 133120]), 'shape': [2], 'kind': 'data'}, 'interpolate_1': { 'type': 'Interpolate', 'kind': 'op', @@ -451,13 +1129,14 @@ 'align_corners': 0, 'antialias': 1, 'pads_begin': 5, - 'pads_end': 3 + 'pads_end': 3, + 'version': 'opset1' }, 'interpolate_1_data': {'value': None, 'shape': int64_array([10, 64, 4599, 416, 133120]), 'kind': 'data'}, 'scale_2': { 'kind': 'op', 'op': 'Const', 'type': 'Const', 'value': int64_array([912]) }, - 'scale_2_data': {'value': None, 'shape': [1], 'kind': 'data'}, + 'scale_2_data': {'value': int64_array([912]), 'shape': [1], 'kind': 'data'}, 'interpolate_2': { 'type': 'Interpolate', 'kind': 'op', @@ -467,7 +1146,8 @@ 'align_corners': 0, 'antialias': 1, 'pads_begin': 5, - 'pads_end': 3 + 'pads_end': 3, + 'version': 'opset1' }, 'interpolate_2_data': {'value': None, 'shape': int64_array([10, 64, 4599, 912, 133120]), 'kind': 'data'}, 'abs': {'type': 'Abs', 'kind': 'op', 'op': 'Abs'}, @@ -503,7 +1183,8 @@ def test_2d_interpolate_sequence_1(self): 'kind': 'op', 'op': 'Interpolate', 'axes': int64_array([2, 3]), - 'mode': 'nearest' + 'mode': 'nearest', + 'version': 'opset1' }, 'interpolate_1_data': {'value': None, 'shape': int64_array([1, 4, 660, 700]), 'kind': 'data'}, 'scale_2': { @@ -515,7 +1196,8 @@ def test_2d_interpolate_sequence_1(self): 'kind': 'op', 'op': 'Interpolate', 'axes': int64_array([2]), - 'mode': 'nearest' + 'mode': 'nearest', + 'version': 'opset1' }, 'interpolate_2_data': {'value': None, 'shape': int64_array([1, 4, 1320, 700]), 'kind': 'data'}, 'abs': {'type': 'Abs', 'kind': 'op', 'op': 'Abs'}, @@ -541,6 +1223,20 @@ def test_2d_interpolate_sequence_1(self): (flag, resp) = compare_graphs(graph, ref_graph, 'output') self.assertTrue(flag, resp) + def test_2d_interpolate_sequence_1_opset4_case(self): + graph = build_graph( + nodes_attrs=graph_node_attrs_for_2d_case_1_opset4_case, + edges=edges_for_2d_case_1_opset4_case + ) + + ref_graph = build_graph( + nodes_attrs=ref_graph_node_attrs_for_2d_case_1_opset4_case, + edges=ref_edges_for_2d_case_1_opset4_case + ) + InterpolateSequenceToInterpolate().find_and_replace_pattern(graph) + (flag, resp) = compare_graphs(graph, ref_graph, 'output') + self.assertTrue(flag, resp) + def test_2d_interpolate_sequence_2(self): graph = build_graph( nodes_attrs=graph_node_attrs_for_2d_case_2, @@ -597,7 +1293,8 @@ def test_2d_interpolate_sequence_4(self): 'align_corners': 0, 'antialias': 1, 'pads_begin': 5, - 'pads_end': 3 + 'pads_end': 3, + 'version': 'opset1' }, 'interpolate_data': {'value': None, 'shape': int64_array([1, 4, 2200, 700]), 'kind': 'data'}, 'abs': {'type': 'Abs', 'kind': 'op', 'op': 'Abs'}, @@ -622,6 +1319,21 @@ def test_2d_interpolate_sequence_4(self): (flag, resp) = compare_graphs(graph, ref_graph, 'output') self.assertTrue(flag, resp) + def test_2d_interpolate_sequence_4_opset4_case(self): + graph = build_graph( + nodes_attrs=new_graph_node_attrs_for_2d_case_4_opset4_case, + edges=new_edges_for_2d_case_4_opset4_case + ) + + ref_graph = build_graph( + nodes_attrs=new_ref_graph_node_attrs_for_2d_case_4_opset4_case, + edges=new_ref_edges_for_2d_case_4_opset4_case + ) + + InterpolateSequenceToInterpolate().find_and_replace_pattern(graph) + (flag, resp) = compare_graphs(graph, ref_graph, 'output') + self.assertTrue(flag, resp) + def test_2d_interpolate_sequence_5(self): graph = build_graph( nodes_attrs=graph_node_attrs_for_2d_case_4, @@ -647,6 +1359,31 @@ def test_2d_interpolate_sequence_5(self): (flag, resp) = compare_graphs(graph, ref_graph, 'output') self.assertTrue(flag, resp) + def test_2d_interpolate_sequence_5_opset4_case(self): + graph = build_graph( + nodes_attrs=graph_node_attrs_for_2d_case_4_opset4_case, + edges=edges_for_2d_case_4_opset4_case, + update_attributes={ + 'interpolate_1': { + 'antialias': 0, 'cube_coeff': -0.1 + } + } + ) + + ref_graph = build_graph( + nodes_attrs=graph_node_attrs_for_2d_case_4_opset4_case, + edges=edges_for_2d_case_4_opset4_case, + update_attributes={ + 'interpolate_1': { + 'antialias': 0, 'cube_coeff': -0.1 + } + } + ) + + InterpolateSequenceToInterpolate().find_and_replace_pattern(graph) + (flag, resp) = compare_graphs(graph, ref_graph, 'output') + self.assertTrue(flag, resp) + def test_2d_interpolate_sequence_6(self): graph = build_graph( nodes_attrs=graph_node_attrs_for_2d_case_6, @@ -686,7 +1423,8 @@ def test_3d_interpolate_sequence_1(self): 'kind': 'op', 'op': 'Interpolate', 'axes': int64_array([2, 3, 4]), - 'mode': 'nearest' + 'mode': 'nearest', + 'version': 'opset1' }, 'interpolate_1_data': {'value': None, 'shape': int64_array([1, 5, 4096, 1280, 2400]), 'kind': 'data'}, 'scale_2': { @@ -698,7 +1436,8 @@ def test_3d_interpolate_sequence_1(self): 'kind': 'op', 'op': 'Interpolate', 'axes': int64_array([4]), - 'mode': 'nearest' + 'mode': 'nearest', + 'version': 'opset1' }, 'interpolate_2_data': {'value': None, 'shape': int64_array([1, 5, 4096, 1280, 512]), 'kind': 'data'}, 'abs': {'type': 'Abs', 'kind': 'op', 'op': 'Abs'}, @@ -724,6 +1463,20 @@ def test_3d_interpolate_sequence_1(self): (flag, resp) = compare_graphs(graph, ref_graph, 'output') self.assertTrue(flag, resp) + def test_3d_interpolate_sequence_1_opset4_case(self): + graph = build_graph( + nodes_attrs=new_graph_node_attrs_for_3d_case_1_opset4_case, + edges=new_edges_for_3d_case_1_opset4_case + ) + + ref_graph = build_graph( + nodes_attrs=new_ref_graph_node_attrs_for_3d_case_1_opset4_case, + edges=new_ref_edges_for_3d_case_1_opset4_case + ) + InterpolateSequenceToInterpolate().find_and_replace_pattern(graph) + (flag, resp) = compare_graphs(graph, ref_graph, 'output') + self.assertTrue(flag, resp) + def test_3d_interpolate_sequence_2(self): graph = build_graph( nodes_attrs=graph_node_attrs_for_3d_case_2, @@ -778,7 +1531,8 @@ def test_3d_interpolate_sequence_4(self): 'align_corners': 0, 'antialias': 1, 'pads_begin': 5, - 'pads_end': 3 + 'pads_end': 3, + 'version': 'opset1' }, 'interpolate_data': {'value': None, 'shape': int64_array([10, 64, 4599, 912, 133120]), 'kind': 'data'}, 'abs': {'type': 'Abs', 'kind': 'op', 'op': 'Abs'}, @@ -803,6 +1557,21 @@ def test_3d_interpolate_sequence_4(self): (flag, resp) = compare_graphs(graph, ref_graph, 'output') self.assertTrue(flag, resp) + def test_3d_interpolate_sequence_4_opset4_case(self): + graph = build_graph( + nodes_attrs=new_graph_node_attrs_for_3d_case_4_opset4_case, + edges=new_edges_for_3d_case_4_opset4_case + ) + + ref_graph = build_graph( + nodes_attrs=new_ref_graph_node_attrs_for_3d_case_4_opset4_case, + edges=new_ref_edges_for_3d_case_4_opset4_case + ) + + InterpolateSequenceToInterpolate().find_and_replace_pattern(graph) + (flag, resp) = compare_graphs(graph, ref_graph, 'output') + self.assertTrue(flag, resp) + def test_3d_interpolate_sequence_5(self): graph = build_graph( nodes_attrs=graph_node_attrs_for_3d_case_4, diff --git a/model-optimizer/extensions/middle/ONNXResize11ToInterpolateV4.py b/model-optimizer/extensions/middle/ONNXResize11ToInterpolateV4.py new file mode 100644 index 00000000000000..30e0fd4b3e2486 --- /dev/null +++ b/model-optimizer/extensions/middle/ONNXResize11ToInterpolateV4.py @@ -0,0 +1,172 @@ +""" + Copyright (C) 2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import logging as log +import numpy as np + +from extensions.ops.activation_ops import Floor +from extensions.ops.Cast import Cast +from extensions.ops.elementwise import Add, Div, Mul +from extensions.ops.interpolate import Interpolate +from mo.front.common.layout import get_depth_dim, get_height_dim, get_width_dim +from mo.front.common.partial_infer.utils import int64_array, float_array +from mo.front.tf.graph_utils import create_op_with_const_inputs +from mo.middle.passes.convert_data_type import data_type_str_to_np +from mo.middle.replacement import MiddleReplacementPattern +from mo.graph.graph import Graph, Node, rename_nodes +from mo.ops.const import Const +from mo.ops.shape import Shape +from mo.ops.strided_slice import StridedSlice + + +def convert_mode(onnx_mode: str) -> str: + return {'nearest': 'nearest', 'linear': 'linear_onnx', 'cubic': 'cubic'}[onnx_mode] + + +def replace_resize(graph: Graph, resize: Node): + log.debug("Converting of ONNX Resize-11 to Interpolate-4 " + "is triggered for node {}.".format(resize.soft_get('name', resize.id))) + + input_shape = resize.in_port(0).data.get_shape() + input_rank = len(input_shape) + resize_name = resize.soft_get('name', resize.id) + if input_rank not in {4, 5}: + log.warning('The input shape is not 4D or 5D for op with name {}'.format(resize_name)) + return + + num_of_inputs = len([port for port in resize.in_ports().values() if not port.disconnected()]) + assert num_of_inputs in {3, 4}, \ + "Number of inputs of ONNXResize (with name {}) should be equal to 3 or 4".format(resize_name) + + assert resize.soft_get('coordinate_transformation_mode') != 'tf_crop_and_resize', \ + 'Mode tf_crop_and_resize is not supported for op {} with name {}'.format(resize.op, resize_name) + + layout = graph.graph['layout'] + + if input_rank == 4: + begin_dim = get_height_dim(layout, input_rank) + end_dim = get_width_dim(layout, input_rank) + 1 + else: + begin_dim = get_depth_dim(layout, input_rank) + end_dim = get_width_dim(layout, input_rank) + 1 + + sizes_ss = create_op_with_const_inputs(graph, StridedSlice, + {1: int64_array([begin_dim]), + 2: int64_array([end_dim]), + 3: int64_array([1])}, + {'name': resize_name + '/StridedSlice_', + 'begin_mask': int64_array([1]), + 'end_mask': int64_array([1]), + 'new_axis_mask': int64_array([0]), + 'shrink_axis_mask': int64_array([0]), + 'ellipsis_mask': int64_array([0])}) + scales_ss = create_op_with_const_inputs(graph, StridedSlice, + {1: int64_array([begin_dim]), + 2: int64_array([end_dim]), + 3: int64_array([1])}, + {'name': resize_name + '/StridedSlice_', + 'begin_mask': int64_array([1]), + 'end_mask': int64_array([1]), + 'new_axis_mask': int64_array([0]), + 'shrink_axis_mask': int64_array([0]), + 'ellipsis_mask': int64_array([0])}) + axes_node = Const(graph, + {'name': resize_name + '/axis_', + 'value': int64_array(np.arange(begin_dim, end_dim))}).create_node() + + shape_calculation_mode = 'scales' if num_of_inputs == 3 else 'sizes' + + interpolate_node = Interpolate(graph, {'version': 'opset4', + 'mode': convert_mode(resize.mode), + 'coordinate_transformation_mode': resize.coordinate_transformation_mode, + 'cube_coeff': resize.cube_coeff, + 'nearest_mode': resize.nearest_mode, + 'pads_begin': int64_array([0]), + 'pads_end': int64_array([0]), + 'antialias': 0, + 'shape_calculation_mode': shape_calculation_mode, + 'in_ports_count': 4}).create_node() + + axes_node.out_port(0).connect(interpolate_node.in_port(3)) + shape_of = Shape(graph, {'name': resize_name + '/ShapeOf_'}).create_node() + + add_node = create_op_with_const_inputs(graph, Add, + {1: float_array([1.0e-5])}, + {'name': resize_name + '/Add_'}) + + input_data_type = data_type_str_to_np(graph.graph['cmd_params'].data_type) + + if num_of_inputs == 3: + cast_shape_to_float = Cast(graph, {'dst_type': input_data_type}).create_node() + mul_node = Mul(graph, {'name': resize_name + '/Mul_'}).create_node() + shape_of.out_port(0).connect(cast_shape_to_float.in_port(0)) + cast_shape_to_float.out_port(0).connect(mul_node.in_port(0)) + cast_add_result_to_int = Cast(graph, {'dst_type': np.int64}).create_node() + floor_node = Floor(graph, {'name': resize_name + '/Floor_'}).create_node() + mul_node.out_port(0).connect(add_node.in_port(0)) + add_node.out_port(0).connect(floor_node.in_port(0)) + floor_node.out_port(0).connect(cast_add_result_to_int.in_port(0)) + cast_add_result_to_int.out_port(0).connect(sizes_ss.in_port(0)) + sizes_ss.out_port(0).connect(interpolate_node.in_port(1)) + scales_ss.out_port(0).connect(interpolate_node.in_port(2)) + + connection_of_resize_input = resize.in_port(0).get_connection() + connection_of_resize_input.set_destination(interpolate_node.in_port(0)) + + connection_of_scales = resize.in_port(2).get_connection() + connection_of_scales.set_destination(scales_ss.in_port(0)) + + connection_of_resize_input.get_source().connect(shape_of.in_port(0)) + connection_of_scales.get_source().connect(mul_node.in_port(1)) + else: + cast_shape_to_float = Cast(graph, {'dst_type': input_data_type}).create_node() + cast_sizes_to_float = Cast(graph, {'dst_type': input_data_type}).create_node() + div_node = Div(graph, {'name': resize_name + '/Div_'}).create_node() + cast_sizes_to_float.out_port(0).connect(div_node.in_port(0)) + cast_shape_to_float.out_port(0).connect(div_node.in_port(1)) + shape_of.out_port(0).connect(cast_shape_to_float.in_port(0)) + div_node.out_port(0).connect(add_node.in_port(0)) + add_node.out_port(0).connect(scales_ss.in_port(0)) + scales_ss.out_port(0).connect(interpolate_node.in_port(2)) + sizes_ss.out_port(0).connect(interpolate_node.in_port(1)) + + connection_of_resize_input = resize.in_port(0).get_connection() + connection_of_resize_input.set_destination(interpolate_node.in_port(0)) + + connection_of_sizes = resize.in_port(3).get_connection() + connection_of_sizes.set_destination(sizes_ss.in_port(0)) + + connection_of_resize_input.get_source().connect(shape_of.in_port(0)) + connection_of_sizes.get_source().connect(cast_sizes_to_float.in_port(0)) + + rename_nodes([(resize, resize_name + '/delete'), (interpolate_node, resize_name)]) + resize.out_port(0).get_connection().set_source(interpolate_node.out_port(0)) + + +class ONNXResize11ToInterpolate4(MiddleReplacementPattern): + """ + The transformation replaces ONNX Resize 11 with Interpolate-4. + """ + enabled = True + + def run_before(self): + from extensions.middle.InterpolateSequenceToInterpolate import InterpolateSequenceToInterpolate + return [InterpolateSequenceToInterpolate] + + def find_and_replace_pattern(self, graph: Graph): + resize11_ops = graph.get_op_nodes(op='ONNXResize11') + for resize in resize11_ops: + replace_resize(graph, resize) diff --git a/model-optimizer/extensions/middle/UpsampleToResample.py b/model-optimizer/extensions/middle/UpsampleToResample.py index 0e34c1e2d319a2..4a516976bd98c0 100644 --- a/model-optimizer/extensions/middle/UpsampleToResample.py +++ b/model-optimizer/extensions/middle/UpsampleToResample.py @@ -102,8 +102,6 @@ def replace_pattern(self, graph: Graph, match: Dict[str, Node]): begin_value = int64_array([get_depth_dim(layout, input_shape_rank)]) factor_value = np.array([depth_scale, height_scale, width_scale]) - - ss = create_op_with_const_inputs(graph, StridedSlice, {1: begin_value, 2: int64_array([get_width_dim(layout, input_shape_rank) + 1]), diff --git a/model-optimizer/extensions/ops/ONNXResize11.py b/model-optimizer/extensions/ops/ONNXResize11.py new file mode 100644 index 00000000000000..38c34c76250c45 --- /dev/null +++ b/model-optimizer/extensions/ops/ONNXResize11.py @@ -0,0 +1,72 @@ +""" + Copyright (C) 2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import numpy as np + +from mo.front.common.partial_infer.utils import int64_array +from mo.graph.graph import Node, Graph +from mo.ops.op import Op + + +class ONNXResize11Op(Op): + op = 'ONNXResize11' + + def __init__(self, graph: Graph, attrs: dict): + mandatory_props = { + 'op': self.op, + 'out_ports_count': 1, + 'infer': ONNXResize11Op.onnx_resize_infer + } + super().__init__(graph, mandatory_props, attrs) + + def supported_attrs(self): + return [ + 'coordinate_transformation_mode', + 'cube_coeff', + 'exclude_outside', + 'extrapolation_value', + 'mode', + 'nearest_mode' + ] + + @staticmethod + def onnx_resize_infer(node: Node): + input_shape = node.in_port(0).data.get_shape() + if input_shape is None: + return + + num_of_in_nodes = len(node.in_nodes()) + assert num_of_in_nodes in {3, 4}, \ + "Node {} with op {} number of inputs must be equal to 3 or 4.".format(node.name, node.op) + + assert node.coordinate_transformation_mode != 'tf_crop_and_resize', \ + 'Mode tf_crop_and_resize is not supported for op {} with name {}'.format(node.op, node.name) + + if num_of_in_nodes == 3: + # i.e. input 'sizes' is not given + input2_value = node.in_port(2).data.get_value() + assert input2_value is not None, \ + "Node {} with op {} has no value in input port 2".format(node.soft_get('name', node.id), node.op) + scale = np.array(input2_value) + output_shape = np.floor(input_shape * scale + 1.0e-6).astype(np.int64) + else: + # i.e. input 'sizes' is given + sizes = node.in_port(3).data.get_value() + assert sizes is not None, \ + "Node {} with op {} has no value in input port 3".format(node.name, node.op) + output_shape = int64_array(sizes) + + node.out_port(0).data.set_shape(output_shape.copy()) \ No newline at end of file diff --git a/model-optimizer/extensions/ops/interpolate.py b/model-optimizer/extensions/ops/interpolate.py index 0a0e9d27aa942e..95e62d20d5cb50 100644 --- a/model-optimizer/extensions/ops/interpolate.py +++ b/model-optimizer/extensions/ops/interpolate.py @@ -14,15 +14,118 @@ limitations under the License. """ + +import math +import numpy as np + +from mo.front.common.partial_infer.utils import int64_array from mo.graph.graph import Node, Graph from mo.ops.op import Op, PermuteAttrs +def infer_for_opset4(node: Node): + assert len([p for p in node.in_ports().values() if not p.disconnected()]) in [3, 4], \ + "Interpolate-4 node {} must have 3 or 4 inputs".format(node.soft_get(node.name, node.id)) + assert node.has_valid('mode') + assert node.has_valid('shape_calculation_mode') + src_shape = node.in_port(0).data.get_shape() + assert src_shape is not None + + input_rank = len(src_shape) + + pads_begin = correct_pad(node.soft_get('pads_begin', [0]), input_rank) + pads_end = correct_pad(node.soft_get('pads_end', [0]), input_rank) + node['pads_begin'] = pads_begin + node['pads_end'] = pads_end + + if len(node.in_ports()) == 3: + axes = list(range(0, input_rank)) + else: + axes = node.in_port(3).get_source().data.get_value() + assert axes is not None, \ + "Interpolate-4 node with name {} has None as 'axes' input".format(node.soft_get('name', node.id)) + + axes = int64_array(axes) + output_shape = src_shape + pads_begin + pads_end + if node.shape_calculation_mode == 'sizes': + dst_shape = node.in_port(1).data.get_value() + assert dst_shape is not None + correct_scales_using_dst_shape(node, dst_shape, src_shape, axes) + for i, axis in enumerate(axes): + output_shape[axis] = dst_shape[i] + else: + scales = node.in_port(2).data.get_value() + assert scales is not None + for i, axis in enumerate(axes): + output_shape[axis] = math.floor(scales[i] * output_shape[axis] + 1.0e-5) + + node.out_port(0).data.set_shape(output_shape) + + +def infer_for_opset1(node: Node): + assert len([p for p in node.in_ports().values() if not p.disconnected()]) == 2 + assert node.has_valid('mode') + assert node.has_valid('axes') + + src_shape = node.in_port(0).data.get_shape() + + assert src_shape is not None + dst_shape = node.in_port(1).data.get_value() + assert dst_shape is not None + + output_shape = src_shape.copy() + for ind, axis in enumerate(node.axes): + output_shape[axis] = dst_shape[ind] + + node.out_port(0).data.set_shape(output_shape) + + PermuteAttrs.create_permute_attrs(node, attrs=[('axes', 'input:0')]) + + +def pad_attribute_to_str(node: Node, attr: str): + return ','.join(map(str, node[attr])) if node.has_valid(attr) else None + + +def correct_pad(pad, rank): + pad_len = len(pad) + if pad_len < rank: + return np.pad(pad, (0, rank - pad_len), 'constant').astype(np.int64) + elif pad_len > rank: + return np.array(pad[: rank]).astype(np.int64) + else: + return np.array(pad, dtype=np.int64) + + +def correct_scales_using_dst_shape(node, dst_shape, src_shape, axes): + scales_value = node.in_port(2).data.get_value() + if scales_value is None or len(scales_value) != len(dst_shape): + corrected_scales = np.zeros(len(dst_shape)) + for i, axis in enumerate(list(axes)): + corrected_scales[i] = math.floor((dst_shape[i] / src_shape[axis]) + 1.0e-5) + + class Interpolate(Op): op = 'Interpolate' enabled = False + infers = { + 'opset1': infer_for_opset1, + 'opset4': infer_for_opset4 + } def __init__(self, graph: Graph, attrs: dict): + self.attributes_for_opsets = { + 'opset1': [ + ('axes', lambda node: ','.join(map(str, node.axes))), + 'mode', 'align_corners', 'antialias', 'pads_begin', 'pads_end', + ], + 'opset4': [ + 'mode', 'antialias', 'nearest_mode', 'cube_coeff', 'coordinate_transformation_mode', + 'shape_calculation_mode', + ('pads_begin', lambda node: pad_attribute_to_str(node, 'pads_begin')), + ('pads_end', lambda node: pad_attribute_to_str(node, 'pads_end')), + ] + } + mandatory_props = { 'op': self.op, 'type': self.op, @@ -38,34 +141,34 @@ def __init__(self, graph: Graph, attrs: dict): 'infer': self.infer, 'force_precision_in_ports': {1: 'int64'}, - 'in_ports_count': 2, 'out_ports_count': 1, } super().__init__(graph, mandatory_props, attrs) def supported_attrs(self): - return [ - ('axes', lambda node: ','.join(map(str, node.axes))), - 'mode', 'align_corners', 'antialias', 'pads_begin', 'pads_end', - ] + opset = self.get_opset() + key = opset if opset in self.attributes_for_opsets else 'opset1' + return self.attributes_for_opsets[key] + + def infer(self, node: Node): + opset = self.get_opset() + key = opset if opset in self.infers else 'opset1' + self.infers[key](node) @staticmethod - def infer(node: Node): - assert len([p for p in node.in_ports().values() if not p.disconnected()]) == 2 - assert node.has_valid('mode') - assert node.has_valid('axes') + def get_axes(node: Node) -> np.ndarray: + opset = node.get_opset() + if opset == 'opset1': + interp_axes = node.soft_get('axes', None) + return interp_axes if interp_axes is None else int64_array(interp_axes) src_shape = node.in_port(0).data.get_shape() - assert src_shape is not None - dst_shape = node.in_port(1).data.get_value() - assert dst_shape is not None - - output_shape = src_shape.copy() - for ind, axis in enumerate(node.axes): - output_shape[axis] = dst_shape[ind] - - node.out_port(0).data.set_shape(output_shape) + input_rank = len(src_shape) - PermuteAttrs.create_permute_attrs(node, attrs=[('axes', 'input:0')]) + if len(node.in_ports()) == 3: + axes = list(range(0, input_rank)) + else: + axes = node.in_port(3).get_source().data.get_value() + return int64_array(axes) diff --git a/model-optimizer/extensions/ops/interpolate_test.py b/model-optimizer/extensions/ops/interpolate_test.py new file mode 100644 index 00000000000000..92b8bf14c3a3e5 --- /dev/null +++ b/model-optimizer/extensions/ops/interpolate_test.py @@ -0,0 +1,281 @@ +""" + Copyright (C) 2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + + +import unittest + +import numpy as np +from generator import generator, generate + +from extensions.ops.interpolate import Interpolate +from mo.front.common.partial_infer.utils import int64_array +from mo.graph.graph import Node +from mo.utils.unittest.graph import build_graph + + +graph_node_attrs_without_axes = { + 'input': {'type': 'Parameter', 'kind': 'op'}, + 'input_data': {'kind': 'data', 'shape': None, 'value': None}, + 'sizes': {'type': 'Const', 'kind': 'op', 'shape': None, 'value': None}, + 'sizes_data': {'kind': 'data', 'shape': None, 'value': None}, + 'scales': {'type': 'Const', 'kind': 'op', 'shape': None, 'value': None}, + 'scales_data': {'kind': 'data', 'shape': None, 'value': None}, + 'interpolate': { + 'type': 'Interpolate', 'kind': 'op', 'mode': 'nearest', 'shape_calculation_mode': 'sizes', + 'coordinate_transformation_mode': 'half_pixel', 'version': 'opset4', + 'nearest_mode': 'round_prefer_floor', 'antialias': 0, + }, + 'interpolate_data': {'kind': 'data', 'value': None, 'shape': None}, + 'op_output': {'kind': 'op', 'op': 'Result'}, +} + +graph_edges_without_axes = [ + ('input', 'input_data'), + ('sizes', 'sizes_data'), + ('scales', 'scales_data'), + ('input_data', 'interpolate', {'in': 0}), + ('sizes_data', 'interpolate', {'in': 1}), + ('scales_data', 'interpolate', {'in': 2}), + ('interpolate', 'interpolate_data'), + ('interpolate_data', 'op_output'), +] + + +graph_nodes_attrs = { + 'input': {'type': 'Parameter', 'kind': 'op'}, + 'input_data': {'kind': 'data', 'shape': None, 'value': None}, + 'sizes': {'type': 'Const', 'kind': 'op', 'shape': None, 'value': None}, + 'sizes_data': {'kind': 'data', 'shape': None, 'value': None}, + 'scales': {'type': 'Const', 'kind': 'op', 'shape': None, 'value': None}, + 'scales_data': {'kind': 'data', 'shape': None, 'value': None}, + 'axes': {'type': 'Const', 'kind': 'op', 'shape': None, 'value': None}, + 'axes_data': {'kind': 'data', 'shape': None, 'value': None}, + 'interpolate': { + 'type': 'Interpolate', 'kind': 'op', 'mode': 'nearest', 'shape_calculation_mode': 'sizes', + 'coordinate_transformation_mode': 'half_pixel', 'version': 'opset4', + 'nearest_mode': 'round_prefer_floor', 'antialias': 0, + }, + 'interpolate_data': {'kind': 'data', 'value': None, 'shape': None}, + 'op_output': {'kind': 'op', 'op': 'Result'}, +} + +graph_edges = [ + ('input', 'input_data'), + ('sizes', 'sizes_data'), + ('scales', 'scales_data'), + ('axes', 'axes_data'), + ('input_data', 'interpolate', {'in': 0}), + ('sizes_data', 'interpolate', {'in': 1}), + ('scales_data', 'interpolate', {'in': 2}), + ('axes_data', 'interpolate', {'in': 3}), + ('interpolate', 'interpolate_data'), + ('interpolate_data', 'op_output'), +] + + +@generator +class TestInterpolateOp(unittest.TestCase): + @generate(*[([0], [0], [1, 3, 100, 200], [1, 3, 350, 150], [350, 150], [3.5, 150 / 200], [2, 3]), + ([0, 3, 10, 10], [0], [16, 7, 190, 400], [8, 10, 390, 600], + [8, 390, 600], [0.5, 390 / 200, 600 / 410], [0, 2, 3]), + ([10, 5, 0, 10], [0, 4, 16, 18], [4, 33, 1024, 8000], [56, 42, 520, 8028], + [56, 520], [4.0, 0.5], [0, 2]), + ([0], [0], [1, 16, 85, 470, 690], [20, 16, 40, 470, 1380], + [20, 40, 1380], [20.0, 40.0 / 85.0, 1380.0 / 690.0], [0, 2, 4]), + ([4, 3, 11, 22, 5], [1, 3, 4, 8, 5], [1, 16, 85, 470, 690], [60, 22, 430, 500, 345], + [60, 430, 345], [10.0, 4.3, 345.0 / 700.0], [0, 2, 4]), + ([0], [0], [5, 77, 444, 88, 6050], [100, 308, 4440, 44, 6050], + [100, 308, 4440, 44], [20.0, 4.0, 10.0, 0.5], [0, 1, 2, 3]), + ([0], [0], [1, 100, 200], [1, 350, 150], [350, 150], [3.5, 150 / 200], [1, 2]), + ([0, 3, 10], [0], [16, 7, 190], [8, 10, 390], [8, 390], [0.5, 390 / 200], [0, 2]), + ([10, 0, 10], [0, 16, 18], [4, 1024, 8000], [56, 520, 8028], [56, 520], [4.0, 0.5], [0, 1]), + ([0], [0], [1, 690], [20, 1380], [20, 1380], [20.0, 1380.0 / 690.0], [0, 1]), + ([4, 3, 11, 22, 5, 0], [1, 3, 4, 8, 5, 0], [1, 16, 85, 470, 690, 349], [60, 22, 430, 500, 345, 349], + [60, 430, 345], [10.0, 4.3, 345.0 / 700.0], [0, 2, 4]) + ]) + def test_interpolate4_using_sizes(self, pads_begin, pads_end, input_shape, output_shape, sizes, scales, axes): + graph = build_graph(nodes_attrs=graph_nodes_attrs, + edges=graph_edges, + update_attributes={ + 'input_data': {'shape': input_shape}, + 'sizes': {'shape': int64_array(sizes).shape, 'value': int64_array(sizes)}, + 'sizes_data': {'shape': int64_array(sizes).shape, 'value': int64_array(sizes)}, + 'scales': {'shape': np.array(scales).shape, 'value': np.array(scales)}, + 'scales_data': {'shape': np.array(scales).shape, 'value': np.array(scales)}, + 'axes': {'shape': int64_array(axes).shape, 'value': int64_array(axes)}, + 'axes_data': {'shape': int64_array(axes).shape, 'value': int64_array(axes)}, + 'interpolate': {'pads_begin': int64_array(pads_begin), + 'pads_end': int64_array(pads_end)} + }) + + node = Node(graph, 'interpolate') + tested_class = Interpolate(graph=graph, attrs=node.attrs()) + tested_class.infer(node) + + msg = "Interpolate-4 infer failed for case: sizes={}, scales={}, pads_begin={}, pads_end={}, axes={}," \ + " expected_shape={}, actual_shape={}" + + self.assertTrue(np.array_equal(graph.node['interpolate_data']['shape'], int64_array(output_shape)), + msg.format(sizes, scales, pads_begin, pads_end, axes, output_shape, + graph.node['interpolate_data']['shape'])) + + @generate(*[([0], [0], [1, 3, 100, 200], [1, 3, 350, 150], [350, 150], [3.5, 150 / 200], [2, 3]), + ([0, 3, 10, 10], [0], [16, 7, 190, 400], [8, 10, 390, 600], + [8, 390, 600], [0.5, 390 / 200, 600 / 410], [0, 2, 3]), + ([10, 5, 0, 10], [0, 4, 16, 18], [4, 33, 1024, 8000], [56, 42, 520, 8028], + [56, 520], [4.0, 0.5], [0, 2]), + ([0], [0], [1, 16, 85, 470, 690], [20, 16, 40, 470, 1380], + [20, 40, 1380], [20.0, 40.0 / 85.0, 1380.0 / 690.0], [0, 2, 4]), + ([4, 3, 11, 22, 5], [1, 3, 4, 8, 5], [1, 16, 85, 470, 690], [60, 22, 430, 500, 345], + [60, 430, 345], [10.0, 4.3, 345.0 / 700.0], [0, 2, 4]), + ([0], [0], [5, 77, 444, 88, 6050], [100, 308, 4440, 44, 6050], + [100, 308, 4440, 44], [20.0, 4.0, 10.0, 0.5], [0, 1, 2, 3]), + ([0], [0], [1, 100, 200], [1, 350, 150], [350, 150], [3.5, 150 / 200], [1, 2]), + ([0, 3, 10], [0], [16, 7, 190], [8, 10, 390], [8, 390], [0.5, 390 / 200], [0, 2]), + ([10, 0, 10], [0, 16, 18], [4, 1024, 8000], [56, 520, 8028], [56, 520], [4.0, 0.5], [0, 1]), + ([0], [0], [1, 690], [20, 1380], [20, 1380], [20.0, 1380.0 / 690.0], [0, 1]), + ([4, 3, 11, 22, 5, 0], [1, 3, 4, 8, 5, 0], [1, 16, 85, 470, 690, 349], [60, 22, 430, 500, 345, 349], + [60, 430, 345], [10.0, 4.3, 345.0 / 700.0], [0, 2, 4]), + ([4, 3, 11, 22, 5, 0, 0], [1, 3, 4, 8, 5, 0, 0], [1, 16, 85, 470, 690, 349, 3], + [60, 22, 430, 500, 345, 349, 1], + [60, 430, 345, 1], [10.0, 4.3, 345.0 / 700.0, 1 / 3], [0, 2, 4, 6]), + ([4, 3, 11, 22, 5, 0, 0], [1, 3, 4, 8, 5, 0, 0], [1, 16, 85, 470, 690, 349, 3], + [60, 22, 430, 500, 345, 349, 1], + [60, 430, 345, 1], [10.0, 4.3, 345.0 / 700.0, 0.3333333], [0, 2, 4, 6]), + ]) + def test_interpolate4_using_scales(self, pads_begin, pads_end, input_shape, output_shape, sizes, scales, axes): + graph = build_graph(nodes_attrs=graph_nodes_attrs, + edges=graph_edges, + update_attributes={ + 'input_data': {'shape': input_shape}, + 'sizes': {'shape': int64_array(sizes).shape, 'value': int64_array(sizes)}, + 'sizes_data': {'shape': int64_array(sizes).shape, 'value': int64_array(sizes)}, + 'scales': {'shape': np.array(scales).shape, 'value': np.array(scales)}, + 'scales_data': {'shape': np.array(scales).shape, 'value': np.array(scales)}, + 'axes': {'shape': int64_array(axes).shape, 'value': int64_array(axes)}, + 'axes_data': {'shape': int64_array(axes).shape, 'value': int64_array(axes)}, + 'interpolate': {'pads_begin': int64_array(pads_begin), + 'pads_end': int64_array(pads_end), + 'shape_calculation_mode': 'scales'} + }) + + node = Node(graph, 'interpolate') + tested_class = Interpolate(graph=graph, attrs=node.attrs()) + tested_class.infer(node) + + msg = "Interpolate-4 infer failed for case: sizes={}, scales={}, pads_begin={}, pads_end={}, axes={}," \ + " expected_shape={}, actual_shape={}" + + self.assertTrue(np.array_equal(graph.node['interpolate_data']['shape'], int64_array(output_shape)), + msg.format(sizes, scales, pads_begin, pads_end, axes, output_shape, + graph.node['interpolate_data']['shape'])) + + @generate(*[([0], [0], [1, 3, 100, 200], [1, 3, 350, 150], [1, 3, 350, 150], [1.0, 1.0, 3.5, 150 / 200]), + ([0, 3, 10, 10], [0], [16, 7, 190, 400], [8, 10, 390, 600], + [8, 10, 390, 600], [0.5, 1.0, 390 / 200, 600 / 410]), + ([10, 5, 0, 10], [0, 4, 16, 18], [4, 33, 1024, 8000], [56, 42, 520, 8028], + [56, 42, 520, 8028], [4.0, 1.0, 0.5, 1.0]), + ([0], [0], [1, 16, 85, 470, 690], [20, 16, 40, 470, 1380], + [20, 16, 40, 470, 1380], [20.0, 1.0, 40.0 / 85.0, 1.0, 1380.0 / 690.0]), + ([4, 3, 11, 22, 5], [1, 3, 4, 8, 5], [1, 16, 85, 470, 690], [60, 22, 430, 500, 345], + [60, 22, 430, 500, 345], [10.0, 1.0, 4.3, 1.0, 345.0 / 700.0]), + ([0], [0], [5, 77, 444, 88, 6050], [100, 308, 4440, 44, 6050], + [100, 308, 4440, 44, 6050], [20.0, 4.0, 10.0, 0.5, 1.0]), + ([0], [0], [1, 100, 200], [1, 350, 150], [1, 350, 150], [1.0, 3.5, 150 / 200]), + ([0, 3, 10], [0], [16, 7, 190], [8, 10, 390], [8, 10, 390], [0.5, 1.0, 390 / 200]), + ([10, 0, 10], [0, 16, 18], [4, 1024, 8000], [56, 520, 8028], [56, 520, 8028], [4.0, 0.5, 1.0]), + ([0], [0], [1, 690], [20, 1380], [20, 1380], [20.0, 1380.0 / 690.0]), + ([4, 3, 11, 22, 5, 0], [1, 3, 4, 8, 5, 0], [1, 16, 85, 470, 690, 349], [60, 22, 430, 500, 345, 349], + [60, 22, 430, 500, 345, 349], [10.0, 1.0, 4.3, 1.0, 345.0 / 700.0, 1.0]), + ([4, 3, 11, 22, 5, 0, 0], [1, 3, 4, 8, 5, 0, 0], [1, 16, 85, 470, 690, 349, 3], + [60, 22, 430, 500, 345, 349, 1], + [60, 22, 430, 500, 345, 349, 1], [10.0, 1.0, 4.3, 1.0, 345.0 / 700.0, 1.0, 1 / 3]), + ]) + def test_interpolate4_using_sizes_without_axes(self, pads_begin, pads_end, input_shape, output_shape, sizes, + scales): + graph = build_graph(nodes_attrs=graph_node_attrs_without_axes, + edges=graph_edges_without_axes, + update_attributes={ + 'input_data': {'shape': input_shape}, + 'sizes': {'shape': int64_array(sizes).shape, 'value': int64_array(sizes)}, + 'sizes_data': {'shape': int64_array(sizes).shape, 'value': int64_array(sizes)}, + 'scales': {'shape': np.array(scales).shape, 'value': np.array(scales)}, + 'scales_data': {'shape': np.array(scales).shape, 'value': np.array(scales)}, + 'interpolate': {'pads_begin': int64_array(pads_begin), + 'pads_end': int64_array(pads_end), + 'shape_calculation_mode': 'sizes'} + }) + + node = Node(graph, 'interpolate') + tested_class = Interpolate(graph=graph, attrs=node.attrs()) + tested_class.infer(node) + + msg = "Interpolate-4 infer failed for case: sizes={}, scales={}, pads_begin={}, pads_end={}," \ + " expected_shape={}, actual_shape={}" + + self.assertTrue(np.array_equal(graph.node['interpolate_data']['shape'], int64_array(output_shape)), + msg.format(sizes, scales, pads_begin, pads_end, output_shape, + graph.node['interpolate_data']['shape'])) + + @generate(*[([0], [0], [1, 3, 100, 200], [1, 3, 350, 150], [1, 3, 350, 150], [1.0, 1.0, 3.5, 150 / 200]), + ([0, 3, 10, 10], [0], [16, 7, 190, 400], [8, 10, 390, 600], + [8, 10, 390, 600], [0.5, 1.0, 390 / 200, 600 / 410]), + ([10, 5, 0, 10], [0, 4, 16, 18], [4, 33, 1024, 8000], [56, 42, 520, 8028], + [56, 42, 520, 8028], [4.0, 1.0, 0.5, 1.0]), + ([0], [0], [1, 16, 85, 470, 690], [20, 16, 40, 470, 1380], + [20, 16, 40, 470, 1380], [20.0, 1.0, 40.0 / 85.0, 1.0, 1380.0 / 690.0]), + ([4, 3, 11, 22, 5], [1, 3, 4, 8, 5], [1, 16, 85, 470, 690], [60, 22, 430, 500, 345], + [60, 22, 430, 500, 345], [10.0, 1.0, 4.3, 1.0, 345.0 / 700.0]), + ([0], [0], [5, 77, 444, 88, 6050], [100, 308, 4440, 44, 6050], + [100, 308, 4440, 44, 6050], [20.0, 4.0, 10.0, 0.5, 1.0]), + ([0], [0], [1, 100, 200], [1, 350, 150], [1, 350, 150], [1.0, 3.5, 150 / 200]), + ([0, 3, 10], [0], [16, 7, 190], [8, 10, 390], [8, 10, 390], [0.5, 1.0, 390 / 200]), + ([10, 0, 10], [0, 16, 18], [4, 1024, 8000], [56, 520, 8028], [56, 520, 8028], [4.0, 0.5, 1.0]), + ([0], [0], [1, 690], [20, 1380], [20, 1380], [20.0, 1380.0 / 690.0]), + ([4, 3, 11, 22, 5, 0], [1, 3, 4, 8, 5, 0], [1, 16, 85, 470, 690, 349], [60, 22, 430, 500, 345, 349], + [60, 22, 430, 500, 345, 349], [10.0, 1.0, 4.3, 1.0, 345.0 / 700.0, 1.0]), + ([4, 3, 11, 22, 5, 0, 0], [1, 3, 4, 8, 5, 0, 0], [1, 16, 85, 470, 690, 349, 3], + [60, 22, 430, 500, 345, 349, 1], + [60, 22, 430, 500, 345, 349, 1], [10.0, 1.0, 4.3, 1.0, 345.0 / 700.0, 1.0, 1 / 3]), + ([4, 3, 11, 22, 5, 0, 0], [1, 3, 4, 8, 5, 0, 0], [1, 16, 85, 470, 690, 349, 3], + [60, 22, 430, 500, 345, 349, 1], + [60, 22, 430, 500, 345, 349, 1], [10.0, 1.0, 4.3, 1.0, 345.0 / 700.0, 1.0, 0.3333333]), + ]) + def test_interpolate4_using_scales_without_axes(self, pads_begin, pads_end, input_shape, output_shape, sizes, + scales): + graph = build_graph(nodes_attrs=graph_node_attrs_without_axes, + edges=graph_edges_without_axes, + update_attributes={ + 'input_data': {'shape': input_shape}, + 'sizes': {'shape': int64_array(sizes).shape, 'value': int64_array(sizes)}, + 'sizes_data': {'shape': int64_array(sizes).shape, 'value': int64_array(sizes)}, + 'scales': {'shape': np.array(scales).shape, 'value': np.array(scales)}, + 'scales_data': {'shape': np.array(scales).shape, 'value': np.array(scales)}, + 'interpolate': {'pads_begin': int64_array(pads_begin), + 'pads_end': int64_array(pads_end), + 'shape_calculation_mode': 'scales'} + }) + + node = Node(graph, 'interpolate') + tested_class = Interpolate(graph=graph, attrs=node.attrs()) + tested_class.infer(node) + + msg = "Interpolate-4 infer failed for case: sizes={}, scales={}, pads_begin={}, pads_end={}," \ + " expected_shape={}, actual_shape={}" + + self.assertTrue(np.array_equal(graph.node['interpolate_data']['shape'], int64_array(output_shape)), + msg.format(sizes, scales, pads_begin, pads_end, output_shape, + graph.node['interpolate_data']['shape'])) From ba86f23dd62e4f99aa11d6d64136e50de881cbb8 Mon Sep 17 00:00:00 2001 From: Katarzyna Mitrus Date: Wed, 9 Sep 2020 15:47:53 +0200 Subject: [PATCH 52/66] Relaxed tolerance for fp16_inception_v1 model (#2129) --- ngraph/python/tests/test_onnx/test_additional_models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ngraph/python/tests/test_onnx/test_additional_models.py b/ngraph/python/tests/test_onnx/test_additional_models.py index f8665e80696829..316381360dfb16 100644 --- a/ngraph/python/tests/test_onnx/test_additional_models.py +++ b/ngraph/python/tests/test_onnx/test_additional_models.py @@ -34,6 +34,7 @@ def _get_default_additional_models_dir(): tolerance_map = { "arcface_lresnet100e_opset8": {"atol": 0.001, "rtol": 0.001}, + "fp16_inception_v1": {"atol": 0.001, "rtol": 0.001}, "mobilenet_opset7": {"atol": 0.001, "rtol": 0.001}, "resnet50_v2_opset7": {"atol": 0.001, "rtol": 0.001}, "test_mobilenetv2-1.0": {"atol": 0.001, "rtol": 0.001}, From 13dfcb066fb7d5bf0aae8e85313f7560fc1a9455 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 9 Sep 2020 17:14:20 +0300 Subject: [PATCH 53/66] [IE][TOOLS] compile_tool improvements (#2140) * Remove dead code. * Protect device specific config options with device checks. * Add missing space to precision parsing error message. * Allow to switch FP32 input precision to U8. --- inference-engine/tools/compile_tool/main.cpp | 46 ++++++++++---------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/inference-engine/tools/compile_tool/main.cpp b/inference-engine/tools/compile_tool/main.cpp index 788ec7c23958b8..7e591fc4016b6c 100644 --- a/inference-engine/tools/compile_tool/main.cpp +++ b/inference-engine/tools/compile_tool/main.cpp @@ -111,10 +111,6 @@ static bool parseCommandLine(int *argc, char ***argv, InferenceEngine::Core& ie) throw std::invalid_argument("Target device name is required"); } - if (std::string::npos != FLAGS_d.find("MYRIAD")) { - std::vector myriadDeviceIds = ie.GetMetric("MYRIAD", METRIC_KEY(AVAILABLE_DEVICES)); - } - if (1 < *argc) { std::stringstream message; message << "Unknown arguments: "; @@ -148,24 +144,28 @@ static std::map parseConfig(const std::string& configN static std::map configure(const std::string &configFile, const std::string &xmlFileName) { auto config = parseConfig(configFile); - IE_SUPPRESS_DEPRECATED_START + if (std::string::npos != FLAGS_d.find("MYRIAD")) { +IE_SUPPRESS_DEPRECATED_START config[VPU_MYRIAD_CONFIG_KEY(PLATFORM)] = "VPU_MYRIAD_2480"; - IE_SUPPRESS_DEPRECATED_END +IE_SUPPRESS_DEPRECATED_END - if (!FLAGS_VPU_NUMBER_OF_SHAVES.empty()) { - config[InferenceEngine::MYRIAD_NUMBER_OF_SHAVES] = FLAGS_VPU_NUMBER_OF_SHAVES; - } + if (!FLAGS_VPU_NUMBER_OF_SHAVES.empty()) { + config[InferenceEngine::MYRIAD_NUMBER_OF_SHAVES] = FLAGS_VPU_NUMBER_OF_SHAVES; + } - if (!FLAGS_VPU_NUMBER_OF_CMX_SLICES.empty()) { - config[InferenceEngine::MYRIAD_NUMBER_OF_CMX_SLICES] = FLAGS_VPU_NUMBER_OF_CMX_SLICES; - } + if (!FLAGS_VPU_NUMBER_OF_CMX_SLICES.empty()) { + config[InferenceEngine::MYRIAD_NUMBER_OF_CMX_SLICES] = FLAGS_VPU_NUMBER_OF_CMX_SLICES; + } - if (!FLAGS_VPU_TILING_CMX_LIMIT_KB.empty()) { - config[InferenceEngine::MYRIAD_TILING_CMX_LIMIT_KB] = FLAGS_VPU_TILING_CMX_LIMIT_KB; + if (!FLAGS_VPU_TILING_CMX_LIMIT_KB.empty()) { + config[InferenceEngine::MYRIAD_TILING_CMX_LIMIT_KB] = FLAGS_VPU_TILING_CMX_LIMIT_KB; + } } - if (!FLAGS_DLA_ARCH_NAME.empty()) { - config["DLIA_ARCH_NAME"] = FLAGS_DLA_ARCH_NAME; + if (std::string::npos != FLAGS_d.find("FPGA")) { + if (!FLAGS_DLA_ARCH_NAME.empty()) { + config["DLIA_ARCH_NAME"] = FLAGS_DLA_ARCH_NAME; + } } return config; @@ -228,7 +228,7 @@ static InferenceEngine::Precision getInputPrecision(const std::string &value) { { "FP16", InferenceEngine::Precision::FP16 }, { "U8", InferenceEngine::Precision::U8 } }; - return getPrecision(value, supported_precisions, "for input layer"); + return getPrecision(value, supported_precisions, " for input layer"); } static InferenceEngine::Precision getOutputPrecision(const std::string &value) { @@ -236,7 +236,7 @@ static InferenceEngine::Precision getOutputPrecision(const std::string &value) { { "FP32", InferenceEngine::Precision::FP32 }, { "FP16", InferenceEngine::Precision::FP16 } }; - return getPrecision(value, supported_precisions, "for output layer"); + return getPrecision(value, supported_precisions, " for output layer"); } static InferenceEngine::Layout getLayout(const std::string &value) { @@ -297,7 +297,7 @@ static void setPrecisions(const InferenceEngine::CNNNetwork &network, const std: if (input != inputs.end()) { const auto input_precision = input->second->getPrecision(); if ((isFloat(input_precision) && isFloat(getInputPrecision(user_precision))) || - (isFP16(input_precision) && isU8(getInputPrecision(user_precision)))) { + (isFloat(input_precision) && isU8(getInputPrecision(user_precision)))) { input->second->setPrecision(getInputPrecision(user_precision)); } } else if (output != outputs.end()) { @@ -311,10 +311,8 @@ static void setPrecisions(const InferenceEngine::CNNNetwork &network, const std: } } -static void setDefaultIOPrecisions(InferenceEngine::CNNNetwork &network, const std::string & device) { - bool isMyriad = FLAGS_d.find("MYRIAD") != std::string::npos; - - if (isMyriad) { +static void setDefaultIOPrecisions(InferenceEngine::CNNNetwork &network) { + if (std::string::npos != FLAGS_d.find("MYRIAD")) { const InferenceEngine::Precision fp16 = InferenceEngine::Precision::FP16; for (auto &&layer : network.getInputsInfo()) { @@ -410,7 +408,7 @@ int main(int argc, char *argv[]) { auto network = ie.ReadNetwork(FLAGS_m); - setDefaultIOPrecisions(network, FLAGS_d); + setDefaultIOPrecisions(network); processPrecisions(network, FLAGS_ip, FLAGS_op, FLAGS_iop); processLayout(network, FLAGS_il, FLAGS_ol); From 40fd1858a283ade9c65ef0e99f7df97ddb988a25 Mon Sep 17 00:00:00 2001 From: Mikhail Letavin Date: Wed, 9 Sep 2020 17:21:10 +0300 Subject: [PATCH 54/66] [IE CLDNN] Fix problems with loop iterator and parameter check in clDNN (#2141) --- inference-engine/thirdparty/clDNN/src/gpu/device_info.cpp | 2 +- inference-engine/thirdparty/clDNN/src/scatter_update.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/inference-engine/thirdparty/clDNN/src/gpu/device_info.cpp b/inference-engine/thirdparty/clDNN/src/gpu/device_info.cpp index 8383fdf4eeac97..526609f77ae510 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/device_info.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/device_info.cpp @@ -106,7 +106,7 @@ int driver_dev_id() auto id_itr = result.begin(); while (id_itr != result.end()) { if (std::find(unused_ids.begin(), unused_ids.end(), *id_itr) != unused_ids.end()) - result.erase(id_itr); + id_itr = result.erase(id_itr); else id_itr++; } diff --git a/inference-engine/thirdparty/clDNN/src/scatter_update.cpp b/inference-engine/thirdparty/clDNN/src/scatter_update.cpp index 2c4dca945ccd96..8bc7a9609754ae 100644 --- a/inference-engine/thirdparty/clDNN/src/scatter_update.cpp +++ b/inference-engine/thirdparty/clDNN/src/scatter_update.cpp @@ -69,18 +69,18 @@ layout scatter_update_inst::calc_output_layout(scatter_update_node const& node) output_type = node.get_fused_output_layout().data_type; } + if (static_cast(axis) < 0 || static_cast(axis) >= input_number_of_dims) + CLDNN_ERROR_MESSAGE(node.id(), "Incorrect axis value for ScatterUpdate: Axis must be positive and less than the input tensor dimension."); + if (indices_size > static_cast(output_shape.sizes()[axis])) { CLDNN_ERROR_MESSAGE(node.id(), "Undefined behavior ScatterUpdate: indices size must not be larger than the output size along the Axis."); } - + if (nonempty_indices_dims + static_cast(axis) > updates_number_of_dims) { CLDNN_ERROR_MESSAGE(node.id(), "Undefined behavior ScatterUpdate: indices dimention must not be larger than the updates[:Axis] dimentional size."); } - - if (static_cast(axis) < 0 || static_cast(axis) >= input_number_of_dims) - CLDNN_ERROR_MESSAGE(node.id(), "Incorrect axis value for ScatterUpdate: Axis must be positive and less than the input tensor dimension."); return layout{output_type, input_format, output_shape}; } From 53c03db3074f61d34f3622103122978fd5a62d36 Mon Sep 17 00:00:00 2001 From: Anna Likholat Date: Wed, 9 Sep 2020 17:49:23 +0300 Subject: [PATCH 55/66] [JAVA] Code style check added (#1984) --- .github/workflows/code_style.yml | 33 ++- .../java/org/intel/openvino/Blob.java | 4 +- .../java/org/intel/openvino/CNNNetwork.java | 10 +- .../java/org/intel/openvino/Data.java | 2 +- .../java/org/intel/openvino/IECore.java | 12 +- .../java/org/intel/openvino/IEWrapper.java | 2 +- .../java/org/intel/openvino/InferRequest.java | 4 +- .../openvino/InferenceEngineProfileInfo.java | 22 +- .../java/org/intel/openvino/InputInfo.java | 8 +- .../java/org/intel/openvino/Parameter.java | 2 +- .../org/intel/openvino/PreProcessInfo.java | 2 +- .../java/org/intel/openvino/Precision.java | 23 +- .../org/intel/openvino/ResizeAlgorithm.java | 4 +- .../java/org/intel/openvino/StatusCode.java | 17 +- .../java/org/intel/openvino/TensorDesc.java | 10 +- .../java/org/intel/openvino/WaitMode.java | 5 +- .../java/samples/ArgumentParser.java | 8 +- .../ie_bridges/java/samples/README.md | 5 +- .../samples/benchmark_app/InferReqWrap.java | 29 +-- .../benchmark_app/InferRequestsQueue.java | 24 +- .../java/samples/benchmark_app/Main.java | 194 ++++++++------- .../face_detection_java_sample/Main.java | 47 ++-- .../face_detection_sample_async/Main.java | 222 +++++++++--------- .../ie_bridges/java/tests/BlobTests.java | 5 +- .../java/tests/CNNNetworkTests.java | 9 +- .../ie_bridges/java/tests/IECoreTests.java | 12 +- .../ie_bridges/java/tests/IETest.java | 45 ++-- .../java/tests/InferRequestTests.java | 39 +-- .../ie_bridges/java/tests/InputInfoTests.java | 7 +- .../java/tests/OpenVinoTestRunner.java | 5 +- .../ie_bridges/java/tests/TestsSuite.java | 55 +++-- 31 files changed, 478 insertions(+), 388 deletions(-) diff --git a/.github/workflows/code_style.yml b/.github/workflows/code_style.yml index f3835c269650b9..d05782ec334898 100644 --- a/.github/workflows/code_style.yml +++ b/.github/workflows/code_style.yml @@ -31,10 +31,37 @@ jobs: if: failure() run: | ngraph/maint/apply-code-format.sh - git diff >code_style_diff.patch + git diff >ngraph_code_style_diff.patch - uses: actions/upload-artifact@v2 if: failure() with: - name: code_style_diff - path: code_style_diff.patch + name: ngraph_code_style_diff + path: ngraph_code_style_diff.patch + + Java: + runs-on: ubuntu-18.04 + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-java@v1 + with: + java-version: '11' + + - name: Install dependencies + run: | + wget -nc https://github.com/google/google-java-format/releases/download/google-java-format-1.9/google-java-format-1.9-all-deps.jar + + - name: Check code style + run: | + java -jar google-java-format-1.9-all-deps.jar --set-exit-if-changed -a -i $(find . -type f -name "*.java") + + - name: Create code style diff + if: failure() + run: | + git diff >java_code_style_diff.patch + + - uses: actions/upload-artifact@v2 + if: failure() + with: + name: java_code_style_diff + path: java_code_style_diff.patch diff --git a/inference-engine/ie_bridges/java/org/intel/openvino/Blob.java b/inference-engine/ie_bridges/java/org/intel/openvino/Blob.java index 054cfc77335874..4090f7d7de2512 100644 --- a/inference-engine/ie_bridges/java/org/intel/openvino/Blob.java +++ b/inference-engine/ie_bridges/java/org/intel/openvino/Blob.java @@ -11,7 +11,7 @@ public Blob(TensorDesc tensorDesc) { } public Blob(TensorDesc tensorDesc, byte[] data) { - super(BlobByte(tensorDesc.getNativeObjAddr(), data)) ; + super(BlobByte(tensorDesc.getNativeObjAddr(), data)); } public Blob(TensorDesc tensorDesc, float[] data) { @@ -22,7 +22,7 @@ public Blob(TensorDesc tensorDesc, long cArray) { super(BlobCArray(tensorDesc.nativeObj, cArray)); } - public TensorDesc getTensorDesc(){ + public TensorDesc getTensorDesc() { return new TensorDesc(GetTensorDesc(nativeObj)); } diff --git a/inference-engine/ie_bridges/java/org/intel/openvino/CNNNetwork.java b/inference-engine/ie_bridges/java/org/intel/openvino/CNNNetwork.java index 85c6c09db4224f..a879aeea92fc13 100644 --- a/inference-engine/ie_bridges/java/org/intel/openvino/CNNNetwork.java +++ b/inference-engine/ie_bridges/java/org/intel/openvino/CNNNetwork.java @@ -8,11 +8,11 @@ protected CNNNetwork(long addr) { super(addr); } - public String getName(){ + public String getName() { return getName(nativeObj); } - public int getBatchSize(){ + public int getBatchSize() { return getBatchSize(nativeObj); } @@ -20,7 +20,7 @@ public Map getOutputsInfo() { return GetOutputsInfo(nativeObj); } - public Map getInputsInfo(){ + public Map getInputsInfo() { return GetInputsInfo(nativeObj); } @@ -28,7 +28,7 @@ public void reshape(Map inputShapes) { reshape(nativeObj, inputShapes); } - public Map getInputShapes(){ + public Map getInputShapes() { return getInputShapes(nativeObj); } @@ -46,7 +46,7 @@ public void addOutput(String layerName) { private static native int getBatchSize(long addr); private static native Map GetInputsInfo(long addr); - + private static native Map GetOutputsInfo(long addr); private static native void reshape(long addr, Map inputShapes); diff --git a/inference-engine/ie_bridges/java/org/intel/openvino/Data.java b/inference-engine/ie_bridges/java/org/intel/openvino/Data.java index 384ccce3c5c7e2..22a382143f402b 100644 --- a/inference-engine/ie_bridges/java/org/intel/openvino/Data.java +++ b/inference-engine/ie_bridges/java/org/intel/openvino/Data.java @@ -1,6 +1,6 @@ package org.intel.openvino; -public class Data extends IEWrapper{ +public class Data extends IEWrapper { protected Data(long addr) { super(addr); diff --git a/inference-engine/ie_bridges/java/org/intel/openvino/IECore.java b/inference-engine/ie_bridges/java/org/intel/openvino/IECore.java index a6c07d0922a498..7530458bb67a9b 100644 --- a/inference-engine/ie_bridges/java/org/intel/openvino/IECore.java +++ b/inference-engine/ie_bridges/java/org/intel/openvino/IECore.java @@ -25,8 +25,10 @@ public ExecutableNetwork LoadNetwork(CNNNetwork net, final String device) { return new ExecutableNetwork(LoadNetwork(nativeObj, net.getNativeObjAddr(), device)); } - public ExecutableNetwork LoadNetwork(CNNNetwork net, final String device, final Map config) { - return new ExecutableNetwork(LoadNetwork1(nativeObj, net.getNativeObjAddr(), device, config)); + public ExecutableNetwork LoadNetwork( + CNNNetwork net, final String device, final Map config) { + long network = LoadNetwork1(nativeObj, net.getNativeObjAddr(), device, config); + return new ExecutableNetwork(network); } public void RegisterPlugin(String pluginName, String deviceName) { @@ -64,11 +66,13 @@ public Parameter GetConfig(String deviceName, String name) { /*----------------------------------- native methods -----------------------------------*/ private static native long ReadNetwork(long core, final String modelFileName); - private static native long ReadNetwork1(long core, final String modelPath, final String weightPath); + private static native long ReadNetwork1( + long core, final String modelPath, final String weightPath); private static native long LoadNetwork(long core, long net, final String device); - private static native long LoadNetwork1(long core, long net, final String device, final Map config); + private static native long LoadNetwork1( + long core, long net, final String device, final Map config); private static native void RegisterPlugin(long core, String pluginName, String deviceName); diff --git a/inference-engine/ie_bridges/java/org/intel/openvino/IEWrapper.java b/inference-engine/ie_bridges/java/org/intel/openvino/IEWrapper.java index 33652d1bd2c5ce..0b3f650c8b261f 100644 --- a/inference-engine/ie_bridges/java/org/intel/openvino/IEWrapper.java +++ b/inference-engine/ie_bridges/java/org/intel/openvino/IEWrapper.java @@ -3,7 +3,7 @@ public class IEWrapper { protected final long nativeObj; - protected IEWrapper(long addr){ + protected IEWrapper(long addr) { nativeObj = addr; } diff --git a/inference-engine/ie_bridges/java/org/intel/openvino/InferRequest.java b/inference-engine/ie_bridges/java/org/intel/openvino/InferRequest.java index 46c99d9d3ed04d..050408066f466d 100644 --- a/inference-engine/ie_bridges/java/org/intel/openvino/InferRequest.java +++ b/inference-engine/ie_bridges/java/org/intel/openvino/InferRequest.java @@ -28,13 +28,13 @@ public StatusCode Wait(WaitMode waitMode) { return StatusCode.valueOf(Wait(nativeObj, waitMode.getValue())); } - public void SetCompletionCallback(Runnable runnable){ + public void SetCompletionCallback(Runnable runnable) { SetCompletionCallback(nativeObj, runnable); } public Map GetPerformanceCounts() { return GetPerformanceCounts(nativeObj); - } + } /*----------------------------------- native methods -----------------------------------*/ private static native void Infer(long addr); diff --git a/inference-engine/ie_bridges/java/org/intel/openvino/InferenceEngineProfileInfo.java b/inference-engine/ie_bridges/java/org/intel/openvino/InferenceEngineProfileInfo.java index c7bc86ceb4aa41..86053cd360ce14 100644 --- a/inference-engine/ie_bridges/java/org/intel/openvino/InferenceEngineProfileInfo.java +++ b/inference-engine/ie_bridges/java/org/intel/openvino/InferenceEngineProfileInfo.java @@ -5,27 +5,27 @@ public class InferenceEngineProfileInfo { public enum LayerStatus { - NOT_RUN(0), - OPTIMIZED_OUT(1), + NOT_RUN(0), + OPTIMIZED_OUT(1), EXECUTED(2); - + private int value; private static Map map = new HashMap(); - + static { for (LayerStatus layerStatus : LayerStatus.values()) { map.put(layerStatus.value, layerStatus); } } - + LayerStatus(int value) { this.value = value; } - + int getValue() { return value; } - + static LayerStatus valueOf(int value) { return map.get(value); } @@ -38,7 +38,13 @@ static LayerStatus valueOf(int value) { public String layerType; public int executionIndex; - public InferenceEngineProfileInfo(LayerStatus status, long realTimeUSec, long cpuUSec, String execType, String layerType, int executionIndex) { + public InferenceEngineProfileInfo( + LayerStatus status, + long realTimeUSec, + long cpuUSec, + String execType, + String layerType, + int executionIndex) { this.status = status; this.realTimeUSec = realTimeUSec; this.cpuUSec = cpuUSec; diff --git a/inference-engine/ie_bridges/java/org/intel/openvino/InputInfo.java b/inference-engine/ie_bridges/java/org/intel/openvino/InputInfo.java index 457994726db14b..1eb68f43acb551 100644 --- a/inference-engine/ie_bridges/java/org/intel/openvino/InputInfo.java +++ b/inference-engine/ie_bridges/java/org/intel/openvino/InputInfo.java @@ -1,6 +1,6 @@ package org.intel.openvino; -public class InputInfo extends IEWrapper{ +public class InputInfo extends IEWrapper { public InputInfo(long addr) { super(addr); @@ -14,7 +14,7 @@ public void setLayout(Layout layout) { SetLayout(nativeObj, layout.getValue()); } - public Layout getLayout(){ + public Layout getLayout() { return Layout.valueOf(getLayout(nativeObj)); } @@ -22,11 +22,11 @@ public void setPrecision(Precision precision) { SetPrecision(nativeObj, precision.getValue()); } - public Precision getPrecision(){ + public Precision getPrecision() { return Precision.valueOf(getPrecision(nativeObj)); } - public TensorDesc getTensorDesc(){ + public TensorDesc getTensorDesc() { return new TensorDesc(GetTensorDesc(nativeObj)); } diff --git a/inference-engine/ie_bridges/java/org/intel/openvino/Parameter.java b/inference-engine/ie_bridges/java/org/intel/openvino/Parameter.java index a0e8cad8ca6457..855a580b3da9cc 100644 --- a/inference-engine/ie_bridges/java/org/intel/openvino/Parameter.java +++ b/inference-engine/ie_bridges/java/org/intel/openvino/Parameter.java @@ -20,4 +20,4 @@ public String asString() { @Override protected native void delete(long nativeObj); -} \ No newline at end of file +} diff --git a/inference-engine/ie_bridges/java/org/intel/openvino/PreProcessInfo.java b/inference-engine/ie_bridges/java/org/intel/openvino/PreProcessInfo.java index a236885bc9162e..ef0fbea3089607 100644 --- a/inference-engine/ie_bridges/java/org/intel/openvino/PreProcessInfo.java +++ b/inference-engine/ie_bridges/java/org/intel/openvino/PreProcessInfo.java @@ -1,6 +1,6 @@ package org.intel.openvino; -public class PreProcessInfo extends IEWrapper{ +public class PreProcessInfo extends IEWrapper { public PreProcessInfo(long addr) { super(addr); diff --git a/inference-engine/ie_bridges/java/org/intel/openvino/Precision.java b/inference-engine/ie_bridges/java/org/intel/openvino/Precision.java index a6eff703c7f1af..523d90f241c606 100644 --- a/inference-engine/ie_bridges/java/org/intel/openvino/Precision.java +++ b/inference-engine/ie_bridges/java/org/intel/openvino/Precision.java @@ -5,17 +5,17 @@ public enum Precision { UNSPECIFIED(255), - MIXED(0), - FP32(10), - FP16(11), - Q78(20), - I16(30), - U8(40), - I8(50), - U16(60), - I32(70), - I64(72), - BIN(71), + MIXED(0), + FP32(10), + FP16(11), + Q78(20), + I16(30), + U8(40), + I8(50), + U16(60), + I32(70), + I64(72), + BIN(71), CUSTOM(80); private int value; @@ -39,4 +39,3 @@ static Precision valueOf(int value) { return map.get(value); } } - \ No newline at end of file diff --git a/inference-engine/ie_bridges/java/org/intel/openvino/ResizeAlgorithm.java b/inference-engine/ie_bridges/java/org/intel/openvino/ResizeAlgorithm.java index 3e037ac144da96..c0c4a7ebaa80e1 100644 --- a/inference-engine/ie_bridges/java/org/intel/openvino/ResizeAlgorithm.java +++ b/inference-engine/ie_bridges/java/org/intel/openvino/ResizeAlgorithm.java @@ -1,7 +1,9 @@ package org.intel.openvino; public enum ResizeAlgorithm { - NO_RESIZE(0), RESIZE_BILINEAR(1), RESIZE_AREA(2); + NO_RESIZE(0), + RESIZE_BILINEAR(1), + RESIZE_AREA(2); private int value; diff --git a/inference-engine/ie_bridges/java/org/intel/openvino/StatusCode.java b/inference-engine/ie_bridges/java/org/intel/openvino/StatusCode.java index 5ba8b435789d2c..a7e13a5b858b83 100644 --- a/inference-engine/ie_bridges/java/org/intel/openvino/StatusCode.java +++ b/inference-engine/ie_bridges/java/org/intel/openvino/StatusCode.java @@ -1,12 +1,21 @@ package org.intel.openvino; -import java.util.Map; import java.util.HashMap; +import java.util.Map; public enum StatusCode { - OK(0), GENERAL_ERROR(-1), NOT_IMPLEMENTED(-2), NETWORK_NOT_LOADED(-3), - PARAMETER_MISMATCH(-4), NOT_FOUND(-5), OUT_OF_BOUNDS(-6), UNEXPECTED(-7), - REQUEST_BUSY(-8), RESULT_NOT_READY(-9), NOT_ALLOCATED(-10), INFER_NOT_STARTED(-11), + OK(0), + GENERAL_ERROR(-1), + NOT_IMPLEMENTED(-2), + NETWORK_NOT_LOADED(-3), + PARAMETER_MISMATCH(-4), + NOT_FOUND(-5), + OUT_OF_BOUNDS(-6), + UNEXPECTED(-7), + REQUEST_BUSY(-8), + RESULT_NOT_READY(-9), + NOT_ALLOCATED(-10), + INFER_NOT_STARTED(-11), NETWORK_NOT_READ(-12); private int value; diff --git a/inference-engine/ie_bridges/java/org/intel/openvino/TensorDesc.java b/inference-engine/ie_bridges/java/org/intel/openvino/TensorDesc.java index 24da3a4c13f177..4fefeb3dcbb793 100644 --- a/inference-engine/ie_bridges/java/org/intel/openvino/TensorDesc.java +++ b/inference-engine/ie_bridges/java/org/intel/openvino/TensorDesc.java @@ -1,11 +1,9 @@ package org.intel.openvino; -import java.util.concurrent.BlockingDeque; - public class TensorDesc extends IEWrapper { - public TensorDesc(long addr){ - super(addr); + public TensorDesc(long addr) { + super(addr); } public TensorDesc(Precision precision, int[] dims, Layout layout) { @@ -16,11 +14,11 @@ public int[] getDims() { return GetDims(nativeObj); } - public Layout getLayout(){ + public Layout getLayout() { return Layout.valueOf(getLayout(nativeObj)); } - public Precision getPrecision(){ + public Precision getPrecision() { return Precision.valueOf(getPrecision(nativeObj)); } diff --git a/inference-engine/ie_bridges/java/org/intel/openvino/WaitMode.java b/inference-engine/ie_bridges/java/org/intel/openvino/WaitMode.java index 8daf17cc0f93c6..07a94173b06060 100644 --- a/inference-engine/ie_bridges/java/org/intel/openvino/WaitMode.java +++ b/inference-engine/ie_bridges/java/org/intel/openvino/WaitMode.java @@ -1,7 +1,8 @@ package org.intel.openvino; public enum WaitMode { - RESULT_READY(-1), STATUS_ONLY(0); + RESULT_READY(-1), + STATUS_ONLY(0); private int value; @@ -12,4 +13,4 @@ private WaitMode(int value) { public int getValue() { return value; } -} \ No newline at end of file +} diff --git a/inference-engine/ie_bridges/java/samples/ArgumentParser.java b/inference-engine/ie_bridges/java/samples/ArgumentParser.java index 9d48f854a14ce9..d6bfa78782452f 100644 --- a/inference-engine/ie_bridges/java/samples/ArgumentParser.java +++ b/inference-engine/ie_bridges/java/samples/ArgumentParser.java @@ -1,5 +1,5 @@ -import java.util.Map; import java.util.HashMap; +import java.util.Map; public class ArgumentParser { private Map input; @@ -25,8 +25,8 @@ private void printHelp() { } public void parseArgs(String[] args) { - try{ - for(int i = 0; i < args.length; i++) { + try { + for (int i = 0; i < args.length; i++) { String arg = args[i]; if (arg.equals("--help") | arg.equals("-h")) { printHelp(); @@ -40,7 +40,7 @@ public void parseArgs(String[] args) { } } } - } catch(ArrayIndexOutOfBoundsException e) { + } catch (ArrayIndexOutOfBoundsException e) { System.out.println("Error: Incorrect number of arguments"); System.exit(0); } diff --git a/inference-engine/ie_bridges/java/samples/README.md b/inference-engine/ie_bridges/java/samples/README.md index d1f276256f6d45..6cdb661c243897 100644 --- a/inference-engine/ie_bridges/java/samples/README.md +++ b/inference-engine/ie_bridges/java/samples/README.md @@ -61,10 +61,7 @@ https://download.01.org/opencv/2019/open_model_zoo/R1/models_bin/face-detection- ## Build and run -Build and run steps are similar to ```benchmark_app```, but you need to add OpenCV path. - -### Build -Add an environment variable with OpenCV installation or build path: +Build and run steps are similar to ```benchmark_app```, but you need to add an environment variable with OpenCV installation or build path before building: ```bash export OpenCV_DIR=/path/to/opencv/ ``` diff --git a/inference-engine/ie_bridges/java/samples/benchmark_app/InferReqWrap.java b/inference-engine/ie_bridges/java/samples/benchmark_app/InferReqWrap.java index 4e73bb0a09b069..194299bfd99044 100644 --- a/inference-engine/ie_bridges/java/samples/benchmark_app/InferReqWrap.java +++ b/inference-engine/ie_bridges/java/samples/benchmark_app/InferReqWrap.java @@ -1,21 +1,22 @@ -import java.util.Map; - import org.intel.openvino.*; +import java.util.Map; + public class InferReqWrap { - public InferReqWrap(ExecutableNetwork net, int id, InferRequestsQueue irQueue) { - request = net.CreateInferRequest(); + public InferReqWrap(ExecutableNetwork net, int id, InferRequestsQueue irQueue) { + request = net.CreateInferRequest(); this.id = id; this.irQueue = irQueue; - request.SetCompletionCallback(new Runnable() { - - @Override - public void run() { - endTime = System.nanoTime(); - irQueue.putIdleRequest(id, getExecutionTimeInMilliseconds()); - } - }); + request.SetCompletionCallback( + new Runnable() { + + @Override + public void run() { + endTime = System.nanoTime(); + irQueue.putIdleRequest(id, getExecutionTimeInMilliseconds()); + } + }); } void startAsync() { @@ -43,9 +44,9 @@ Blob getBlob(String name) { } double getExecutionTimeInMilliseconds() { - return (double)(endTime - startTime) * 1e-6; + return (double) (endTime - startTime) * 1e-6; } - + InferRequest request; private InferRequestsQueue irQueue; private long startTime; diff --git a/inference-engine/ie_bridges/java/samples/benchmark_app/InferRequestsQueue.java b/inference-engine/ie_bridges/java/samples/benchmark_app/InferRequestsQueue.java index 5b531e90a64d21..aec27472d3b35b 100644 --- a/inference-engine/ie_bridges/java/samples/benchmark_app/InferRequestsQueue.java +++ b/inference-engine/ie_bridges/java/samples/benchmark_app/InferRequestsQueue.java @@ -1,9 +1,9 @@ +import org.intel.openvino.*; + import java.util.Vector; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; -import org.intel.openvino.*; - public class InferRequestsQueue { public InferRequestsQueue(ExecutableNetwork net, int nireq) { for (int id = 0; id < nireq; id++) { @@ -12,17 +12,17 @@ public InferRequestsQueue(ExecutableNetwork net, int nireq) { } resetTimes(); } - + void resetTimes() { startTime = Long.MAX_VALUE; endTime = Long.MIN_VALUE; latencies.clear(); } - + double getDurationInMilliseconds() { - return (double)(endTime - startTime) * 1e-6; + return (double) (endTime - startTime) * 1e-6; } - + void putIdleRequest(int id, double latency) { latencies.add(latency); idleIds.add(id); @@ -32,7 +32,7 @@ void putIdleRequest(int id, double latency) { foo.notify(); } } - + InferReqWrap getIdleRequest() { try { InferReqWrap request = requests.get(idleIds.take()); @@ -43,11 +43,11 @@ InferReqWrap getIdleRequest() { } return null; } - + void waitAll() { synchronized (foo) { try { - while(idleIds.size() != requests.size()) { + while (idleIds.size() != requests.size()) { foo.wait(); } } catch (InterruptedException e) { @@ -55,16 +55,16 @@ void waitAll() { } } } - + Vector getLatencies() { return latencies; } - + Vector requests = new Vector(); private BlockingQueue idleIds = new LinkedBlockingQueue(); private long startTime; private long endTime; - Vector latencies = new Vector(); + Vector latencies = new Vector(); Object foo = new Object(); } diff --git a/inference-engine/ie_bridges/java/samples/benchmark_app/Main.java b/inference-engine/ie_bridges/java/samples/benchmark_app/Main.java index e0eefee90d8730..860b4baba12366 100644 --- a/inference-engine/ie_bridges/java/samples/benchmark_app/Main.java +++ b/inference-engine/ie_bridges/java/samples/benchmark_app/Main.java @@ -1,28 +1,26 @@ -import java.util.Map; -import java.util.Vector; - -import javax.management.RuntimeErrorException; +import org.intel.openvino.*; -import java.util.Random; -import java.util.HashMap; -import java.util.LinkedList; import java.util.ArrayList; - import java.util.Arrays; - -import org.intel.openvino.*; +import java.util.HashMap; +import java.util.Map; +import java.util.Random; +import java.util.Vector; public class Main { - static boolean adjustShapesBatch(Map shapes, int batchSize, Map inputInfo) { + static boolean adjustShapesBatch( + Map shapes, int batchSize, Map inputInfo) { boolean updated = false; for (Map.Entry entry : inputInfo.entrySet()) { Layout layout = entry.getValue().getTensorDesc().getLayout(); int batchIndex = -1; - if ((layout == Layout.NCHW) || (layout == Layout.NCDHW) || - (layout == Layout.NHWC) || (layout == Layout.NDHWC) || - (layout == Layout.NC)) { + if ((layout == Layout.NCHW) + || (layout == Layout.NCDHW) + || (layout == Layout.NHWC) + || (layout == Layout.NDHWC) + || (layout == Layout.NC)) { batchIndex = 0; } else if (layout == Layout.CN) { batchIndex = 1; @@ -35,28 +33,42 @@ static boolean adjustShapesBatch(Map shapes, int batchSize, Map device_config, String device, int nstreams, boolean isAsync) { + static String setThroughputStreams( + IECore core, + Map device_config, + String device, + int nstreams, + boolean isAsync) { String key = device + "_THROUGHPUT_STREAMS"; if (nstreams > 0) { device_config.put(key, Integer.toString(nstreams)); } else if (!device_config.containsKey(key) && isAsync) { - System.err.println("[ WARNING ] -nstreams default value is determined automatically for " + device + " device. " + - "Although the automatic selection usually provides a reasonable performance," + - "but it still may be non-optimal for some cases, for more information look at README."); + System.err.println( + "[ WARNING ] -nstreams default value is determined automatically for " + + device + + " device. Although the automatic selection usually provides a" + + " reasonable performance,but it still may be non-optimal for some" + + " cases, for more information look at README."); device_config.put(key, device + "_THROUGHPUT_AUTO"); } return device_config.get(key); - }; + } static void fillBlobs(Vector requests, Map inputsInfo) { for (Map.Entry entry : inputsInfo.entrySet()) { String inputName = entry.getKey(); TensorDesc tDesc = entry.getValue().getTensorDesc(); - System.err.print("[ INFO ] Network input '" + inputName + "' precision " + tDesc.getPrecision() - + ", dimensions (" + tDesc.getLayout() + "): "); - for (int dim : tDesc.getDims()) - System.err.print(dim + " "); + System.err.print( + "[ INFO ] Network input '" + + inputName + + "' precision " + + tDesc.getPrecision() + + ", dimensions (" + + tDesc.getLayout() + + "): "); + + for (int dim : tDesc.getDims()) System.err.print(dim + " "); System.err.println(); } @@ -66,7 +78,7 @@ static void fillBlobs(Vector requests, Map inpu String inputName = entry.getKey(); TensorDesc tDesc = entry.getValue().getTensorDesc(); request.SetBlob(inputName, blobRandomByte(tDesc)); - } + } } } @@ -74,7 +86,7 @@ static Blob blobRandomByte(TensorDesc tDesc) { int dims[] = tDesc.getDims(); int size = 1; - for(int i = 0; i < dims.length; i++) { + for (int i = 0; i < dims.length; i++) { size *= dims[i]; } @@ -87,53 +99,59 @@ static Blob blobRandomByte(TensorDesc tDesc) { static double getMedianValue(Vector vec) { Object[] objArr = vec.toArray(); - Double[] arr = Arrays.copyOf(objArr, objArr.length, Double[].class); + Double[] arr = Arrays.copyOf(objArr, objArr.length, Double[].class); Arrays.sort(arr); if (arr.length % 2 == 0) - return ((double)arr[arr.length / 2] + (double)arr[arr.length / 2 - 1]) / 2; - else - return (double)arr[arr.length / 2]; + return ((double) arr[arr.length / 2] + (double) arr[arr.length / 2 - 1]) / 2; + else return (double) arr[arr.length / 2]; } static boolean getApiBoolean(String api) throws RuntimeException { - if(api.equals("sync")) - return false; - else if(api.equals("async")) - return true; + if (api.equals("sync")) return false; + else if (api.equals("async")) return true; else throw new RuntimeException("Incorrect argument: '-api'"); } static int step = 0; - static void nextStep(String stepInfo) { + + static void nextStep(String stepInfo) { step += 1; System.out.println("[Step " + step + "/11] " + stepInfo); } static int deviceDefaultDeviceDurationInSeconds(String device) { - final Map deviceDefaultDurationInSeconds = new HashMap() {{ - put("CPU", 60 ); - put("GPU", 60 ); - put("VPU", 60 ); - put("MYRIAD", 60 ); - put("HDDL", 60 ); - put("FPGA", 120); - put("UNKNOWN", 120); - }}; + final Map deviceDefaultDurationInSeconds = + new HashMap() { + { + put("CPU", 60); + put("GPU", 60); + put("VPU", 60); + put("MYRIAD", 60); + put("HDDL", 60); + put("FPGA", 120); + put("UNKNOWN", 120); + } + }; Integer duration = deviceDefaultDurationInSeconds.get(device); if (duration == null) { duration = deviceDefaultDurationInSeconds.get("UNKNOWN"); - System.err.println("[ WARNING ] Default duration " + duration + " seconds for unknown device '" + device + "' is used"); + System.err.println( + "[ WARNING ] Default duration " + + duration + + " seconds for unknown device '" + + device + + "' is used"); } return duration; } static long getTotalMsTime(long startTimeMilliSec) { return (System.currentTimeMillis() - startTimeMilliSec); - }; + } static long getDurationInMilliseconds(int seconds) { return seconds * 1000L; @@ -147,7 +165,7 @@ public static void main(String[] args) { System.exit(1); } - // ----------------- 1. Parsing and validating input arguments --------------------------------------------- + // ----------------- 1. Parsing and validating input arguments ----------------- nextStep("Parsing and validating input arguments"); ArgumentParser parser = new ArgumentParser("This is benchmarking application"); @@ -169,39 +187,38 @@ public static void main(String[] args) { int batchSize = parser.getInteger("-b", 0); int nthreads = parser.getInteger("-nthreads", 0); int nstreams = parser.getInteger("-nstreams", 0); - int timeLimit = parser.getInteger("-t",0); + int timeLimit = parser.getInteger("-t", 0); String api = parser.get("-api", "async"); boolean isAsync; - try{ + try { isAsync = getApiBoolean(api); - } catch(RuntimeException e) { + } catch (RuntimeException e) { System.out.println(e.getMessage()); return; } - if(xmlPath == null) { + if (xmlPath == null) { System.out.println("Error: Missed argument: -m"); return; } - // ----------------- 2. Loading the Inference Engine -------------------------------------------------------- + // ----------------- 2. Loading the Inference Engine -------------------------- nextStep("Loading the Inference Engine"); IECore core = new IECore(); - // ----------------- 3. Setting device configuration -------------------------------------------------------- + // ----------------- 3. Setting device configuration -------------------------- nextStep("Setting device configuration"); Map device_config = new HashMap<>(); - if (device.equals("CPU")) { // CPU supports few special performance-oriented keys + if (device.equals("CPU")) { // CPU supports few special performance-oriented keys // limit threading for CPU portion of inference - if (nthreads > 0) - device_config.put("CPU_THREADS_NUM", Integer.toString(nthreads)); + if (nthreads > 0) device_config.put("CPU_THREADS_NUM", Integer.toString(nthreads)); if (!device_config.containsKey("CPU_BIND_THREAD")) { - device_config.put("CPU_BIND_THREAD", "YES"); + device_config.put("CPU_BIND_THREAD", "YES"); } // for CPU execution, more throughput-oriented execution via streams @@ -212,15 +229,14 @@ public static void main(String[] args) { } else if (device.equals("MYRIAD")) { device_config.put("LOG_LEVEL", "LOG_WARNING"); } else if (device.equals("GNA")) { - device_config.put("GNA_PRECISION", "I16"); + device_config.put("GNA_PRECISION", "I16"); - if (nthreads > 0) - device_config.put("GNA_LIB_N_THREADS", Integer.toString(nthreads)); + if (nthreads > 0) device_config.put("GNA_LIB_N_THREADS", Integer.toString(nthreads)); } core.SetConfig(device_config, device); - // ----------------- 4. Reading the Intermediate Representation network ------------------------------------- + // ----------- 4. Reading the Intermediate Representation network ------------- nextStep("Reading the Intermediate Representation network"); long startTime = System.currentTimeMillis(); @@ -233,14 +249,14 @@ public static void main(String[] args) { String inputName = new ArrayList(inputsInfo.keySet()).get(0); InputInfo inputInfo = inputsInfo.get(inputName); - // ----------------- 5. Resizing network to match image sizes and given batch ------------------------------- + // ----- 5. Resizing network to match image sizes and given batch -------------- nextStep("Resizing network to match image sizes and given batch"); int inputBatchSize = batchSize; batchSize = net.getBatchSize(); Map shapes = net.getInputShapes(); - + if ((inputBatchSize != 0) && (batchSize != inputBatchSize)) { adjustShapesBatch(shapes, batchSize, inputsInfo); @@ -252,15 +268,19 @@ public static void main(String[] args) { System.err.println("[ INFO ] Reshape network took " + durationMs + " ms"); } - System.err.println((inputBatchSize != 0 ? "[ INFO ] Network batch size was changed to: " : "[ INFO ] Network batch size: ") + batchSize); + System.err.println( + (inputBatchSize != 0 + ? "[ INFO ] Network batch size was changed to: " + : "[ INFO ] Network batch size: ") + + batchSize); - // ----------------- 6. Configuring input ------------------------------------------------------------------- + // ----------------- 6. Configuring input ------------------------------------- nextStep("Configuring input"); inputInfo.getPreProcess().setResizeAlgorithm(ResizeAlgorithm.RESIZE_BILINEAR); inputInfo.setPrecision(Precision.U8); - // ----------------- 7. Loading the model to the device ----------------------------------------------------- + // ----------------- 7. Loading the model to the device ----------------------- nextStep("Loading the model to the device"); startTime = System.currentTimeMillis(); @@ -269,11 +289,12 @@ public static void main(String[] args) { System.err.println("[ INFO ] Load network took " + durationMs + " ms"); - // ----------------- 8. Setting optimal runtime parameters -------------------------------------------------- + // ---------------- 8. Setting optimal runtime parameters --------------------- nextStep("Setting optimal runtime parameters"); // Update number of streams - nstreams = Integer.parseInt(core.GetConfig(device, device + "_THROUGHPUT_STREAMS").asString()); + String nStr = core.GetConfig(device, device + "_THROUGHPUT_STREAMS").asString(); + nstreams = Integer.parseInt(nStr); // Number of requests if (nireq == 0) { @@ -289,8 +310,12 @@ public static void main(String[] args) { int temp = niter; niter = ((niter + nireq - 1) / nireq) * nireq; if (temp != niter) { - System.err.println("[ INFO ] Number of iterations was aligned by request number from " + - temp + " to " + niter + " using number of requests " + nireq); + System.err.println( + "[ INFO ] Number of iterations was aligned by request number from " + + " to " + + niter + + " using number of requests " + + nireq); } } @@ -304,14 +329,14 @@ public static void main(String[] args) { durationSeconds = deviceDefaultDeviceDurationInSeconds(device); } durationMs = getDurationInMilliseconds(durationSeconds); - - // ----------------- 9. Creating infer requests and filling input blobs ------------------------------------- + + // ---------- 9. Creating infer requests and filling input blobs --------------- nextStep("Creating infer requests and filling input blobs"); InferRequestsQueue inferRequestsQueue = new InferRequestsQueue(executableNetwork, nireq); fillBlobs(inferRequestsQueue.requests, inputsInfo); - // ----------------- 10. Measuring performance -------------------------------------------------------------- + // ---------- 10. Measuring performance ---------------------------------------- String ss = "Start inference " + api + "ronously"; if (isAsync) { if (!ss.isEmpty()) { @@ -352,18 +377,19 @@ public static void main(String[] args) { startTime = System.currentTimeMillis(); long execTime = getTotalMsTime(startTime); - - while ((niter != 0 && iteration < niter) || - (durationMs != 0L && execTime < durationMs) || - (isAsync && iteration % nireq != 0)) { + + while ((niter != 0 && iteration < niter) + || (durationMs != 0L && execTime < durationMs) + || (isAsync && iteration % nireq != 0)) { inferRequest = inferRequestsQueue.getIdleRequest(); - + if (isAsync) { - // As the inference request is currently idle, the wait() adds no additional overhead - //(and should return immediately). + // As the inference request is currently idle, the wait() adds no additional + // overhead (and should return immediately). // The primary reason for calling the method is exception checking/re-throwing. // Callback, that governs the actual execution can handle errors as well, - // but as it uses just error codes it has no details like ‘what()’ method of `std::exception` + // but as it uses just error codes it has no details like ‘what()’ method of + // `std::exception`. // So, rechecking for any exceptions here. inferRequest._wait(); inferRequest.startAsync(); @@ -380,10 +406,12 @@ public static void main(String[] args) { double latency = getMedianValue(inferRequestsQueue.getLatencies()); double totalDuration = inferRequestsQueue.getDurationInMilliseconds(); - double fps = (!isAsync) ? batchSize * 1000.0 / latency : - batchSize * 1000.0 * iteration / totalDuration; + double fps = + (!isAsync) + ? batchSize * 1000.0 / latency + : batchSize * 1000.0 * iteration / totalDuration; - // ----------------- 11. Dumping statistics report ---------------------------------------------------------- + // ------------ 11. Dumping statistics report ---------------------------------- nextStep("Dumping statistics report"); System.out.println("Count: " + iteration + " iterations"); diff --git a/inference-engine/ie_bridges/java/samples/face_detection_java_sample/Main.java b/inference-engine/ie_bridges/java/samples/face_detection_java_sample/Main.java index 31bbbf39f04816..2fcc114ebca00f 100644 --- a/inference-engine/ie_bridges/java/samples/face_detection_java_sample/Main.java +++ b/inference-engine/ie_bridges/java/samples/face_detection_java_sample/Main.java @@ -1,20 +1,19 @@ +import org.intel.openvino.*; import org.opencv.core.*; -import org.opencv.imgcodecs.*; import org.opencv.highgui.HighGui; +import org.opencv.imgcodecs.*; import org.opencv.imgproc.Imgproc; -import org.intel.openvino.*; -import java.util.Map; -import java.util.Set; import java.util.ArrayList; +import java.util.Map; /* This is face detection java sample. -Upon the start-up the sample application reads command line parameters and loads a network -and an image to the Inference Engine device. When inference is done, the application will show -the image with detected objects enclosed in rectangles in new window.It also outputs the -confidence value and the coordinates of the rectangle to the standard output stream. +Upon the start-up the sample application reads command line parameters and loads a network +and an image to the Inference Engine device. When inference is done, the application will show +the image with detected objects enclosed in rectangles in new window.It also outputs the +confidence value and the coordinates of the rectangle to the standard output stream. To get the list of command line parameters run the application with `--help` paramether. */ @@ -42,24 +41,25 @@ public static void main(String[] args) { String imgPath = parser.get("-i", null); String xmlPath = parser.get("-m", null); - if(imgPath == null) { + if (imgPath == null) { System.out.println("Error: Missed argument: -i"); return; } - if(xmlPath == null) { + if (xmlPath == null) { System.out.println("Error: Missed argument: -m"); return; } Mat image = Imgcodecs.imread(imgPath); - + int[] dimsArr = {1, image.channels(), image.height(), image.width()}; TensorDesc tDesc = new TensorDesc(Precision.U8, dimsArr, Layout.NHWC); - // The source image is also used at the end of the program to display the detection results, - // therefore the Mat object won't be destroyed by Garbage Collector while the network is running. + // The source image is also used at the end of the program to display the detection results, + // therefore the Mat object won't be destroyed by Garbage Collector while the network is + // running. Blob imgBlob = new Blob(tDesc, image.dataAddr()); - + IECore core = new IECore(); CNNNetwork net = core.ReadNetwork(xmlPath); @@ -77,7 +77,7 @@ public static void main(String[] args) { ExecutableNetwork executableNetwork = core.LoadNetwork(net, "CPU"); InferRequest inferRequest = executableNetwork.CreateInferRequest(); - inferRequest.SetBlob(inputName, imgBlob); + inferRequest.SetBlob(inputName, imgBlob); inferRequest.Infer(); Blob output = inferRequest.GetBlob(outputName); @@ -89,27 +89,28 @@ public static void main(String[] args) { for (int curProposal = 0; curProposal < maxProposalCount; curProposal++) { int image_id = (int) detection[curProposal * 7]; - if (image_id < 0) - break; + if (image_id < 0) break; float confidence = detection[curProposal * 7 + 2]; // Drawing only objects with >70% probability - if (confidence < THRESHOLD) - continue; - + if (confidence < THRESHOLD) continue; + int label = (int) (detection[curProposal * 7 + 1]); int xmin = (int) (detection[curProposal * 7 + 3] * image.cols()); int ymin = (int) (detection[curProposal * 7 + 4] * image.rows()); int xmax = (int) (detection[curProposal * 7 + 5] * image.cols()); int ymax = (int) (detection[curProposal * 7 + 6] * image.rows()); - System.out.println("[" + curProposal + "," + label + "] element, prob = " + confidence + " (" + xmin - + "," + ymin + ")-(" + xmax + "," + ymax + ")"); + String result = "[" + curProposal + "," + label + "] element, prob = " + confidence; + result += " (" + xmin + "," + ymin + ")-(" + xmax + "," + ymax + ")"; + + System.out.println(result); System.out.println(" - WILL BE PRINTED!"); // Draw rectangle around detected object. - Imgproc.rectangle(image, new Point(xmin, ymin), new Point(xmax, ymax), new Scalar(0, 255, 0)); + Imgproc.rectangle( + image, new Point(xmin, ymin), new Point(xmax, ymax), new Scalar(0, 255, 0)); } HighGui.namedWindow("Detection", HighGui.WINDOW_AUTOSIZE); diff --git a/inference-engine/ie_bridges/java/samples/face_detection_sample_async/Main.java b/inference-engine/ie_bridges/java/samples/face_detection_sample_async/Main.java index 0786f70d135d91..cfc4fe4a04fc94 100644 --- a/inference-engine/ie_bridges/java/samples/face_detection_sample_async/Main.java +++ b/inference-engine/ie_bridges/java/samples/face_detection_sample_async/Main.java @@ -1,32 +1,30 @@ +import org.intel.openvino.*; import org.opencv.core.*; +import org.opencv.highgui.HighGui; import org.opencv.imgcodecs.*; -import org.opencv.videoio.*; import org.opencv.imgproc.Imgproc; -import org.opencv.highgui.HighGui; +import org.opencv.videoio.*; +import java.util.ArrayList; import java.util.LinkedList; +import java.util.Map; +import java.util.Queue; import java.util.Vector; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; -import java.util.Map; -import java.util.Queue; -import java.util.ArrayList; -import java.util.HashMap; - -import org.intel.openvino.*; /* This is async face detection java sample. -Upon the start-up the sample application reads command line parameters and loads a network -and an images to the Inference Engine device. When inference is done, the application +Upon the start-up the sample application reads command line parameters and loads a network +and an images to the Inference Engine device. When inference is done, the application shows the video with detected objects enclosed in rectangles in new window. To get the list of command line parameters run the application with `--help` paramether. */ public class Main { - + public static Blob imageToBlob(Mat image) { int[] dimsArr = {1, image.channels(), image.height(), image.width()}; TensorDesc tDesc = new TensorDesc(Precision.U8, dimsArr, Layout.NHWC); @@ -41,9 +39,8 @@ static void processInferRequets(WaitMode wait) { while (!startedRequestsIds.isEmpty()) { int requestId = startedRequestsIds.peek(); InferRequest inferRequest = inferRequests.get(requestId); - - if (inferRequest.Wait(wait) != StatusCode.OK) - return; + + if (inferRequest.Wait(wait) != StatusCode.OK) return; if (size == 0 && res == null) { size = inferRequest.GetBlob(outputName).size(); @@ -86,11 +83,11 @@ public static void main(String[] args) { String device = parser.get("-d", "CPU"); int inferRequestsSize = parser.getInteger("-nireq", 2); - if(imgsPath == null ) { + if (imgsPath == null) { System.out.println("Error: Missed argument: -i"); return; } - if(xmlPath == null) { + if (xmlPath == null) { System.out.println("Error: Missed argument: -m"); return; } @@ -99,137 +96,145 @@ public static void main(String[] args) { BlockingQueue framesQueue = new LinkedBlockingQueue(); - Thread captureThread = new Thread(new Runnable() { - @Override - public void run() { - VideoCapture cam = new VideoCapture(); - cam.open(imgsPath); - Mat frame = new Mat(); - - while (cam.read(frame)) { - framesCounter++; - framesQueue.add(frame.clone()); - } - } - }); - - Thread inferThread = new Thread(new Runnable() { - - @Override - public void run() { - try { - IECore core = new IECore(); - CNNNetwork net = core.ReadNetwork(xmlPath); - - Map inputsInfo = net.getInputsInfo(); - String inputName = new ArrayList(inputsInfo.keySet()).get(0); - InputInfo inputInfo = inputsInfo.get(inputName); - - inputInfo.getPreProcess().setResizeAlgorithm(ResizeAlgorithm.RESIZE_BILINEAR); - inputInfo.setLayout(Layout.NHWC); - inputInfo.setPrecision(Precision.U8); - - outputName = new ArrayList(net.getOutputsInfo().keySet()).get(0); - - ExecutableNetwork executableNetwork = core.LoadNetwork(net, device); - - asyncInferIsFree = new Vector(inferRequestsSize); - - for (int i = 0; i < inferRequestsSize; i++) { - inferRequests.add(executableNetwork.CreateInferRequest()); - asyncInferIsFree.add(true); + Runnable capture = + new Runnable() { + @Override + public void run() { + VideoCapture cam = new VideoCapture(); + cam.open(imgsPath); + Mat frame = new Mat(); + + while (cam.read(frame)) { + framesCounter++; + framesQueue.add(frame.clone()); + } } - - boolean isRunning = true; + }; + Thread captureThread = new Thread(capture); - while (captureThread.isAlive() || !framesQueue.isEmpty()) { - processInferRequets(WaitMode.STATUS_ONLY); + Runnable infer = + new Runnable() { + @Override + public void run() { + try { + IECore core = new IECore(); + CNNNetwork net = core.ReadNetwork(xmlPath); - for (int i = 0; i < inferRequestsSize; i++) { - if (!asyncInferIsFree.get(i)) - continue; + Map inputsInfo = net.getInputsInfo(); + String inputName = new ArrayList(inputsInfo.keySet()).get(0); + InputInfo inputInfo = inputsInfo.get(inputName); - Mat frame = framesQueue.poll(0, TimeUnit.SECONDS); + inputInfo + .getPreProcess() + .setResizeAlgorithm(ResizeAlgorithm.RESIZE_BILINEAR); + inputInfo.setLayout(Layout.NHWC); + inputInfo.setPrecision(Precision.U8); - if (frame == null) - break; + outputName = + new ArrayList(net.getOutputsInfo().keySet()).get(0); - InferRequest request = inferRequests.get(i); - - asyncInferIsFree.setElementAt(false, i); - processedFramesQueue.add(frame); // predictionsQueue is used in rendering + ExecutableNetwork execNetwork = core.LoadNetwork(net, device); - // The source frame is kept in processedFramesQueue, - // so the frame will be removed by java Garbage Collector only after completion of inference, - // and we can create Blob object using Mat object data address. - Blob imgBlob = imageToBlob(frame); - request.SetBlob(inputName, imgBlob); + asyncInferIsFree = new Vector(inferRequestsSize); - startedRequestsIds.add(i); - request.StartAsync(); + for (int i = 0; i < inferRequestsSize; i++) { + inferRequests.add(execNetwork.CreateInferRequest()); + asyncInferIsFree.add(true); + } + + boolean isRunning = true; + + while (captureThread.isAlive() || !framesQueue.isEmpty()) { + processInferRequets(WaitMode.STATUS_ONLY); + + for (int i = 0; i < inferRequestsSize; i++) { + if (!asyncInferIsFree.get(i)) continue; + + Mat frame = framesQueue.poll(0, TimeUnit.SECONDS); + + if (frame == null) break; + + InferRequest request = inferRequests.get(i); + + asyncInferIsFree.setElementAt(false, i); + + // processedFramesQueue is used in rendering + processedFramesQueue.add(frame); + + // The source frame is kept in processedFramesQueue, + // so the frame will be removed by java Garbage + // Collector only after completion of inference, + // and we can create Blob object using Mat object data address. + Blob imgBlob = imageToBlob(frame); + request.SetBlob(inputName, imgBlob); + + startedRequestsIds.add(i); + request.StartAsync(); + } + } + processInferRequets(WaitMode.RESULT_READY); + } catch (InterruptedException e) { + e.printStackTrace(); + + for (Thread t : Thread.getAllStackTraces().keySet()) + if (t.getState() == Thread.State.RUNNABLE) t.interrupt(); } } - processInferRequets(WaitMode.RESULT_READY); - } catch (InterruptedException e) { - e.printStackTrace(); - - for (Thread t : Thread.getAllStackTraces().keySet()) - if (t.getState()==Thread.State.RUNNABLE) - t.interrupt(); - } - } - }); + }; + Thread inferThread = new Thread(infer); captureThread.start(); inferThread.start(); - TickMeter tm = new TickMeter(); + TickMeter tm = new TickMeter(); + Scalar color = new Scalar(0, 255, 0); try { while (inferThread.isAlive() || !detectionOutput.isEmpty()) { - float[] detection = detectionOutput.poll(waitingTime, TimeUnit.SECONDS); - if (detection == null) - continue; - - Mat img = processedFramesQueue.poll(waitingTime, TimeUnit.SECONDS); + float[] detection = detectionOutput.poll(waitingTime, TimeUnit.SECONDS); + if (detection == null) continue; + + Mat img = processedFramesQueue.poll(waitingTime, TimeUnit.SECONDS); int maxProposalCount = detection.length / 7; for (int curProposal = 0; curProposal < maxProposalCount; curProposal++) { int imageId = (int) detection[curProposal * 7]; - if (imageId < 0) - break; - + if (imageId < 0) break; + float confidence = detection[curProposal * 7 + 2]; // Drawing only objects with >70% probability - if (confidence < CONFIDENCE_THRESHOLD) - continue; - + if (confidence < CONFIDENCE_THRESHOLD) continue; + int label = (int) (detection[curProposal * 7 + 1]); int xmin = (int) (detection[curProposal * 7 + 3] * img.cols()); int ymin = (int) (detection[curProposal * 7 + 4] * img.rows()); int xmax = (int) (detection[curProposal * 7 + 5] * img.cols()); int ymax = (int) (detection[curProposal * 7 + 6] * img.rows()); - + // Draw rectangle around detected object. - Imgproc.rectangle(img, new Point(xmin, ymin), new Point(xmax, ymax), new Scalar(0, 255, 0), 2); + Point lt = new Point(xmin, ymin); + Point br = new Point(xmax, ymax); + Imgproc.rectangle(img, lt, br, color, 2); } - if (resultCounter == warmupNum) { + if (resultCounter == warmupNum) { tm.start(); } else if (resultCounter > warmupNum) { tm.stop(); - double worksFps = ((double)(resultCounter - warmupNum)) / tm.getTimeSec(); - double readFps = ((double)(framesCounter - warmupNum)) / tm.getTimeSec(); + double worksFps = ((double) (resultCounter - warmupNum)) / tm.getTimeSec(); + double readFps = ((double) (framesCounter - warmupNum)) / tm.getTimeSec(); tm.start(); - Imgproc.putText(img, "Reading fps: " + String.format("%.3f", readFps), new Point(10, 50), 0 , 0.7, new Scalar(0, 255, 0), 1); - Imgproc.putText(img, "Inference fps: " + String.format("%.3f", worksFps), new Point(10, 80), 0 , 0.7, new Scalar(0, 255, 0), 1); + String label = "Reading fps: " + String.format("%.3f", readFps); + String label1 = "Inference fps: " + String.format("%.3f", worksFps); + + Imgproc.putText(img, label, new Point(10, 50), 0, 0.7, color, 1); + Imgproc.putText(img, label1, new Point(10, 80), 0, 0.7, color, 1); } - HighGui.imshow("Detection", img); } - + captureThread.join(); inferThread.join(); @@ -239,8 +244,7 @@ public void run() { } catch (InterruptedException e) { e.printStackTrace(); for (Thread t : Thread.getAllStackTraces().keySet()) - if (t.getState()==Thread.State.RUNNABLE) - t.interrupt(); + if (t.getState() == Thread.State.RUNNABLE) t.interrupt(); } } diff --git a/inference-engine/ie_bridges/java/tests/BlobTests.java b/inference-engine/ie_bridges/java/tests/BlobTests.java index db28ae16746992..3d4617409306c0 100644 --- a/inference-engine/ie_bridges/java/tests/BlobTests.java +++ b/inference-engine/ie_bridges/java/tests/BlobTests.java @@ -1,10 +1,9 @@ +import org.intel.openvino.*; import org.junit.Assert; import org.junit.Test; -import org.intel.openvino.*; - public class BlobTests extends IETest { - + @Test public void testGetBlob() { int[] dimsArr = {1, 3, 200, 200}; diff --git a/inference-engine/ie_bridges/java/tests/CNNNetworkTests.java b/inference-engine/ie_bridges/java/tests/CNNNetworkTests.java index fa67365a130b76..932594ab9fb123 100644 --- a/inference-engine/ie_bridges/java/tests/CNNNetworkTests.java +++ b/inference-engine/ie_bridges/java/tests/CNNNetworkTests.java @@ -1,12 +1,12 @@ import static org.junit.Assert.*; + +import org.intel.openvino.*; import org.junit.Test; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; -import org.intel.openvino.*; - public class CNNNetworkTests extends IETest { IECore core = new IECore(); @@ -37,13 +37,12 @@ public void testReshape() { public void testAddOutput() { CNNNetwork net = core.ReadNetwork(modelXml); Map output = net.getOutputsInfo(); - + assertEquals("Input size", 1, output.size()); - + net.addOutput("19/WithoutBiases"); output = net.getOutputsInfo(); assertEquals("Input size", 2, output.size()); } - } diff --git a/inference-engine/ie_bridges/java/tests/IECoreTests.java b/inference-engine/ie_bridges/java/tests/IECoreTests.java index 1a31d0720d99a5..be884305d97c79 100644 --- a/inference-engine/ie_bridges/java/tests/IECoreTests.java +++ b/inference-engine/ie_bridges/java/tests/IECoreTests.java @@ -1,14 +1,14 @@ import static org.junit.Assert.*; -import org.junit.Test; import org.intel.openvino.*; +import org.junit.Test; -import java.util.Map; import java.util.HashMap; +import java.util.Map; public class IECoreTests extends IETest { IECore core = new IECore(); - + @Test public void testReadNetwork() { CNNNetwork net = core.ReadNetwork(modelXml, modelBin); @@ -57,7 +57,7 @@ public void testLoadNetworDeviceConfig() { Map testMap = new HashMap(); - //When specifying key values as raw strings, omit the KEY_ prefix + // When specifying key values as raw strings, omit the KEY_ prefix testMap.put("CPU_BIND_THREAD", "YES"); testMap.put("CPU_THREADS_NUM", "1"); @@ -75,6 +75,8 @@ public void testLoadNetworkWrongDevice() { } catch (Exception e) { exceptionMessage = e.getMessage(); } - assertTrue(exceptionMessage.contains("Device with \"DEVISE\" name is not registered in the InferenceEngine")); + assertTrue( + exceptionMessage.contains( + "Device with \"DEVISE\" name is not registered in the InferenceEngine")); } } diff --git a/inference-engine/ie_bridges/java/tests/IETest.java b/inference-engine/ie_bridges/java/tests/IETest.java index 021a561d9138e2..135324532dfc0f 100644 --- a/inference-engine/ie_bridges/java/tests/IETest.java +++ b/inference-engine/ie_bridges/java/tests/IETest.java @@ -1,15 +1,11 @@ -import org.junit.Assert; -import org.junit.Before; -import org.junit.Rule; +import org.intel.openvino.*; import org.junit.Ignore; -import org.junit.runner.Description; import org.junit.Rule; import org.junit.rules.TestWatcher; +import org.junit.runner.Description; import java.nio.file.Paths; -import org.intel.openvino.*; - @Ignore public class IETest { String modelXml; @@ -23,20 +19,33 @@ public IETest() { System.err.println("Failed to load Inference Engine library\n" + e); System.exit(1); } - modelXml = Paths.get(System.getenv("MODELS_PATH"), "models", "test_model", "test_model_fp32.xml").toString(); - modelBin = Paths.get(System.getenv("MODELS_PATH"), "models", "test_model", "test_model_fp32.bin").toString(); + modelXml = + Paths.get( + System.getenv("MODELS_PATH"), + "models", + "test_model", + "test_model_fp32.xml") + .toString(); + modelBin = + Paths.get( + System.getenv("MODELS_PATH"), + "models", + "test_model", + "test_model_fp32.bin") + .toString(); } @Rule - public TestWatcher watchman = new TestWatcher() { - @Override - protected void succeeded(Description description) { - System.out.println(description + " - OK"); - } + public TestWatcher watchman = + new TestWatcher() { + @Override + protected void succeeded(Description description) { + System.out.println(description + " - OK"); + } - @Override - protected void failed(Throwable e, Description description) { - System.out.println(description + " - FAILED"); - } - }; + @Override + protected void failed(Throwable e, Description description) { + System.out.println(description + " - FAILED"); + } + }; } diff --git a/inference-engine/ie_bridges/java/tests/InferRequestTests.java b/inference-engine/ie_bridges/java/tests/InferRequestTests.java index 3dc74d6182ef48..ba3e4535e2530e 100644 --- a/inference-engine/ie_bridges/java/tests/InferRequestTests.java +++ b/inference-engine/ie_bridges/java/tests/InferRequestTests.java @@ -1,13 +1,12 @@ import static org.junit.Assert.*; -import org.junit.Test; + +import org.intel.openvino.*; import org.junit.Before; +import org.junit.Test; +import java.util.ArrayList; import java.util.Map; import java.util.Vector; -import java.util.ArrayList; - -import org.intel.openvino.*; -import org.intel.openvino.InferenceEngineProfileInfo.LayerStatus; public class InferRequestTests extends IETest { IECore core; @@ -59,13 +58,14 @@ public void testGetPerformanceCounts() { ArrayList resKeySet = new ArrayList(res.keySet()); for (int i = 0; i < res.size(); i++) { - String key = resKeySet.get(i); + String key = resKeySet.get(i); InferenceEngineProfileInfo resVal = res.get(key); assertEquals(key + " execType", key, layer_name.elementAt(i)); assertEquals(key + " executionIndex", i, resVal.executionIndex); - assertTrue(resVal.status == InferenceEngineProfileInfo.LayerStatus.EXECUTED - || resVal.status == InferenceEngineProfileInfo.LayerStatus.NOT_RUN); + assertTrue( + resVal.status == InferenceEngineProfileInfo.LayerStatus.EXECUTED + || resVal.status == InferenceEngineProfileInfo.LayerStatus.NOT_RUN); } } @@ -79,20 +79,21 @@ public void testStartAsync() { @Test public void testSetCompletionCallback() { - inferRequest.SetCompletionCallback(new Runnable() { + inferRequest.SetCompletionCallback( + new Runnable() { - @Override - public void run() { - completionCallback = true; - } - }); + @Override + public void run() { + completionCallback = true; + } + }); - for(int i = 0; i < 5; i++) { - inferRequest.Wait(WaitMode.RESULT_READY); + for (int i = 0; i < 5; i++) { + inferRequest.Wait(WaitMode.RESULT_READY); inferRequest.StartAsync(); - } - - inferRequest.Wait(WaitMode.RESULT_READY); + } + + inferRequest.Wait(WaitMode.RESULT_READY); inferRequest.StartAsync(); StatusCode statusCode = inferRequest.Wait(WaitMode.RESULT_READY); diff --git a/inference-engine/ie_bridges/java/tests/InputInfoTests.java b/inference-engine/ie_bridges/java/tests/InputInfoTests.java index 747470a17125a8..ae4e28c0a63664 100644 --- a/inference-engine/ie_bridges/java/tests/InputInfoTests.java +++ b/inference-engine/ie_bridges/java/tests/InputInfoTests.java @@ -1,11 +1,11 @@ import static org.junit.Assert.*; + +import org.intel.openvino.*; import org.junit.Test; import java.util.ArrayList; import java.util.Map; -import org.intel.openvino.*; - public class InputInfoTests extends IETest { IECore core = new IECore(); @@ -26,12 +26,11 @@ public void testSetLayout() { public void testSetPrecision() { CNNNetwork net = core.ReadNetwork(modelXml); Map inputsInfo = net.getInputsInfo(); - + String inputName = new ArrayList(inputsInfo.keySet()).get(0); InputInfo inputInfo = inputsInfo.get(inputName); inputInfo.setPrecision(Precision.U8); assertEquals("setPrecision", Precision.U8, inputInfo.getPrecision()); } - } diff --git a/inference-engine/ie_bridges/java/tests/OpenVinoTestRunner.java b/inference-engine/ie_bridges/java/tests/OpenVinoTestRunner.java index 91be39fc45350d..f1ee90cda5b5c8 100644 --- a/inference-engine/ie_bridges/java/tests/OpenVinoTestRunner.java +++ b/inference-engine/ie_bridges/java/tests/OpenVinoTestRunner.java @@ -1,3 +1,4 @@ +import org.intel.openvino.*; import org.junit.runner.JUnitCore; import org.junit.runner.Result; import org.junit.runner.notification.Failure; @@ -11,9 +12,9 @@ public static void main(String[] args) { IETest.device = parser.get("-d", "CPU"); Result result = JUnitCore.runClasses(TestsSuite.class); - + for (Failure failure : result.getFailures()) { - System.out.println(failure.toString()); + System.out.println(failure.toString()); } } } diff --git a/inference-engine/ie_bridges/java/tests/TestsSuite.java b/inference-engine/ie_bridges/java/tests/TestsSuite.java index c3109d5d8cbdf0..5b9a77ce67e087 100644 --- a/inference-engine/ie_bridges/java/tests/TestsSuite.java +++ b/inference-engine/ie_bridges/java/tests/TestsSuite.java @@ -1,40 +1,38 @@ -import org.junit.runner.RunWith; -import org.junit.runners.AllTests; - import junit.framework.TestSuite; -import java.util.List; -import java.util.ArrayList; -import java.util.zip.*; - -import java.nio.file.FileSystems; -import java.nio.file.Path; -import java.nio.file.Paths; +import org.intel.openvino.*; +import org.junit.runner.RunWith; +import org.junit.runners.AllTests; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; - -import java.lang.Class; import java.net.*; - -import org.intel.openvino.*; +import java.util.ArrayList; +import java.util.List; +import java.util.zip.*; @RunWith(AllTests.class) - -public class TestsSuite extends IETest{ +public class TestsSuite extends IETest { public static TestSuite suite() { TestSuite suite = new TestSuite(); try { - //get openvino_test.jar path - String dir = new File(TestsSuite.class.getProtectionDomain().getCodeSource().getLocation().toURI()).getPath().toString(); - + // get openvino_test.jar path + String dir = + new File( + TestsSuite.class + .getProtectionDomain() + .getCodeSource() + .getLocation() + .toURI()) + .getPath() + .toString(); + List> results = findClasses(dir); for (Class cl : results) { - if (cl.getName() == "ArgumentParser") - continue; + if (cl.getName() == "ArgumentParser") continue; suite.addTest(new junit.framework.JUnit4TestAdapter(cl)); } } catch (ClassNotFoundException e) { @@ -51,14 +49,19 @@ private static List> findClasses(String directory) throws ClassNotFound ZipInputStream zip = new ZipInputStream(new FileInputStream(directory)); for (ZipEntry entry = zip.getNextEntry(); entry != null; entry = zip.getNextEntry()) { String name = entry.getName().toString(); - if (name.endsWith(".class") && !name.contains("$") && !name.contains("/") - && !name.equals("TestsSuite.class") && !name.equals("OpenVinoTestRunner.class") && !name.equals("IETest.class")) { - classes.add(Class.forName(name.substring(0, name.length() - ".class".length()))); + if (name.endsWith(".class") + && !name.contains("$") + && !name.contains("/") + && !name.equals("TestsSuite.class") + && !name.equals("OpenVinoTestRunner.class") + && !name.equals("IETest.class")) { + classes.add( + Class.forName(name.substring(0, name.length() - ".class".length()))); } } - } catch(FileNotFoundException e) { + } catch (FileNotFoundException e) { System.out.println("FileNotFoundException: " + e.getMessage()); - } catch(IOException e) { + } catch (IOException e) { System.out.println("IOException: " + e.getMessage()); } return classes; From 3af3e459c3c9cb9ef7a5489df59b2081ad722fa4 Mon Sep 17 00:00:00 2001 From: Evgenya Stepyreva Date: Wed, 9 Sep 2020 18:25:53 +0300 Subject: [PATCH 56/66] [ IE ] ExperimentalDetectron shape infer fix (#2143) --- .../src/shape_infer/built-in/ie_proposal_onnx_shape_infer.hpp | 3 +-- .../src/shape_infer/built-in/ie_topkrois_onnx_shape_infer.hpp | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/inference-engine/src/legacy_api/src/shape_infer/built-in/ie_proposal_onnx_shape_infer.hpp b/inference-engine/src/legacy_api/src/shape_infer/built-in/ie_proposal_onnx_shape_infer.hpp index d2f88363d6f917..15719d0cb61faa 100644 --- a/inference-engine/src/legacy_api/src/shape_infer/built-in/ie_proposal_onnx_shape_infer.hpp +++ b/inference-engine/src/legacy_api/src/shape_infer/built-in/ie_proposal_onnx_shape_infer.hpp @@ -29,8 +29,7 @@ class ExperimentalDetectronGenerateProposalsSingleImageShapeProp : public BuiltI cnnLayer.type = _type; validate(&cnnLayer, inBlobs, params, blobs); - size_t post_nms_count = static_cast(cnnLayer.GetParamAsInt("post_nms_count")); - + auto post_nms_count = cnnLayer.GetParamAsUInt("post_nms_count"); outShapes.push_back({post_nms_count, 4}); outShapes.push_back({post_nms_count, }); } diff --git a/inference-engine/src/legacy_api/src/shape_infer/built-in/ie_topkrois_onnx_shape_infer.hpp b/inference-engine/src/legacy_api/src/shape_infer/built-in/ie_topkrois_onnx_shape_infer.hpp index 35891b67006bb0..7b476ee79a8f88 100644 --- a/inference-engine/src/legacy_api/src/shape_infer/built-in/ie_topkrois_onnx_shape_infer.hpp +++ b/inference-engine/src/legacy_api/src/shape_infer/built-in/ie_topkrois_onnx_shape_infer.hpp @@ -32,7 +32,7 @@ class ExperimentalDetectronTopKROIsShapeProp : public BuiltInShapeInferImpl { cnnLayer.type = _type; validate(&cnnLayer, inBlobs, params, blobs); - const bool max_rois = cnnLayer.GetParamAsInt("max_rois"); + const auto max_rois = cnnLayer.GetParamAsUInt("max_rois"); outShapes.push_back({max_rois, 4}); } }; From b3829c58a8cd34f7ab7f854d735033640d7d240e Mon Sep 17 00:00:00 2001 From: Maxim Andronov Date: Wed, 9 Sep 2020 18:57:58 +0300 Subject: [PATCH 57/66] [CPU] add const and precision check quantize ranges (#2074) --- .../src/mkldnn_plugin/nodes/mkldnn_quantize_node.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.cpp index d9b35ecb9f6327..17521f2a01f629 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.cpp @@ -124,6 +124,14 @@ void MKLDNNQuantizeNode::init() { THROW_IE_EXCEPTION << "Unsupported input sizes for Quantize layer with name " << getName(); } + for (size_t i = 1; i < getParentEdges().size(); i++) { + if (!getParentEdgesAtPort(i)[0]->getParent()->isConstant()) + THROW_IE_EXCEPTION << "Quantize layer with name " << getName() << " has non const input on " << i << " port"; + auto prec = getCnnLayer()->insData[i].lock()->getPrecision(); + if (prec != Precision::FP32) + THROW_IE_EXCEPTION << "Quantize layer with name " << getName() << " has unsupported precision " << prec << " on " << i << " port"; + } + auto inputLowBlob = dynamic_cast*>(getParentEdgesAtPort(1)[0]->getParent()->getCnnLayer()->blobs["custom"].get()); auto inputLowData = inputLowBlob->buffer().as(); From d78fd196e85bb06fed26b4b854dd5de5355569c5 Mon Sep 17 00:00:00 2001 From: Gorokhov Dmitriy Date: Wed, 9 Sep 2020 18:59:37 +0300 Subject: [PATCH 58/66] [CPU] Fixed quantization post op memory leak (#2101) --- .../nodes/mkldnn_quantize_node.cpp | 12 +++++++++- .../nodes/mkldnn_quantize_node.h | 22 ++++++++++++++----- inference-engine/thirdparty/mkl-dnn | 2 +- 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.cpp index 17521f2a01f629..e2dfe0dd32c098 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.cpp @@ -525,7 +525,17 @@ void MKLDNNQuantizeNode::execute(mkldnn::stream strm) { } void MKLDNNQuantizeNode::appendPostOps(mkldnn::post_ops& ops) { - ops.append_quantization(quantizeAlgorithm , cropLow, cropHigh, inputScale, inputShift, outputScale, outputShift); + if (!isPostOpDataInitialized) { + isPostOpDataInitialized = true; + cropLowData.set(cropLow.size(), 1 << 1, &cropLow[0]); + cropHighData.set(cropHigh.size(), 1 << 1, &cropHigh[0]); + inputScaleData.set(inputScale.size(), 1 << 1, &inputScale[0]); + inputShiftData.set(inputShift.size(), 1 << 1, &inputShift[0]); + outputScaleData.set(outputScale.size(), 1 << 1, &outputScale[0]); + outputShiftData.set(outputShift.size(), 1 << 1, &outputShift[0]); + } + + ops.append_quantization(quantizeAlgorithm, &cropLowData, &cropHighData, &inputScaleData, &inputShiftData, &outputScaleData, &outputShiftData); } bool MKLDNNQuantizeNode::created() const { diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.h index 13f671339b9f1d..af68cfdd08a8b5 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.h @@ -10,6 +10,7 @@ #include #include #include +#include namespace MKLDNNPlugin { @@ -41,12 +42,12 @@ class MKLDNNQuantizeNode : public MKLDNNNode { const std::vector& getOutputScale() const { return outputScale; } const std::vector& getOutputShift() const { return outputShift; } - void setCropLow(std::vector newCropLow) { cropLow = std::move(newCropLow); } - void setCropHigh(std::vector newCropHigh) { cropHigh = std::move(newCropHigh); } - void setInputScale(std::vector newInputScale) { inputScale = std::move(newInputScale); } - void setInputShift(std::vector newInputShift) { inputShift = std::move(newInputShift); } - void setOutputScale(std::vector newOutputScale) { outputScale = std::move(newOutputScale); } - void setOutputShift(std::vector newOutputShift) { outputShift = std::move(newOutputShift); } + void setCropLow(std::vector newCropLow) { cropLow = std::move(newCropLow); isPostOpDataInitialized = false; } + void setCropHigh(std::vector newCropHigh) { cropHigh = std::move(newCropHigh); isPostOpDataInitialized = false; } + void setInputScale(std::vector newInputScale) { inputScale = std::move(newInputScale); isPostOpDataInitialized = false; } + void setInputShift(std::vector newInputShift) { inputShift = std::move(newInputShift); isPostOpDataInitialized = false; } + void setOutputScale(std::vector newOutputScale) { outputScale = std::move(newOutputScale); isPostOpDataInitialized = false;} + void setOutputShift(std::vector newOutputShift) { outputShift = std::move(newOutputShift); isPostOpDataInitialized = false; } bool isInputLowBroadcast() const { return isInputLowBroadcasted; } bool isInputHighBroadcast() const { return isInputHighBroadcasted; } @@ -74,6 +75,15 @@ class MKLDNNQuantizeNode : public MKLDNNNode { std::vector outputScale; std::vector outputShift; + // mkldnn style post ops data representation + bool isPostOpDataInitialized = false; + mkldnn::impl::shifts_t cropLowData; + mkldnn::impl::shifts_t cropHighData; + mkldnn::impl::scales_t inputScaleData; + mkldnn::impl::shifts_t inputShiftData; + mkldnn::impl::scales_t outputScaleData; + mkldnn::impl::shifts_t outputShiftData; + bool isInputLowBroadcasted = false; bool isInputHighBroadcasted = false; bool isOutputLowBroadcasted = false; diff --git a/inference-engine/thirdparty/mkl-dnn b/inference-engine/thirdparty/mkl-dnn index 683bea673b4e51..6547f0b6aac272 160000 --- a/inference-engine/thirdparty/mkl-dnn +++ b/inference-engine/thirdparty/mkl-dnn @@ -1 +1 @@ -Subproject commit 683bea673b4e510eb150e4b338aeeeb366ba17f6 +Subproject commit 6547f0b6aac2725bd4e36197e19fb1a6f2ee2f51 From 27ca6be72842d4c84bb2047c8fec26dd79d0c9de Mon Sep 17 00:00:00 2001 From: Ilya Churaev Date: Thu, 10 Sep 2020 06:14:11 +0300 Subject: [PATCH 59/66] Fixed addOutput behavior for experimental ops (#2138) --- .../src/inference_engine/generic_ie.cpp | 9 ++- .../cnn_network/cnn_ngraph_impl_tests.cpp | 75 +++++++++++++++++++ 2 files changed, 83 insertions(+), 1 deletion(-) diff --git a/inference-engine/src/inference_engine/generic_ie.cpp b/inference-engine/src/inference_engine/generic_ie.cpp index 543eac56a3ca77..313bee4519283b 100644 --- a/inference-engine/src/inference_engine/generic_ie.cpp +++ b/inference-engine/src/inference_engine/generic_ie.cpp @@ -130,7 +130,7 @@ void ngraph::op::GenericIE::validate_and_infer_types() { } // WA: shape infer has to know number of outputs - if ((type == "Proposal" || type == "ExperimentalDetectronROIFeatureExtractor" || type == "ExperimentalDetectronDetectionOutput") + if ((type == "ExperimentalDetectronROIFeatureExtractor" || type == "ExperimentalDetectronDetectionOutput") && parameters.find("num_outputs") == parameters.end()) { parameters["num_outputs"] = std::to_string(outputs.size()); } @@ -149,6 +149,13 @@ void ngraph::op::GenericIE::validate_and_infer_types() { // Extensions are not loaded when we create nGraph function // First call: create node if (initialized < 1) { + if ((type == "ExperimentalDetectronROIFeatureExtractor" || type == "ExperimentalDetectronDetectionOutput") + && outputs.size() < 2) { + // Add fake port + PortIE port; + port.precision = InferenceEngine::Precision::FP32; + outputs.emplace_back(port); + } if (outputs.size()) set_output_size(outputs.size()); for (size_t output_index = 0; output_index < outputs.size(); output_index++) { diff --git a/inference-engine/tests/functional/inference_engine/cnn_network/cnn_ngraph_impl_tests.cpp b/inference-engine/tests/functional/inference_engine/cnn_network/cnn_ngraph_impl_tests.cpp index b5565b9b8cf1de..370c6a10c267b2 100644 --- a/inference-engine/tests/functional/inference_engine/cnn_network/cnn_ngraph_impl_tests.cpp +++ b/inference-engine/tests/functional/inference_engine/cnn_network/cnn_ngraph_impl_tests.cpp @@ -880,4 +880,79 @@ TEST(CNNNGraphImplTests, addOutputForParameter) { } } +TEST(CNNNGraphImplTests, AddOutputToExperimentalOp) { + std::string model = R"V0G0N( + + + + + + + 1 + 3 + 22 + 22 + + + + + + + 1 + 3 + 22 + 22 + + + + + 1 + 3 + 22 + 22 + + + + + + + 1 + 3 + 22 + 22 + + + + + 1 + 3 + 22 + 22 + + + + + + + 1 + 3 + 22 + 22 + + + + + + + + + + +)V0G0N"; + InferenceEngine::Core core; + CNNNetwork network = core.ReadNetwork(model, InferenceEngine::Blob::CPtr()); + network.addOutput("exp"); + auto outputs = network.getOutputsInfo(); + ASSERT_NE(outputs.find("exp.0"), outputs.end()); +} IE_SUPPRESS_DEPRECATED_END From 0e34b392eeda262c971c0fea7bd910f9f2a7017d Mon Sep 17 00:00:00 2001 From: Anton Voronov Date: Thu, 10 Sep 2020 08:33:38 +0300 Subject: [PATCH 60/66] [CPU] Supported depthwise 6d, 7d, ..., added test (#971) --- .../nodes/mkldnn_depthwise_node.cpp | 157 +++++++++++++++++- .../nodes/mkldnn_depthwise_node.h | 5 + .../subgraph_tests/multiply_add.cpp | 32 ++++ .../include/subgraph_tests/multiply_add.hpp | 32 ++++ .../src/subgraph_tests/multiply_add.cpp | 58 +++++++ 5 files changed, 283 insertions(+), 1 deletion(-) create mode 100644 inference-engine/tests/functional/plugin/cpu/shared_tests_instances/subgraph_tests/multiply_add.cpp create mode 100644 inference-engine/tests/functional/plugin/shared/include/subgraph_tests/multiply_add.hpp create mode 100644 inference-engine/tests/functional/plugin/shared/src/subgraph_tests/multiply_add.cpp diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp index 5442f0375ba324..486cc963c64ca6 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp @@ -66,6 +66,58 @@ void MKLDNNDepthwiseNode::getSupportedDescriptors() { } } +void MKLDNNDepthwiseNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + auto parentOutDims = getParentEdgeAt(0)->getDims(); + if (parentOutDims.ndims() <= 5) { + MKLDNNNode::initSupportedPrimitiveDescriptors(); + } else { + createSpecificDescriptor5D(); + if (specificDesc5DPtr == nullptr) + THROW_IE_EXCEPTION << "Cannot create specific MKLDNNDescriptor for depthwise node " << getName(); + const auto& desc = *specificDesc5DPtr; + auto itpd = desc.createPrimitiveDescriptorIterator(getEngine()); + while (itpd.is_not_end()) { + InferenceEngine::LayerConfig config; + config.dynBatchSupport = true; + for (size_t i = 0; i < descInputNumbers(desc); i++) { + InferenceEngine::DataConfig dataConfig; + dataConfig.inPlace = -1; + dataConfig.constant = false; + dataConfig.desc = MKLDNNMemoryDesc(InferenceEngine::TensorDesc(Precision::FP32, parentOutDims.ToSizeVector(), Layout::ANY)); + config.inConfs.push_back(dataConfig); + } + + std::vector outFormats; + for (size_t i = 0; i < descOutputNumbers(desc); i++) { + InferenceEngine::DataConfig dataConfig; + dataConfig.inPlace = canBeInPlace() ? 0 : -1; + dataConfig.constant = false; + dataConfig.desc = MKLDNNMemoryDesc(InferenceEngine::TensorDesc(Precision::FP32, parentOutDims.ToSizeVector(), Layout::ANY)); + config.outConfs.push_back(dataConfig); + + auto primDesc = itpd.fetch(); + auto dstPrimDesc = mkldnn_primitive_desc_query_pd(primDesc.get(), mkldnn::convert_to_c(dst_pd), 0); + if (dstPrimDesc) { + outFormats.emplace_back(static_cast(itpd.dst_primitive_desc().desc().data.format)); + } else { + // This path is needed to correctly handle Deconvolution node + auto diffSrcPrimDesc = mkldnn_primitive_desc_query_pd(primDesc.get(), mkldnn::convert_to_c(diff_src_pd), 0); + if (diffSrcPrimDesc) { + outFormats.emplace_back(static_cast(itpd.diff_src_primitive_desc().desc().data.format)); + } + } + } + impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str()); + + supportedPrimitiveDescriptors.emplace_back(config, impl_type, outFormats); + itpd++; + } + } +} + void MKLDNNDepthwiseNode::createPrimitive() { if (prim) return; @@ -79,7 +131,30 @@ void MKLDNNDepthwiseNode::createPrimitive() { if (getSelectedPrimitiveDescriptor() == nullptr) THROW_IE_EXCEPTION << "Preferable primitive descriptor is not set."; - auto prim_desc = createPrimitiveDescriptor(); + auto createRightPrimitiveDescriptor = [&]() -> depthwise_forward::primitive_desc { + auto parentOutDims = getParentEdgeAt(0)->getDims(); + if (parentOutDims.ndims() <= 5) { + return createPrimitiveDescriptor(); + } else { + const PrimitiveDescInfo *selected_pd = getSelectedPrimitiveDescriptor(); + auto& desc = *specificDesc5DPtr; + auto itpd = desc.createPrimitiveDescriptorIterator(getEngine(), mkldnn::primitive_attr()); + + while (itpd.is_not_end()) { + impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str()); + if (impl_type == getSelectedPrimitiveDescriptor()->getImplementationType()) { + specificPrepareMemory5D(itpd); + std::shared_ptr selected_desc_ptr = desc; + depthwise_forward::primitive_desc prim_desc = depthwise_forward::primitive_desc(*selected_desc_ptr, getEngine()); + return prim_desc; + } + itpd++; + } + THROW_IE_EXCEPTION << "Cannot create specific primitive descriptor for depthwise node " << getName() << "."; + } + }; + + auto prim_desc = createRightPrimitiveDescriptor(); if (isBroadcast()) { float broadcastValue = static_cast(internalBlobMemory[0]->GetData())[0]; @@ -185,6 +260,9 @@ void MKLDNNDepthwiseNode::initOptimalPrimitiveDescriptor() { !isUninitTensorDesc(config.outConfs[0].desc) && config.inConfs[0].desc != config.outConfs[0].desc)) THROW_IE_EXCEPTION << "Layer " << getName() << " has incorrect selected config!"; + if (getParentEdgeAt(0)->getDims().ndims() > 5) + return; + if (!isUninitTensorDesc(config.inConfs[0].desc)) { config.outConfs[0].desc = config.inConfs[0].desc; } else if (!isUninitTensorDesc(config.outConfs[0].desc)) { @@ -195,4 +273,81 @@ void MKLDNNDepthwiseNode::initOptimalPrimitiveDescriptor() { initDescriptor(config); } + +void MKLDNNDepthwiseNode::createSpecificDescriptor5D() { + auto parentOutDims = getParentEdgeAt(0)->getDims(); + MKLDNNDims newDims; + for (int i = 0; i < 4; i++) + newDims.push_back(parentOutDims[i]); + int lastDim = 1; + for (int i = 4; i < parentOutDims.ndims(); i++) { + lastDim *= parentOutDims[i]; + } + newDims.push_back(lastDim); + + MKLDNNMemoryDesc in_candidate{newDims, MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision::FP32), mkldnn::memory::ncdhw}; + MKLDNNMemoryDesc out_candidate(in_candidate); + MKLDNNDims weightDims({in_candidate.getDims()[1]}); + + MKLDNNMemoryDesc wgh_candidate{weightDims, in_candidate.getDataType(), memory::x}; + + if (isWithBiases()) { + MKLDNNMemoryDesc bias_candidate{weightDims, in_candidate.getDataType(), memory::x}; + MKLDNNDescriptor desc(std::shared_ptr( + new depthwise_forward::desc(prop_kind::forward_scoring, getAlgorithm(), in_candidate, out_candidate, wgh_candidate, bias_candidate))); + specificDesc5DPtr = std::make_shared(desc); + } else { + MKLDNNDescriptor desc(std::shared_ptr( + new depthwise_forward::desc(prop_kind::forward_scoring, getAlgorithm(), in_candidate, out_candidate, wgh_candidate))); + specificDesc5DPtr = std::make_shared(desc); + } +} + +void MKLDNNDepthwiseNode::specificPrepareMemory5D(mkldnn::primitive_desc_iterator& itpd) { + std::vector intDescs; + for (auto &it : internalBlobDesc) + intDescs.push_back(it(itpd, 0)); + + internalBlobMemory.clear(); + for (size_t i = 0; i < internalBlobs.size(); i++) { + const auto &internalBlob = internalBlobs[i]; + + auto create = [&] () { + auto newDesc = MKLDNNMemoryDesc(internalBlob->getTensorDesc()); + auto newFormat = newDesc.getFormat(); + if (newFormat == mkldnn::memory::ncdhw) { + newFormat = mkldnn::memory::goihw; + } + if (newFormat == mkldnn::memory::nchw) { + newFormat = mkldnn::memory::oihw; + } + + MKLDNNMemory memory{ getEngine() }; + memory.Create(MKLDNNMemoryDesc(newDesc.getDims(), newDesc.getDataType(), newFormat), internalBlob->buffer()); + + MKLDNNMemoryPtr _ptr = MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())); + _ptr->Create(intDescs[i]); + _ptr->SetData(memory); + + return _ptr; + }; + + MKLDNNMemoryPtr ptr; + if (weightCache != nullptr) { + const uint64_t data_hash = weightCache->GetHashFunc().hash( + internalBlob->buffer(), internalBlob->byteSize()); + + const std::string string_hash = getName() + "_" + std::to_string(i) + + "_" + std::to_string(internalBlob->byteSize()) + + "_" + std::to_string(data_hash); + + ptr = weightCache->findOrCreate(string_hash, create); + } else { + ptr = create(); + } + + internalBlobMemory.push_back(ptr); + } +} + REG_MKLDNN_PRIM_FOR(MKLDNNDepthwiseNode, Depthwise); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h index ce56e5e8e38a79..01f96484b510fe 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h @@ -21,6 +21,7 @@ class MKLDNNDepthwiseNode : public MKLDNNNode { const std::vector& outputDesc) override; void initOptimalPrimitiveDescriptor() override; void getSupportedDescriptors() override; + void initSupportedPrimitiveDescriptors() override; void createPrimitive() override; bool created() const override; @@ -36,6 +37,10 @@ class MKLDNNDepthwiseNode : public MKLDNNNode { size_t realBiasSize = 0; bool withBiases = false; bool broadcast = false; + + std::shared_ptr specificDesc5DPtr; + void createSpecificDescriptor5D(); + void specificPrepareMemory5D(mkldnn::primitive_desc_iterator& itpd); }; } // namespace MKLDNNPlugin diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/subgraph_tests/multiply_add.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/subgraph_tests/multiply_add.cpp new file mode 100644 index 00000000000000..9e5720ce153382 --- /dev/null +++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/subgraph_tests/multiply_add.cpp @@ -0,0 +1,32 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "subgraph_tests/multiply_add.hpp" + +using namespace LayerTestsDefinitions; + +namespace { + +const std::vector netPrecisions = { + InferenceEngine::Precision::FP32 +}; + +const std::vector> inputShapes = { + {1, 3, 2, 2, 4, 5}, + {1, 3, 2, 2, 2, 4, 5}, + {1, 3, 2, 2, 2, 2, 4, 5}, + {1, 3, 2, 2, 2, 2, 2, 4, 5}, + {1, 3, 2, 2, 2, 2, 2, 2, 4, 5}, +}; + +INSTANTIATE_TEST_CASE_P(MultipleAdd_Nd, MultiplyAddLayerTest, + ::testing::Combine( + ::testing::ValuesIn(inputShapes), + ::testing::ValuesIn(netPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + MultiplyAddLayerTest::getTestCaseName); + +} // namespace diff --git a/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/multiply_add.hpp b/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/multiply_add.hpp new file mode 100644 index 00000000000000..4b65929c2c94a0 --- /dev/null +++ b/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/multiply_add.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#pragma once + +#include +#include +#include +#include +#include "functional_test_utils/layer_test_utils.hpp" +#include "ngraph_functions/builders.hpp" +#include "ngraph_functions/utils/ngraph_helpers.hpp" +#include "common_test_utils/test_constants.hpp" + +namespace LayerTestsDefinitions { + +using MultiplyAddParamsTuple = typename std::tuple< + std::vector, //input shapes + InferenceEngine::Precision, //Network precision + std::string>; //Device name + +class MultiplyAddLayerTest: + public testing::WithParamInterface, + public LayerTestsUtils::LayerTestsCommon{ +public: + std::shared_ptr fn; + static std::string getTestCaseName(const testing::TestParamInfo &obj); +protected: + void SetUp() override; +}; + +} // namespace LayerTestsDefinitions diff --git a/inference-engine/tests/functional/plugin/shared/src/subgraph_tests/multiply_add.cpp b/inference-engine/tests/functional/plugin/shared/src/subgraph_tests/multiply_add.cpp new file mode 100644 index 00000000000000..1be404d6396839 --- /dev/null +++ b/inference-engine/tests/functional/plugin/shared/src/subgraph_tests/multiply_add.cpp @@ -0,0 +1,58 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include +#include +#include +#include "ie_core.hpp" + +#include "subgraph_tests/multiply_add.hpp" + +#include "common_test_utils/common_utils.hpp" +#include "functional_test_utils/blob_utils.hpp" +#include "functional_test_utils/precision_utils.hpp" +#include "functional_test_utils/plugin_cache.hpp" +#include "functional_test_utils/skip_tests_config.hpp" + +namespace LayerTestsDefinitions { +std::string MultiplyAddLayerTest::getTestCaseName(const testing::TestParamInfo &obj) { + std::vector inputShapes; + InferenceEngine::Precision netPrecision; + std::string targetName; + std::tie(inputShapes, netPrecision, targetName) = obj.param; + std::ostringstream results; + + results << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_"; + results << "netPRC=" << netPrecision.name() << "_"; + results << "targetDevice=" << targetName << "_"; + return results.str(); +} + +void MultiplyAddLayerTest::SetUp() { + std::vector inputShape; + auto netPrecision = InferenceEngine::Precision::UNSPECIFIED; + std::tie(inputShape, netPrecision, targetDevice) = this->GetParam(); + auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); + auto params = ngraph::builder::makeParams(ngPrc, {inputShape}); + auto paramOuts = ngraph::helpers::convert2OutputVector( + ngraph::helpers::castOps2Nodes(params)); + + std::vector constShape(inputShape.size(), 1); + constShape[1] = inputShape[1]; + + auto const_mul = ngraph::builder::makeConstant(ngPrc, constShape, {}, true); + auto mul = std::make_shared(paramOuts[0], const_mul); + auto const_add = ngraph::builder::makeConstant(ngPrc, constShape, {}, true); + auto add = std::make_shared(mul, const_add); + ngraph::ResultVector results{std::make_shared(add)}; + function = std::make_shared(results, params, "multiplyAdd"); +} + +TEST_P(MultiplyAddLayerTest, CompareWithRefs) { + Run(); +}; +} // namespace LayerTestsDefinitions From 5403003d026f0f8fb3e29c99ce6f0dd43f0fb963 Mon Sep 17 00:00:00 2001 From: Nikolay Shchegolev Date: Thu, 10 Sep 2020 08:35:32 +0300 Subject: [PATCH 61/66] [CPU] statically analyzed issues. (#2139) --- .../mkldnn_plugin/nodes/mkldnn_eltwise_node.h | 1 - .../mkldnn_plugin/nodes/mkldnn_input_node.cpp | 4 ++-- .../nodes/mkldnn_normalize_node.cpp | 9 +++++++++ .../nodes/mkldnn_scatter_update_node.cpp | 20 +++++++++++-------- 4 files changed, 23 insertions(+), 11 deletions(-) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h index 56227a9fd4d884..15b13c16a95799 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h @@ -71,7 +71,6 @@ class MKLDNNEltwiseNode : public MKLDNNNode { std::vector sum_scales; bool broadcast = false; int batch_dim = 5; - std::vector PostOpsIntBlobMemory; mkldnn::primitive_attr attr; std::shared_ptr eltiwse_fq_kernel; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp index 9571b4e51984d1..8c1ef5d1c8f768 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp @@ -125,8 +125,8 @@ namespace { auto const &lhsBlockingDesc = lhs.getBlockingDesc(); auto const &rhsBlockingDesc = rhs.getBlockingDesc(); - bool lhsDefaultStrides, rhsDefaultStrides; - size_t lhsSize, rhsSize; + bool lhsDefaultStrides = false, rhsDefaultStrides = false; + size_t lhsSize = 0lu, rhsSize = 0lu; std::tie(lhsDefaultStrides, lhsSize) = isDefaultStrides(lhsBlockingDesc.getStrides(), lhs.getDims()); std::tie(rhsDefaultStrides, rhsSize) = isDefaultStrides(rhsBlockingDesc.getStrides(), rhs.getDims()); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp index 7f412e677fae60..fd59bb98c32a7e 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp @@ -633,9 +633,15 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji for (int i = 0; i < p.len_; i++) { auto& post_op = p.entry_[i]; if (post_op.is_eltwise()) { + if (eltwise_injectors.size() <= eltwise_inj_idx + || eltwise_injectors[eltwise_inj_idx] == nullptr) + assert(!"Invalid eltwise injectors."); eltwise_injectors[eltwise_inj_idx]->compute_vector_range(vmm_val.getIdx(), vmm_val.getIdx() + 1); eltwise_inj_idx++; } else if (post_op.is_depthwise()) { + if (depthwise_injectors.size() <= depthwise_inj_idx + || depthwise_injectors[depthwise_inj_idx] == nullptr) + assert(!"Invalid depthwise injectors."); mov(reg_d_weights, reinterpret_cast(post_op.depthwise.weights_data)); mov(reg_d_bias, reinterpret_cast(post_op.depthwise.biases_data)); add(reg_d_weights, reg_oc_off); @@ -644,6 +650,9 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji depthwise_injectors[depthwise_inj_idx]->compute_vector_range(vmm_val.getIdx(), vmm_val.getIdx() + 1, reg_d_weights, reg_d_bias, is_broadcast); depthwise_inj_idx++; } else if (post_op.is_quantization()) { + if (quantization_injectors.size() <= quantization_inj_idx + || quantization_injectors[quantization_inj_idx] == nullptr) + assert(!"Invalid quantization injectors."); bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize; bool do_rounding = do_dequantization || dst_dt == memory::f32 || i != p.len_ - 1; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_scatter_update_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_scatter_update_node.cpp index 2d74c876773e40..a7401112833266 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_scatter_update_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_scatter_update_node.cpp @@ -289,6 +289,7 @@ void MKLDNNScatterUpdateNode::execute(mkldnn::stream strm) { SizeVector indicesDim = getParentEdgeAt(INDICES_ID)->getDesc().getDims(); size_t srcRank = srcDataDim.size(); int axis = 0; + std::string errorPrefix = std::string("'") + getTypeStr() + "'" + " layer with name '" + getName() + "'"; if (axisRelaxed) { auto &axisMemPtr = getParentEdgeAt(AXIS_ID)->getMemoryPtr(); uint8_t *axisPtr = reinterpret_cast(axisMemPtr->GetData()) + @@ -302,8 +303,8 @@ void MKLDNNScatterUpdateNode::execute(mkldnn::stream strm) { } if (axis >= static_cast(srcRank) || axis < (static_cast(srcRank) * - 1)) { - THROW_IE_EXCEPTION << "'" << getType() << "'" << " layer with name '" << getName() - << "' should have axis value in range [-r, r - 1], where r is the rank of input data"; + THROW_IE_EXCEPTION << errorPrefix + << " should have axis value in range [-r, r - 1], where r is the rank of input data"; } axis = axis < 0 ? (axis + srcRank) : axis; @@ -315,8 +316,8 @@ void MKLDNNScatterUpdateNode::execute(mkldnn::stream strm) { for (int i = start; i < end; i++) { int64_t idxValue = getIndicesValue(indicesPtr, i); if (idxValue >= static_cast(srcDimAxis) || idxValue < 0) { - THROW_IE_EXCEPTION << "'" << getType() << "'" << " layer with name '" << getName() - << "' have indices value that points to non-existing output tensor element"; + THROW_IE_EXCEPTION << errorPrefix + << " have indices value that points to non-existing output tensor element"; } } }); @@ -336,10 +337,13 @@ void MKLDNNScatterUpdateNode::execute(mkldnn::stream strm) { } } } + if (updateRank > expectUpdateShape.size()) + THROW_IE_EXCEPTION << errorPrefix << " cannot update shape. New rank: " + << updateRank << ", expected: " << expectUpdateShape.size(); for (size_t ru = 0; ru < updateRank; ru++) { if (updateDim[ru] != expectUpdateShape[ru]) { - THROW_IE_EXCEPTION << "'" << getType() << "'" << " layer with name '" << getName() - << "' do not have matched tensor shape relationship for input, indices and update"; + THROW_IE_EXCEPTION << errorPrefix + << " do not have matched tensor shape relationship for input, indices and update"; } } } @@ -370,8 +374,8 @@ void MKLDNNScatterUpdateNode::execute(mkldnn::stream strm) { break; } default: { - THROW_IE_EXCEPTION << "'" << getType() << "'" << " layer with name '" << getName() - << "' is not supported"; + THROW_IE_EXCEPTION << errorPrefix + << " is not supported"; } } } From 3797a28e65dcaab60888d57d8e62fcc0463d83c3 Mon Sep 17 00:00:00 2001 From: Ilya Znamenskiy Date: Thu, 10 Sep 2020 08:56:04 +0300 Subject: [PATCH 62/66] [IE CLDNN] Fully connected MMAD kernel optimizations (#2115) --- .../fully_connected_kernel_mmad.cpp | 62 ++++++++--- .../fully_connected_kernel_mmad.h | 7 ++ .../cl_kernels/fully_connected_gpu_MMAD.cl | 105 ++++++++++++++---- 3 files changed, 140 insertions(+), 34 deletions(-) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp index ceb6dc1e5ec9b9..b560f6e0503c1e 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp @@ -17,10 +17,6 @@ namespace kernel_selector { -namespace { - static const size_t sub_group_size = 8; -} // namespace - ParamsKey FullyConnectedKernelMMAD::GetSupportedKey() const { ParamsKey k; k.EnableInputDataType(Datatype::INT8); @@ -65,14 +61,32 @@ bool FullyConnectedKernelMMAD::Validate(const Params& params, const optional_par return true; } +FullyConnectedKernelMMAD::FullyConnectedTuningData FullyConnectedKernelMMAD::SetTuningParams(const fully_connected_params& params) const { + FullyConnectedTuningData tuning_data; + + const auto& input = params.inputs[0]; + + size_t feature_blocks_count = input.GetLayout() == DataLayout::bfyx && input.Feature().v % 32 != 0 ? + input.Feature().v / 32 : CeilDiv(input.Feature().v, 32); + + if (feature_blocks_count) + while (feature_blocks_count % (tuning_data.slm_div_factor * 2) == 0 && + (tuning_data.slm_div_factor * 2 <= params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size)) + tuning_data.slm_div_factor *= 2; + + tuning_data.work_group_size = tuning_data.slm_div_factor * tuning_data.sub_group_size; + + return tuning_data; +} + FullyConnectedKernelMMAD::DispatchData FullyConnectedKernelMMAD::SetDefault(const fully_connected_params& params, int) const { + FullyConnectedTuningData tuning_data = SetTuningParams(params); auto runInfo = Parent::SetDefault(params); + const auto& output = params.output; - const auto& out = params.output; - - std::vector global = { Align(out.Feature().v, sub_group_size), out.Batch().v, 1 }; - auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo); + std::vector global = { Align(output.Feature().v, tuning_data.sub_group_size) * tuning_data.slm_div_factor, output.Batch().v, 1 }; + std::vector local = { tuning_data.work_group_size, 1, 1 }; runInfo.gws0 = global[0]; runInfo.gws1 = global[1]; @@ -87,12 +101,14 @@ FullyConnectedKernelMMAD::DispatchData FullyConnectedKernelMMAD::SetDefault(cons JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_params& params, const DispatchData& runInfo) const { + FullyConnectedTuningData tuning_data = SetTuningParams(params); + auto jit = Parent::GetJitConstants(params, runInfo); auto& input = params.inputs[0]; auto& weights = params.weights; - jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size)); + jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", tuning_data.sub_group_size)); if (input.GetDims().size() == 5) { jit.AddConstant(MakeJitConstant("FILTER_GET_OFFSET(f)", "GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4_INDEX(FILTER, f, 0, 0, 0)")); } else { @@ -137,13 +153,33 @@ JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_par jit.AddConstant(MakeJitConstant("MMAD_INPUT_FBLOCK_PITCH", input.Feature().pitch * 32)); } + jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor)); + + size_t feature_blocks_count; + size_t temp_unroll_factor = 9, unroll_factor, full_unroll_factor; + if (input.GetLayout() == DataLayout::bfyx && input.Feature().v % 32 != 0) { + feature_blocks_count = input.Feature().v / 32; jit.AddConstant(MakeJitConstant("HAS_FEATURE_LEFTOVERS", true)); - jit.AddConstant(MakeJitConstant("FEATURE_BLOCKS_COUNT", input.Feature().v / 32)); } else { - jit.AddConstant(MakeJitConstant("FEATURE_BLOCKS_COUNT", CeilDiv(input.Feature().v, 32))); + feature_blocks_count = CeilDiv(input.Feature().v, 32); + } + + full_unroll_factor = feature_blocks_count / tuning_data.slm_div_factor; + + if (full_unroll_factor > 9) { + while (full_unroll_factor % temp_unroll_factor) + temp_unroll_factor--; + unroll_factor = temp_unroll_factor; + } else { + unroll_factor = full_unroll_factor; } + jit.AddConstant(MakeJitConstant("FEATURE_BLOCKS_COUNT", feature_blocks_count)); + jit.AddConstant(MakeJitConstant("UNROLL_FACTOR", unroll_factor)); + jit.AddConstant(MakeJitConstant("FULL_UNROLL_FACTOR", full_unroll_factor)); + jit.AddConstant(MakeJitConstant("WORK_GROUP_SIZE", tuning_data.work_group_size)); + jit.AddConstant(MakeJitConstant("MMAD_INPUT_SPATIAL_PITCH", input_x_pitch)); jit.AddConstant(MakeJitConstant("MMAD_INPUT_X_PITCH", input_x_pitch)); jit.AddConstant(MakeJitConstant("MMAD_INPUT_Y_PITCH", input_y_pitch)); @@ -158,7 +194,7 @@ JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_par if (!params.fused_ops.empty()) { auto input_dt = GetActivationType(params); - FusedOpsConfiguration conf = { "", {"b", "f", "0", "0"}, "dequantized", input_dt, 1 }; + FusedOpsConfiguration conf = { "", {"batch", "feature", "0", "0"}, "dequantized", input_dt, 1 }; jit.Merge(MakeFusedOpsJitConstants(params, { conf })); } @@ -180,7 +216,7 @@ KernelsData FullyConnectedKernelMMAD::GetKernelsData(const Params& params, const options, input.GetLayout(), w_layout, - FORCE_PRIORITY_9, + FORCE_PRIORITY_7, static_cast(i)); if (!kd.empty()) { res.emplace_back(kd[0]); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.h index 8f906a0770e1e7..704b29173006a5 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.h @@ -29,6 +29,12 @@ class FullyConnectedKernelMMAD : public FullyConnectedKernelBase { KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; ParamsKey GetSupportedKey() const override; + struct FullyConnectedTuningData { + const size_t sub_group_size = 8; + size_t slm_div_factor = 1; + size_t work_group_size = 1; + }; + protected: JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const override; DispatchData SetDefault(const fully_connected_params& params, int autoTuneIndex = -1) const override; @@ -38,5 +44,6 @@ class FullyConnectedKernelMMAD : public FullyConnectedKernelBase { FusedOpType::ACTIVATION }; } bool Validate(const Params& params, const optional_params& options) const override; + FullyConnectedTuningData SetTuningParams(const fully_connected_params& params) const; }; } // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_MMAD.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_MMAD.cl index 43789cedaaf99b..95fc65da6805d8 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_MMAD.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_MMAD.cl @@ -37,25 +37,35 @@ KERNEL(fully_connected_gpu_MMAD)( #endif ) { -#if OUTPUT_BATCH_NUM == 1 - const uint f = (uint)get_global_id(0); - const uint b = 0; -#else - const uint f = (uint)get_global_id(0); - const uint b = (uint)get_global_id(1); -#endif + const uint lid0 = (uint)get_local_id(0); + const uint feature_per_wg = (uint)get_local_size(0) / SLM_DIV_FACTOR; + const uint feature = (uint)get_group_id(0) * feature_per_wg + (uint)get_global_id(0) % feature_per_wg; + const uint feature_block = lid0 / feature_per_wg; + const uint batch = (uint)get_global_id(1); int dotProd = 0; - const uint filter_offset = FILTER_GET_OFFSET(f); + const uint filter_offset = FILTER_GET_OFFSET(feature); #if INPUT0_DIMS == 5 - const uint input_offset = INPUT0_GET_INDEX(b, 0, 0, 0, 0); + const uint input_offset = INPUT0_GET_INDEX(batch, 0, 0, 0, 0); #else - const uint input_offset = INPUT0_GET_INDEX(b, 0, 0, 0); + const uint input_offset = INPUT0_GET_INDEX(batch, 0, 0, 0); +#endif + +#if SLM_DIV_FACTOR > 1 + __local int partial_summ[WORK_GROUP_SIZE]; #endif #if SPATIAL_MAJOR - for (uint k = 0; k < FEATURE_BLOCKS_COUNT; ++k) { + +#if FULL_UNROLL_FACTOR < 2 + for (uint k = feature_block * FULL_UNROLL_FACTOR; k < (feature_block + 1) * FULL_UNROLL_FACTOR; ++k) +#elif UNROLL_FACTOR == FULL_UNROLL_FACTOR + uint k = feature_block * FULL_UNROLL_FACTOR; +#else + for (uint k = feature_block * FULL_UNROLL_FACTOR; k + UNROLL_FACTOR <= (feature_block + 1) * FULL_UNROLL_FACTOR; k += UNROLL_FACTOR) +#endif + { # if !SPLIT_SPATIAL for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) { # else @@ -73,7 +83,15 @@ KERNEL(fully_connected_gpu_MMAD)( for (uint xi = 0; xi < FILTER_SIZE_X; ++xi) { const uint spatial = xi + yi * FILTER_SIZE_X + zi * FILTER_SIZE_X * FILTER_SIZE_Y; # endif - for (uint k = 0; k < FEATURE_BLOCKS_COUNT; ++k) { + +#if FULL_UNROLL_FACTOR < 2 + for (uint k = feature_block * FULL_UNROLL_FACTOR; k < (feature_block + 1) * FULL_UNROLL_FACTOR; ++k) +#elif UNROLL_FACTOR == FULL_UNROLL_FACTOR + uint k = feature_block * FULL_UNROLL_FACTOR; +#else + for (uint k = feature_block * FULL_UNROLL_FACTOR; k + UNROLL_FACTOR <= (feature_block + 1) * FULL_UNROLL_FACTOR; k += UNROLL_FACTOR) +#endif + { #endif #if !SPLIT_SPATIAL uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + k * MMAD_INPUT_FBLOCK_PITCH; @@ -82,10 +100,12 @@ KERNEL(fully_connected_gpu_MMAD)( #endif uint filter_idx = filter_offset + spatial * MMAD_FILTER_SPATIAL_PITCH + k * MMAD_FILTER_FBLOCK_PITCH; +#if UNROLL_FACTOR < 2 uint input_data_u = intel_sub_group_block_read((const __global uint*)(input + input_idx)); INPUT_PACKED_TYPE input_data = AS_TYPE(INPUT_PACKED_TYPE, input_data_u); - INPUT_PACKED_TYPE_8 activations; //activations of all lanes + INPUT_PACKED_TYPE_8 activations; + activations.s0 = sub_group_broadcast(input_data, 0); activations.s1 = sub_group_broadcast(input_data, 1); activations.s2 = sub_group_broadcast(input_data, 2); @@ -99,11 +119,50 @@ KERNEL(fully_connected_gpu_MMAD)( FILTER_PACKED_TYPE_8 weights_data = AS_TYPE(FILTER_PACKED_TYPE_8, weights_data_u); dotProd = MMAD_8(activations, weights_data, dotProd); +#else + INPUT_PACKED_TYPE input_data[UNROLL_FACTOR]; + FILTER_PACKED_TYPE_8 weights_data[UNROLL_FACTOR]; + + __attribute__((opencl_unroll_hint)) + for (uint kb = 0; kb < UNROLL_FACTOR; kb++) { + input_data[kb] = AS_TYPE(INPUT_PACKED_TYPE, intel_sub_group_block_read((const __global uint*)(input + + input_idx + kb * MMAD_INPUT_FBLOCK_PITCH))); + + uint8 weights_data_u0 = intel_sub_group_block_read8((const __global uint*)(weights + filter_idx + kb * MMAD_FILTER_FBLOCK_PITCH)); + weights_data[kb] = AS_TYPE(FILTER_PACKED_TYPE_8, weights_data_u0); + } + + __attribute__((opencl_unroll_hint)) + for (uint kb = 0; kb < UNROLL_FACTOR; kb++) { + INPUT_PACKED_TYPE_8 in; + + in.s0 = sub_group_broadcast(input_data[kb], 0); + in.s1 = sub_group_broadcast(input_data[kb], 1); + in.s2 = sub_group_broadcast(input_data[kb], 2); + in.s3 = sub_group_broadcast(input_data[kb], 3); + in.s4 = sub_group_broadcast(input_data[kb], 4); + in.s5 = sub_group_broadcast(input_data[kb], 5); + in.s6 = sub_group_broadcast(input_data[kb], 6); + in.s7 = sub_group_broadcast(input_data[kb], 7); + + dotProd = MMAD_8(in, weights_data[kb], dotProd); + } +#endif // UNROLL_FACTOR < 2 } } +#if SLM_DIV_FACTOR > 1 + partial_summ[lid0] = dotProd; + barrier(CLK_LOCAL_MEM_FENCE); + + if (feature_block == 0) { + __attribute__((opencl_unroll_hint)) + for (uint i = 1; i < SLM_DIV_FACTOR; i++) + dotProd += partial_summ[lid0 % feature_per_wg + i * feature_per_wg]; +#endif // SLM_DIV_FACTOR > 1 + #if HAS_FEATURE_LEFTOVERS - const uint lid = get_sub_group_local_id(); + const uint sglid = get_sub_group_local_id(); #if SPATIAL_MAJOR #if !SPLIT_SPATIAL for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) { @@ -128,14 +187,14 @@ KERNEL(fully_connected_gpu_MMAD)( #if !SPLIT_SPATIAL uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * INPUT0_FEATURE_PITCH; #else // !SPLIT_SPATIAL - uint input_idx = input_offset + FEATURE_BLOCK_COUNT * INPUT0_FEATURE_PITCH + zi * MMAD_INPUT_Z_PITCH + yi * MMAD_INPUT_Y_PITCH + xi * MMAD_INPUT_X_PITCH; + uint input_idx = input_offset + FEATURE_BLOCKS_COUNT * INPUT0_FEATURE_PITCH + zi * MMAD_INPUT_Z_PITCH + yi * MMAD_INPUT_Y_PITCH + xi * MMAD_INPUT_X_PITCH; #endif // !SPLIT_SPATIAL uint filter_idx = filter_offset + spatial * MMAD_FILTER_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * MMAD_FILTER_FBLOCK_PITCH; MAKE_VECTOR_TYPE(INPUT0_TYPE, 4) input_data_u = (0, 0, 0, 0); for (uint i = 0; i < 4; i++) { - if (FEATURE_BLOCKS_COUNT*32 + lid*4 + i < INPUT0_FEATURE_NUM) { - input_data_u[i] = input[input_idx + (lid*4 + i)*INPUT0_FEATURE_PITCH]; + if (FEATURE_BLOCKS_COUNT * 32 + sglid * 4 + i < INPUT0_FEATURE_NUM) { + input_data_u[i] = input[input_idx + (sglid * 4 + i) * INPUT0_FEATURE_PITCH]; } } INPUT_PACKED_TYPE input_data = AS_TYPE(INPUT_PACKED_TYPE, input_data_u); @@ -157,14 +216,14 @@ KERNEL(fully_connected_gpu_MMAD)( } #endif // HAS_FEATURE_LEFTOVERS - if (OUTPUT_FEATURE_NUM % SUB_GROUP_SIZE != 0 && f >= OUTPUT_FEATURE_NUM) + if (OUTPUT_FEATURE_NUM % SUB_GROUP_SIZE != 0 && feature >= OUTPUT_FEATURE_NUM) return; #if BIAS_TERM #if BIAS_PER_OUTPUT - const uint bias_index = GET_DATA_INDEX(BIAS, b, f, 0, 0); + const uint bias_index = GET_DATA_INDEX(BIAS, batch, feature, 0, 0); #elif BIAS_PER_OFM - const uint bias_index = f; + const uint bias_index = feature; #endif float dequantized = (float)dotProd + biases[bias_index]; @@ -172,7 +231,7 @@ KERNEL(fully_connected_gpu_MMAD)( float dequantized = (float)dotProd; #endif - const uint out_idx = OUTPUT_GET_INDEX(b, f, 0, 0); + const uint out_idx = OUTPUT_GET_INDEX(batch, feature, 0, 0); #if HAS_FUSED_OPS FUSED_OPS; @@ -182,6 +241,10 @@ KERNEL(fully_connected_gpu_MMAD)( #else output[out_idx] = TO_OUTPUT_TYPE(dequantized); #endif + +#if SLM_DIV_FACTOR > 1 + } +#endif } #undef INPUT_PACKED_TYPE_8 From 362080b5be23ab0191e50f1d624912cdf8597bc0 Mon Sep 17 00:00:00 2001 From: Maxim Kurin Date: Thu, 10 Sep 2020 11:55:32 +0300 Subject: [PATCH 63/66] [IE][VPU][Custom CL]: Fix binary convolution3x3 3d transaction (#2144) * Fix binary_convolution3x3.cl kernel & test --- .../src/vpu/custom_kernels/binary_convolution3x3.cl | 2 +- .../vpu/common/layers/myriad_layers_custom_test.hpp | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/inference-engine/src/vpu/custom_kernels/binary_convolution3x3.cl b/inference-engine/src/vpu/custom_kernels/binary_convolution3x3.cl index 7c4958663dcfea..a81bb4dafbe883 100644 --- a/inference-engine/src/vpu/custom_kernels/binary_convolution3x3.cl +++ b/inference-engine/src/vpu/custom_kernels/binary_convolution3x3.cl @@ -53,7 +53,7 @@ __kernel void binary_convolution( DH * IW - IW, // src_line_stride 0, // dst_line_stride IC / GC, // num planes - IH * IW - 3 * IW, // src plane stride + IH * IW - 3 * DH * IW, // src plane stride 0, // dst plane stride 0); wait_group_events(1, &e); diff --git a/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_custom_test.hpp b/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_custom_test.hpp index 20c18a2496028a..fb592d8adcde56 100644 --- a/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_custom_test.hpp +++ b/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_custom_test.hpp @@ -1057,10 +1057,6 @@ TEST_P(myriadLayersTestsBinaryConvolution_smoke, BinaryConvolution) { } _config[InferenceEngine::MYRIAD_CUSTOM_LAYERS] = customConfig; - if (kernel.x == 3 && kernel.y == 3 && dilations == 2) { - GTEST_SKIP() << "Computing wrong after hoisting"; - } - SetInputTensor(dims); auto dimsOutput = dims; dimsOutput.h = (dims.h) / strides; From 926be8356887567e1f58a7e55bc34a01cc660ed1 Mon Sep 17 00:00:00 2001 From: Mateusz Tabaka Date: Thu, 10 Sep 2020 11:27:12 +0200 Subject: [PATCH 64/66] =?UTF-8?q?Add=20support=20for=20custom=20onnx=20ope?= =?UTF-8?q?rators:=20DetectionOutput,=20Normalize=20and=E2=80=A6=20(#2064)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../include/onnx_import/core/model.hpp | 2 + .../org.openvinotoolkit/detection_output.hpp | 38 +++++ .../fake_quantize.hpp | 0 .../op/org.openvinotoolkit/normalize.hpp | 37 +++++ .../op/org.openvinotoolkit/prior_box.hpp | 38 +++++ .../include/onnx_import/ops_bridge.hpp | 2 + .../frontend/onnx_import/src/core/graph.cpp | 7 +- .../frontend/onnx_import/src/core/model.cpp | 12 +- ngraph/frontend/onnx_import/src/core/node.cpp | 8 +- ngraph/frontend/onnx_import/src/onnx.cpp | 26 ++++ .../org.openvinotoolkit/detection_output.cpp | 104 +++++++++++++ .../fake_quantize.cpp | 2 +- .../src/op/org.openvinotoolkit/normalize.cpp | 96 ++++++++++++ .../src/op/org.openvinotoolkit/prior_box.cpp | 92 +++++++++++ .../frontend/onnx_import/src/ops_bridge.cpp | 18 ++- .../onnx_import/src/utils/convpool.cpp | 4 +- ...onv2d_dilation_assym_pads_strides.prototxt | 4 - ngraph/test/models/onnx/conv3d_bias.prototxt | 4 - .../models/onnx/detection_output.prototxt | 140 +++++++++++++++++ ngraph/test/models/onnx/normalize.prototxt | 83 ++++++++++ ngraph/test/models/onnx/prior_box.prototxt | 147 ++++++++++++++++++ .../fake_quantize_const_inputs.prototxt | 1 + .../fake_quantize_nonconst_inputs.prototxt | 1 + ngraph/test/onnx/onnx_import.in.cpp | 82 ++++++++++ ngraph/test/op_is.cpp | 9 ++ .../runtime/interpreter/int_executable.cpp | 3 +- .../runtime/interpreter/int_executable.hpp | 11 ++ ngraph/test/runtime/opset0_tbl.hpp | 1 + 28 files changed, 945 insertions(+), 27 deletions(-) create mode 100644 ngraph/frontend/onnx_import/include/onnx_import/op/org.openvinotoolkit/detection_output.hpp rename ngraph/frontend/onnx_import/include/onnx_import/op/{ => org.openvinotoolkit}/fake_quantize.hpp (100%) create mode 100644 ngraph/frontend/onnx_import/include/onnx_import/op/org.openvinotoolkit/normalize.hpp create mode 100644 ngraph/frontend/onnx_import/include/onnx_import/op/org.openvinotoolkit/prior_box.hpp create mode 100644 ngraph/frontend/onnx_import/src/op/org.openvinotoolkit/detection_output.cpp rename ngraph/frontend/onnx_import/src/op/{ => org.openvinotoolkit}/fake_quantize.cpp (96%) create mode 100644 ngraph/frontend/onnx_import/src/op/org.openvinotoolkit/normalize.cpp create mode 100644 ngraph/frontend/onnx_import/src/op/org.openvinotoolkit/prior_box.cpp create mode 100644 ngraph/test/models/onnx/detection_output.prototxt create mode 100644 ngraph/test/models/onnx/normalize.prototxt create mode 100644 ngraph/test/models/onnx/prior_box.prototxt diff --git a/ngraph/frontend/onnx_import/include/onnx_import/core/model.hpp b/ngraph/frontend/onnx_import/include/onnx_import/core/model.hpp index 64c602852cbfb4..a62bfe82af901c 100644 --- a/ngraph/frontend/onnx_import/include/onnx_import/core/model.hpp +++ b/ngraph/frontend/onnx_import/include/onnx_import/core/model.hpp @@ -27,6 +27,8 @@ namespace ngraph { namespace onnx_import { + std::string get_node_domain(const ONNX_NAMESPACE::NodeProto& node_proto); + class Model { public: diff --git a/ngraph/frontend/onnx_import/include/onnx_import/op/org.openvinotoolkit/detection_output.hpp b/ngraph/frontend/onnx_import/include/onnx_import/op/org.openvinotoolkit/detection_output.hpp new file mode 100644 index 00000000000000..6365972774f9ef --- /dev/null +++ b/ngraph/frontend/onnx_import/include/onnx_import/op/org.openvinotoolkit/detection_output.hpp @@ -0,0 +1,38 @@ +//***************************************************************************** +// Copyright 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#pragma once + +#include "ngraph/node.hpp" +#include "onnx_import/core/node.hpp" + +namespace ngraph +{ + namespace onnx_import + { + namespace op + { + namespace set_1 + { + OutputVector detection_output(const Node& node); + + } // namespace set_1 + + } // namespace op + + } // namespace onnx_import + +} // namespace ngraph diff --git a/ngraph/frontend/onnx_import/include/onnx_import/op/fake_quantize.hpp b/ngraph/frontend/onnx_import/include/onnx_import/op/org.openvinotoolkit/fake_quantize.hpp similarity index 100% rename from ngraph/frontend/onnx_import/include/onnx_import/op/fake_quantize.hpp rename to ngraph/frontend/onnx_import/include/onnx_import/op/org.openvinotoolkit/fake_quantize.hpp diff --git a/ngraph/frontend/onnx_import/include/onnx_import/op/org.openvinotoolkit/normalize.hpp b/ngraph/frontend/onnx_import/include/onnx_import/op/org.openvinotoolkit/normalize.hpp new file mode 100644 index 00000000000000..7bef385140f2cd --- /dev/null +++ b/ngraph/frontend/onnx_import/include/onnx_import/op/org.openvinotoolkit/normalize.hpp @@ -0,0 +1,37 @@ +//***************************************************************************** +// Copyright 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#pragma once + +#include "ngraph/node.hpp" +#include "onnx_import/core/node.hpp" + +namespace ngraph +{ + namespace onnx_import + { + namespace op + { + namespace set_1 + { + OutputVector normalize(const Node& node); + + } // namespace set_1 + } // namespace op + + } // namespace onnx_import + +} // namespace ngraph diff --git a/ngraph/frontend/onnx_import/include/onnx_import/op/org.openvinotoolkit/prior_box.hpp b/ngraph/frontend/onnx_import/include/onnx_import/op/org.openvinotoolkit/prior_box.hpp new file mode 100644 index 00000000000000..d5e5696a54c8d4 --- /dev/null +++ b/ngraph/frontend/onnx_import/include/onnx_import/op/org.openvinotoolkit/prior_box.hpp @@ -0,0 +1,38 @@ +//***************************************************************************** +// Copyright 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#pragma once + +#include "ngraph/node.hpp" +#include "onnx_import/core/node.hpp" + +namespace ngraph +{ + namespace onnx_import + { + namespace op + { + namespace set_1 + { + OutputVector prior_box(const Node& node); + + } // namespace set_1 + + } // namespace op + + } // namespace onnx_import + +} // namespace ngraph diff --git a/ngraph/frontend/onnx_import/include/onnx_import/ops_bridge.hpp b/ngraph/frontend/onnx_import/include/onnx_import/ops_bridge.hpp index 3744b720420e0b..df6407f67233ae 100644 --- a/ngraph/frontend/onnx_import/include/onnx_import/ops_bridge.hpp +++ b/ngraph/frontend/onnx_import/include/onnx_import/ops_bridge.hpp @@ -129,6 +129,8 @@ namespace ngraph const std::string& domain); }; + const std::string OPENVINO_ONNX_DOMAIN = "org.openvinotoolkit"; + } // namespace onnx_import } // namespace ngraph diff --git a/ngraph/frontend/onnx_import/src/core/graph.cpp b/ngraph/frontend/onnx_import/src/core/graph.cpp index 6ae9a52a271c73..4897d52f2d58cb 100644 --- a/ngraph/frontend/onnx_import/src/core/graph.cpp +++ b/ngraph/frontend/onnx_import/src/core/graph.cpp @@ -46,11 +46,6 @@ namespace ngraph return result; } - static std::string get_node_domain(const ONNX_NAMESPACE::NodeProto& node_proto) - { - return (node_proto.domain().empty() ? "" : node_proto.domain()); - } - /// \brief Gets the operator represented by provided node unique identificator. /// /// \param[in] node_proto The node protobuf representation object. @@ -142,7 +137,7 @@ namespace ngraph node_proto); // If a node from an unregistered domain is detected, try registering that // domain - m_model->enable_opset_domain(detail::get_node_domain(node_proto)); + m_model->enable_opset_domain(get_node_domain(node_proto)); } } diff --git a/ngraph/frontend/onnx_import/src/core/model.cpp b/ngraph/frontend/onnx_import/src/core/model.cpp index 2dfce97c00cf78..f079f7ee22aa0a 100644 --- a/ngraph/frontend/onnx_import/src/core/model.cpp +++ b/ngraph/frontend/onnx_import/src/core/model.cpp @@ -24,6 +24,11 @@ namespace ngraph { namespace onnx_import { + std::string get_node_domain(const ONNX_NAMESPACE::NodeProto& node_proto) + { + return node_proto.has_domain() ? node_proto.domain() : ""; + } + Model::Model(const ONNX_NAMESPACE::ModelProto& model_proto) : m_model_proto{&model_proto} { @@ -32,9 +37,8 @@ namespace ngraph // unknown or invalid. for (const auto& id : m_model_proto->opset_import()) { - m_opset.emplace(id.domain(), - OperatorsBridge::get_operator_set( - (id.domain() == "ai.onnx" ? "" : id.domain()), id.version())); + auto domain = id.has_domain() ? id.domain() : ""; + m_opset.emplace(domain, OperatorsBridge::get_operator_set(domain, id.version())); } // onnx.proto(.3): the empty string ("") for domain or absence of opset_import field // implies the operator set that is defined as part of the ONNX specification. @@ -63,7 +67,7 @@ namespace ngraph bool Model::is_operator_available(const ONNX_NAMESPACE::NodeProto& node_proto) const { - const auto dm = m_opset.find(node_proto.domain()); + const auto dm = m_opset.find(get_node_domain(node_proto)); if (dm == std::end(m_opset)) { return false; diff --git a/ngraph/frontend/onnx_import/src/core/node.cpp b/ngraph/frontend/onnx_import/src/core/node.cpp index 64ef02856375eb..5e2a65082e2709 100644 --- a/ngraph/frontend/onnx_import/src/core/node.cpp +++ b/ngraph/frontend/onnx_import/src/core/node.cpp @@ -33,6 +33,8 @@ namespace ngraph Impl(const ONNX_NAMESPACE::NodeProto& node_proto, const Graph& graph) : m_node_proto{&node_proto} + , m_name{node_proto.has_name() ? node_proto.name() : ""} + , m_domain{get_node_domain(node_proto)} , m_graph{&graph} , m_attributes{std::begin(node_proto.attribute()), std::end(node_proto.attribute())} , m_output_names{std::begin(node_proto.output()), std::end(node_proto.output())} @@ -65,6 +67,8 @@ namespace ngraph private: const ONNX_NAMESPACE::NodeProto* m_node_proto; + std::string m_name; + std::string m_domain; const Graph* m_graph; std::vector m_attributes; std::vector> m_output_names; @@ -74,9 +78,9 @@ namespace ngraph const ONNX_NAMESPACE::NodeProto& Node::Impl::node_proto() const { return *m_node_proto; } const Graph& Node::Impl::graph() const { return *m_graph; } const std::vector& Node::Impl::attributes() const { return m_attributes; } - const std::string& Node::Impl::domain() const { return m_node_proto->domain(); } + const std::string& Node::Impl::domain() const { return m_domain; } const std::string& Node::Impl::op_type() const { return m_node_proto->op_type(); } - const std::string& Node::Impl::name() const { return m_node_proto->name(); } + const std::string& Node::Impl::name() const { return m_name; } const std::vector>& Node::Impl::get_output_names() const { diff --git a/ngraph/frontend/onnx_import/src/onnx.cpp b/ngraph/frontend/onnx_import/src/onnx.cpp index 4705e83699f4a0..c398e49a729e01 100644 --- a/ngraph/frontend/onnx_import/src/onnx.cpp +++ b/ngraph/frontend/onnx_import/src/onnx.cpp @@ -73,6 +73,29 @@ namespace ngraph } // namespace error + static const std::vector legacy_ops_to_fixup = { + "FakeQuantize", "DetectionOutput", "Normalize", "PriorBox"}; + + // There are some models with custom OPs (list above) that has the default domain set. + // So in order to load the models, we need overwrite the OPs' domain to the one they're + // registered + void fixup_legacy_operators(ONNX_NAMESPACE::GraphProto* graph_proto) + { + for (auto& node : *graph_proto->mutable_node()) + { + auto it = std::find( + legacy_ops_to_fixup.begin(), legacy_ops_to_fixup.end(), node.op_type()); + if (it != legacy_ops_to_fixup.end()) + { + if (!node.has_domain() || node.domain().empty() || + node.domain() == "ai.onnx") + { + node.set_domain(OPENVINO_ONNX_DOMAIN); + } + } + } + } + std::shared_ptr convert_to_ng_function(const ONNX_NAMESPACE::ModelProto& model_proto) { @@ -119,6 +142,9 @@ namespace ngraph } #endif } + + detail::fixup_legacy_operators(model_proto.mutable_graph()); + return detail::convert_to_ng_function(model_proto); } diff --git a/ngraph/frontend/onnx_import/src/op/org.openvinotoolkit/detection_output.cpp b/ngraph/frontend/onnx_import/src/op/org.openvinotoolkit/detection_output.cpp new file mode 100644 index 00000000000000..336b498641c1d2 --- /dev/null +++ b/ngraph/frontend/onnx_import/src/op/org.openvinotoolkit/detection_output.cpp @@ -0,0 +1,104 @@ +//***************************************************************************** +// Copyright 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#include "ngraph/op/detection_output.hpp" +#include "ngraph/node.hpp" +#include "onnx_import/core/node.hpp" +#include "onnx_import/default_opset.hpp" +#include "onnx_import/op/org.openvinotoolkit/detection_output.hpp" + +namespace ngraph +{ + namespace onnx_import + { + namespace op + { + namespace set_1 + { + OutputVector detection_output(const Node& node) + { + auto inputs = node.get_ng_inputs(); + + auto box_logits = inputs[0]; + auto class_preds = inputs[1]; + auto proposals = inputs[2]; + + ngraph::op::DetectionOutputAttrs attrs; + attrs.num_classes = node.get_attribute_value("num_classes"); + attrs.background_label_id = + node.get_attribute_value("background_label_id", 0); + attrs.top_k = node.get_attribute_value("top_k", -1); + attrs.variance_encoded_in_target = + node.get_attribute_value("variance_encoded_in_target", 0); + // spec says keep_top_k is an array of ints, but some models use a single int + // also mkldnn expects single integer + attrs.keep_top_k = { + static_cast(node.get_attribute_value("keep_top_k", 1))}; + + auto code_type = node.get_attribute_value( + "code_type", std::string{"caffe.PriorBoxParameter.CORNER"}); + // possible values are "caffe.PriorBoxParameter.CENTER_SIZE", + // "caffe.PriorBoxParameter.CORNER" + // but we can just have "CENTER_SIZE" or "CORNER" strings here, so we need to + // handle that case + if (code_type.find("caffe.PriorBoxParameter.") == std::string::npos) + { + code_type = "caffe.PriorBoxParameter." + code_type; + } + attrs.code_type = code_type; + attrs.share_location = node.get_attribute_value("share_location", 1); + attrs.nms_threshold = node.get_attribute_value("nms_threshold"); + attrs.confidence_threshold = + node.get_attribute_value("confidence_threshold", 0); + attrs.clip_after_nms = node.get_attribute_value("clip_after_nms", 0); + attrs.clip_before_nms = node.get_attribute_value("clip_before_nms", 0); + attrs.decrease_label_id = + node.get_attribute_value("decrease_label_id", 0); + // TODO: per spec, normalized by default should be 0, but in MO it's 1. + attrs.normalized = node.get_attribute_value("normalized", 1); + attrs.input_width = node.get_attribute_value("input_width", 1); + attrs.input_height = node.get_attribute_value("input_height", 1); + attrs.objectness_score = node.get_attribute_value("objectness_score", 0); + + if (inputs.size() == 3) + { + return {std::make_shared( + box_logits, class_preds, proposals, attrs)}; + } + else if (inputs.size() == 5) + { + auto aux_class_preds = inputs[3]; + auto aux_box_preds = inputs[4]; + return {std::make_shared(box_logits, + class_preds, + proposals, + aux_class_preds, + aux_box_preds, + attrs)}; + } + else + { + NGRAPH_CHECK(false, "Invalid number of inputs"); + } + } + + } // namespace set_1 + + } // namespace op + + } // namespace onnx_import + +} // namespace ngraph diff --git a/ngraph/frontend/onnx_import/src/op/fake_quantize.cpp b/ngraph/frontend/onnx_import/src/op/org.openvinotoolkit/fake_quantize.cpp similarity index 96% rename from ngraph/frontend/onnx_import/src/op/fake_quantize.cpp rename to ngraph/frontend/onnx_import/src/op/org.openvinotoolkit/fake_quantize.cpp index 4e4444d34d5173..044f095d4be008 100644 --- a/ngraph/frontend/onnx_import/src/op/fake_quantize.cpp +++ b/ngraph/frontend/onnx_import/src/op/org.openvinotoolkit/fake_quantize.cpp @@ -16,8 +16,8 @@ #include -#include "fake_quantize.hpp" #include "onnx_import/default_opset.hpp" +#include "onnx_import/op/org.openvinotoolkit/fake_quantize.hpp" namespace ngraph { diff --git a/ngraph/frontend/onnx_import/src/op/org.openvinotoolkit/normalize.cpp b/ngraph/frontend/onnx_import/src/op/org.openvinotoolkit/normalize.cpp new file mode 100644 index 00000000000000..226658d7f55e82 --- /dev/null +++ b/ngraph/frontend/onnx_import/src/op/org.openvinotoolkit/normalize.cpp @@ -0,0 +1,96 @@ +//***************************************************************************** +// Copyright 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#include "onnx_import/op/org.openvinotoolkit/normalize.hpp" +#include "ngraph/op/normalize_l2.hpp" +#include "onnx_import/default_opset.hpp" +#include "onnx_import/utils/common.hpp" + +namespace ngraph +{ + namespace onnx_import + { + namespace op + { + namespace set_1 + { + OutputVector normalize(const Node& node) + { + auto inputs = node.get_ng_inputs(); + NGRAPH_CHECK(inputs.size() == 2, "Invalid number of inputs"); + + auto data = inputs[0]; + float eps = node.get_attribute_value("eps", 0); + int64_t across_spatial = node.get_attribute_value("across_spatial", 0); + int64_t channel_shared = node.get_attribute_value("channel_shared", 0); + + std::shared_ptr weights; + if (channel_shared) + { + NGRAPH_CHECK( + ngraph::op::is_constant(inputs[1].get_node()), + "Weights input must be a constant if channel_shared is set to 1"); + const auto& shape = inputs[1].get_partial_shape(); + NGRAPH_CHECK( + shape.is_static() && shape.rank().get_length() == 1, + "Weights rank must be equal to 1 if channel_shared is set to 1"); + weights = inputs[1].get_node_shared_ptr(); + } + else + { + std::vector weights_shape{1}; + const auto& data_shape = inputs[0].get_partial_shape(); + if (data_shape[1].is_static()) + { + weights_shape.push_back(data_shape[1].get_length()); + } + else + { + weights_shape.push_back(0); + } + for (size_t i = 2; i < data_shape.rank().get_length(); ++i) + { + weights_shape.push_back(1); + } + auto new_shape = std::make_shared( + element::i64, Shape{weights_shape.size()}, weights_shape); + weights = + std::make_shared(inputs[1], new_shape, true); + } + + std::shared_ptr axes; + if (!across_spatial) + { + axes = std::make_shared( + element::i64, Shape{1}, std::vector{1}); + } + else + { + axes = common::get_monotonic_range_along_node_rank(data, 1); + } + + return {std::make_shared( + std::make_shared( + data, axes, eps, ngraph::op::EpsMode::ADD), + weights)}; + } + + } // namespace set_1 + } // namespace op + + } // namespace onnx_import + +} // namespace ngraph diff --git a/ngraph/frontend/onnx_import/src/op/org.openvinotoolkit/prior_box.cpp b/ngraph/frontend/onnx_import/src/op/org.openvinotoolkit/prior_box.cpp new file mode 100644 index 00000000000000..33ae3dc25a4b97 --- /dev/null +++ b/ngraph/frontend/onnx_import/src/op/org.openvinotoolkit/prior_box.cpp @@ -0,0 +1,92 @@ +//***************************************************************************** +// Copyright 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#include "ngraph/op/prior_box.hpp" +#include "ngraph/node.hpp" +#include "onnx_import/core/node.hpp" +#include "onnx_import/default_opset.hpp" +#include "onnx_import/op/org.openvinotoolkit/prior_box.hpp" + +namespace ngraph +{ + namespace onnx_import + { + namespace op + { + namespace detail + { + namespace + { + std::shared_ptr + make_slice(std::shared_ptr node, int64_t start, int64_t end) + { + return std::make_shared( + node, + default_opset::Constant::create( + element::i64, Shape{1}, std::vector{start}), + default_opset::Constant::create( + element::i64, Shape{1}, std::vector{end}), + std::vector{0}, // begin mask + std::vector{0}); // end mask + } + } + } // detail + + namespace set_1 + { + OutputVector prior_box(const Node& node) + { + auto inputs = node.get_ng_inputs(); + NGRAPH_CHECK(inputs.size() == 2, "Invalid number of inputs"); + + auto output_shape = std::make_shared(inputs[0]); + auto image_shape = std::make_shared(inputs[1]); + auto output_shape_slice = detail::make_slice(output_shape, 2, 4); + auto image_shape_slice = detail::make_slice(image_shape, 2, 4); + + ngraph::op::PriorBoxAttrs attrs; + attrs.min_size = node.get_attribute_value>("min_size", {}); + attrs.max_size = node.get_attribute_value>("max_size", {}); + attrs.aspect_ratio = + node.get_attribute_value>("aspect_ratio", {}); + attrs.flip = node.get_attribute_value("flip", 0); + attrs.clip = node.get_attribute_value("clip", 0); + attrs.step = node.get_attribute_value("step", 0); + attrs.offset = node.get_attribute_value("offset", 0); + attrs.variance = node.get_attribute_value>("variance", {}); + attrs.scale_all_sizes = node.get_attribute_value("scale_all_sizes", 1); + attrs.fixed_ratio = + node.get_attribute_value>("fixed_ratio", {}); + attrs.fixed_size = + node.get_attribute_value>("fixed_size", {}); + attrs.density = node.get_attribute_value>("density", {}); + + auto axes = default_opset::Constant::create( + element::i64, Shape{1}, std::vector{0}); + + return {std::make_shared( + std::make_shared( + output_shape_slice, image_shape_slice, attrs), + axes)}; + } + + } // namespace set_1 + + } // namespace op + + } // namespace onnx_import + +} // namespace ngraph diff --git a/ngraph/frontend/onnx_import/src/ops_bridge.cpp b/ngraph/frontend/onnx_import/src/ops_bridge.cpp index 97b62622c06745..111443ad3c49b2 100644 --- a/ngraph/frontend/onnx_import/src/ops_bridge.cpp +++ b/ngraph/frontend/onnx_import/src/ops_bridge.cpp @@ -57,7 +57,6 @@ #include "onnx_import/op/exp.hpp" #include "onnx_import/op/expand.hpp" #include "onnx_import/op/eye_like.hpp" -#include "onnx_import/op/fake_quantize.hpp" #include "onnx_import/op/flatten.hpp" #include "onnx_import/op/floor.hpp" #include "onnx_import/op/gather.hpp" @@ -144,6 +143,11 @@ #include "onnx_import/op/xor.hpp" #include "onnx_import/ops_bridge.hpp" +#include "onnx_import/op/org.openvinotoolkit/detection_output.hpp" +#include "onnx_import/op/org.openvinotoolkit/fake_quantize.hpp" +#include "onnx_import/op/org.openvinotoolkit/normalize.hpp" +#include "onnx_import/op/org.openvinotoolkit/prior_box.hpp" + namespace ngraph { namespace onnx_import @@ -249,6 +253,9 @@ namespace ngraph #define REGISTER_OPERATOR(name_, ver_, fn_) \ m_map[""][name_].emplace(ver_, std::bind(op::set_##ver_::fn_, std::placeholders::_1)) +#define REGISTER_OPERATOR_WITH_DOMAIN(domain_, name_, ver_, fn_) \ + m_map[domain_][name_].emplace(ver_, std::bind(op::set_##ver_::fn_, std::placeholders::_1)) + OperatorsBridge::OperatorsBridge() { REGISTER_OPERATOR("Abs", 1, abs); @@ -399,11 +406,16 @@ namespace ngraph REGISTER_OPERATOR("Where", 1, where); REGISTER_OPERATOR("Xor", 1, logical_xor); - // TODO Change the domain - m_map[""]["FakeQuantize"].emplace(1, op::set_1::fake_quantize); + // custom OPs + REGISTER_OPERATOR_WITH_DOMAIN(OPENVINO_ONNX_DOMAIN, "FakeQuantize", 1, fake_quantize); + REGISTER_OPERATOR_WITH_DOMAIN( + OPENVINO_ONNX_DOMAIN, "DetectionOutput", 1, detection_output); + REGISTER_OPERATOR_WITH_DOMAIN(OPENVINO_ONNX_DOMAIN, "PriorBox", 1, prior_box); + REGISTER_OPERATOR_WITH_DOMAIN(OPENVINO_ONNX_DOMAIN, "Normalize", 1, normalize); } #undef REGISTER_OPERATOR +#undef REGISTER_OPERATOR_WITH_DOMAIN } // namespace onnx_import } // namespace ngraph diff --git a/ngraph/frontend/onnx_import/src/utils/convpool.cpp b/ngraph/frontend/onnx_import/src/utils/convpool.cpp index 5282838a228414..b503c45613fa76 100644 --- a/ngraph/frontend/onnx_import/src/utils/convpool.cpp +++ b/ngraph/frontend/onnx_import/src/utils/convpool.cpp @@ -111,10 +111,10 @@ namespace ngraph {"SAME_UPPER", ngraph::op::PadType::SAME_UPPER}, {"SAME_LOWER", ngraph::op::PadType::SAME_LOWER}, {"NOTSET", ngraph::op::PadType::NOTSET}, - {"", ngraph::op::PadType::NOTSET}, }; - const std::string& pad_str{node.get_attribute_value("auto_pad")}; + const std::string& pad_str{ + node.get_attribute_value("auto_pad", "NOTSET")}; const auto pad_val_it = auto_pad_values.find(pad_str); CHECK_VALID_NODE(node, pad_val_it != auto_pad_values.end(), diff --git a/ngraph/test/models/onnx/conv2d_dilation_assym_pads_strides.prototxt b/ngraph/test/models/onnx/conv2d_dilation_assym_pads_strides.prototxt index ff4e954c37d635..078fa6b2ef9b70 100644 --- a/ngraph/test/models/onnx/conv2d_dilation_assym_pads_strides.prototxt +++ b/ngraph/test/models/onnx/conv2d_dilation_assym_pads_strides.prototxt @@ -6,10 +6,6 @@ graph { input: "B" output: "C" op_type: "Conv" - attribute { - name: "auto_pad" - type: STRING - } attribute { name: "dilations" ints: 1 diff --git a/ngraph/test/models/onnx/conv3d_bias.prototxt b/ngraph/test/models/onnx/conv3d_bias.prototxt index f2ddb2c4b2be63..464608d935d47e 100644 --- a/ngraph/test/models/onnx/conv3d_bias.prototxt +++ b/ngraph/test/models/onnx/conv3d_bias.prototxt @@ -7,10 +7,6 @@ graph { input: "C" output: "D" op_type: "Conv" - attribute { - name: "auto_pad" - type: STRING - } attribute { name: "dilations" ints: 2 diff --git a/ngraph/test/models/onnx/detection_output.prototxt b/ngraph/test/models/onnx/detection_output.prototxt new file mode 100644 index 00000000000000..04f00de63bf764 --- /dev/null +++ b/ngraph/test/models/onnx/detection_output.prototxt @@ -0,0 +1,140 @@ +ir_version: 6 +producer_name: "nGraph ONNX Importer" +graph { + name: "test_graph" + node { + domain: "org.openvinotoolkit" + input: "box_logits" + input: "class_preds" + input: "proposals" + output: "out" + name: "DetectionOutput_500" + op_type: "DetectionOutput" + attribute { + name: "background_label_id" + i: 0 + type: INT + } + attribute { + name: "code_type" + s: "CENTER_SIZE" + type: STRING + } + attribute { + name: "confidence_threshold" + f: 0.0099999997764825821 + type: FLOAT + } + attribute { + name: "eta" + f: 1 + type: FLOAT + } + attribute { + name: "keep_top_k" + i: 5 + type: INT + } + attribute { + name: "nms_threshold" + f: 0.44999998807907104 + type: FLOAT + } + attribute { + name: "num_classes" + i: 3 + type: INT + } + attribute { + name: "share_location" + i: 1 + type: INT + } + attribute { + name: "top_k" + i: 5 + type: INT + } + attribute { + name: "variance_encoded_in_target" + i: 0 + type: INT + } + } + input { + name: "box_logits" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 1 + } + dim { + dim_value: 12 + } + } + } + } + } + input { + name: "class_preds" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 1 + } + dim { + dim_value: 9 + } + } + } + } + } + input { + name: "proposals" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 1 + } + dim { + dim_value: 2 + } + dim { + dim_value: 15 + } + } + } + } + } + output { + name: "out" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 1 + } + dim { + dim_value: 1 + } + dim { + dim_value: 5 + } + dim { + dim_value: 7 + } + } + } + } + } +} +opset_import { + version: 10 +} diff --git a/ngraph/test/models/onnx/normalize.prototxt b/ngraph/test/models/onnx/normalize.prototxt new file mode 100644 index 00000000000000..c86713ea6a4879 --- /dev/null +++ b/ngraph/test/models/onnx/normalize.prototxt @@ -0,0 +1,83 @@ +ir_version: 6 +producer_name: "nGraph ONNX Importer" +graph { + name: "test" + node { + domain: "org.openvinotoolkit" + input: "data" + input: "weight" + output: "out" + name: "Normalize_177" + op_type: "Normalize" + attribute { + name: "across_spatial" + i: 0 + type: INT + } + attribute { + name: "channel_shared" + i: 0 + type: INT + } + attribute { + name: "eps" + f: 1.000000013351432e-10 + type: FLOAT + } + } + initializer { + dims: 3 + data_type: 1 + name: "weight" + float_data: 2 + float_data: 3 + float_data: 4 + } + input { + name: "data" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 1 + } + dim { + dim_value: 3 + } + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } + output { + name: "out" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 1 + } + dim { + dim_value: 3 + } + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } +} +opset_import { + version: 10 +} diff --git a/ngraph/test/models/onnx/prior_box.prototxt b/ngraph/test/models/onnx/prior_box.prototxt new file mode 100644 index 00000000000000..9788d2aa82ee0c --- /dev/null +++ b/ngraph/test/models/onnx/prior_box.prototxt @@ -0,0 +1,147 @@ +ir_version: 6 +producer_name: "nGraph ONNX Importer" +graph { + name: "test" + node { + domain: "org.openvinotoolkit" + input: "A" + input: "B" + output: "out" + name: "PriorBox_306" + op_type: "PriorBox" + attribute { + name: "aspect_ratio" + floats: 1 + type: FLOATS + } + attribute { + name: "clip" + i: 0 + type: INT + } + attribute { + name: "flip" + i: 1 + type: INT + } + attribute { + name: "img_h" + i: 0 + type: INT + } + attribute { + name: "img_size" + i: 0 + type: INT + } + attribute { + name: "img_w" + i: 0 + type: INT + } + attribute { + name: "max_size" + floats: 76.800003051757812 + type: FLOATS + } + attribute { + name: "min_size" + floats: 35.840000152587891 + type: FLOATS + } + attribute { + name: "offset" + f: 0.5 + type: FLOAT + } + attribute { + name: "step" + f: 8 + type: FLOAT + } + attribute { + name: "step_h" + f: 0 + type: FLOAT + } + attribute { + name: "step_w" + f: 0 + type: FLOAT + } + attribute { + name: "variance" + floats: 0.10000000149011612 + floats: 0.10000000149011612 + floats: 0.20000000298023224 + floats: 0.20000000298023224 + type: FLOATS + } + } + input { + name: "A" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 1 + } + dim { + dim_value: 3 + } + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } + input { + name: "B" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 1 + } + dim { + dim_value: 3 + } + dim { + dim_value: 6 + } + dim { + dim_value: 6 + } + } + } + } + } + output { + name: "out" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 1 + } + dim { + dim_value: 2 + } + dim { + dim_value: 32 + } + } + } + } + } +} +opset_import { + version: 10 +} diff --git a/ngraph/test/models/onnx/quantization/fake_quantize_const_inputs.prototxt b/ngraph/test/models/onnx/quantization/fake_quantize_const_inputs.prototxt index 9a0704b3c5bf87..8b465857707e53 100644 --- a/ngraph/test/models/onnx/quantization/fake_quantize_const_inputs.prototxt +++ b/ngraph/test/models/onnx/quantization/fake_quantize_const_inputs.prototxt @@ -70,6 +70,7 @@ graph { } } node { + domain: "org.openvinotoolkit" input: "X" input: "input_low" input: "input_high" diff --git a/ngraph/test/models/onnx/quantization/fake_quantize_nonconst_inputs.prototxt b/ngraph/test/models/onnx/quantization/fake_quantize_nonconst_inputs.prototxt index affb0c8c99af7c..ce9de0f3292f11 100644 --- a/ngraph/test/models/onnx/quantization/fake_quantize_nonconst_inputs.prototxt +++ b/ngraph/test/models/onnx/quantization/fake_quantize_nonconst_inputs.prototxt @@ -2,6 +2,7 @@ ir_version: 7 producer_name: "onnx-importer-test" graph { node { + domain: "org.openvinotoolkit" input: "X" input: "input_low" input: "input_high" diff --git a/ngraph/test/onnx/onnx_import.in.cpp b/ngraph/test/onnx/onnx_import.in.cpp index 860e8940c9e618..f6336db4f5b407 100644 --- a/ngraph/test/onnx/onnx_import.in.cpp +++ b/ngraph/test/onnx/onnx_import.in.cpp @@ -2470,3 +2470,85 @@ NGRAPH_TEST(${BACKEND_NAME}, quant_dequant_pattern_axis) test_case.add_input({1}); test_case.run(); } + +NGRAPH_TEST(${BACKEND_NAME}, onnx_detection_output) +{ + const auto function = onnx_import::import_onnx_model( + file_util::path_join(SERIALIZED_ZOO, "onnx/detection_output.prototxt")); + auto test_case = test::TestCase(function); + + auto gen_vector = [](size_t size, float min, float max) -> std::vector { + float step = (max - min) / size; + float next = min - step; + + std::vector out(size); + std::generate(out.begin(), out.end(), [&next, &step] { return next += step; }); + return out; + }; + + std::vector logits = gen_vector(12, -2, 2); + std::vector class_preds = gen_vector(9, 0, 1); + std::vector proposals = gen_vector(15 * 2, 0, 1); + std::vector output = {0, 1, 0.777778, 0.241012, 0.260378, 0.418248, 0.499622, + 0, 1, 0.444444, 0.10963, 0.146239, 0.176296, 0.228576, + 0, 2, 0.888889, 0.241012, 0.260378, 0.418248, 0.499622, + 0, 2, 0.555556, 0.10963, 0.146239, 0.176296, 0.228576, + 0, 2, 0.222222, -0.0378917, -0.00169918, -0.00210832, 0.0387362}; + test_case.add_input(logits); + test_case.add_input(class_preds); + test_case.add_input(proposals); + test_case.add_expected_output(Shape{1, 1, 5, 7}, output); + int tolerance_bits = 6; + test_case.run(tolerance_bits); +} + +NGRAPH_TEST(${BACKEND_NAME}, onnx_prior_box) +{ + const auto function = onnx_import::import_onnx_model( + file_util::path_join(SERIALIZED_ZOO, "onnx/prior_box.prototxt")); + auto test_case = test::TestCase(function); + std::vector A(3 * 2 * 2); + std::vector B(3 * 6 * 6); + std::vector output = { + -2.3200002, -2.3200002, 3.6533334, 3.6533334, -3.7053659, -3.7053659, 5.0386992, + 5.0386992, -0.98666668, -2.3200002, 4.9866667, 3.6533334, -2.3720326, -3.7053659, + 6.3720322, 5.0386992, -2.3200002, -0.98666668, 3.6533334, 4.9866667, -3.7053659, + -2.3720326, 5.0386992, 6.3720322, -0.98666668, -0.98666668, 4.9866667, 4.9866667, + -2.3720326, -2.3720326, 6.3720322, 6.3720322, 0.1, 0.1, 0.2, + 0.2, 0.1, 0.1, 0.2, 0.2, 0.1, 0.1, + 0.2, 0.2, 0.1, 0.1, 0.2, 0.2, 0.1, + 0.1, 0.2, 0.2, 0.1, 0.1, 0.2, 0.2, + 0.1, 0.1, 0.2, 0.2, 0.1, 0.1, 0.2, + 0.2, + }; + test_case.add_input(A); + test_case.add_input(B); + test_case.add_expected_output(Shape{1, 2, 32}, output); + test_case.run(); +} + +NGRAPH_TEST(${BACKEND_NAME}, onnx_normalize) +{ + const auto function = onnx_import::import_onnx_model( + file_util::path_join(SERIALIZED_ZOO, "onnx/normalize.prototxt")); + auto test_case = test::TestCase(function); + std::vector data(12); + std::iota(data.begin(), data.end(), 1); + std::vector output = { + 0.19334731, + 0.33806169, + 0.44846106, + 0.53452247, + 1.4501048, + 1.5212777, + 1.5696137, + 1.6035674, + 3.4802516, + 3.3806169, + 3.2887144, + 3.2071347, + }; + test_case.add_input(data); + test_case.add_expected_output(Shape{1, 3, 2, 2}, output); + test_case.run(); +} diff --git a/ngraph/test/op_is.cpp b/ngraph/test/op_is.cpp index 059f1de2ee61cc..dc1ea86289dc5d 100644 --- a/ngraph/test/op_is.cpp +++ b/ngraph/test/op_is.cpp @@ -632,6 +632,15 @@ namespace EXPECT_FALSE(op::is_binary_elementwise_logical(&node)); } + void op_is_PriorBox() + { + op::PriorBox node; + EXPECT_FALSE(op::is_unary_elementwise_arithmetic(&node)); + EXPECT_FALSE(op::is_binary_elementwise_arithmetic(&node)); + EXPECT_FALSE(op::is_binary_elementwise_comparison(&node)); + EXPECT_FALSE(op::is_binary_elementwise_logical(&node)); + } + void op_is_Product() { op::Product node; diff --git a/ngraph/test/runtime/interpreter/int_executable.cpp b/ngraph/test/runtime/interpreter/int_executable.cpp index 3400e70572b392..d23f6c2bf235a3 100644 --- a/ngraph/test/runtime/interpreter/int_executable.cpp +++ b/ngraph/test/runtime/interpreter/int_executable.cpp @@ -176,7 +176,8 @@ bool runtime::interpreter::INTExecutable::call(const vector(op) || is_type(op) || is_type(op)) + if (is_type(op) || is_type(op) || is_type(op) || + is_type(op)) { type = op->get_input_element_type(0); } diff --git a/ngraph/test/runtime/interpreter/int_executable.hpp b/ngraph/test/runtime/interpreter/int_executable.hpp index 74c1c0138cd8df..fd9e28c45b92bb 100644 --- a/ngraph/test/runtime/interpreter/int_executable.hpp +++ b/ngraph/test/runtime/interpreter/int_executable.hpp @@ -73,6 +73,7 @@ #include "ngraph/runtime/reference/not.hpp" #include "ngraph/runtime/reference/one_hot.hpp" #include "ngraph/runtime/reference/pad.hpp" +#include "ngraph/runtime/reference/prior_box.hpp" #include "ngraph/runtime/reference/product.hpp" #include "ngraph/runtime/reference/quantize.hpp" #include "ngraph/runtime/reference/relu.hpp" @@ -881,6 +882,16 @@ class INTERPRETER_BACKEND_API ngraph::runtime::interpreter::INTExecutable : publ break; } case OP_TYPEID::Parameter: break; + case OP_TYPEID::PriorBox: + { + const op::PriorBox* pbox = static_cast(&node); + runtime::reference::prior_box(args[0]->get_data_ptr(), + args[1]->get_data_ptr(), + out[0]->get_data_ptr(), + out[0]->get_shape(), + pbox->get_attrs()); + break; + } case OP_TYPEID::Quantize: { const op::Quantize* quantize = static_cast(&node); diff --git a/ngraph/test/runtime/opset0_tbl.hpp b/ngraph/test/runtime/opset0_tbl.hpp index ec14923d7830ae..a0eac8c3e6599f 100644 --- a/ngraph/test/runtime/opset0_tbl.hpp +++ b/ngraph/test/runtime/opset0_tbl.hpp @@ -111,6 +111,7 @@ NGRAPH_OP(Or, ngraph::op) NGRAPH_OP(Parameter, ngraph::op) NGRAPH_OP(Power, ngraph::op) NGRAPH_OP(PRelu, ngraph::op) +NGRAPH_OP(PriorBox, ngraph::op) NGRAPH_OP(Product, ngraph::op) NGRAPH_OP(Quantize, ngraph::op) NGRAPH_OP(QuantizedConvolution, ngraph::op) From d4d460101d6b87e0b56c49b17ecc7f0c6cc67a2a Mon Sep 17 00:00:00 2001 From: Kate Generalova Date: Thu, 10 Sep 2020 12:44:13 +0300 Subject: [PATCH 65/66] fix #38545 remove CPU network layer check for py samples (#2145) --- .../classification_sample/classification_sample.py | 12 ------------ .../classification_sample_async.py | 13 +------------ .../object_detection_sample_ssd.py | 10 ---------- .../style_transfer_sample/style_transfer_sample.py | 12 ------------ 4 files changed, 1 insertion(+), 46 deletions(-) diff --git a/inference-engine/ie_bridges/python/sample/classification_sample/classification_sample.py b/inference-engine/ie_bridges/python/sample/classification_sample/classification_sample.py index 80fb4bd1233ced..3a132ae82b19e8 100644 --- a/inference-engine/ie_bridges/python/sample/classification_sample/classification_sample.py +++ b/inference-engine/ie_bridges/python/sample/classification_sample/classification_sample.py @@ -63,18 +63,6 @@ def main(): # Read IR log.info("Loading network files:\n\t{}\n\t{}".format(model_xml, model_bin)) net = ie.read_network(model=model_xml, weights=model_bin) - func = ng.function_from_cnn(net) - - if "CPU" in args.device: - supported_layers = ie.query_network(net, "CPU") - ops = func.get_ordered_ops() - not_supported_layers = [op.friendly_name for op in ops if op.friendly_name not in supported_layers] - if len(not_supported_layers) != 0: - log.error("Following layers are not supported by the plugin for specified device {}:\n {}". - format(args.device, ', '.join(not_supported_layers))) - log.error("Please try to specify cpu extensions library path in sample's command line parameters using -l " - "or --cpu_extension command line argument") - sys.exit(1) assert len(net.input_info.keys()) == 1, "Sample supports only single input topologies" assert len(net.outputs) == 1, "Sample supports only single output topologies" diff --git a/inference-engine/ie_bridges/python/sample/classification_sample_async/classification_sample_async.py b/inference-engine/ie_bridges/python/sample/classification_sample_async/classification_sample_async.py index 9fbf95ac84d789..da2a5fef0d623d 100644 --- a/inference-engine/ie_bridges/python/sample/classification_sample_async/classification_sample_async.py +++ b/inference-engine/ie_bridges/python/sample/classification_sample_async/classification_sample_async.py @@ -108,18 +108,7 @@ def main(): # Read IR log.info("Loading network files:\n\t{}\n\t{}".format(model_xml, model_bin)) net = ie.read_network(model=model_xml, weights=model_bin) - func = ng.function_from_cnn(net) - - if "CPU" in args.device: - supported_layers = ie.query_network(net, "CPU") - ops = func.get_ordered_ops() - not_supported_layers = [op.friendly_name for op in ops if op.friendly_name not in supported_layers] - if len(not_supported_layers) != 0: - log.error("Following layers are not supported by the plugin for specified device {}:\n {}". - format(args.device, ', '.join(not_supported_layers))) - log.error("Please try to specify cpu extensions library path in sample's command line parameters using -l " - "or --cpu_extension command line argument") - sys.exit(1) + assert len(net.input_info.keys()) == 1, "Sample supports only single input topologies" assert len(net.outputs) == 1, "Sample supports only single output topologies" diff --git a/inference-engine/ie_bridges/python/sample/object_detection_sample_ssd/object_detection_sample_ssd.py b/inference-engine/ie_bridges/python/sample/object_detection_sample_ssd/object_detection_sample_ssd.py index 862abe450e4a65..64ef7aaaa8088b 100644 --- a/inference-engine/ie_bridges/python/sample/object_detection_sample_ssd/object_detection_sample_ssd.py +++ b/inference-engine/ie_bridges/python/sample/object_detection_sample_ssd/object_detection_sample_ssd.py @@ -73,16 +73,6 @@ def main(): if args.cpu_extension and "CPU" in args.device: ie.add_extension(args.cpu_extension, "CPU") log.info("CPU extension loaded: {}".format(args.cpu_extension)) - - if "CPU" in args.device: - supported_layers = ie.query_network(net, "CPU") - not_supported_layers = [op.friendly_name for op in ops if op.friendly_name not in supported_layers] - if len(not_supported_layers) != 0: - log.error("Following layers are not supported by the plugin for specified device {}:\n {}". - format(args.device, ', '.join(not_supported_layers))) - log.error("Please try to specify cpu extensions library path in sample's command line parameters using -l " - "or --cpu_extension command line argument") - sys.exit(1) # ----------------------------------------------------------------------------------------------------- # --------------------------- 3. Read and preprocess input -------------------------------------------- diff --git a/inference-engine/ie_bridges/python/sample/style_transfer_sample/style_transfer_sample.py b/inference-engine/ie_bridges/python/sample/style_transfer_sample/style_transfer_sample.py index f6c8bd08f783c4..9531abaa177e56 100644 --- a/inference-engine/ie_bridges/python/sample/style_transfer_sample/style_transfer_sample.py +++ b/inference-engine/ie_bridges/python/sample/style_transfer_sample/style_transfer_sample.py @@ -67,18 +67,6 @@ def main(): # Read IR log.info("Loading network files:\n\t{}\n\t{}".format(model_xml, model_bin)) net = ie.read_network(model=model_xml, weights=model_bin) - func = ng.function_from_cnn(net) - - if "CPU" in args.device: - supported_layers = ie.query_network(net, "CPU") - ops = func.get_ordered_ops() - not_supported_layers = [op.friendly_name for op in ops if op.friendly_name not in supported_layers] - if len(not_supported_layers) != 0: - log.error("Following layers are not supported by the plugin for specified device {}:\n {}". - format(args.device, ', '.join(not_supported_layers))) - log.error("Please try to specify cpu extensions library path in sample's command line parameters using -l " - "or --cpu_extension command line argument") - sys.exit(1) assert len(net.input_info.keys()) == 1, "Sample supports only single input topologies" assert len(net.outputs) == 1, "Sample supports only single output topologies" From ef2581d5c623f78de5913f96fac3de8fd3a5b91f Mon Sep 17 00:00:00 2001 From: Nikita Kudriavtsev Date: Thu, 10 Sep 2020 12:56:21 +0300 Subject: [PATCH 66/66] [IE Myriad][IE Tests] Activation layer's constants parametrization. (#2071) CI passed: https://gitlab-icv.inn.intel.com/inference-engine/product-configs/merge_requests/870 --- .../single_layer_tests/activation.cpp | 70 +++++++++---------- .../single_layer_tests/comparison.cpp | 2 +- .../single_layer_tests/squeeze_unsqueeze.cpp | 2 +- .../single_layer_tests/activation.cpp | 20 +++--- .../single_layer_tests/activation.cpp | 62 ++++++++-------- .../single_layer_tests/comparison.cpp | 2 +- .../single_layer_tests/activation.cpp | 24 +++---- .../include/single_layer_tests/activation.hpp | 10 ++- .../src/single_layer_tests/activation.cpp | 40 +++++++---- .../common_test_utils/common_utils.hpp | 14 ++-- .../include/ngraph_functions/builders.hpp | 3 +- .../tests/ngraph_functions/src/activation.cpp | 21 +++--- 12 files changed, 144 insertions(+), 126 deletions(-) diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/activation.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/activation.cpp index 0088bb90e79999..8450a3ed328187 100644 --- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/activation.cpp +++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/activation.cpp @@ -22,39 +22,39 @@ const std::vector netPrecisions = { InferenceEngine::Precision::FP16 }; -const std::vector activationTypes = { - Sigmoid, - Tanh, - Relu, - Exp, - Log, - Sign, - Abs, - Clamp, - Negative, - Acos, - Asin, - Atan, - Cos, - Cosh, - Floor, - Sin, - Sinh, - Sqrt, - Tan, - Elu, - Erf, - HardSigmoid, - Selu, - Ceiling, - Mish, - HSwish, - SoftPlus +const std::map>> activationTypes = { + {Sigmoid, {}}, + {Tanh, {}}, + {Relu, {}}, + {Exp, {}}, + {Log, {}}, + {Sign, {}}, + {Abs, {}}, + {Clamp, {{-2.0f, 2.0f}}}, + {Negative, {}}, + {Acos, {}}, + {Asin, {}}, + {Atan, {}}, + {Cos, {}}, + {Cosh, {}}, + {Floor, {}}, + {Sin, {}}, + {Sinh, {}}, + {Sqrt, {}}, + {Tan, {}}, + {Elu, {{0.1f}}}, + {Erf, {}}, + {HardSigmoid, {{0.2f, 0.5f}}}, + {Selu, {{1.6732f, 1.0507f}}}, + {Ceiling, {}}, + {Mish, {}}, + {HSwish, {}}, + {SoftPlus, {}} }; -const std::vector activationParamTypes = { - PReLu, - LeakyRelu, +const std::map>> activationParamTypes = { + {PReLu, {{-0.01f}}}, + {LeakyRelu, {{0.01f}}} }; std::map, std::vector>> basic = { @@ -68,16 +68,16 @@ std::map, std::vector>> preluBasic = { }; const auto basicCases = ::testing::Combine( - ::testing::ValuesIn(activationTypes), + ::testing::ValuesIn(CommonTestUtils::combineParams(activationTypes)), ::testing::ValuesIn(netPrecisions), - ::testing::ValuesIn(CommonTestUtils::combineShapes(basic)), + ::testing::ValuesIn(CommonTestUtils::combineParams(basic)), ::testing::Values(CommonTestUtils::DEVICE_CPU) ); const auto basicPreluCases = ::testing::Combine( - ::testing::ValuesIn(activationParamTypes), + ::testing::ValuesIn(CommonTestUtils::combineParams(activationParamTypes)), ::testing::ValuesIn(netPrecisions), - ::testing::ValuesIn(CommonTestUtils::combineShapes(preluBasic)), + ::testing::ValuesIn(CommonTestUtils::combineParams(preluBasic)), ::testing::Values(CommonTestUtils::DEVICE_CPU) ); diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/comparison.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/comparison.cpp index cc23a910c8f509..5caf2dc2c2d62f 100644 --- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/comparison.cpp +++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/comparison.cpp @@ -45,7 +45,7 @@ std::vector netPrecisions = { std::map additional_config = {}; const auto ComparisonTestParams = ::testing::Combine( - ::testing::ValuesIn(CommonTestUtils::combineShapes(inputShapes)), + ::testing::ValuesIn(CommonTestUtils::combineParams(inputShapes)), ::testing::ValuesIn(inputsPrecisions), ::testing::ValuesIn(comparisonOpTypes), ::testing::ValuesIn(secondInputTypes), diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/squeeze_unsqueeze.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/squeeze_unsqueeze.cpp index fae71fa4741f70..5562d6588254f5 100644 --- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/squeeze_unsqueeze.cpp +++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/squeeze_unsqueeze.cpp @@ -31,7 +31,7 @@ const std::vector opTypes = { INSTANTIATE_TEST_CASE_P(Basic, SqueezeUnsqueezeLayerTest, ::testing::Combine( - ::testing::ValuesIn(CommonTestUtils::combineShapes(axesVectors)), + ::testing::ValuesIn(CommonTestUtils::combineParams(axesVectors)), ::testing::ValuesIn(opTypes), ::testing::ValuesIn(netPrecisions), ::testing::Values(CommonTestUtils::DEVICE_CPU)), diff --git a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/activation.cpp b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/activation.cpp index bedf839e4ed2e7..c73ca47b9152ae 100644 --- a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/activation.cpp +++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/activation.cpp @@ -25,14 +25,14 @@ const std::vector netPrecisions = { InferenceEngine::Precision::U8 }; -const std::vector activationTypes = { - Sigmoid, - Tanh, - Relu, - Exp, - Log, - Sign, - Abs +const std::map>> activationTypes = { + {Sigmoid, {}}, + {Tanh, {}}, + {Relu, {}}, + {Exp, {}}, + {Log, {}}, + {Sign, {}}, + {Abs, {}} }; std::map, std::vector>> basic = { @@ -42,9 +42,9 @@ std::map, std::vector>> basic = { }; const auto basicCases = ::testing::Combine( - ::testing::ValuesIn(activationTypes), + ::testing::ValuesIn(CommonTestUtils::combineParams(activationTypes)), ::testing::ValuesIn(netPrecisions), - ::testing::ValuesIn(CommonTestUtils::combineShapes(basic)), + ::testing::ValuesIn(CommonTestUtils::combineParams(basic)), ::testing::Values(CommonTestUtils::DEVICE_GNA) ); diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/activation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/activation.cpp index 703ac25fae6935..2b4644ce91a0b9 100644 --- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/activation.cpp +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/activation.cpp @@ -16,35 +16,35 @@ const std::vector netPrecisions = { InferenceEngine::Precision::FP16 }; -const std::vector activationTypes = { - Sigmoid, - Tanh, - Relu, - Exp, - Log, - Sign, - Abs, - Gelu, - Clamp, - Negative, - Acos, - Asin, - Atan, - Cos, - Cosh, - Floor, - Sin, - Sinh, - Sqrt, - Tan, - Elu, - Erf, - HardSigmoid, - Selu, - Ceiling, - Mish, - HSwish, - SoftPlus +const std::map>> activationTypes = { + {Sigmoid, {}}, + {Tanh, {}}, + {Relu, {}}, + {Exp, {}}, + {Log, {}}, + {Sign, {}}, + {Abs, {}}, + {Gelu, {}}, + {Clamp, {{-2.0f, 2.0f}}}, + {Negative, {}}, + {Acos, {}}, + {Asin, {}}, + {Atan, {}}, + {Cos, {}}, + {Cosh, {}}, + {Floor, {}}, + {Sin, {}}, + {Sinh, {}}, + {Sqrt, {}}, + {Tan, {}}, + {Elu, {{0.1f}}}, + {Erf, {}}, + {HardSigmoid, {{0.2f, 0.5f}}}, + {Selu, {{1.6732f, 1.0507f}}}, + {Ceiling, {}}, + {Mish, {}}, + {HSwish, {}}, + {SoftPlus, {}} }; std::map, std::vector>> basic = { @@ -53,9 +53,9 @@ std::map, std::vector>> basic = { }; const auto basicCases = ::testing::Combine( - ::testing::ValuesIn(activationTypes), + ::testing::ValuesIn(CommonTestUtils::combineParams(activationTypes)), ::testing::ValuesIn(netPrecisions), - ::testing::ValuesIn(CommonTestUtils::combineShapes(basic)), + ::testing::ValuesIn(CommonTestUtils::combineParams(basic)), ::testing::Values(CommonTestUtils::DEVICE_GPU) ); diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/comparison.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/comparison.cpp index 2dbfcb1e5ff619..7c907bfa1b07e0 100644 --- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/comparison.cpp +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/comparison.cpp @@ -45,7 +45,7 @@ std::vector netPrecisions = { std::map additional_config = {}; const auto ComparisonTestParams = ::testing::Combine( - ::testing::ValuesIn(CommonTestUtils::combineShapes(inputShapes)), + ::testing::ValuesIn(CommonTestUtils::combineParams(inputShapes)), ::testing::ValuesIn(inputsPrecisions), ::testing::ValuesIn(comparisonOpTypes), ::testing::ValuesIn(secondInputTypes), diff --git a/inference-engine/tests/functional/plugin/myriad/shared_tests_instances/single_layer_tests/activation.cpp b/inference-engine/tests/functional/plugin/myriad/shared_tests_instances/single_layer_tests/activation.cpp index 714e50931c14d0..2b52438e2614ec 100644 --- a/inference-engine/tests/functional/plugin/myriad/shared_tests_instances/single_layer_tests/activation.cpp +++ b/inference-engine/tests/functional/plugin/myriad/shared_tests_instances/single_layer_tests/activation.cpp @@ -15,16 +15,16 @@ const std::vector netPrecisions = { InferenceEngine::Precision::FP16 }; -const std::vector activationTypes = { - Sigmoid, - Tanh, - Relu, - Exp, - Log, - Gelu, - Mish, - SoftPlus, - Swish +const std::map>> activationTypes = { + {Sigmoid, {}}, + {Tanh, {}}, + {Relu, {}}, + {Exp, {}}, + {Log, {}}, + {Gelu, {}}, + {Mish, {}}, + {SoftPlus, {}}, + {Swish, {{0.05f}, {0.8f}, {1.0f}, {15.0f}}} }; std::map, std::vector>> basic = { @@ -33,9 +33,9 @@ std::map, std::vector>> basic = { }; const auto basicCases = ::testing::Combine( - ::testing::ValuesIn(activationTypes), + ::testing::ValuesIn(CommonTestUtils::combineParams(activationTypes)), ::testing::ValuesIn(netPrecisions), - ::testing::ValuesIn(CommonTestUtils::combineShapes(basic)), + ::testing::ValuesIn(CommonTestUtils::combineParams(basic)), ::testing::Values(CommonTestUtils::DEVICE_MYRIAD) ); diff --git a/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/activation.hpp b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/activation.hpp index caa17fcbfddcb2..d4adc10568ec08 100644 --- a/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/activation.hpp +++ b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/activation.hpp @@ -74,7 +74,7 @@ static std::map activationNames = }; typedef std::tuple< - ngraph::helpers::ActivationTypes, + std::pair>, // Activation type and constant value InferenceEngine::Precision, std::pair, std::vector>, std::string> activationParams; @@ -98,8 +98,12 @@ class ActivationParamLayerTest : public ActivationLayerTest { void SetUp() override; private: - void generateActivationBlob(); - ngraph::ParameterVector createActivationParams(ngraph::element::Type ngPrc, std::vector inShape = {}); + void generateActivationBlob(std::vector constantsValue); + ngraph::ParameterVector createActivationParams( + ngraph::element::Type ngPrc, std::vector inShape = {}); + +private: + std::vector constantsValue; }; } // namespace LayerTestsDefinitions diff --git a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/activation.cpp b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/activation.cpp index 2801b1711e91c3..fc3f575d090902 100644 --- a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/activation.cpp +++ b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/activation.cpp @@ -23,14 +23,15 @@ std::string ActivationLayerTest::getTestCaseName(const testing::TestParamInfo, std::vector> shapes; std::string targetDevice; - ngraph::helpers::ActivationTypes activationType; - std::tie(activationType, netPrecision, shapes, targetDevice) = obj.param; + std::pair> activationDecl; + std::tie(activationDecl, netPrecision, shapes, targetDevice) = obj.param; std::ostringstream result; const char separator = '_'; - result << activationNames[activationType] << separator; - result << "IS=" << CommonTestUtils::vec2str(shapes.first) << separator;; - result << "AS=" << CommonTestUtils::vec2str(shapes.second) << separator;; + result << activationNames[activationDecl.first] << separator; + result << "IS=" << CommonTestUtils::vec2str(shapes.first) << separator; + result << "AS=" << CommonTestUtils::vec2str(shapes.second) << separator; + result << "ConstantsValue=" << CommonTestUtils::vec2str(activationDecl.second) << separator; result << "netPRC=" << netPrecision.name() << separator; result << "targetDevice=" << targetDevice; return result.str(); @@ -39,10 +40,15 @@ std::string ActivationLayerTest::getTestCaseName(const testing::TestParamInfo, std::vector> shapes; - std::tie(activationType, netPrecision, shapes, targetDevice) = GetParam(); + std::pair> activationDecl; + std::tie(activationDecl, netPrecision, shapes, targetDevice) = GetParam(); + + activationType = activationDecl.first; + auto constantsValue = activationDecl.second; auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); auto params = ngraph::builder::makeParams(ngPrc, {shapes.first}); - auto activation = ngraph::builder::makeActivation(params[0], ngPrc, activationType, shapes.second); + auto activation = ngraph::builder::makeActivation(params[0], ngPrc, activationType, shapes.second, constantsValue); + function = std::make_shared(ngraph::NodeVector{activation}, params); } @@ -127,29 +133,29 @@ ngraph::ParameterVector ActivationParamLayerTest::createActivationParams(ngraph: } } -void ActivationParamLayerTest::generateActivationBlob() { +void ActivationParamLayerTest::generateActivationBlob(std::vector constantsValue) { switch (activationType) { case ngraph::helpers::ActivationTypes::PReLu: { auto blobNegativeSlope = inferRequest.GetBlob("negativeSlope"); - float negativeSlope = -0.01f; + float negativeSlope = constantsValue[0]; blobNegativeSlope = FuncTestUtils::createAndFillBlobWithFloatArray(blobNegativeSlope->getTensorDesc(), &negativeSlope, 1); } case ngraph::helpers::ActivationTypes::LeakyRelu: { auto blobLeakySlope = inferRequest.GetBlob("leakySlope"); - float leakySlope = 0.01f; + float leakySlope = constantsValue[0]; blobLeakySlope = FuncTestUtils::createAndFillBlobWithFloatArray(blobLeakySlope->getTensorDesc(), &leakySlope, 1); } case ngraph::helpers::ActivationTypes::HardSigmoid: { auto blobHardSigmoidAlpha = inferRequest.GetBlob("alpha"); auto blobHardSigmoidBeta = inferRequest.GetBlob("beta"); - float alpha = 0.2f, beta = 0.5f; + float alpha = constantsValue[0], beta = constantsValue[1]; blobHardSigmoidAlpha = FuncTestUtils::createAndFillBlobWithFloatArray(blobHardSigmoidAlpha->getTensorDesc(), &alpha, 1); blobHardSigmoidBeta = FuncTestUtils::createAndFillBlobWithFloatArray(blobHardSigmoidBeta->getTensorDesc(), &beta, 1); } case ngraph::helpers::ActivationTypes::Selu: { auto blobHardSigmoidAlpha = inferRequest.GetBlob("alpha"); auto blobHardSigmoidLambda = inferRequest.GetBlob("lambda"); - float alpha = 1.6732f, lambda = 1.0507f; + float alpha = constantsValue[0], lambda = constantsValue[1]; blobHardSigmoidAlpha = FuncTestUtils::createAndFillBlobWithFloatArray(blobHardSigmoidAlpha->getTensorDesc(), &alpha, 1); blobHardSigmoidLambda = FuncTestUtils::createAndFillBlobWithFloatArray(blobHardSigmoidLambda->getTensorDesc(), &lambda, 1); } @@ -164,7 +170,7 @@ void ActivationParamLayerTest::Infer() { auto blobInput = inferRequest.GetBlob("Input"); blobInput = FuncTestUtils::createAndFillBlobFloat(blobInput->getTensorDesc()); - generateActivationBlob(); + generateActivationBlob(constantsValue); inferRequest.Infer(); } @@ -173,12 +179,18 @@ void ActivationParamLayerTest::Infer() { void ActivationParamLayerTest::SetUp() { InferenceEngine::Precision netPrecision; std::pair, std::vector> shapes; - std::tie(activationType, netPrecision, shapes, targetDevice) = GetParam(); + std::pair> activationDecl; + std::tie(activationDecl, netPrecision, shapes, targetDevice) = GetParam(); + + activationType = activationDecl.first; + constantsValue = activationDecl.second; auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); auto params = ngraph::builder::makeParams(ngPrc, {shapes.first}); auto activationParams = createActivationParams(ngPrc); + params[0]->set_friendly_name("Input"); params.insert(params.end(), activationParams.begin(), activationParams.end()); + auto activation = ngraph::builder::makeActivation(params, ngPrc, activationType); function = std::make_shared(ngraph::NodeVector{activation}, params); } diff --git a/inference-engine/tests/ie_test_utils/common_test_utils/common_utils.hpp b/inference-engine/tests/ie_test_utils/common_test_utils/common_utils.hpp index 5627c617ebbf42..f5b2dcdfaf09b4 100644 --- a/inference-engine/tests/ie_test_utils/common_test_utils/common_utils.hpp +++ b/inference-engine/tests/ie_test_utils/common_test_utils/common_utils.hpp @@ -70,13 +70,13 @@ inline InferenceEngine::CNNLayerPtr getLayerByName(const InferenceEngine::CNNNet return getLayerByName(&icnnnetwork, layerName); } -template -std::vector, std::vector>> - combineShapes(const std::map, std::vector>>& inputShapes) { - std::vector, std::vector>> resVec; - for (auto& inputShape : inputShapes) { - for (auto& item : inputShape.second) { - resVec.push_back({inputShape.first, item}); +template +std::vector> combineParams( + const std::map>& keyValueSets) { + std::vector> resVec; + for (auto& keyValues : keyValueSets) { + for (auto& item : keyValues.second) { + resVec.push_back({keyValues.first, item}); } } return resVec; diff --git a/inference-engine/tests/ngraph_functions/include/ngraph_functions/builders.hpp b/inference-engine/tests/ngraph_functions/include/ngraph_functions/builders.hpp index bcddb14d90512b..22cf1da0077d11 100644 --- a/inference-engine/tests/ngraph_functions/include/ngraph_functions/builders.hpp +++ b/inference-engine/tests/ngraph_functions/include/ngraph_functions/builders.hpp @@ -191,7 +191,8 @@ std::shared_ptr makeVariadicSplit(const ngraph::Output &in, std::shared_ptr makeActivation(const ngraph::Output &in, const element::Type &type, ngraph::helpers::ActivationTypes activationType, - std::vector inShape = {}); + std::vector inShape = {}, + std::vector constantsValue = {}); std::shared_ptr makeActivation(const ngraph::ParameterVector ¶meters, const element::Type &type, diff --git a/inference-engine/tests/ngraph_functions/src/activation.cpp b/inference-engine/tests/ngraph_functions/src/activation.cpp index 34e3a7c864a8f0..c09df345184631 100644 --- a/inference-engine/tests/ngraph_functions/src/activation.cpp +++ b/inference-engine/tests/ngraph_functions/src/activation.cpp @@ -15,7 +15,8 @@ namespace builder { std::shared_ptr makeActivation(const ngraph::Output &in, const element::Type &type, ngraph::helpers::ActivationTypes activationType, - std::vector inShape) { + std::vector inShape, + std::vector constantsValue) { switch (activationType) { case ngraph::helpers::ActivationTypes::Sigmoid: return std::make_shared(in); @@ -27,7 +28,7 @@ std::shared_ptr makeActivation(const ngraph::Output &in, auto leaky_slope = std::make_shared( ngraph::element::f32, inShape, - std::vector{0.01f}); + constantsValue); return std::make_shared(in, leaky_slope); } case ngraph::helpers::ActivationTypes::Exp: @@ -41,7 +42,7 @@ std::shared_ptr makeActivation(const ngraph::Output &in, case ngraph::helpers::ActivationTypes::Gelu: return std::make_shared(in); case ngraph::helpers::ActivationTypes::Clamp: - return std::make_shared(in, -2.0, 2.0); + return std::make_shared(in, constantsValue[0], constantsValue[1]); case ngraph::helpers::ActivationTypes::Negative: return std::make_shared(in); case ngraph::helpers::ActivationTypes::Acos: @@ -65,21 +66,21 @@ std::shared_ptr makeActivation(const ngraph::Output &in, case ngraph::helpers::ActivationTypes::Tan: return std::make_shared(in); case ngraph::helpers::ActivationTypes::Elu: - return std::make_shared(in, 0.1); + return std::make_shared(in, constantsValue[0]); case ngraph::helpers::ActivationTypes::Erf: return std::make_shared(in); case ngraph::helpers::ActivationTypes::HardSigmoid: { auto hard_sigmoid_alpha = std::make_shared( - type, inShape, 0.2f); + type, inShape, constantsValue[0]); auto hard_sigmoid_beta = std::make_shared( - type, inShape, 0.5f); + type, inShape, constantsValue[1]); return std::make_shared(in, hard_sigmoid_alpha, hard_sigmoid_beta); } case ngraph::helpers::ActivationTypes::Selu: { auto selu_alpha = std::make_shared( - type, inShape, 1.6732f); + type, inShape, constantsValue[0]); auto selu_lambda = std::make_shared( - type, inShape, 1.0507f); + type, inShape, constantsValue[1]); return std::make_shared(in, selu_alpha, selu_lambda); } case ngraph::helpers::ActivationTypes::Ceiling: @@ -88,7 +89,7 @@ std::shared_ptr makeActivation(const ngraph::Output &in, auto negative_slope = std::make_shared( ngraph::element::f32, inShape, - std::vector{-0.01f}); + constantsValue); return std::make_shared(in, negative_slope); } case ngraph::helpers::ActivationTypes::Mish: @@ -98,7 +99,7 @@ std::shared_ptr makeActivation(const ngraph::Output &in, case ngraph::helpers::ActivationTypes::SoftPlus: return std::make_shared(in); case ngraph::helpers::ActivationTypes::Swish: { - auto beta = std::make_shared(type, inShape, 1.0f); + auto beta = std::make_shared(type, inShape, constantsValue[0]); return std::make_shared(in, beta); } default: