diff --git a/.github/workflows/linux_arm64.yml b/.github/workflows/linux_arm64.yml index 12bc0f8f84106e..32a2147a2b2433 100644 --- a/.github/workflows/linux_arm64.yml +++ b/.github/workflows/linux_arm64.yml @@ -172,6 +172,7 @@ jobs: -DCMAKE_COMPILE_WARNING_AS_ERROR=ON \ -DCMAKE_CXX_COMPILER_LAUNCHER=${{ env.CMAKE_CXX_COMPILER_LAUNCHER }} \ -DCMAKE_C_COMPILER_LAUNCHER=${{ env.CMAKE_C_COMPILER_LAUNCHER }} \ + -DOV_CPU_AARCH64_USE_MULTI_ISA=OFF \ -S ${OPENVINO_REPO} \ -B ${BUILD_DIR} diff --git a/.gitignore b/.gitignore index 9bc1e79b3e53b1..9dd22697d3780a 100644 --- a/.gitignore +++ b/.gitignore @@ -61,4 +61,5 @@ __pycache__ /tools/mo/*.svg /src/plugins/intel_cpu/tools/commit_slider/*.json /src/plugins/intel_cpu/tools/commit_slider/slider_cache/* +/src/plugins/intel_cpu/thirdparty/ComputeLibrary/build/* .github/GITHUB_OUTPUT diff --git a/src/bindings/python/src/pyopenvino/graph/dict_attribute_visitor.cpp b/src/bindings/python/src/pyopenvino/graph/dict_attribute_visitor.cpp index 710ad70d0b6cfa..578523dd309f93 100644 --- a/src/bindings/python/src/pyopenvino/graph/dict_attribute_visitor.cpp +++ b/src/bindings/python/src/pyopenvino/graph/dict_attribute_visitor.cpp @@ -274,6 +274,17 @@ void util::DictAttributeSerializer::on_adapter(const std::string& name, ov::Valu if (m_attributes.contains(name)) { OPENVINO_THROW("No AttributeVisitor support for accessing attribute named: ", name); } + + if (auto _adapter = dynamic_cast>*>(&adapter)) { + m_attributes[name.c_str()] = _adapter->get()->get_info().variable_id; + } else if (auto _adapter = dynamic_cast*>(&adapter)) { + auto partial_shape = _adapter->get(); + std::vector shape; + for (const auto& dim : partial_shape) { + shape.push_back(dim.is_dynamic() ? -1 : dim.get_length()); + } + m_attributes[name.c_str()] = shape; + } } void util::DictAttributeSerializer::on_adapter(const std::string& name, ov::ValueAccessor& adapter) { m_attributes[name.c_str()] = adapter.get(); diff --git a/src/bindings/python/tests/test_graph/test_create_op.py b/src/bindings/python/tests/test_graph/test_create_op.py index 355de9ab151383..3567732bb6faeb 100644 --- a/src/bindings/python/tests/test_graph/test_create_op.py +++ b/src/bindings/python/tests/test_graph/test_create_op.py @@ -1183,11 +1183,15 @@ def test_read_value(): init_value = ov.parameter([2, 2], name="init_value", dtype=np.int32) node = ov.read_value(init_value, "var_id_667", np.int32, [2, 2]) + read_value_attributes = node.get_attributes() assert node.get_type_name() == "ReadValue" assert node.get_output_size() == 1 assert list(node.get_output_shape(0)) == [2, 2] assert node.get_output_element_type(0) == Type.i32 + assert read_value_attributes["variable_type"] == "i32" + assert read_value_attributes["variable_id"] == "var_id_667" + assert read_value_attributes["variable_shape"] == [2, 2] def test_read_value_dyn_variable_pshape(): @@ -1205,11 +1209,13 @@ def test_assign(): input_data = ov.parameter([5, 7], name="input_data", dtype=np.int32) rv = ov.read_value(input_data, "var_id_667", np.int32, [5, 7]) node = ov.assign(rv, "var_id_667") + assign_attributes = node.get_attributes() assert node.get_type_name() == "Assign" assert node.get_output_size() == 1 assert list(node.get_output_shape(0)) == [5, 7] assert node.get_output_element_type(0) == Type.i32 + assert assign_attributes["variable_id"] == "var_id_667" def test_extract_image_patches(): @@ -2353,3 +2359,10 @@ def test_topk_opset11(): assert node.get_output_size() == 2 assert list(node.get_output_shape(0)) == [1, 3, 3] assert list(node.get_output_shape(1)) == [1, 3, 3] + + +def test_parameter_get_attributes(): + parameter = ov.parameter([2, 2], dtype=np.float32, name="InputData") + parameter_attributes = parameter.get_attributes() + assert parameter_attributes["element_type"] == "f32" + assert parameter_attributes["shape"] == [2, 2] diff --git a/src/common/transformations/src/transformations/common_optimizations/common_optimizations.cpp b/src/common/transformations/src/transformations/common_optimizations/common_optimizations.cpp index 8e7fea6e5526ed..11a03374b829d6 100644 --- a/src/common/transformations/src/transformations/common_optimizations/common_optimizations.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/common_optimizations.cpp @@ -69,7 +69,6 @@ #include "transformations/op_conversions/convert_bitwise_to_logical_bool.hpp" #include "transformations/op_conversions/convert_broadcast_to_tiles.hpp" #include "transformations/op_conversions/convert_convertlike.hpp" -#include "transformations/op_conversions/convert_convertpromotetypes.hpp" #include "transformations/op_conversions/convert_deformable_conv_v8_to_v1.hpp" #include "transformations/op_conversions/convert_depth_to_space.hpp" #include "transformations/op_conversions/convert_divide.hpp" @@ -174,7 +173,6 @@ bool ov::pass::CommonOptimizations::run_on_model(const std::shared_ptr ADD_MATCHER(decomp, ConvertDivideWithConstant) ADD_MATCHER(decomp, ConvertSubtractWithConstant) ADD_MATCHER(decomp, ConvertNegative) - + ADD_MATCHER(decomp, ConvertConvertPromoteTypes) manager.register_pass(); auto multiply_fusions = manager.register_pass(); diff --git a/src/frontends/onnx/tests/tests_python/utils/model_importer.py b/src/frontends/onnx/tests/tests_python/utils/model_importer.py index b1321e27b1ad32..6df72a9e8b445d 100644 --- a/src/frontends/onnx/tests/tests_python/utils/model_importer.py +++ b/src/frontends/onnx/tests/tests_python/utils/model_importer.py @@ -141,6 +141,7 @@ def _execute_pb_data( executed_tests = executed_tests + 1 return executed_tests + def _add_model_import_test(self, model_test: ExtOnnxTestCase) -> None: # model is loaded at runtime, note sometimes it could even # never loaded if the test skipped @@ -148,7 +149,7 @@ def _add_model_import_test(self, model_test: ExtOnnxTestCase) -> None: def run_import(test_self: Any, device: Text) -> None: model = ModelImportRunner._load_onnx_model(model_test.model_dir, model_test.model) - model_marker[0] = model + model_marker[0] = model_test.model_dir / model_test.model assert import_onnx_model(model) self._add_test("ModelImport", model_test.name, run_import, model_marker) @@ -160,7 +161,7 @@ def _add_model_execution_test(self, model_test: ExtOnnxTestCase) -> None: def run_execution(test_self: Any, device: Text) -> None: model = ModelImportRunner._load_onnx_model(model_test.model_dir, model_test.model) - model_marker[0] = model + model_marker[0] = model_test.model_dir / model_test.model prepared_model = self.backend.prepare(model, device) assert prepared_model is not None executed_tests = ModelImportRunner._execute_npz_data( diff --git a/src/frontends/pytorch/src/op/bucketize.cpp b/src/frontends/pytorch/src/op/bucketize.cpp new file mode 100644 index 00000000000000..07ac70458824cb --- /dev/null +++ b/src/frontends/pytorch/src/op/bucketize.cpp @@ -0,0 +1,50 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/op/bucketize.hpp" + +#include "openvino/frontend/pytorch/node_context.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/concat.hpp" +#include "openvino/op/convert_like.hpp" +#include "openvino/op/logical_or.hpp" +#include "openvino/op/multiply.hpp" +#include "utils.hpp" + +namespace ov { +namespace frontend { +namespace pytorch { +namespace op { + +using namespace ov::op; + +OutputVector translate_bucketize(const NodeContext& context) { + num_inputs_check(context, 2, 5); + auto input = context.get_input(0); + auto boundaries = context.get_input(1); + + element::Type output_type = ov::element::i64; + if (!context.input_is_none(2) && context.const_input(2)) { + output_type = ov::element::i32; + } + + bool with_right_bound = true; + if (!context.input_is_none(3)) { + with_right_bound = !context.const_input(3); + } + + auto bucketize = + context.mark_node(std::make_shared(input, boundaries, output_type, with_right_bound)); + + if (!context.input_is_none(4)) { + context.mutate_input(4, bucketize); + } + + return {bucketize}; +}; + +} // namespace op +} // namespace pytorch +} // namespace frontend +} // namespace ov \ No newline at end of file diff --git a/src/frontends/pytorch/src/op_table.cpp b/src/frontends/pytorch/src/op_table.cpp index 55d218df430e43..ea2ff9cf6c5a59 100644 --- a/src/frontends/pytorch/src/op_table.cpp +++ b/src/frontends/pytorch/src/op_table.cpp @@ -47,6 +47,7 @@ OP_CONVERTER(translate_bitwise_and); OP_CONVERTER(translate_bitwise_not); OP_CONVERTER(translate_bitwise_or); OP_CONVERTER(translate_bitwise_xor); +OP_CONVERTER(translate_bucketize); OP_CONVERTER(translate_cat); OP_CONVERTER(translate_cdist); OP_CONVERTER(translate_celu); @@ -374,6 +375,7 @@ const std::map get_supported_ops_ts() { {"aten::Bool", op::translate_bool}, // aten::broadcast_tensors - Supported in limited set of patterns {"aten::broadcast_to", op::translate_expand}, + {"aten::bucketize", op::translate_bucketize}, {"aten::cat", op::translate_cat}, {"aten::cdist", op::translate_cdist}, {"aten::ceil", op::optional_out, 1>}, @@ -522,6 +524,7 @@ const std::map get_supported_ops_ts() { {"aten::masked_scatter_", op::inplace_op}, {"aten::matmul", op::translate_1to1_match_2_inputs}, {"aten::max", op::translate_max}, + {"aten::mv", op::translate_1to1_match_2_inputs}, {"aten::maximum", op::translate_maximum}, {"aten::max_pool1d", op::quantizable_op}, {"aten::max_pool1d_with_indices", op::quantizable_op}, diff --git a/src/frontends/tensorflow/docs/supported_ops.md b/src/frontends/tensorflow/docs/supported_ops.md index f4b06fafa06283..367cf6cb2408a2 100644 --- a/src/frontends/tensorflow/docs/supported_ops.md +++ b/src/frontends/tensorflow/docs/supported_ops.md @@ -26,7 +26,7 @@ A "supported operation" is one that TensorFlow Frontend can convert to the OpenV | All | YES | | | AllCandidateSampler | NO | | | AllToAll | NO | | -| Angle | NO | | +| Angle | YES | | | AnonymousHashTable | NO | | | AnonymousIterator | NO | | | AnonymousIteratorV2 | NO | | diff --git a/src/frontends/tensorflow/src/op_table.cpp b/src/frontends/tensorflow/src/op_table.cpp index fb1597c926e6c8..81d016ec67c68a 100644 --- a/src/frontends/tensorflow/src/op_table.cpp +++ b/src/frontends/tensorflow/src/op_table.cpp @@ -205,6 +205,7 @@ const std::map get_supported_ops() { // Separate translators: {"AddN", CreatorFunction(translate_add_n_op)}, {"AdjustContrastv2", CreatorFunction(translate_adjust_contrast_op)}, + {"Angle", CreatorFunction(translate_angle_op)}, {"ArgMax", CreatorFunction(translate_arg_max_op)}, {"ArgMin", CreatorFunction(translate_arg_min_op)}, {"Assert", CreatorFunction(translate_no_op)}, diff --git a/src/frontends/tensorflow_common/include/common_op_table.hpp b/src/frontends/tensorflow_common/include/common_op_table.hpp index 6d4e4a971c2f98..af59b862c89234 100644 --- a/src/frontends/tensorflow_common/include/common_op_table.hpp +++ b/src/frontends/tensorflow_common/include/common_op_table.hpp @@ -35,6 +35,7 @@ OP_CONVERTER(translate_addv2_op); OP_CONVERTER(translate_add_n_op); OP_CONVERTER(translate_approximate_equal_op); OP_CONVERTER(translate_adjust_contrast_op); +OP_CONVERTER(translate_angle_op); OP_CONVERTER(translate_arg_max_op); OP_CONVERTER(translate_arg_min_op); OP_CONVERTER(translate_atan2_op); diff --git a/src/frontends/tensorflow_common/src/op/angle.cpp b/src/frontends/tensorflow_common/src/op/angle.cpp new file mode 100644 index 00000000000000..a60363216b7d55 --- /dev/null +++ b/src/frontends/tensorflow_common/src/op/angle.cpp @@ -0,0 +1,90 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "common_op_table.hpp" +#include "helper_ops/complex_type_mark.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/atan.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/divide.hpp" +#include "openvino/op/equal.hpp" +#include "openvino/op/gather.hpp" +#include "openvino/op/greater.hpp" +#include "openvino/op/greater_eq.hpp" +#include "openvino/op/less.hpp" +#include "openvino/op/logical_and.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/select.hpp" +#include "openvino/op/subtract.hpp" + +using namespace std; +using namespace ov::op; + +namespace ov { +namespace frontend { +namespace tensorflow { +namespace op { + +OutputVector translate_angle_op(const NodeContext& node) { + default_op_checks(node, 1, {"Angle"}, true); + auto complex = node.get_input(0); + auto result_type = node.get_attribute("Tout"); + + auto complex_type_mark = as_type_ptr(complex.get_node_shared_ptr()); + + TENSORFLOW_OP_VALIDATION( + node, + complex_type_mark, + "[TensorFlow Frontend] inconsistent model: Angle operation expects complex type tensor on input"); + + complex = complex_type_mark->input_value(0); + auto real_index = make_shared(element::i32, Shape{}, 0); + auto imag_index = make_shared(element::i32, Shape{}, 1); + auto gather_axis = make_shared(element::i32, Shape{1}, -1); + + auto x = make_shared(complex, real_index, gather_axis)->output(0); + auto y = make_shared(complex, imag_index, gather_axis)->output(0); + + // handle the first condition : x>0 + auto div_y_x = make_shared(y, x); + auto atan = make_shared(div_y_x); + auto const_zero = create_same_type_const_scalar(x, 0); + auto result = atan->output(0); + + // handle the second condition : x<0 && y>=0 + auto const_pi = create_same_type_const_scalar(x, std::atan(1.0) * 4); + auto is_x_negative = make_shared(x, const_zero); + auto y_non_negative = make_shared(y, const_zero); + auto cond1 = make_shared(is_x_negative, y_non_negative); + auto atan_y_x_plus_pi = make_shared(atan, const_pi); + result = make_shared(cond1, atan_y_x_plus_pi, result); + + // handle the third condition : x<0 && y<0 + auto is_y_negative = make_shared(y, const_zero); + auto cond2 = make_shared(is_x_negative, is_y_negative); + auto atan_y_x_minus_pi = make_shared(atan, const_pi); + result = make_shared(cond2, atan_y_x_minus_pi, result); + + // handle the fourth condition : x=0 && y>0 + auto is_x_zero = make_shared(x, const_zero); + auto is_y_positive = make_shared(y, const_zero); + auto cond3 = make_shared(is_x_zero, is_y_positive); + auto const_two = create_same_type_const_scalar(x, 2); + auto pi_div_two = make_shared(const_pi, const_two); + result = make_shared(cond3, pi_div_two, result); + + // handle the fifth condition : x=0 && y<0 + auto cond4 = make_shared(is_x_zero, is_y_negative); + auto const_minus_two = create_same_type_const_scalar(x, -2); + auto pi_div_minus_two = make_shared(const_pi, const_minus_two); + result = make_shared(cond4, pi_div_two, result); + auto result_changed_type = make_shared(result, result_type)->output(0); + + set_node_name(node.get_name(), result_changed_type.get_node_shared_ptr()); + return {result_changed_type}; +} +} // namespace op +} // namespace tensorflow +} // namespace frontend +} // namespace ov diff --git a/src/plugins/intel_cpu/CMakeLists.txt b/src/plugins/intel_cpu/CMakeLists.txt index 70da87819f03e5..c65bceae2a1d0b 100644 --- a/src/plugins/intel_cpu/CMakeLists.txt +++ b/src/plugins/intel_cpu/CMakeLists.txt @@ -30,6 +30,16 @@ elseif(OV_COMPILER_IS_CLANG) endif() endif() +if (AARCH64 AND NOT APPLE AND CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 10.2) + # according to https://github.com/ARM-software/ComputeLibrary/issues/1053#issuecomment-1846903707 comment + # the 'multi_isa=1' below enables FP32, FP16 and SVE / SVE2 kernels + # But: arm_sve.h header is not available on gcc older 10.2 (let's test it), so we have to check it + set(OV_CPU_AARCH64_USE_MULTI_ISA_DEFAULT ON) +else() + set(OV_CPU_AARCH64_USE_MULTI_ISA_DEFAULT OFF) +endif() +set(OV_CPU_AARCH64_USE_MULTI_ISA ${OV_CPU_AARCH64_USE_MULTI_ISA_DEFAULT} CACHE BOOL "Build multi-ISA ACL") + set(OV_CPU_ARM_TARGET_GENERIC_ARCHS armv8a armv8.2-a armv8.6-a armv8.6-a-sve armv8.6-a-sve2 armv8.6-a-sve2-sme2 @@ -41,7 +51,18 @@ if(ARM) # requires estate=32 ${OV_CPU_ARM_TARGET_GENERIC_ARCHS}) elseif(AARCH64) - set(OV_CPU_ARM_TARGET_ARCH_DEFAULT arm64-v8.2-a) + if(APPLE) + set(OV_CPU_ARM_TARGET_ARCH_DEFAULT arm64-v8.2-a) + else() + if(OV_CPU_AARCH64_USE_MULTI_ISA) + # set v8a even we want fp16 kernels, because + # we use multi_isa=1 in ACLConfig.cmake to enable both fp16 and fp32 kernels + # actual kernel is selected in runtime based on runtime capabilities + set(OV_CPU_ARM_TARGET_ARCH_DEFAULT arm64-v8a) + else() + set(OV_CPU_ARM_TARGET_ARCH_DEFAULT arm64-v8.2-a) + endif() + endif() set(OV_CPU_ARM_TARGET_ARCHS arm64-v8a arm64-v8.2-a arm64-v8.2-a-sve arm64-v8.2-a-sve2 # used with estate=64 @@ -49,9 +70,6 @@ elseif(AARCH64) endif() set(OV_CPU_ARM_TARGET_ARCH ${OV_CPU_ARM_TARGET_ARCH_DEFAULT} CACHE STRING "Architecture for ARM ComputeLibrary") set_property(CACHE OV_CPU_ARM_TARGET_ARCH PROPERTY STRINGS ${OV_CPU_ARM_TARGET_ARCHS}) -if(OV_CPU_ARM_TARGET_ARCH MATCHES "(armv|arm64-v)[8-9]\\.") - add_definitions(-DOV_CPU_ARM_ENABLE_FP16) -endif() if(X86 OR X86_64 OR AARCH64) # disable mlas with webassembly diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp index 8567914415e459..4d94abf72ebfc0 100644 --- a/src/plugins/intel_cpu/src/config.cpp +++ b/src/plugins/intel_cpu/src/config.cpp @@ -284,14 +284,9 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { inferencePrecision = ov::element::bf16; } } else if (prec == ov::element::f16) { -#if defined(OPENVINO_ARCH_X86_64) if (hasHardwareSupport(ov::element::f16)) { inferencePrecision = ov::element::f16; } -#elif defined(OV_CPU_ARM_ENABLE_FP16) - // TODO: add runtime FP16 feature support check for ARM - inferencePrecision = ov::element::f16; -#endif } else if (prec == ov::element::f32) { inferencePrecision = ov::element::f32; } else { @@ -382,12 +377,13 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { if (!inferencePrecisionSetExplicitly) { if (executionMode == ov::hint::ExecutionMode::PERFORMANCE) { inferencePrecision = ov::element::f32; -#if defined(OV_CPU_ARM_ENABLE_FP16) - inferencePrecision = ov::element::f16; -#else +#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) + if (hasHardwareSupport(ov::element::f16)) { + inferencePrecision = ov::element::f16; + } +#endif if (mayiuse(avx512_core_bf16)) inferencePrecision = ov::element::bf16; -#endif } else { inferencePrecision = ov::element::f32; } diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index 3ffff01c6da6e7..ec9f4012ce53e0 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -37,11 +37,10 @@ #include "utils/ngraph_utils.hpp" #include "utils/node_dumper.h" #include "utils/verbose.h" +#include "utils/precision_support.h" #include -#if defined(OV_CPU_ARM_ENABLE_FP16) #include "common/primitive_desc_iface.hpp" -#endif #include "openvino/runtime/memory_solver.hpp" @@ -425,10 +424,12 @@ static bool isReorderAvailable(const MemoryDescPtr& parentDesc, const MemoryDesc dnnl_primitive_desc_t result = nullptr; auto status = dnnl_reorder_primitive_desc_create(&result, srcMemDesc.get(), eng.get(), dstMemDesc.get(), eng.get(), attr.get()); -#if defined(OV_CPU_ARM_ENABLE_FP16) +#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) // temporary WA for slow FP32->FP16 conversion reorder in oneDNN on ARM // pretend the reorder is not available to use Convert node instead - if (result && parse_impl_name(result->impl()->name()) == ref_any) { + if (hasHardwareSupport(ov::element::f16) && + result && + parse_impl_name(result->impl()->name()) == ref_any) { dnnl_primitive_desc_destroy(result); return false; } @@ -1607,7 +1608,7 @@ void Graph::EnforceInferencePrecision() { if (inferPrec == ov::element::f32) return; // nothing to do, only precision reduction is currently allowed -#if defined(OV_CPU_ARM_ENABLE_FP16) +#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) if (inferPrec == ov::element::f16) return; // precision of configured by ov::pass::ConvertPrecision #endif diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp index cdc038fbf9155d..ae091deed57121 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp @@ -4,6 +4,7 @@ #include "acl_eltwise.hpp" #include "acl_utils.hpp" +#include "utils/debug_capabilities.h" namespace ov { namespace intel_cpu { @@ -31,6 +32,17 @@ inline VectorDims reshape_sizes(VectorDims dims) { return result_dims; } +inline void log_unsupported_prec(const std::vector& srcDescs, + const std::vector& dstDescs, + const Algorithm eltwiseAlgorithm) { + std::string srcPrec; + for (size_t i = 0; i < srcDescs.size(); i++) { + srcPrec += srcDescs[i]->getPrecision().to_string() + " "; + } + DEBUG_LOG(algToString(eltwiseAlgorithm), ": provided combination of src precisions: [", srcPrec, + "] and dst precision: ", dstDescs[0]->getPrecision().to_string(), " is not supported"); +} + bool AclEltwiseExecutor::isEltwiseAlgorithmSupported(Algorithm algorithm) { if (one_of(algorithm, Algorithm::EltwiseSqrt, Algorithm::EltwiseDivide, @@ -94,6 +106,7 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs, case Algorithm::EltwiseHswish: if (!(checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) || checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) { + log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); return false; } break; @@ -103,6 +116,7 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs, if (!(checkPrecision({ov::element::i32, ov::element::i32}, ov::element::i32) || checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) || checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) { + log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); return false; } break; @@ -113,6 +127,7 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs, checkPrecision({ov::element::i32, ov::element::i32}, ov::element::i32) || checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) || checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) { + log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); return false; } break; @@ -123,6 +138,7 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs, checkPrecision({ov::element::i32, ov::element::i32}, ov::element::i32) || checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) || checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) { + log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); return false; } break; @@ -134,6 +150,7 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs, checkPrecision({ov::element::i16, ov::element::i16}, ov::element::i16) || checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) || checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) { + log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); return false; } break; @@ -149,20 +166,26 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs, checkPrecision({ov::element::i32, ov::element::i32}, ov::element::u8) || checkPrecision({ov::element::f16, ov::element::f16}, ov::element::u8) || checkPrecision({ov::element::f32, ov::element::f32}, ov::element::u8))) { + log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); return false; } break; default: + DEBUG_LOG("Eltwise algorithm ", algToString(eltwiseAttrs.algorithm), " is not supported"); return false; } for (const auto & srcDesc : srcDescs) { - if (getAclDataLayoutByMemoryDesc(srcDesc) == arm_compute::DataLayout::UNKNOWN) + if (getAclDataLayoutByMemoryDesc(srcDesc) == arm_compute::DataLayout::UNKNOWN) { + DEBUG_LOG("src descriptor layout is unsupported by ACL: ", srcDesc->serializeFormat()); return false; + } } for (const auto & dstDesc : dstDescs) { - if (getAclDataLayoutByMemoryDesc(dstDesc) == arm_compute::DataLayout::UNKNOWN) + if (getAclDataLayoutByMemoryDesc(dstDesc) == arm_compute::DataLayout::UNKNOWN) { + DEBUG_LOG("dst descriptor layout is unsupported by ACL: ", dstDesc->serializeFormat()); return false; + } } return true; diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp index 0c66c37394ab52..eea989656e49b6 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp @@ -21,6 +21,7 @@ #include "nodes/executors/mlas/mlas_gemm.hpp" #include "nodes/executors/precision_matcher.hpp" #include "nodes/executors/precision_translation.hpp" +#include "nodes/executors/type_mask.hpp" #include "openvino/core/type/element_type.hpp" #include "ov_optional.hpp" #include "utils/cpp/maybe_unused.hpp" @@ -39,21 +40,23 @@ static const LayoutConfig dnnlFCLayoutConfig{LayoutType::ncsp, LayoutType::ncsp, // clang-format off static const TypeMapping dnnlFCTypeMapping { - // {src, wei, bia, dst} pt - {{_bf16, _bf16 | _f32, _any, _bf16 | _f32}, pt(bypass(), bypass(), use<3>(), use<3>())}, - {{_f16, _f16, _any, _f16 | _f32}, pt(bypass(), bypass(), use<3>(), use<3>())}, + // {src, wei, bia, dst} pt + {{_bf16, _bf16 | _f32, _any, _bf16 | _f32}, pt(bypass(), bypass(), use<3>(), bypass())}, + {{_f16, _f16, _any, _f16 | _f32}, pt(bypass(), bypass(), use<3>(), bypass())}, // integer precision outputs are not supported for float precision inputs - {{_f32 | _bf16 | _f16, _any, _any, _i8 | _u8}, pt(bypass(), bypass(), use<0>(), use<0>())}, + {{_f32 | _bf16 | _f16, _any, _any, _i8 | _u8}, pt(bypass(), bypass(), use<0>(), use<0>())}, // compresses float weights which do not match input data precision - {{_f32, _half_float, _any, _any | _any}, pt(bypass(), bypass(), use<0>(), use<0>())}, - {{_bf16, _f16, _any, _any | _any}, pt(bypass(), bypass(), use<0>(), use<0>())}, - {{_f16, _bf16, _any, _any | _any}, pt(bypass(), bypass(), use<0>(), use<0>())}, - // quantization configuration (@todo more strict requrements for output precision?) - {{_u8 | _i8, _i8, _any, _any}, pt(bypass(), bypass(), bypass(), use<3>())}, + {{_f32, _half_float, _any, _any | _any}, pt(bypass(), bypass(), use<0>(), use<0>())}, + {{_bf16, _f16, _any, _any | _any}, pt(bypass(), bypass(), use<0>(), use<0>())}, + {{_f16, _bf16, _any, _any | _any}, pt(bypass(), bypass(), use<0>(), use<0>())}, + // quantization configuration + // int8 inner_product does not support f16 output and bias + {{_u8 | _i8, _i8, _any, _f16}, pt(bypass(), bypass(), just(), just())}, + {{_u8 | _i8, _i8, _any, _u8 | _i8 | _i32 | _bf16 | _f32}, pt(bypass(), bypass(), use<3>(), bypass())}, // compresses int weights (@todo more strict requrements for output precision?) - {{_f32 | _bf16, _u8 | _nf4 | _u4 | _i4, _any, _any}, pt(bypass(), bypass(), use<0>(), use<0>())}, + {{_f32 | _bf16, _u8 | _nf4 | _u4 | _i4, _any, _any}, pt(bypass(), bypass(), use<0>(), use<0>())}, // @todo should we fallback to FPXX instead of _f32? - {{_any, _any, _any, _any}, pt(just(), just(), just(), just())}, + {{_any, _any, _any, _any}, pt(just(), just(), just(), just())}, // @todo explicitly cover configuration limitations for oneDNN on ARM }; @@ -63,8 +66,8 @@ static const MappingNotation dnnlConvolutionMappingNotation { static const TypeMapping dnnlConvolutionTypeMapping { // {src, wei, bia, dst} pt - {{_bf16, _bf16 | _f32, _any, _bf16 | _f32}, pt(bypass(), bypass(), use<3>(), use<3>())}, - {{_f16, _f16, _any, _f16 | _f32}, pt(bypass(), bypass(), use<3>(), use<3>())}, + {{_bf16, _bf16 | _f32, _any, _bf16 | _f32}, pt(bypass(), bypass(), use<3>(), bypass())}, + {{_f16, _f16, _any, _f16 | _f32}, pt(bypass(), bypass(), use<3>(), bypass())}, // integer precision outputs are not supported for float precision inputs {{_f32 | _bf16 | _f16, _any, _any, _i8 | _u8}, pt(bypass(), bypass(), use<0>(), use<0>())}, // compresses float weights which do not match input data precision @@ -72,7 +75,7 @@ static const TypeMapping dnnlConvolutionTypeMapping { {{_bf16, _f16, _any, _any | _any}, pt(bypass(), bypass(), use<0>(), use<0>())}, {{_f16, _bf16, _any, _any | _any}, pt(bypass(), bypass(), use<0>(), use<0>())}, // quantization configuration - {{_u8 | _i8, _i8, _any, _any}, pt(bypass(), bypass(), use<3>(), use<3>())}, + {{_u8 | _i8, _i8, _any, _any}, pt(bypass(), bypass(), use<3>(), bypass())}, // @todo should we fallback to _fxx instead of _f32 (currenly legacy logic is replicated) {{_any, _any, _any, _any}, pt(just(), just(), just(), just())}, }; diff --git a/src/plugins/intel_cpu/src/nodes/reorder.cpp b/src/plugins/intel_cpu/src/nodes/reorder.cpp index a0d3f101f573a3..e1db1540fdd4b5 100644 --- a/src/plugins/intel_cpu/src/nodes/reorder.cpp +++ b/src/plugins/intel_cpu/src/nodes/reorder.cpp @@ -24,11 +24,9 @@ #include "nodes/common/reorder_prim.h" #include "openvino/core/parallel.hpp" #include "shape_inference/shape_inference_pass_through.hpp" - -#if defined(OV_CPU_ARM_ENABLE_FP16) +#include "utils/precision_support.h" #include "nodes/executors/executor.hpp" #include "nodes/executors/transpose_list.hpp" -#endif namespace ov { namespace intel_cpu { @@ -128,7 +126,6 @@ void Reorder::executeDynamicImpl(dnnl::stream strm) { execute(strm); } -#if defined(OV_CPU_ARM_ENABLE_FP16) void Reorder::prepareReorderAsTranspose(MemoryDescPtr parentDesc, MemoryDescPtr childDesc) { auto getOrderAndBlockedDims = [](const MemoryDesc& lhs, const MemoryDesc& rhs) -> std::pair, std::vector> { const auto& in = lhs.as()->getBlockDims(); @@ -180,7 +177,6 @@ void Reorder::prepareReorderAsTranspose(MemoryDescPtr parentDesc, MemoryDescPtr getSelectedPrimitiveDescriptor()->setImplementationType(transposeExecutor->implType()); return; } -#endif // OV_CPU_ARM_ENABLE_FP16 void Reorder::prepareParams() { if (isOptimized) @@ -211,7 +207,7 @@ void Reorder::prepareParams() { const auto& parentDesc = srcMemPtr->getDescPtr(); const auto& childDesc = dstMemPtr->getDescPtr(); -#if defined(OV_CPU_ARM_ENABLE_FP16) +#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) // @todo current oneDNN v3.2 lacks optimized jit implementation for fp16 reorders. // Use transpose executor as a temporary WA. if (everyone_is(ov::element::f16, parentDesc->getPrecision(), childDesc->getPrecision()) && @@ -405,7 +401,7 @@ void Reorder::optimizedNspc2Ncsp() { } void Reorder::execute(dnnl::stream strm) { -#if defined(OV_CPU_ARM_ENABLE_FP16) +#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) if (transposeExecutor) { auto dstMemPtr = getDstMemoryAtPort(0); auto srcMemPtr = getSrcMemoryAtPort(0); diff --git a/src/plugins/intel_cpu/src/nodes/reorder.h b/src/plugins/intel_cpu/src/nodes/reorder.h index 07a7b7b53230be..cb99caa07bdfa6 100644 --- a/src/plugins/intel_cpu/src/nodes/reorder.h +++ b/src/plugins/intel_cpu/src/nodes/reorder.h @@ -6,9 +6,7 @@ #include -#if defined(OV_CPU_ARM_ENABLE_FP16) #include "nodes/executors/transpose.hpp" -#endif namespace ov { namespace intel_cpu { @@ -76,10 +74,9 @@ class Reorder : public Node { void optimizedNspc2Ncsp(); void optimizedNcsp2Nspc(); void createReorderPrimitive(const dnnl::memory::desc &srcDesc, void* srcPtr, const dnnl::memory::desc &dstDesc, void* dstPtr); -#if defined(OV_CPU_ARM_ENABLE_FP16) + void prepareReorderAsTranspose(MemoryDescPtr parentDesc, MemoryDescPtr childDesc); TransposeExecutorPtr transposeExecutor; -#endif }; } // namespace node diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 8dbdd42cee0726..cdea46e202b1cd 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -318,9 +318,10 @@ void Transformations::PreLpt(const std::vector& defaultPrecis // @todo should we always convert to f32 regardless of hardware support, as it is done for f16? if (!hasHardwareSupport(ov::element::bf16)) map.insert({ov::element::bf16, ov::element::f32}); -#if defined(OV_CPU_ARM_ENABLE_FP16) - if (inferencePrecision != ov::element::f16) - map.insert({ov::element::f16, ov::element::f32}); +#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) + if (inferencePrecision != ov::element::f16) { + map.insert({ov::element::f16, ov::element::f32}); + } #else map.insert({ov::element::f16, ov::element::f32}); #endif @@ -329,11 +330,12 @@ void Transformations::PreLpt(const std::vector& defaultPrecis type_to_fuse_map type_to_fuse = {{ov::opset10::Convert::get_type_info_static(), fuse_type_to_convert}}; -#if defined(OV_CPU_ARM_ENABLE_FP16) +#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) // It cannot be static data, because it may be difference for different inferencePrecision const auto precisions = get_convert_precisions(); if (inferencePrecision == ov::element::f16) { precisions_map fp_convert_precision_map = {{ov::element::f32, ov::element::f16}}; + //keep fq nodes in f32 prec to avoid performance degradation type_to_fuse_map f16_fuse_map = {{ov::opset1::FakeQuantize::get_type_info_static(), fuse_type_to_fq}}; const bool keep_precision_sensitive_in_fp32 = true; CPU_REGISTER_PASS_COMMON(manager, diff --git a/src/plugins/intel_cpu/src/utils/precision_support.cpp b/src/plugins/intel_cpu/src/utils/precision_support.cpp index 4a89002e63da48..e2e55a4d0f6cca 100644 --- a/src/plugins/intel_cpu/src/utils/precision_support.cpp +++ b/src/plugins/intel_cpu/src/utils/precision_support.cpp @@ -4,10 +4,16 @@ #include "precision_support.h" +#if defined(OPENVINO_ARCH_X86_64) #include "cpu/x64/cpu_isa_traits.hpp" +#endif #include "openvino/core/type/element_type.hpp" #include "openvino/core/visibility.hpp" +#if defined(OV_CPU_WITH_ACL) +#include "arm_compute/core/CPP/CPPTypes.h" +#endif + namespace ov { namespace intel_cpu { @@ -17,8 +23,10 @@ static bool hasFP16HardwareSupport(const ov::element::Type& precision) { dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni_2)) return true; return false; -#elif defined(OV_CPU_ARM_ENABLE_FP16) - return true; // @todo add runtime check for arm as well +#elif defined(OPENVINO_ARCH_ARM64) && defined(OV_CPU_WITH_ACL) + //has_fp16() works correctly on aarch64 only + //TODO: remove else branch as soon as ACL issue #1096 is fixed + return arm_compute::CPUInfo::get().has_fp16(); #else return false; #endif diff --git a/src/plugins/intel_cpu/tests/functional/CMakeLists.txt b/src/plugins/intel_cpu/tests/functional/CMakeLists.txt index 3a58130da3463e..8e32bc3ec059b6 100644 --- a/src/plugins/intel_cpu/tests/functional/CMakeLists.txt +++ b/src/plugins/intel_cpu/tests/functional/CMakeLists.txt @@ -4,14 +4,32 @@ set(TARGET_NAME ov_cpu_func_tests) -add_library(cpuSpecificRtInfo STATIC +if(SUGGEST_OVERRIDE_SUPPORTED) + # xbyak compilation fails + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-suggest-override") +endif() + +add_library(cpuUtils STATIC $/src/utils/rt_info/memory_formats_attribute.hpp - $/src/utils/rt_info/memory_formats_attribute.cpp) -target_link_libraries(cpuSpecificRtInfo PRIVATE openvino::runtime) + $/src/utils/rt_info/memory_formats_attribute.cpp + $/src/utils/precision_support.h + $/src/utils/precision_support.cpp) +set(CPU_UTILS_LINK_LIBRARIES openvino::runtime) +set(CPU_UTILS_INCLUDE_PATHS) +if(OV_CPU_WITH_ACL) + list(APPEND CPU_UTILS_LINK_LIBRARIES arm_compute::arm_compute) + list(APPEND CPU_UTILS_INCLUDE_PATHS $) +endif() +if(OV_CPU_WITH_DNNL) + list(APPEND CPU_UTILS_LINK_LIBRARIES dnnl) + list(APPEND CPU_UTILS_INCLUDE_PATHS $/thirdparty/onednn/src) +endif() +target_link_libraries(cpuUtils PRIVATE ${CPU_UTILS_LINK_LIBRARIES}) +target_include_directories(cpuUtils PUBLIC ${CPU_UTILS_INCLUDE_PATHS}) set(INCLUDES ${CMAKE_CURRENT_SOURCE_DIR} $/src) set(DEPENDENCIES openvino_intel_cpu_plugin openvino_template_extension) -set(LINK_LIBRARIES funcSharedTests cpuSpecificRtInfo openvino::snippets ov_snippets_models) +set(LINK_LIBRARIES funcSharedTests cpuUtils openvino::snippets ov_snippets_models) if(ENABLE_OV_ONNX_FRONTEND) list(APPEND DEFINES TEST_MODELS="${TEST_MODEL_ZOO}") diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp index 1b29347d6c0605..27c80bce3fc1a0 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp @@ -5,6 +5,7 @@ #include #include +#include "utils/precision_support.h" #include "utils/properties_test.hpp" #include "common_test_utils/test_assertions.hpp" #include "openvino/runtime/properties.hpp" @@ -208,8 +209,8 @@ TEST_F(OVClassConfigTestCPU, smoke_PluginSetConfigAffinityCore) { ASSERT_EQ(false, value); } -#if defined(OV_CPU_ARM_ENABLE_FP16) - const auto expected_precision_for_performance_mode = ov::element::f16; +#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) + const auto expected_precision_for_performance_mode = ov::intel_cpu::hasHardwareSupport(ov::element::f16) ? ov::element::f16 : ov::element::f32; #else const auto expected_precision_for_performance_mode = ov::with_cpu_x86_bfloat16() ? ov::element::bf16 : ov::element::f32; #endif diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/denormal_check.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/denormal_check.cpp index 306d61d26ee8d3..b98d4c61a1fb43 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/denormal_check.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/denormal_check.cpp @@ -65,10 +65,10 @@ TEST_F(DenormalNullifyCheck, smoke_CPU_Denormal_Check) { constexpr unsigned denormalsCount = 15u; constexpr uint32_t denormalsRange = (0xffffffffu >> 9u) - 1; testing::internal::Random random(seed); - auto randomRange = NGraphFunctions::Utils::generateVector(elemsCount, 10, -10); + auto randomRange = ov::test::utils::generateVector(elemsCount, 10, -10); for (auto& interval : intervals) { - auto randomIndices = NGraphFunctions::Utils::generateVector(denormalsCount, interval.second, interval.first); + auto randomIndices = ov::test::utils::generateVector(denormalsCount, interval.second, interval.first); std::unordered_set randomIndexSet(randomIndices.begin(), randomIndices.end()); for (size_t i = 0; i < elemsCount; ++i) { if (randomIndexSet.count(i)) { diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/core_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/core_config.cpp index 9dbdd255263b35..d2edd5a14eceb4 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/core_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/core_config.cpp @@ -8,12 +8,11 @@ namespace ov { namespace test { void core_configuration(ov::test::SubgraphBaseTest* test) { - #if defined(OV_CPU_ARM_ENABLE_FP16) || defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) //force fp32 inference precision if it is not configured specially if (!test->configuration.count(ov::hint::inference_precision.name())) { test->configuration.insert({ov::hint::inference_precision.name(), ov::element::f32.to_string()}); } - #endif + // todo: issue: 123320 test->convert_precisions.insert({ov::element::bf16, ov::element::f32}); test->convert_precisions.insert({ov::element::f16, ov::element::f32}); diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp index 6ad35a8105d7b1..485c9cb5bd615a 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -5,6 +5,7 @@ #include "openvino/core/visibility.hpp" #include "functional_test_utils/skip_tests_config.hpp" #include "openvino/runtime/system_conf.hpp" +#include "utils/precision_support.h" #include #include @@ -337,7 +338,6 @@ std::vector disabledTestPatterns() { // int8 specific retVector.emplace_back(R"(smoke_Quantized.*)"); -# if defined(OV_CPU_ARM_ENABLE_FP16) // Issue: 123019 retVector.emplace_back(R"(smoke_staticShapes4D.*INFERENCE_PRECISION_HINT=f16.*)"); retVector.emplace_back(R"(smoke_dynamicShapes4D.*INFERENCE_PRECISION_HINT=f16.*)"); @@ -351,7 +351,6 @@ std::vector disabledTestPatterns() { // Issue: 124395 retVector.emplace_back(R"(smoke_VariableStateBasic/InferRequestVariableStateTest.*)"); retVector.emplace_back(R"(smoke_VariableState/OVInferRequestVariableStateTest.*)"); -# endif #endif @@ -416,14 +415,14 @@ std::vector disabledTestPatterns() { retVector.emplace_back(R"(.*INFERENCE_PRECISION_HINT=(F|f)16.*)"); } #elif defined(OPENVINO_ARCH_ARM64) || defined(OPENVINO_ARCH_ARM) -# if !defined(OV_CPU_ARM_ENABLE_FP16) - // Skip fp16 tests for paltforms that don't support fp16 precision - retVector.emplace_back(R"(.*INFERENCE_PRECISION_HINT=(F|f)16.*)"); -# else - // Issue 117407 - retVector.emplace_back( - R"(.*EltwiseLayerCPUTest.*IS=\(\[1\.\.10\.2\.5\.6\]_\).*eltwiseOpType=SqDiff.*_configItem=INFERENCE_PRECISION_HINT=f16.*)"); -# endif // OV_CPU_ARM_ENABLE_FP16 + if (!ov::intel_cpu::hasHardwareSupport(ov::element::f16)) { + // Skip fp16 tests for paltforms that don't support fp16 precision + retVector.emplace_back(R"(.*INFERENCE_PRECISION_HINT=(F|f)16.*)"); + } else { + // Issue 117407 + retVector.emplace_back( + R"(.*EltwiseLayerCPUTest.*IS=\(\[1\.\.10\.2\.5\.6\]_\).*eltwiseOpType=SqDiff.*_configItem=INFERENCE_PRECISION_HINT=f16.*)"); + } #endif if (!ov::with_cpu_x86_avx512_core_vnni() && !ov::with_cpu_x86_avx512_core_amx_int8()) { // MatMul in Snippets uses BRGEMM that supports i8 only on platforms with VNNI or AMX instructions diff --git a/src/plugins/intel_cpu/tests/functional/utils/fusing_test_utils.hpp b/src/plugins/intel_cpu/tests/functional/utils/fusing_test_utils.hpp index 860106b81d97a8..9b240df9232b9c 100644 --- a/src/plugins/intel_cpu/tests/functional/utils/fusing_test_utils.hpp +++ b/src/plugins/intel_cpu/tests/functional/utils/fusing_test_utils.hpp @@ -183,14 +183,14 @@ const auto fusingSqrt = fusingSpecificParams{std::make_shared(std: const auto fusingPReluPerChannel = fusingSpecificParams{std::make_shared(std::vector{ {[](postNodeConfig& cfg){ ov::Shape newShape = generatePerChannelShape(cfg.target); - auto data = NGraphFunctions::Utils::generateVector(ov::shape_size(newShape)); + auto data = ov::test::utils::generateVector(ov::shape_size(newShape)); return utils::make_activation(cfg.input, cfg.type, utils::LeakyRelu, newShape, data); }, "PRelu(PerChannel)"}}), {"PRelu"}}; const auto fusingPReluPerTensor = fusingSpecificParams{std::make_shared(std::vector{ {[](postNodeConfig& cfg){ ov::Shape shape(1, 1); - auto data = NGraphFunctions::Utils::generateVector(ov::shape_size(shape)); + auto data = ov::test::utils::generateVector(ov::shape_size(shape)); return utils::make_activation(cfg.input, cfg.type, utils::LeakyRelu, shape, data); }, "PRelu(PerTensor)"}}), {"PRelu"}}; @@ -465,7 +465,7 @@ const auto fusingPRelu1D = fusingSpecificParams{std::make_shared(s {[](postNodeConfig& cfg){ auto shape = cfg.input->get_output_partial_shape(0); ov::Shape newShape({static_cast(shape[1].get_length())}); - auto data = NGraphFunctions::Utils::generateVector(ov::shape_size(newShape)); + auto data = ov::test::utils::generateVector(ov::shape_size(newShape)); return utils::make_activation(cfg.input, cfg.type, utils::LeakyRelu, newShape, data); }, "PRelu1D"}}), {"PRelu"}}; @@ -473,7 +473,7 @@ const auto fusingPRelu1DScaleShift = fusingSpecificParams{std::make_sharedget_output_partial_shape(0); ov::Shape newShape({static_cast(shape[1].get_length())}); - auto data = NGraphFunctions::Utils::generateVector(ov::shape_size(newShape)); + auto data = ov::test::utils::generateVector(ov::shape_size(newShape)); return utils::make_activation(cfg.input, cfg.type, utils::LeakyRelu, newShape, data); }, "PRelu1D"}, {[](postNodeConfig& cfg) { diff --git a/src/plugins/intel_cpu/thirdparty/ACLConfig.cmake b/src/plugins/intel_cpu/thirdparty/ACLConfig.cmake index 3afbae622af835..09774aa4bec493 100644 --- a/src/plugins/intel_cpu/thirdparty/ACLConfig.cmake +++ b/src/plugins/intel_cpu/thirdparty/ACLConfig.cmake @@ -98,7 +98,6 @@ elseif(NOT TARGET arm_compute::arm_compute) # set(ARM_COMPUTE_SOURCE_DIR "${intel_cpu_thirdparty_SOURCE_DIR}/ComputeLibrary") - set(ARM_COMPUTE_BINARY_DIR "${intel_cpu_thirdparty_BINARY_DIR}/ComputeLibrary") message(STATUS "Configure to build ${ARM_COMPUTE_SOURCE_DIR}") @@ -149,17 +148,16 @@ elseif(NOT TARGET arm_compute::arm_compute) list(APPEND ARM_COMPUTE_OPTIONS estate=32) else() list(APPEND ARM_COMPUTE_OPTIONS estate=64) - if(NOT APPLE AND CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 10.2) - # arm_sve.h header is not available on gcc older 10.2 - # TODO: validate it on machines with FP16 / SVE support and enabled back - # list(APPEND ARM_COMPUTE_OPTIONS multi_isa=1) + if(OV_CPU_AARCH64_USE_MULTI_ISA) + list(APPEND ARM_COMPUTE_OPTIONS multi_isa=1) + # let's additionally enable SME as well + set(extra_cxx_flags "${extra_cxx_flags} -DENABLE_SME -DARM_COMPUTE_ENABLE_SME -DARM_COMPUTE_ENABLE_SME2") endif() endif() if(NOT MSVC64) list(APPEND ARM_COMPUTE_OPTIONS - build_dir=${ARM_COMPUTE_BINARY_DIR} - install_dir=${ARM_COMPUTE_BINARY_DIR}/install) + install_dir=install) endif() if(ARM_COMPUTE_SCONS_JOBS) @@ -329,11 +327,10 @@ elseif(NOT TARGET arm_compute::arm_compute) if(MSVC64) set(arm_compute build/arm_compute-static.lib) - set(arm_compute_full_path "${ARM_COMPUTE_SOURCE_DIR}/${arm_compute}") else() - set(arm_compute ${ARM_COMPUTE_BINARY_DIR}/libarm_compute-static.a) - set(arm_compute_full_path "${arm_compute}") + set(arm_compute build/libarm_compute-static.a) endif() + set(arm_compute_full_path "${ARM_COMPUTE_SOURCE_DIR}/${arm_compute}") list(APPEND ARM_COMPUTE_OPTIONS fixed_format_kernels=True) diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/gemm.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/gemm.hpp index 18380badcfe196..0f1e690483119e 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/gemm.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/gemm.hpp @@ -26,15 +26,30 @@ class Gemm : public ov::op::v0::MatMul { const std::vector& order_c, const ov::element::Type output_type = ov::element::undefined); + Gemm(const ov::Output& A, + const ov::Output& B, + const std::vector& target_shape_a, + const std::vector& target_shape_b, + const std::vector& output_pattern_a, + const std::vector& output_pattern_b, + const std::vector& order_a, + const std::vector& order_b, + const std::vector& order_c, + const ov::element::Type output_type = ov::element::undefined); + bool visit_attributes(ov::AttributeVisitor &visitor) override; void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; - std::vector get_input0_order() const { return m_order_a; } - std::vector get_input1_order() const { return m_order_b; } - std::vector get_output_order() const { return m_order_c; } + std::vector get_input0_broadcast_target_shape() const { return m_target_shape_a; } + std::vector get_input1_broadcast_target_shape() const { return m_target_shape_b; } + std::vector get_input0_reshape_pattern() const { return m_output_pattern_a; } + std::vector get_input1_reshape_pattern() const { return m_output_pattern_b; } + std::vector get_input0_transpose_order() const { return m_order_a; } + std::vector get_input1_transpose_order() const { return m_order_b; } + std::vector get_output_transpose_order() const { return m_order_c; } ov::element::Type get_output_type() const { return m_output_type; } static std::vector default_order(size_t rank) { @@ -44,6 +59,10 @@ class Gemm : public ov::op::v0::MatMul { } protected: + std::vector m_target_shape_a; + std::vector m_target_shape_b; + std::vector m_output_pattern_a; + std::vector m_output_pattern_b; std::vector m_order_a; std::vector m_order_b; std::vector m_order_c; @@ -52,6 +71,10 @@ class Gemm : public ov::op::v0::MatMul { std::vector shape_infer(const Gemm* op, std::vector input_shapes, + const std::vector& target_shape_a, + const std::vector& target_shape_b, + const std::vector& output_pattern_a, + const std::vector& output_pattern_b, const std::vector& order_a, const std::vector& order_b, const std::vector& order_c); diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/gemm.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/gemm.hpp index 41b08e16a0b466..15dd92cd23f6d9 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/gemm.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/gemm.hpp @@ -54,6 +54,10 @@ struct gemm : public primitive_base { : primitive_base(id, inputs, {output_padding}, {optional_data_type{ data_type }}), transpose_input0(transpose_input0 ? 1 : 0), transpose_input1(transpose_input1 ? 1 : 0), + input0_broadcast_target_shape({}), + input1_broadcast_target_shape({}), + input0_reshape_pattern({}), + input1_reshape_pattern({}), alpha(alpha), beta(beta), input_rank(input_rank), @@ -70,9 +74,9 @@ struct gemm : public primitive_base { return order; }; - input0_order = get_transposed_order(input_rank, transpose_input0); - input1_order = get_transposed_order(weight_rank, transpose_input1); - output_order = {}; + input0_transpose_order = get_transposed_order(input_rank, transpose_input0); + input1_transpose_order = get_transposed_order(weight_rank, transpose_input1); + output_transpose_order = {}; } /// @brief Constructs gemm layer. @@ -86,48 +90,60 @@ struct gemm : public primitive_base { gemm(const primitive_id& id, const std::vector& inputs, const data_types data_type, - const std::vector& input0_order = {0, 1, 2, 3}, - const std::vector& input1_order = {0, 1, 2, 3}, - const std::vector& output_order = {}, + const std::vector& input0_broadcast_target_shape = {}, + const std::vector& input1_broadcast_target_shape = {}, + const std::vector& input0_reshape_pattern = {}, + const std::vector& input1_reshape_pattern = {}, + const std::vector& input0_transpose_order = {0, 1, 2, 3}, + const std::vector& input1_transpose_order = {0, 1, 2, 3}, + const std::vector& output_transpose_order = {}, const float alpha = 1.0f, const float beta = 0.0f, const padding& output_padding = padding()) : primitive_base(id, inputs, {output_padding}, {optional_data_type{ data_type }}), - input0_order(input0_order), - input1_order(input1_order), - output_order(output_order), + input0_broadcast_target_shape(input0_broadcast_target_shape), + input1_broadcast_target_shape(input1_broadcast_target_shape), + input0_reshape_pattern(input0_reshape_pattern), + input1_reshape_pattern(input1_reshape_pattern), + input0_transpose_order(input0_transpose_order), + input1_transpose_order(input1_transpose_order), + output_transpose_order(output_transpose_order), alpha(alpha), beta(beta), - input_rank(input0_order.size()), - weight_rank(input1_order.size()) { + input_rank(input0_transpose_order.size()), + weight_rank(input1_transpose_order.size()) { if (inputs.size() != 2 && inputs.size() != 3) { throw std::invalid_argument("Invalid inputs count - gemm expects either two or three inputs"); } - transpose_input0 = get_transpose_mode(input0_order); - transpose_input1 = get_transpose_mode(input1_order); + transpose_input0 = get_transpose_mode(input0_transpose_order); + transpose_input1 = get_transpose_mode(input1_transpose_order); } gemm(const primitive_id& id, const std::vector& inputs, const input_info& beam_table, const data_types data_type, - const std::vector& input0_order, - const std::vector& input1_order, - const std::vector& output_order, + const std::vector& input0_transpose_order, + const std::vector& input1_transpose_order, + const std::vector& output_transpose_order, bool indirect_a, bool indirect_b, const float alpha = 1.0f, const float beta = 0.0f, const padding& output_padding = padding()) : primitive_base(id, inputs, {output_padding}, {optional_data_type{ data_type }}), - input0_order(input0_order), - input1_order(input1_order), - output_order(output_order), + input0_broadcast_target_shape({}), + input1_broadcast_target_shape({}), + input0_reshape_pattern({}), + input1_reshape_pattern({}), + input0_transpose_order(input0_transpose_order), + input1_transpose_order(input1_transpose_order), + output_transpose_order(output_transpose_order), alpha(alpha), beta(beta), - input_rank(input0_order.size()), - weight_rank(input1_order.size()), + input_rank(input0_transpose_order.size()), + weight_rank(input1_transpose_order.size()), beam_table(beam_table), indirect_a(indirect_a), indirect_b(indirect_b) { @@ -135,20 +151,28 @@ struct gemm : public primitive_base { throw std::invalid_argument("Invalid inputs count - gemm expects either two or three inputs"); } - transpose_input0 = get_transpose_mode(input0_order); - transpose_input1 = get_transpose_mode(input1_order); + transpose_input0 = get_transpose_mode(input0_transpose_order); + transpose_input1 = get_transpose_mode(input1_transpose_order); } /// @brief Flag for transposing first input matrix uint32_t transpose_input0 = 0; /// @brief Flag for transposing second input matrix uint32_t transpose_input1 = 0; + /// @brief broadcasted target shape of input 0 + std::vector input0_broadcast_target_shape; + /// @brief broadcasted target shape of input 1 + std::vector input1_broadcast_target_shape; + /// @brief reshaped output pattern of input 0 + std::vector input0_reshape_pattern; + /// @brief reshaped output pattern of input 1 + std::vector input1_reshape_pattern; /// @brief order of input 0 - std::vector input0_order; + std::vector input0_transpose_order; /// @brief order of input 1 - std::vector input1_order; + std::vector input1_transpose_order; /// @brief order of output - std::vector output_order; + std::vector output_transpose_order; /// @brief Variable containing ALPHA parameter float alpha = 1.0f; /// @brief Variable containing BETA parameter @@ -169,12 +193,13 @@ struct gemm : public primitive_base { seed = hash_combine(seed, transpose_input1); seed = hash_combine(seed, indirect_a); seed = hash_combine(seed, indirect_b); - for (auto order : input0_order) - seed = hash_combine(seed, order); - for (auto order : input1_order) - seed = hash_combine(seed, order); - for (auto order : output_order) - seed = hash_combine(seed, order); + seed = hash_range(seed, input0_broadcast_target_shape.begin(), input0_broadcast_target_shape.end()); + seed = hash_range(seed, input1_broadcast_target_shape.begin(), input1_broadcast_target_shape.end()); + seed = hash_range(seed, input0_reshape_pattern.begin(), input0_reshape_pattern.end()); + seed = hash_range(seed, input1_reshape_pattern.begin(), input1_reshape_pattern.end()); + seed = hash_range(seed, input0_transpose_order.begin(), input0_transpose_order.end()); + seed = hash_range(seed, input1_transpose_order.begin(), input1_transpose_order.end()); + seed = hash_range(seed, output_transpose_order.begin(), output_transpose_order.end()); seed = hash_combine(seed, alpha); seed = hash_combine(seed, beta); return seed; @@ -200,9 +225,13 @@ struct gemm : public primitive_base { primitive_base::save(ob); ob << transpose_input0; ob << transpose_input1; - ob << input0_order; - ob << input1_order; - ob << output_order; + ob << input0_broadcast_target_shape; + ob << input1_broadcast_target_shape; + ob << input0_reshape_pattern; + ob << input1_reshape_pattern; + ob << input0_transpose_order; + ob << input1_transpose_order; + ob << output_transpose_order; ob << alpha; ob << beta; ob << input_rank; @@ -217,9 +246,13 @@ struct gemm : public primitive_base { primitive_base::load(ib); ib >> transpose_input0; ib >> transpose_input1; - ib >> input0_order; - ib >> input1_order; - ib >> output_order; + ib >> input0_broadcast_target_shape; + ib >> input1_broadcast_target_shape; + ib >> input0_reshape_pattern; + ib >> input1_reshape_pattern; + ib >> input0_transpose_order; + ib >> input1_transpose_order; + ib >> output_transpose_order; ib >> alpha; ib >> beta; ib >> input_rank; diff --git a/src/plugins/intel_gpu/src/graph/gemm.cpp b/src/plugins/intel_gpu/src/graph/gemm.cpp index a587b514fc0207..49f0fefd0f1ced 100644 --- a/src/plugins/intel_gpu/src/graph/gemm.cpp +++ b/src/plugins/intel_gpu/src/graph/gemm.cpp @@ -10,6 +10,18 @@ #include "intel_gpu/op/gemm.hpp" +namespace { +template ::value>::type> +int find_index_from_vec(const std::vector& vec, const DT value) { + int idx = 0; + for (auto v : vec) { + if (v != static_cast(value)) + break; + idx += 1; + } + return idx; +} +} // namespace namespace cldnn { GPU_DEFINE_PRIMITIVE_TYPE_ID(gemm) @@ -22,8 +34,8 @@ layout gemm_inst::calc_output_layout(gemm_node const& node, kernel_impl_params c auto input0_shape = input0_layout.get_shape(); auto input1_shape = input1_layout.get_shape(); - auto input0_order = prim->input0_order; - auto input1_order = prim->input1_order; + auto input0_transpose_order = prim->input0_transpose_order; + auto input1_transpose_order = prim->input1_transpose_order; bool reordered = prim->input_rank > 4 || prim->weight_rank > 4; size_t output_rank = std::max(prim->input_rank, prim->weight_rank); @@ -60,13 +72,13 @@ layout gemm_inst::calc_output_layout(gemm_node const& node, kernel_impl_params c return shape_transposed; }; - auto input0_shape_update = update_input_shape(input0_shape, input_rank, input0_order, true); - auto input1_shape_update = update_input_shape(input1_shape, weight_rank, input1_order, false); + auto input0_shape_update = update_input_shape(input0_shape, input_rank, input0_transpose_order, true); + auto input1_shape_update = update_input_shape(input1_shape, weight_rank, input1_transpose_order, false); ov::Shape bias_shape(output_rank); if (prim->input_size() == 3) { bias_shape = impl_param.get_input_layout(2).get_shape(); - bias_shape = update_input_shape(bias_shape, weight_rank, input1_order, false); + bias_shape = update_input_shape(bias_shape, weight_rank, input1_transpose_order, false); } auto output_shape = input0_shape_update; @@ -83,8 +95,8 @@ layout gemm_inst::calc_output_layout(gemm_node const& node, kernel_impl_params c size_t ones_to_add = 4 - std::min(output_shape.size(), static_cast(4)); output_shape.insert(output_shape.begin(), ones_to_add, 1); - if (prim->output_order.size() > 0) - output_shape = transpose_shape(output_shape, prim->output_order); + if (prim->output_transpose_order.size() > 0) + output_shape = transpose_shape(output_shape, prim->output_transpose_order); auto output_type = input0_layout.data_type; if ((output_type == data_types::u8 || output_type == data_types::i8) && prim->output_data_types[0]) @@ -125,8 +137,15 @@ std::vector gemm_inst::calc_output_layouts(gemm_node const& node, const input1_layout.get() }; - std::vector output_shapes = ov::intel_gpu::op::shape_infer(&op, input_shapes, - prim->input0_order, prim->input1_order, prim->output_order); + std::vector output_shapes = ov::intel_gpu::op::shape_infer(&op, + input_shapes, + prim->input0_broadcast_target_shape, + prim->input1_broadcast_target_shape, + prim->input0_reshape_pattern, + prim->input1_reshape_pattern, + prim->input0_transpose_order, + prim->input1_transpose_order, + prim->output_transpose_order); cldnn::format output_format = input0_layout.format; if (node.get_preferred_output_fmt() != format::any) @@ -139,58 +158,90 @@ template std::vector gemm_inst::calc_output_layouts(ge std::vector gemm_inst::transform_input_layouts(const std::shared_ptr primitive, const std::vector& input_layouts) { - auto get_updated_input_shape = [&](const ov::PartialShape& input_pshape, size_t input_rank, size_t output_rank, bool transpose, bool first_input) { - ov::PartialShape updated_input_pshape; + auto get_reshaped_input_shape = [&](const ov::PartialShape& input_pshape, + const std::vector& broadcast_target_shape, + const std::vector& reshape_pattern) { + ov::PartialShape reshaped_input_pshape; + + if (broadcast_target_shape.size() > 0 && reshape_pattern.size() > 0) { + std::vector dims(input_pshape); + int idx_recalc = find_index_from_vec(broadcast_target_shape, 1); + int idx_target = find_index_from_vec(reshape_pattern, 0); + if (dims[idx_recalc].is_static() && dims[idx_target].is_static()) { + dims[idx_recalc] *= dims[idx_target]; + } else { + dims[idx_recalc] = ov::Dimension::dynamic(); + } + dims.erase(dims.begin() + idx_target); + reshaped_input_pshape = ov::PartialShape(dims); + } else { + reshaped_input_pshape = input_pshape; + } + return reshaped_input_pshape; + }; + + auto get_transposed_input_shape = [&](const ov::PartialShape& input_pshape, size_t input_rank, size_t output_rank, bool transpose, bool first_input) { + ov::PartialShape transposed_input_pshape; if (input_rank == 1) { if (input_pshape.is_static()) { auto input_shape = input_pshape.to_shape(); - updated_input_pshape = ov::PartialShape{ static_cast(*std::max_element(input_shape.begin(), input_shape.end())) }; + transposed_input_pshape = ov::PartialShape{ static_cast(*std::max_element(input_shape.begin(), input_shape.end())) }; } else { - updated_input_pshape = ov::PartialShape::dynamic(input_rank); + transposed_input_pshape = ov::PartialShape::dynamic(input_rank); } } else { if (input_pshape.is_static()) { OPENVINO_ASSERT(input_pshape.size() >= input_rank, "[GPU] Requested input rank in gemm primitive is greater than actual shape"); std::vector dims(input_pshape.begin(), input_pshape.begin() + input_rank); - updated_input_pshape = ov::PartialShape(dims); + transposed_input_pshape = ov::PartialShape(dims); } else { - updated_input_pshape = input_pshape; + transposed_input_pshape = input_pshape; } } - if (updated_input_pshape.size() == 1) { - first_input ? updated_input_pshape.insert(updated_input_pshape.begin(), 1) - : updated_input_pshape.insert(updated_input_pshape.end(), 1); + if (transposed_input_pshape.size() == 1) { + first_input ? transposed_input_pshape.insert(transposed_input_pshape.begin(), 1) + : transposed_input_pshape.insert(transposed_input_pshape.end(), 1); if (transpose) { - std::swap(updated_input_pshape[0], updated_input_pshape[1]); + std::swap(transposed_input_pshape[0], transposed_input_pshape[1]); } } - size_t ones_to_add = std::max(output_rank, static_cast(4)) - updated_input_pshape.size(); - updated_input_pshape.insert(updated_input_pshape.begin(), ones_to_add, 1ul); + size_t ones_to_add = std::max(output_rank, static_cast(4)) - transposed_input_pshape.size(); + transposed_input_pshape.insert(transposed_input_pshape.begin(), ones_to_add, 1ul); - return updated_input_pshape; + return transposed_input_pshape; }; - auto input0_pshape = input_layouts[0].get_partial_shape(); - auto input1_pshape = input_layouts[1].get_partial_shape(); + auto reshaped_input0_pshape = get_reshaped_input_shape(input_layouts[0].get_partial_shape(), + primitive->input0_broadcast_target_shape, + primitive->input0_reshape_pattern); + auto reshaped_input1_pshape = get_reshaped_input_shape(input_layouts[1].get_partial_shape(), + primitive->input1_broadcast_target_shape, + primitive->input1_reshape_pattern); bool reordered = primitive->input_rank > 4 || primitive->weight_rank > 4; size_t output_rank = std::max(primitive->input_rank, primitive->weight_rank); size_t input_rank = reordered ? output_rank : primitive->input_rank; size_t weight_rank = reordered ? output_rank : primitive->weight_rank; - auto updated_input0_pshape = get_updated_input_shape(input0_pshape, input_rank, output_rank, primitive->transpose_input0, true); - auto updated_input1_pshape = get_updated_input_shape(input1_pshape, weight_rank, output_rank, primitive->transpose_input1, false); + auto transposed_input0_pshape = get_transposed_input_shape(reshaped_input0_pshape, input_rank, output_rank, primitive->transpose_input0, true); + auto transposed_input1_pshape = get_transposed_input_shape(reshaped_input1_pshape, weight_rank, output_rank, primitive->transpose_input1, false); std::vector layouts = input_layouts; - layouts[0].set_partial_shape(updated_input0_pshape); - layouts[1].set_partial_shape(updated_input1_pshape); + layouts[0].set_partial_shape(transposed_input0_pshape); + if (primitive->input0_broadcast_target_shape.size() > input_rank) { + layouts[0].format = format::adjust_to_rank(layouts[0].format, input_rank); + } + layouts[1].set_partial_shape(transposed_input1_pshape); + if (primitive->input1_broadcast_target_shape.size() > weight_rank) { + layouts[1].format = format::adjust_to_rank(layouts[1].format, weight_rank); + } if (primitive->input_size() == 3) { auto bias_pshape = input_layouts[2].get_partial_shape(); - auto updated_bias_pshape = get_updated_input_shape(bias_pshape, weight_rank, output_rank, primitive->transpose_input1, false); + auto updated_bias_pshape = get_transposed_input_shape(bias_pshape, weight_rank, output_rank, primitive->transpose_input1, false); layouts[2].set_partial_shape(updated_bias_pshape); } @@ -213,8 +264,8 @@ layout gemm_inst::transform_output_layout(const std::shared_ptr prim auto updated_output_layout = output_layout; auto output_rank = output_layout.get_partial_shape().size(); if (output_rank < 4) { - ov::PartialShape transposed_input0_pshape = transpose_pshape(input_layouts[0].get_partial_shape(), primitive->input0_order); - ov::PartialShape transposed_input1_pshape = transpose_pshape(input_layouts[1].get_partial_shape(), primitive->input1_order); + ov::PartialShape transposed_input0_pshape = transpose_pshape(input_layouts[0].get_partial_shape(), primitive->input0_transpose_order); + ov::PartialShape transposed_input1_pshape = transpose_pshape(input_layouts[1].get_partial_shape(), primitive->input1_transpose_order); auto M = (transposed_input0_pshape.size() > 1) ? transposed_input0_pshape[transposed_input0_pshape.size() - 2] : transposed_input0_pshape[0]; @@ -238,8 +289,8 @@ layout gemm_inst::transform_output_layout(const std::shared_ptr prim output_pshape[get_spatial_idx(updated_output_layout.format, 0)] = std::move(N); output_pshape[get_spatial_idx(updated_output_layout.format, 1)] = std::move(M); - if (primitive->output_order.size() > 0) { - output_pshape = transpose_pshape(output_pshape, primitive->output_order); + if (primitive->output_transpose_order.size() > 0) { + output_pshape = transpose_pshape(output_pshape, primitive->output_transpose_order); } updated_output_layout.set_partial_shape(output_pshape); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp index 5e44c78cbb4724..eb38677d03bf95 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp @@ -514,8 +514,8 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) { } auto gemm_prim = node.get_primitive(); - for (size_t idx = 0; idx < gemm_prim->output_order.size(); ++idx) { - size_t output_order_idx = static_cast(gemm_prim->output_order[idx]); + for (size_t idx = 0; idx < gemm_prim->output_transpose_order.size(); ++idx) { + size_t output_order_idx = static_cast(gemm_prim->output_transpose_order[idx]); if (idx != output_order_idx) { does_support_fusings = false; break; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp index 2ed48b659d3c38..03124262072955 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp @@ -173,9 +173,13 @@ struct gemm_impl : multi_stage_primitive { params.beta = primitive->beta; params.transpose_input0 = primitive->transpose_input0; params.transpose_input1 = primitive->transpose_input1; - params.input0_order = primitive->input0_order; - params.input1_order = primitive->input1_order; - params.output_order = primitive->output_order; + params.input0_target_shape = primitive->input0_broadcast_target_shape; + params.input1_target_shape = primitive->input1_broadcast_target_shape; + params.input0_output_pattern = primitive->input0_reshape_pattern; + params.input1_output_pattern = primitive->input0_reshape_pattern; + params.input0_order = primitive->input0_transpose_order; + params.input1_order = primitive->input1_transpose_order; + params.output_order = primitive->output_transpose_order; params.indirect_input0 = primitive->indirect_a && indirect; params.indirect_input1 = primitive->indirect_b && indirect; diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp index 0ce52b7e1a3d36..8f2021d39a2b7c 100644 --- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp +++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp @@ -937,21 +937,21 @@ static bool is_node_for_onednn(gemm_node const& node) { auto gemm_prim = node.get_primitive(); - for (size_t idx = 0; idx < gemm_prim->output_order.size(); idx++) { - if (idx != static_cast(gemm_prim->output_order[idx])) + for (size_t idx = 0; idx < gemm_prim->output_transpose_order.size(); idx++) { + if (idx != static_cast(gemm_prim->output_transpose_order[idx])) return false; } if (gemm_prim->transpose_input0 > 1 || gemm_prim->transpose_input0 > 1) return false; - for (size_t idx = 0; idx < (gemm_prim->input0_order.size() - 2); idx++) { - if (idx != static_cast(gemm_prim->input0_order[idx])) + for (size_t idx = 0; idx < (gemm_prim->input0_transpose_order.size() - 2); idx++) { + if (idx != static_cast(gemm_prim->input0_transpose_order[idx])) return false; } - for (size_t idx = 0; idx < (gemm_prim->input1_order.size() - 2); idx++) { - if (idx != static_cast(gemm_prim->input1_order[idx])) + for (size_t idx = 0; idx < (gemm_prim->input1_transpose_order.size() - 2); idx++) { + if (idx != static_cast(gemm_prim->input1_transpose_order[idx])) return false; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_ref.cl index cfb9a7b4749cb1..e90841d56fd33d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_ref.cl @@ -10,6 +10,9 @@ // ACCUMULATOR_TYPE [DataType] - type used for intermediate results accumulation. inline uint FUNC(get_input0_index_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) { +#if BROADCAST_INPUT0 + DO_BROADCAST_INPUT0 +#endif #if INPUT0_SIMPLE return GET_DATA_INDEX_6D_SAFE(INPUT0, b, f, w, z, y, x); #else @@ -30,6 +33,9 @@ inline uint FUNC(get_input0_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint } inline uint FUNC(get_input1_index_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) { +#if BROADCAST_INPUT1 + DO_BROADCAST_INPUT1 +#endif #if INPUT1_SIMPLE return GET_DATA_INDEX_6D_SAFE(INPUT1, b, f, w, z, y, x); #else diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl index e9079c6fb395f3..13dab0314ddf23 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl @@ -29,6 +29,9 @@ #endif // TILE_N > SIMD_WIDTH inline uint FUNC(get_input0_index_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) { +#if BROADCAST_INPUT0 + DO_BROADCAST_INPUT0 +#endif #if INPUT0_SIMPLE return GET_DATA_INDEX_6D_SAFE(INPUT0, b, f, w, z, y, x); #else @@ -41,6 +44,9 @@ inline uint FUNC(get_input0_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint } inline uint FUNC(get_input1_index_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) { +#if BROADCAST_INPUT1 + DO_BROADCAST_INPUT1 +#endif #if INPUT1_SIMPLE return GET_DATA_INDEX_6D_SAFE(INPUT1, b, f, w, z, y, x); #else diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.cpp index 87a3d3bb9d03e6..cb59cfee015e96 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.cpp @@ -211,6 +211,44 @@ JitConstants GemmKernelBase::GetJitConstants(const gemm_params& params) const { jit.AddConstant(MakeJitConstant("BIAS_TERM", 1)); } + auto get_broadcast_input_str = [](const std::vector& target_shape) { + const size_t target_rank = target_shape.size(); + std::vector dims; + if (target_rank == 1) { + dims = {"x"}; + } else if (target_rank == 2) { + dims = {"y", "x"}; + } else if (target_rank == 3) { + dims = {"f", "y", "x"}; + } else if (target_rank == 4) { + dims = {"b", "f", "y", "x"}; + } else if (target_rank == 5) { + dims = {"b", "f", "z", "y", "x"}; + } else if (target_rank == 6) { + dims = {"b", "f", "w", "z", "y", "x"}; + } + int pos = 0; + for (auto ts : target_shape) { + if (ts != 1) + break; + pos += 1; + } + std::string str = dims[pos] + " /= " + std::to_string(target_shape[pos]) + ";"; + return str; + }; + if (params.input0_target_shape.size() > 1) { + jit.AddConstants({ + MakeJitConstant("BROADCAST_INPUT0", true), + MakeJitConstant("DO_BROADCAST_INPUT0", get_broadcast_input_str(params.input0_target_shape)), + }); + } + if (params.input1_target_shape.size() > 1) { + jit.AddConstants({ + MakeJitConstant("BROADCAST_INPUT1", true), + MakeJitConstant("DO_BROADCAST_INPUT1", get_broadcast_input_str(params.input1_target_shape)), + }); + } + jit.AddConstants({ MakeJitConstant("TRANSPOSE_X_LAST", 0), MakeJitConstant("TRANSPOSE_Y_LAST", 1), diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.h index afb02169226eb4..633c8171c99ec8 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.h @@ -19,6 +19,10 @@ struct gemm_params : public base_params { float beta; uint32_t transpose_input0; uint32_t transpose_input1; + std::vector input0_target_shape; + std::vector input1_target_shape; + std::vector input0_output_pattern; + std::vector input1_output_pattern; std::vector input0_order; std::vector input1_order; std::vector output_order; diff --git a/src/plugins/intel_gpu/src/plugin/ops/matmul.cpp b/src/plugins/intel_gpu/src/plugin/ops/matmul.cpp index 6398635fdb2147..d455c1fa839b89 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/matmul.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/matmul.cpp @@ -156,20 +156,26 @@ static void CreateGemmOp(ProgramBuilder& p, const std::shared_ptrget_input_partial_shape(1); auto out_shape = op->get_output_partial_shape(0); - size_t rank_a = shape_a.rank().get_length(); - size_t rank_b = shape_b.rank().get_length(); + size_t rank_a = op->get_input0_reshape_pattern().size() > 0 ? op->get_input0_reshape_pattern().size() + : shape_a.rank().get_length(); + size_t rank_b = op->get_input1_reshape_pattern().size() > 0 ? op->get_input1_reshape_pattern().size() + :shape_b.rank().get_length(); size_t output_rank = out_shape.rank().get_length(); - OPENVINO_ASSERT(rank_a == op->get_input0_order().size(), "[GPU] Length of input0_order is not same as rank of input0"); - OPENVINO_ASSERT(rank_b == op->get_input1_order().size(), "[GPU] Length of input1_order is not same as rank of input1"); - OPENVINO_ASSERT(output_rank == op->get_output_order().size(), "[GPU] Length of output_order is not same as rank of output"); + OPENVINO_ASSERT(rank_a == op->get_input0_transpose_order().size(), "[GPU] Length of input0_order is not same as rank of input0"); + OPENVINO_ASSERT(rank_b == op->get_input1_transpose_order().size(), "[GPU] Length of input1_order is not same as rank of input1"); + OPENVINO_ASSERT(output_rank == op->get_output_transpose_order().size(), "[GPU] Length of output_order is not same as rank of output"); auto gemmPrim = cldnn::gemm(layerName, inputs, cldnn::element_type_to_data_type(op->get_output_element_type(0)), - op->get_input0_order(), - op->get_input1_order(), - op->get_output_order(), + op->get_input0_broadcast_target_shape(), + op->get_input1_broadcast_target_shape(), + op->get_input0_reshape_pattern(), + op->get_input1_reshape_pattern(), + op->get_input0_transpose_order(), + op->get_input1_transpose_order(), + op->get_output_transpose_order(), alpha, beta); @@ -200,9 +206,9 @@ static void CreateIndirectGemmOp(ProgramBuilder& p, const std::shared_ptr{ inputs[0], inputs[1] }, inputs[2], cldnn::element_type_to_data_type(op->get_output_element_type(0)), - op->get_input0_order(), - op->get_input1_order(), - op->get_output_order(), + op->get_input0_transpose_order(), + op->get_input1_transpose_order(), + op->get_output_transpose_order(), op->get_indirect_a(), op->get_indirect_b(), alpha, diff --git a/src/plugins/intel_gpu/src/plugin/transformations/broadcast_reshape_matmul_fusion.cpp b/src/plugins/intel_gpu/src/plugin/transformations/broadcast_reshape_matmul_fusion.cpp new file mode 100644 index 00000000000000..17df3d3d1a7294 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/broadcast_reshape_matmul_fusion.cpp @@ -0,0 +1,145 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "broadcast_reshape_matmul_fusion.hpp" + +#include "intel_gpu/op/gemm.hpp" + +#include "openvino/core/rt_info.hpp" +#include "openvino/op/broadcast.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "openvino/pass/pattern/op/or.hpp" +#include "transformations/utils/utils.hpp" + +namespace ov { +namespace intel_gpu { + +BroadcastReshapeMatmulFusion::BroadcastReshapeMatmulFusion() { + using namespace ov::pass::pattern; + + auto not_reshape = [](const ov::Output& output) -> bool { + return std::dynamic_pointer_cast(output.get_node_shared_ptr()) == nullptr; + }; + + auto broadcast_rank_equals_and_has_static_dims = [](const ov::Output& output) -> bool { + return rank_equals(5)(output) && has_static_dims({2, 3}) && consumers_count(1); + }; + + auto reshape_rank_equals_and_has_static_dim = [](const ov::Output& output) -> bool { + return rank_equals(4)(output) && has_static_dim(2) && consumers_count(1); + }; + + auto input_a_m = any_input(not_reshape); + auto input_b_m = any_input(not_reshape); + + auto broadcast_a_target_shape_m = wrap_type(); + auto broadcast_a_m = wrap_type({input_a_m, broadcast_a_target_shape_m}, broadcast_rank_equals_and_has_static_dims); + auto broadcast_b_target_shape_m = wrap_type(); + auto broadcast_b_m = wrap_type({input_b_m, broadcast_b_target_shape_m}, broadcast_rank_equals_and_has_static_dims); + + auto reshape_a_pattern_m = wrap_type(); + auto reshape_a_m = wrap_type({broadcast_a_m, reshape_a_pattern_m}, reshape_rank_equals_and_has_static_dim); + auto reshape_b_pattern_m = wrap_type(); + auto reshape_b_m = wrap_type({broadcast_b_m, reshape_b_pattern_m}, reshape_rank_equals_and_has_static_dim); + + auto matmul_in_a = std::make_shared(OutputVector{input_a_m, reshape_a_m}); + auto matmul_in_b = std::make_shared(OutputVector{input_b_m, reshape_b_m}); + + auto matmul_m = wrap_type({matmul_in_a, matmul_in_b}); + + ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + auto matmul = std::dynamic_pointer_cast(pattern_map.at(matmul_m).get_node_shared_ptr()); + if (!matmul || transformation_callback(m.get_match_root())) { + return false; + } + + auto target_shape_a = std::vector(); + auto target_shape_b = std::vector(); + size_t input_a_output_idx = matmul->get_input_source_output(0).get_index(); + size_t input_b_output_idx = matmul->get_input_source_output(1).get_index(); + auto order_a = matmul->get_input0_transpose_order(); + auto order_b = matmul->get_input1_transpose_order(); + + auto valid_transpose_order = [](const std::vector& order) { + return order.size() == 4 && order[1] == 2; + }; + + auto valid_broadcast_target_shape = [](const std::vector& target_shape) { + return std::count_if(target_shape.begin(), target_shape.end(), [](int32_t s) { return s != 1; }) == 1; + }; + + if (pattern_map.count(broadcast_a_m) > 0) { + if (!valid_transpose_order(order_a)) + return false; + auto broadcast_a = std::dynamic_pointer_cast(pattern_map.at(broadcast_a_m).get_node_shared_ptr()); + if (!broadcast_a || broadcast_a->get_broadcast_spec().m_type != ov::op::BroadcastType::BIDIRECTIONAL) + return false; + auto broadcast_a_target_shape = std::dynamic_pointer_cast(pattern_map.at(broadcast_a_target_shape_m).get_node_shared_ptr()); + target_shape_a = broadcast_a_target_shape->cast_vector(); + if (!valid_broadcast_target_shape(target_shape_a)) + return false; + input_a_output_idx = broadcast_a->get_input_source_output(0).get_index(); + } + if (pattern_map.count(broadcast_b_m) > 0) { + if (!valid_transpose_order(order_b)) + return false; + auto broadcast_b = std::dynamic_pointer_cast(pattern_map.at(broadcast_b_m).get_node_shared_ptr()); + if (!broadcast_b || broadcast_b->get_broadcast_spec().m_type != ov::op::BroadcastType::BIDIRECTIONAL) + return false; + auto broadcast_b_target_shape = std::dynamic_pointer_cast(pattern_map.at(broadcast_b_target_shape_m).get_node_shared_ptr()); + target_shape_b = broadcast_b_target_shape->cast_vector(); + if (!valid_broadcast_target_shape(target_shape_b)) + return false; + input_b_output_idx = broadcast_b->get_input_source_output(0).get_index(); + } + + auto pattern_a = std::vector(); + auto pattern_b = std::vector(); + + auto valid_reshape_pattern = [](const std::vector& pattern) { + return std::count_if(pattern.begin(), pattern.end(), [](int64_t p) { return p == -1; }) == 0; + }; + + if (pattern_map.count(reshape_a_m) > 0) { + auto reshape_a_pattern = std::dynamic_pointer_cast(pattern_map.at(reshape_a_pattern_m).get_node_shared_ptr()); + pattern_a = reshape_a_pattern->cast_vector(); + if (!valid_reshape_pattern(pattern_a)) + return false; + } + if (pattern_map.count(reshape_b_m) > 0) { + auto reshape_b_pattern = std::dynamic_pointer_cast(pattern_map.at(reshape_b_pattern_m).get_node_shared_ptr()); + pattern_b = reshape_b_pattern->cast_vector(); + if (!valid_reshape_pattern(pattern_b)) + return false; + } + + auto input_a = ov::Output(pattern_map.at(input_a_m).get_node_shared_ptr(), input_a_output_idx); + auto input_b = ov::Output(pattern_map.at(input_b_m).get_node_shared_ptr(), input_b_output_idx); + auto order_c = matmul->get_output_transpose_order(); + + auto gemm = std::make_shared(input_a, + input_b, + target_shape_a, + target_shape_b, + pattern_a, + pattern_b, + order_a, + order_b, + order_c); + gemm->set_friendly_name(matmul->get_friendly_name()); + ov::copy_runtime_info(m.get_matched_nodes(), gemm); + ov::replace_node(matmul, gemm); + + return true; + }; + + auto m = std::make_shared(matmul_m, "BroadcastReshapeMatmulFusion"); + this->register_matcher(m, callback); +} + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/broadcast_reshape_matmul_fusion.hpp b/src/plugins/intel_gpu/src/plugin/transformations/broadcast_reshape_matmul_fusion.hpp new file mode 100644 index 00000000000000..e3ad540e0a4692 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/broadcast_reshape_matmul_fusion.hpp @@ -0,0 +1,19 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" + +namespace ov { +namespace intel_gpu { + +class BroadcastReshapeMatmulFusion : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("BroadcastReshapeMatmulFusion", "0"); + BroadcastReshapeMatmulFusion(); +}; + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/clamp_fp16_output.cpp b/src/plugins/intel_gpu/src/plugin/transformations/clamp_fp16_output.cpp index 941b5c51ec3a67..575604038c0deb 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/clamp_fp16_output.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/clamp_fp16_output.cpp @@ -10,6 +10,10 @@ #include "openvino/op/matmul.hpp" #include "openvino/op/softmax.hpp" #include "openvino/op/reshape.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/subtract.hpp" +#include "openvino/op/divide.hpp" #include "openvino/pass/pattern/op/pattern.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" #include "openvino/pass/pattern/op/or.hpp" @@ -28,7 +32,10 @@ ClampFP16Output::ClampFP16Output() { auto in1 = any_input(as_value_predicate(class_other_than())); auto matmul_m = wrap_type({in0, in1}, all_of({type_matches(ov::element::f16), consumers_count(1)})); auto reshape_m = wrap_type({matmul_m, any_input()}, all_of({type_matches(ov::element::f16), consumers_count(1)})); - auto softmax_input_m = std::make_shared(ov::OutputVector{reshape_m, matmul_m}); + auto add_m = wrap_type({matmul_m, any_input()}, all_of({type_matches(ov::element::f16), consumers_count(1)})); + auto eltwise_m = wrap_type({matmul_m, any_input()}, + all_of({type_matches(ov::element::f16), consumers_count(1)})); + auto softmax_input_m = std::make_shared(ov::OutputVector{eltwise_m, reshape_m, matmul_m}); auto softmax_m = wrap_type({softmax_input_m}, type_matches(ov::element::f16)); ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) { diff --git a/src/plugins/intel_gpu/src/plugin/transformations/indirect_kv_cache.cpp b/src/plugins/intel_gpu/src/plugin/transformations/indirect_kv_cache.cpp index 2a0bac302956c2..14b58e642a8116 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/indirect_kv_cache.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/indirect_kv_cache.cpp @@ -85,9 +85,9 @@ IndirectKVCache::IndirectKVCache() { auto matmul_kv_cache_index = kv_cache_users.begin()->get_index(); auto gemm_node = std::dynamic_pointer_cast(m.get_match_root()); - auto order_in0 = gemm_node->get_input0_order(); - auto order_in1 = gemm_node->get_input1_order(); - auto order_out = gemm_node->get_output_order(); + auto order_in0 = gemm_node->get_input0_transpose_order(); + auto order_in1 = gemm_node->get_input1_transpose_order(); + auto order_out = gemm_node->get_output_transpose_order(); auto indirect_gemm = std::make_shared(gemm_node->get_input_node_shared_ptr(0), gemm_node->get_input_node_shared_ptr(1), diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/gemm.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/gemm.cpp index 92f23f9b6b6663..45b200baba4ce9 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/gemm.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/gemm.cpp @@ -4,8 +4,12 @@ #include "intel_gpu/op/gemm.hpp" #include "matmul_shape_inference.hpp" +#include "broadcast_shape_inference.hpp" +#include "reshape_shape_inference.hpp" #include "openvino/core/partial_shape.hpp" #include "openvino/op/matmul.hpp" +#include "openvino/op/broadcast.hpp" +#include "openvino/op/reshape.hpp" namespace ov { namespace intel_gpu { @@ -18,6 +22,35 @@ Gemm::Gemm(const ov::Output& A, const std::vector& order_c, const ov::element::Type output_type) : ov::op::v0::MatMul() + , m_target_shape_a({}) + , m_target_shape_b({}) + , m_output_pattern_a({}) + , m_output_pattern_b({}) + , m_order_a(order_a) + , m_order_b(order_b) + , m_order_c(order_c) + , m_output_type(output_type) { + set_arguments({A, B}); + set_transpose_a(false); + set_transpose_b(false); + validate_and_infer_types(); +} + +Gemm::Gemm(const ov::Output& A, + const ov::Output& B, + const std::vector& target_shape_a, + const std::vector& target_shape_b, + const std::vector& output_pattern_a, + const std::vector& output_pattern_b, + const std::vector& order_a, + const std::vector& order_b, + const std::vector& order_c, + const ov::element::Type output_type) + : ov::op::v0::MatMul() + , m_target_shape_a(target_shape_a) + , m_target_shape_b(target_shape_b) + , m_output_pattern_a(output_pattern_a) + , m_output_pattern_b(output_pattern_b) , m_order_a(order_a) , m_order_b(order_b) , m_order_c(order_c) @@ -31,7 +64,16 @@ Gemm::Gemm(const ov::Output& A, std::shared_ptr Gemm::clone_with_new_inputs(const ov::OutputVector& new_args) const { check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), new_args.at(1), m_order_a, m_order_b, m_order_c, m_output_type); + return std::make_shared(new_args.at(0), + new_args.at(1), + m_target_shape_a, + m_target_shape_b, + m_output_pattern_a, + m_output_pattern_b, + m_order_a, + m_order_b, + m_order_c, + m_output_type); } void Gemm::validate_and_infer_types() { @@ -42,7 +84,15 @@ void Gemm::validate_and_infer_types() { input_size, ", expected 2."); - auto out_shapes = shape_infer(this, std::vector{get_input_partial_shape(0), get_input_partial_shape(1)}, m_order_a, m_order_b, m_order_c); + auto out_shapes = shape_infer(this, + std::vector{get_input_partial_shape(0), get_input_partial_shape(1)}, + m_target_shape_a, + m_target_shape_b, + m_output_pattern_a, + m_output_pattern_b, + m_order_a, + m_order_b, + m_order_c); auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type; set_output_type(0, output_type, out_shapes[0]); @@ -58,9 +108,45 @@ bool Gemm::visit_attributes(ov::AttributeVisitor &visitor) { std::vector shape_infer(const Gemm* op, std::vector input_shapes, + const std::vector& target_shape_a, + const std::vector& target_shape_b, + const std::vector& output_pattern_a, + const std::vector& output_pattern_b, const std::vector& order_a, const std::vector& order_b, const std::vector& order_c) { + auto shape_a = input_shapes[0]; + auto shape_b = input_shapes[1]; + + // broadcasted shapes + auto broadcast_shape = [](const ov::PartialShape shape, const std::vector& target_shape) { + ov::op::v3::Broadcast broadcast; + auto tshape = target_shape; + broadcast.set_broadcast_spec(ov::op::BroadcastType::BIDIRECTIONAL); + std::unordered_map const_data; + const_data.emplace(1, ov::Tensor(ov::element::i32, ov::Shape{tshape.size()}, static_cast(tshape.data()))); + return ov::op::v3::shape_infer(&broadcast, + std::vector{shape, ov::PartialShape(ov::Shape{tshape.size()})}, + ov::make_tensor_accessor(const_data)); + }; + auto shape_a_b = (target_shape_a.size() > 1) ? broadcast_shape(shape_a, target_shape_a)[0] : shape_a; + auto shape_b_b = (target_shape_b.size() > 1) ? broadcast_shape(shape_b, target_shape_b)[0] : shape_b; + + // reshaped shapes + auto reshape_shape = [](const ov::PartialShape shape, const std::vector& output_pattern) { + ov::op::v1::Reshape reshape; + auto opattern = output_pattern; + reshape.set_special_zero(true); + std::unordered_map const_data; + const_data.emplace(1, ov::Tensor(ov::element::i64, ov::Shape{opattern.size()}, static_cast(opattern.data()))); + return ov::op::v1::shape_infer(&reshape, + std::vector{shape, ov::PartialShape(ov::Shape{opattern.size()})}, + ov::make_tensor_accessor(const_data)); + }; + auto shape_a_r = (output_pattern_a.size() > 1) ? reshape_shape(shape_a_b, output_pattern_a)[0] : shape_a_b; + auto shape_b_r = (output_pattern_b.size() > 1) ? reshape_shape(shape_b_b, output_pattern_b)[0] : shape_b_b; + + // transposed shapes auto transpose_shape = [](const ov::PartialShape shape, const std::vector& order) { auto shape_transposed = ov::PartialShape::dynamic(shape.rank()); for (size_t i = 0; i < order.size(); i++) { @@ -69,11 +155,8 @@ std::vector shape_infer(const Gemm* op, return shape_transposed; }; - auto shape_a = input_shapes[0]; - auto shape_b = input_shapes[1]; - - auto shape_a_t = (order_a.size() > 1) ? transpose_shape(shape_a, order_a) : shape_a; - auto shape_b_t = (order_b.size() > 1) ? transpose_shape(shape_b, order_b) : shape_b; + auto shape_a_t = (order_a.size() > 1) ? transpose_shape(shape_a_r, order_a) : shape_a_r; + auto shape_b_t = (order_b.size() > 1) ? transpose_shape(shape_b_r, order_b) : shape_b_r; auto out_shapes = ov::op::v0::shape_infer(dynamic_cast(op), std::vector{shape_a_t, shape_b_t}); if (order_c.size() > 0) { diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_gemm.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_gemm.cpp index 5e35d5cd1fc177..bd557f811e6951 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_gemm.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_gemm.cpp @@ -48,7 +48,15 @@ void IndirectGemm::validate_and_infer_types() { input_size, ", expected 3."); - auto out_shapes = shape_infer(this, std::vector{get_input_partial_shape(0), get_input_partial_shape(1)}, m_order_a, m_order_b, m_order_c); + auto out_shapes = shape_infer(this, + std::vector{get_input_partial_shape(0), get_input_partial_shape(1)}, + m_target_shape_a, + m_target_shape_b, + m_output_pattern_a, + m_output_pattern_b, + m_order_a, + m_order_b, + m_order_c); auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type; set_output_type(0, output_type, out_shapes[0]); diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 03c5118a7a8861..dfd01834eb710b 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -61,6 +61,7 @@ #include "plugin/transformations/transpose_matmul_fusion.hpp" #include "plugin/transformations/indirect_kv_cache.hpp" #include "plugin/transformations/convert_convolution.hpp" +#include "plugin/transformations/broadcast_reshape_matmul_fusion.hpp" #include "transformations/common_optimizations/broadcast_elementwise_fusion.hpp" #include "transformations/common_optimizations/broadcast_transition.hpp" #include "transformations/common_optimizations/common_optimizations.hpp" @@ -723,6 +724,8 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); manager.register_pass(); + if (!device_info.supports_immad) + manager.register_pass(); const size_t zp_pad_size = 32; manager.register_pass(zp_pad_size); diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp index dc7ee7e120b586..a5b524e507f40e 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp @@ -10,6 +10,7 @@ #include #include "openvino/reference/matmul.hpp" #include "openvino/reference/transpose.hpp" +#include "openvino/reference/reshape.hpp" #include "intel_gpu/runtime/compilation_context.hpp" #include "gemm_inst.h" @@ -837,6 +838,285 @@ class gemm_gpu_tests: public ::testing::Test { } } + void test_broadcast_transpose_matmul(bool is_caching_test) { + tests::random_generator rg; + rg.set_seed(GET_SUITE_NAME); + + const unsigned long BATCH_SIZE = 1; + const unsigned long M_SIZE = 1; + const unsigned long K_SIZE = 32; + const unsigned long N_SIZE = 21; + + auto fill_mem = [&](cldnn::memory_ptr mem, std::vector& data) { + cldnn::mem_lock mem_ptr(mem, get_test_stream()); + auto&& l = mem->get_layout(); + auto data_idx = 0; + for (cldnn::tensor::value_type b = 0; b < l.batch(); ++b) { + for (cldnn::tensor::value_type f = 0; f < l.feature(); ++f) { + for (cldnn::tensor::value_type y = 0; y < l.spatial(1); ++y) { + for (cldnn::tensor::value_type x = 0; x < l.spatial(0); ++x) { + auto tensor_coord = cldnn::tensor{{b, f, x, y}, 0}; + auto buffer_idx = l.get_linear_offset(tensor_coord); + mem_ptr[buffer_idx] = data[data_idx++]; + } + } + } + } + }; + + auto& engine = get_test_engine(); + ov::Shape input0_shape; + ov::Shape input1_shape; + std::vector input1_target_shape; + std::vector input0_order; + std::vector input1_order; + ov::Shape beam_table_shape; + cldnn::layout input0_layout; + cldnn::layout input1_layout; + + input0_shape = { BATCH_SIZE, 16, M_SIZE, K_SIZE }; + input1_shape = { N_SIZE, BATCH_SIZE, 1, K_SIZE }; + input1_target_shape = { 1, 1, 16, 1 }; + input0_order = { 0, 1, 2, 3 }; + input1_order = { 1, 2, 3, 0 }; + + input0_layout = layout{ov::PartialShape::dynamic(input0_shape.size()), data_types::f32, format::bfyx}; + input1_layout = layout{ov::PartialShape::dynamic(input1_shape.size()), data_types::f32, format::bfyx}; + + auto input0_mem = engine.allocate_memory(layout{ov::PartialShape(input0_shape), data_types::f32, format::bfyx}); + auto input1_mem = engine.allocate_memory(layout{ov::PartialShape(input1_shape), data_types::f32, format::bfyx}); + + auto input_0_data = rg.generate_random_1d(ov::shape_size(input0_shape), -2, 2); + auto input_1_data = rg.generate_random_1d(ov::shape_size(input1_shape), -2, 2); + + fill_mem(input0_mem, input_0_data); + fill_mem(input1_mem, input_1_data); + + topology topology; + topology.add(input_layout("input0", input0_layout), + input_layout("input1", input1_layout), + gemm("gemm", { input_info("input0"), input_info("input1") }, data_types::f32, {}, input1_target_shape, {}, {}, input0_order, input1_order) + ); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); + network->set_input_data("input0", input0_mem); + network->set_input_data("input1", input1_mem); + + auto inst = network->get_primitive("gemm"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != nullptr); + + auto outputs = network->execute(); + + auto output_mem = outputs.at("gemm").get_memory(); + cldnn::mem_lock output_ptr(output_mem, get_test_stream()); + + ov::Shape ref_input0_shape; + ov::Shape ref_input1_broadcasted_shape; + ov::Shape ref_input1_shape; + ov::Shape ref_output_shape; + + ref_input0_shape = { BATCH_SIZE, 16, M_SIZE, K_SIZE }; + ref_input1_broadcasted_shape = { N_SIZE, BATCH_SIZE, 16, K_SIZE }; + ref_input1_shape = { BATCH_SIZE, 16, K_SIZE, N_SIZE }; + ref_output_shape = { BATCH_SIZE, 16, M_SIZE, N_SIZE }; + + std::vector ref_out_data; + ref_out_data.resize(ov::shape_size(ref_output_shape)); + + std::vector ref_input_0_data(input_0_data.size()); + std::vector ref_input_1_broadcasted_data(ov::shape_size(ref_input1_broadcasted_shape)); + std::vector ref_input_1_data(ref_input_1_broadcasted_data.size()); + + ov::reference::transpose((const char *)(input_0_data.data()), + (char *)(ref_input_0_data.data()), + input0_shape, + sizeof(float), + input0_order, + ref_input0_shape); + + ov::reference::broadcast(reinterpret_cast(input_1_data.data()), + reinterpret_cast(ref_input_1_broadcasted_data.data()), + input1_shape, + ref_input1_broadcasted_shape, + ov::AxisSet({}), + sizeof(float)); + + ov::reference::transpose((const char *)(ref_input_1_broadcasted_data.data()), + (char *)(ref_input_1_data.data()), + ref_input1_broadcasted_shape, + sizeof(float), + input1_order, + ref_input1_shape); + + ov::reference::matmul(ref_input_0_data.data(), + ref_input_1_data.data(), + ref_out_data.data(), + ref_input0_shape, + ref_input1_shape, + ref_output_shape, + false, + false); + + ASSERT_EQ(output_ptr.size(), ref_out_data.size()); + + const auto abs_error = 0.0001; + for (uint32_t i = 0; i < ref_out_data.size(); ++i) { + ASSERT_NEAR(output_ptr[i], ref_out_data[i], abs_error) << "at " << i; + } + } + + void test_broadcast_reshape_transpose_matmul(bool is_caching_test) { + tests::random_generator rg; + rg.set_seed(GET_SUITE_NAME); + + const unsigned long BATCH_SIZE = 1; + const unsigned long M_SIZE = 1; + const unsigned long K_SIZE = 32; + const unsigned long N_SIZE = 21; + + auto fill_mem = [&](cldnn::memory_ptr mem, std::vector& data) { + cldnn::mem_lock mem_ptr(mem, get_test_stream()); + auto&& l = mem->get_layout(); + auto data_idx = 0; + for (cldnn::tensor::value_type b = 0; b < l.batch(); ++b) { + for (cldnn::tensor::value_type f = 0; f < l.feature(); ++f) { + for (cldnn::tensor::value_type z = 0; z < l.spatial(2); ++z) { + for (cldnn::tensor::value_type y = 0; y < l.spatial(1); ++y) { + for (cldnn::tensor::value_type x = 0; x < l.spatial(0); ++x) { + auto tensor_coord = cldnn::tensor{{b, f, x, y, z}, 0}; + auto buffer_idx = l.get_linear_offset(tensor_coord); + mem_ptr[buffer_idx] = data[data_idx++]; + } + } + } + } + } + }; + + auto& engine = get_test_engine(); + ov::Shape input0_shape; + ov::Shape input1_shape; + std::vector input1_target_shape; + std::vector input1_output_pattern; + std::vector input0_order; + std::vector input1_order; + ov::Shape beam_table_shape; + cldnn::layout input0_layout; + cldnn::layout input1_layout; + + input0_shape = { BATCH_SIZE, 32, M_SIZE, K_SIZE }; + input1_shape = { N_SIZE, BATCH_SIZE, 2, 1, K_SIZE }; + input1_target_shape = { 1, 1, 1, 16, 1 }; + input1_output_pattern = { 0, 0, 32, K_SIZE }; + input0_order = { 0, 1, 2, 3 }; + input1_order = { 1, 2, 3, 0 }; + + input0_layout = layout{ov::PartialShape::dynamic(input0_shape.size()), data_types::f32, format::bfyx}; + input1_layout = layout{ov::PartialShape::dynamic(input1_shape.size()), data_types::f32, format::bfzyx}; + + auto input0_mem = engine.allocate_memory(layout{ov::PartialShape(input0_shape), data_types::f32, format::bfyx}); + auto input1_mem = engine.allocate_memory(layout{ov::PartialShape(input1_shape), data_types::f32, format::bfzyx}); + + auto input_0_data = rg.generate_random_1d(ov::shape_size(input0_shape), -2, 2); + auto input_1_data = rg.generate_random_1d(ov::shape_size(input1_shape), -2, 2); + + fill_mem(input0_mem, input_0_data); + fill_mem(input1_mem, input_1_data); + + topology topology; + topology.add(input_layout("input0", input0_layout), + input_layout("input1", input1_layout), + gemm("gemm", { input_info("input0"), input_info("input1") }, data_types::f32, {}, input1_target_shape, {}, input1_output_pattern, input0_order, input1_order) + ); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); + network->set_input_data("input0", input0_mem); + network->set_input_data("input1", input1_mem); + + auto inst = network->get_primitive("gemm"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != nullptr); + + auto outputs = network->execute(); + + auto output_mem = outputs.at("gemm").get_memory(); + cldnn::mem_lock output_ptr(output_mem, get_test_stream()); + + ov::Shape ref_input0_shape; + ov::Shape ref_input1_broadcasted_shape; + ov::Shape ref_input1_reshaped_shape; + ov::Shape ref_input1_shape; + ov::Shape ref_output_shape; + + ref_input0_shape = { BATCH_SIZE, 32, M_SIZE, K_SIZE }; + ref_input1_broadcasted_shape = { N_SIZE, BATCH_SIZE, 2, 16, K_SIZE }; + ref_input1_reshaped_shape = { N_SIZE, BATCH_SIZE, 32, K_SIZE }; + ref_input1_shape = { BATCH_SIZE, 32, K_SIZE, N_SIZE }; + ref_output_shape = { BATCH_SIZE, 32, M_SIZE, N_SIZE }; + + std::vector ref_out_data; + ref_out_data.resize(ov::shape_size(ref_output_shape)); + + std::vector ref_input_0_data(input_0_data.size()); + std::vector ref_input_1_broadcasted_data(ov::shape_size(ref_input1_broadcasted_shape)); + std::vector ref_input_1_reshaped_data(ov::shape_size(ref_input1_reshaped_shape)); + std::vector ref_input_1_data(ref_input_1_broadcasted_data.size()); + + ov::reference::transpose((const char *)(input_0_data.data()), + (char *)(ref_input_0_data.data()), + input0_shape, + sizeof(float), + input0_order, + ref_input0_shape); + + ov::reference::broadcast(reinterpret_cast(input_1_data.data()), + reinterpret_cast(ref_input_1_broadcasted_data.data()), + input1_shape, + ref_input1_broadcasted_shape, + ov::AxisSet({}), + sizeof(float)); + + std::vector axes_order(ov::shape_size(ref_input1_broadcasted_shape)); + std::iota(axes_order.begin(), axes_order.end(), 0); + + ov::reference::reshape(reinterpret_cast(ref_input_1_broadcasted_data.data()), + reinterpret_cast(ref_input_1_reshaped_data.data()), + ref_input1_broadcasted_shape, + axes_order, + ref_input1_reshaped_shape, + sizeof(float)); + + ov::reference::transpose((const char *)(ref_input_1_reshaped_data.data()), + (char *)(ref_input_1_data.data()), + ref_input1_reshaped_shape, + sizeof(float), + input1_order, + ref_input1_shape); + + ov::reference::matmul(ref_input_0_data.data(), + ref_input_1_data.data(), + ref_out_data.data(), + ref_input0_shape, + ref_input1_shape, + ref_output_shape, + false, + false); + + ASSERT_EQ(output_ptr.size(), ref_out_data.size()); + + const auto abs_error = 0.0001; + for (uint32_t i = 0; i < ref_out_data.size(); ++i) { + ASSERT_NEAR(output_ptr[i], ref_out_data[i], abs_error) << "at " << i; + } + } + void test_transpose_matmul(size_t num_dims, bool is_input_dynamic, bool is_caching_test) { tests::random_generator rg; rg.set_seed(GET_SUITE_NAME); @@ -914,7 +1194,7 @@ class gemm_gpu_tests: public ::testing::Test { topology topology; topology.add(input_layout("input0", input0_layout), input_layout("input1", input1_layout), - gemm("gemm", { input_info("input0"), input_info("input1") }, data_types::f32, input0_order, input1_order) + gemm("gemm", { input_info("input0"), input_info("input1") }, data_types::f32, {}, {}, {}, {}, input0_order, input1_order) ); ExecutionConfig config = get_test_default_config(engine); @@ -1072,7 +1352,7 @@ class gemm_gpu_tests: public ::testing::Test { topology topology; topology.add(input_layout("input0", input0_layout), input_layout("input1", input1_layout), - gemm("gemm", { input_info("input0"), input_info("input1") }, data_types::f16, input0_order, input1_order, output_order) + gemm("gemm", { input_info("input0"), input_info("input1") }, data_types::f16, {}, {}, {}, {}, input0_order, input1_order, output_order) ); ExecutionConfig config = get_test_default_config(engine); @@ -1225,6 +1505,14 @@ TEST_F(gemm_gpu_tests, transpose_matmul_in1_indirect) { this->test_transpose_indirect(false, false, true); } +TEST_F(gemm_gpu_tests, broadcast_transpose_matmul) { + this->test_broadcast_transpose_matmul(false); +} + +TEST_F(gemm_gpu_tests, broadcast_reshape_transpose_matmul) { + this->test_broadcast_reshape_transpose_matmul(false); +} + TEST_F(gemm_gpu_tests, transpose_matmul_transpose_dynamic_1d) { this->test_transpose_matmul_transpose(1, true, false); } diff --git a/src/plugins/intel_gpu/tests/unit/transformations/broadcast_reshape_matmul_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/broadcast_reshape_matmul_fusion_test.cpp new file mode 100644 index 00000000000000..8c25f415967dd5 --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/transformations/broadcast_reshape_matmul_fusion_test.cpp @@ -0,0 +1,99 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/ov_test_utils.hpp" + +#include "openvino/core/model.hpp" +#include "openvino/pass/manager.hpp" +#include "openvino/op/broadcast.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/reshape.hpp" +#include "intel_gpu/op/gemm.hpp" + +#include "plugin/transformations/broadcast_reshape_matmul_fusion.hpp" + +#include + +using namespace testing; +using namespace ov::intel_gpu; + +namespace ov { +namespace test { +namespace intel_gpu { + +TEST_F(TransformationTestsF, BroadReshapeMatmulFusion1) { + std::vector order_a = {0, 1, 2, 3}; + std::vector order_b = {1, 2, 3, 0}; + std::vector order_c = {0, 1, 2, 3}; + std::vector target_shape_b = {1, 1, 1, 16, 1}; + std::vector pattern_b = {0, 0, 32, 32}; + { + auto input_a = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto input_b = std::make_shared(ov::element::f32, ov::PartialShape{-1, -1, 2, 1, 32}); + auto broadcast_b_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{5}, target_shape_b); + auto broadcast_b = std::make_shared(input_b, broadcast_b_const, ov::op::BroadcastType::BIDIRECTIONAL); + auto reshape_b_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, pattern_b); + auto reshape_b = std::make_shared(broadcast_b, reshape_b_const, true); + auto gemm = std::make_shared(input_a, reshape_b, order_a, order_b, order_c, ov::element::undefined); + + model = std::make_shared(ov::NodeVector{ gemm }, ov::ParameterVector{ input_a, input_b }); + manager.register_pass(); + } + { + auto input_a = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto input_b = std::make_shared(ov::element::f32, ov::PartialShape{-1, -1, 2, 1, 32}); + auto gemm = std::make_shared(input_a, input_b, std::vector{}, target_shape_b, std::vector{}, pattern_b, order_a, order_b, order_c, ov::element::undefined); + + model_ref = std::make_shared(ov::NodeVector{ gemm }, ov::ParameterVector{ input_a, input_b }); + comparator.enable(FunctionsComparator::ATTRIBUTES); + } +} + +TEST_F(TransformationTestsF, BroadReshapeMatmulFusion2) { + std::vector order_a = {0, 1, 2, 3}; + std::vector order_b = {1, 2, 3, 0}; + std::vector order_c = {0, 1, 2, 3}; + std::vector target_shape_b = {1, 1, 1, 16, 1}; + std::vector pattern_b = {0, 0, -1, 32}; + { + auto input_a = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto input_b = std::make_shared(ov::element::f32, ov::PartialShape{-1, -1, -1, 1, 32}); + auto broadcast_b_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{5}, target_shape_b); + auto broadcast_b = std::make_shared(input_b, broadcast_b_const, ov::op::BroadcastType::BIDIRECTIONAL); + auto reshape_b_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, pattern_b); + auto reshape_b = std::make_shared(broadcast_b, reshape_b_const, true); + auto gemm = std::make_shared(input_a, reshape_b, order_a, order_b, order_c, ov::element::undefined); + + model = std::make_shared(ov::NodeVector{ gemm }, ov::ParameterVector{ input_a, input_b }); + manager.register_pass(); + } + { + model_ref = model->clone(); + comparator.enable(FunctionsComparator::ATTRIBUTES); + } +} + +TEST_F(TransformationTestsF, BroadReshapeMatmulFusion3) { + std::vector order_a = {0, 1, 2, 3}; + std::vector order_b = {0, 1, 2, 3}; + std::vector order_c = {0, 1, 2, 3}; + std::vector target_shape_b = {1, 1, 16, 1, 1}; + std::vector pattern_b = {0, 32, 32, 0}; + { + auto input_a = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto input_b = std::make_shared(ov::element::f32, ov::PartialShape{-1, 2, 1, 32, -1}); + auto broadcast_b_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{5}, target_shape_b); + auto broadcast_b = std::make_shared(input_b, broadcast_b_const, ov::op::BroadcastType::BIDIRECTIONAL); + auto reshape_b_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, pattern_b); + auto reshape_b = std::make_shared(broadcast_b, reshape_b_const, true); + auto gemm = std::make_shared(input_a, reshape_b, order_a, order_b, order_c, ov::element::undefined); + + model = std::make_shared(ov::NodeVector{ gemm }, ov::ParameterVector{ input_a, input_b }); + manager.register_pass(); + } +} + +} // namespace intel_gpu +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_gpu/tests/unit/transformations/clamp_fp16_output_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/clamp_fp16_output_test.cpp index 3973b7701108f5..5bdb492ea04e59 100644 --- a/src/plugins/intel_gpu/tests/unit/transformations/clamp_fp16_output_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/transformations/clamp_fp16_output_test.cpp @@ -15,6 +15,8 @@ #include #include "openvino/op/clamp.hpp" #include "openvino/op/reshape.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/maximum.hpp" #include #include #include @@ -92,7 +94,6 @@ TEST_F(TransformationTestsF, ClampFp16OutputTest3) { comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); } - TEST_F(TransformationTestsF, ClampFp16OutputTest4) { { auto input1 = std::make_shared(ov::element::f16, ov::Shape{ 3, 2, 2 }); @@ -108,3 +109,49 @@ TEST_F(TransformationTestsF, ClampFp16OutputTest4) { } comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); } + +TEST_F(TransformationTestsF, ClampFp16OutputTest5) { + { + auto input1 = std::make_shared(ov::element::f16, ov::Shape{ 3, 2, 2 }); + auto input2 = std::make_shared(ov::element::f16, ov::Shape{ 1, 2, 2 }); + auto matmul = std::make_shared(input1, input2, true, false); + auto data = std::make_shared(ov::element::f16, ov::Shape{ 3, 2, 2 }); + auto add = std::make_shared(matmul, data); + auto softmax = std::make_shared(add, 1); + + model = std::make_shared(ov::NodeVector{ softmax }, ov::ParameterVector{ input1, input2, data }); + manager.register_pass(); + } + { + auto input1 = std::make_shared(ov::element::f16, ov::Shape{ 3, 2, 2 }); + auto input2 = std::make_shared(ov::element::f16, ov::Shape{ 1, 2, 2 }); + auto matmul = std::make_shared(input1, input2, true, false); + auto min = static_cast(std::numeric_limits::lowest()); + auto max = static_cast(std::numeric_limits::max()); + auto clamp = std::make_shared(matmul, min, max); + auto data = std::make_shared(ov::element::f16, ov::Shape{ 3, 2, 2 }); + auto add = std::make_shared(clamp, data); + auto softmax = std::make_shared(add, 1); + + model_ref = std::make_shared(ov::NodeVector{ softmax }, ov::ParameterVector{ input1, input2, data }); + } + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); +} + +TEST_F(TransformationTestsF, ClampFp16OutputTest6) { + { + auto input1 = std::make_shared(ov::element::f16, ov::Shape{ 3, 2, 2 }); + auto input2 = std::make_shared(ov::element::f16, ov::Shape{ 1, 2, 2 }); + auto matmul = std::make_shared(input1, input2, true, false); + auto data = std::make_shared(ov::element::f16, ov::Shape{ 3, 2, 2 }); + auto maximum = std::make_shared(matmul, data); + auto softmax = std::make_shared(maximum, 1); + + model = std::make_shared(ov::NodeVector{ softmax }, ov::ParameterVector{ input1, input2, data }); + manager.register_pass(); + } + { + model_ref = model->clone(); // Not changed due to types for eltwise not supporting fusion to gemm + } + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); +} diff --git a/src/tests/functional/shared_test_classes/src/base/utils/generate_inputs.cpp b/src/tests/functional/shared_test_classes/src/base/utils/generate_inputs.cpp index 6a2f35e6f566d7..7cbb616336c747 100644 --- a/src/tests/functional/shared_test_classes/src/base/utils/generate_inputs.cpp +++ b/src/tests/functional/shared_test_classes/src/base/utils/generate_inputs.cpp @@ -264,15 +264,15 @@ ov::Tensor generate(const std::shared_ptr& node, int seed = 1; size_t constDataSize = ov::shape_size(targetShape); std::vector inputLowData, inputHighData, outputLowData, outputHighData; - inputLowData = NGraphFunctions::Utils::generateVector(constDataSize, 10, 1, seed); + inputLowData = ov::test::utils::generateVector(constDataSize, 10, 1, seed); if (node->get_levels() != 2) { - inputHighData = NGraphFunctions::Utils::generateVector(constDataSize, 10, 1, seed); - outputLowData = NGraphFunctions::Utils::generateVector(constDataSize, 10, 1, seed); - outputHighData = NGraphFunctions::Utils::generateVector(constDataSize, 10, 1, seed); + inputHighData = ov::test::utils::generateVector(constDataSize, 10, 1, seed); + outputLowData = ov::test::utils::generateVector(constDataSize, 10, 1, seed); + outputHighData = ov::test::utils::generateVector(constDataSize, 10, 1, seed); } else { inputHighData = inputLowData; - outputLowData = NGraphFunctions::Utils::generateVector(constDataSize, 10, 1, seed); - outputHighData = NGraphFunctions::Utils::generateVector(constDataSize, 10, 1, seed); + outputLowData = ov::test::utils::generateVector(constDataSize, 10, 1, seed); + outputHighData = ov::test::utils::generateVector(constDataSize, 10, 1, seed); for (int i = 0; i < constDataSize; i++) { if (outputLowData[i] > outputHighData[i]) { diff --git a/src/tests/test_utils/common_test_utils/include/common_test_utils/data_utils.hpp b/src/tests/test_utils/common_test_utils/include/common_test_utils/data_utils.hpp index 09c66cc15776ca..c8eaf631d8e1af 100644 --- a/src/tests/test_utils/common_test_utils/include/common_test_utils/data_utils.hpp +++ b/src/tests/test_utils/common_test_utils/include/common_test_utils/data_utils.hpp @@ -15,8 +15,9 @@ #include "openvino/core/type/element_type_traits.hpp" #include "openvino/runtime/tensor.hpp" -namespace NGraphFunctions { -namespace Utils { +namespace ov { +namespace test { +namespace utils { template std::vector::value_type> inline generateVector( @@ -105,13 +106,6 @@ std::vector castVector(const std::vector& vec) { return resVec; } -} // namespace Utils -} // namespace NGraphFunctions - -namespace ov { -namespace test { -namespace utils { - inline void fill_data(float* data, size_t size, size_t duty_ratio = 10) { for (size_t i = 0; i < size; i++) { if ((i / duty_ratio) % 2 == 1) { diff --git a/src/tests/test_utils/common_test_utils/include/common_test_utils/node_builders/constant.hpp b/src/tests/test_utils/common_test_utils/include/common_test_utils/node_builders/constant.hpp index 423aaa6915b866..9e2605f0e1ff7d 100644 --- a/src/tests/test_utils/common_test_utils/include/common_test_utils/node_builders/constant.hpp +++ b/src/tests/test_utils/common_test_utils/include/common_test_utils/node_builders/constant.hpp @@ -23,28 +23,28 @@ std::shared_ptr make_constant(const ov::element::Type& type, T up_to = 10, T start_from = 1, const int seed = 1) { -#define makeNode(TYPE) \ - case TYPE: \ - if (random) { \ - return std::make_shared( \ - type, \ - shape, \ - NGraphFunctions::Utils::generateVector(ov::shape_size(shape), \ - ov::element_type_traits::value_type(up_to), \ - ov::element_type_traits::value_type(start_from), \ - seed)); \ - } else { \ - if (std::is_same>::value) { \ - return std::make_shared(type, shape, data); \ - } else { \ - /* Convert std::vector data to required type */ \ - std::vector> converted_data(data.size()); \ - std::transform(data.cbegin(), data.cend(), converted_data.begin(), [](T e) { \ - return static_cast>(e); \ - }); \ - return std::make_shared(type, shape, converted_data); \ - } \ - } \ +#define makeNode(TYPE) \ + case TYPE: \ + if (random) { \ + return std::make_shared( \ + type, \ + shape, \ + generateVector(ov::shape_size(shape), \ + ov::element_type_traits::value_type(up_to), \ + ov::element_type_traits::value_type(start_from), \ + seed)); \ + } else { \ + if (std::is_same>::value) { \ + return std::make_shared(type, shape, data); \ + } else { \ + /* Convert std::vector data to required type */ \ + std::vector> converted_data(data.size()); \ + std::transform(data.cbegin(), data.cend(), converted_data.begin(), [](T e) { \ + return static_cast>(e); \ + }); \ + return std::make_shared(type, shape, converted_data); \ + } \ + } \ break; switch (type) { makeNode(ov::element::bf16); diff --git a/src/tests/test_utils/common_test_utils/src/node_builders/fake_quantize.cpp b/src/tests/test_utils/common_test_utils/src/node_builders/fake_quantize.cpp index 05d82ff015b85e..8c728a58fb3e87 100644 --- a/src/tests/test_utils/common_test_utils/src/node_builders/fake_quantize.cpp +++ b/src/tests/test_utils/common_test_utils/src/node_builders/fake_quantize.cpp @@ -41,15 +41,15 @@ std::shared_ptr make_fake_quantize(const ov::Output& in, const int32_t seed) { size_t constDataSize = ov::shape_size(constShapes); std::vector inputLowData, inputHighData, outputLowData, outputHighData; - inputLowData = NGraphFunctions::Utils::generateVector(constDataSize, 10, 1, seed); + inputLowData = ov::test::utils::generateVector(constDataSize, 10, 1, seed); if (levels != 2) { - inputHighData = NGraphFunctions::Utils::generateVector(constDataSize, 10, 1, seed); - outputLowData = NGraphFunctions::Utils::generateVector(constDataSize, 10, 1, seed); - outputHighData = NGraphFunctions::Utils::generateVector(constDataSize, 10, 1, seed); + inputHighData = ov::test::utils::generateVector(constDataSize, 10, 1, seed); + outputLowData = ov::test::utils::generateVector(constDataSize, 10, 1, seed); + outputHighData = ov::test::utils::generateVector(constDataSize, 10, 1, seed); } else { inputHighData = inputLowData; - outputLowData = NGraphFunctions::Utils::generateVector(constDataSize, 10, 1, seed); - outputHighData = NGraphFunctions::Utils::generateVector(constDataSize, 10, 1, seed); + outputLowData = ov::test::utils::generateVector(constDataSize, 10, 1, seed); + outputHighData = ov::test::utils::generateVector(constDataSize, 10, 1, seed); for (int i = 0; i < constDataSize; i++) { if (outputLowData[i] > outputHighData[i]) { diff --git a/tests/layer_tests/common/utils/common_utils.py b/tests/layer_tests/common/utils/common_utils.py index ef282443eae996..7bf2d33a0d657d 100644 --- a/tests/layer_tests/common/utils/common_utils.py +++ b/tests/layer_tests/common/utils/common_utils.py @@ -137,7 +137,12 @@ def allclose(cur_array, ref_array, atol, rtol): # so we have to align formats of both string tensors, for example, to unicode if cur_array.dtype.type != ref_array.dtype.type: cur_array = cur_array.astype('U') - ref_array = ref_array.astype('U') + try: + ref_array = ref_array.astype('U') + except: + # ref_array of object type and each element must be utf-8 decoded + utf8_decoded_elems = [elem.decode('UTF-8') for elem in ref_array.flatten()] + ref_array = np.array(utf8_decoded_elems, dtype=str).reshape(ref_array.shape) return np.array_equal(cur_array, ref_array) elif cur_array.dtype == bool: abs_diff = np.absolute(cur_array ^ ref_array) diff --git a/tests/layer_tests/pytorch_tests/test_bucketize.py b/tests/layer_tests/pytorch_tests/test_bucketize.py new file mode 100644 index 00000000000000..29fb550708e464 --- /dev/null +++ b/tests/layer_tests/pytorch_tests/test_bucketize.py @@ -0,0 +1,53 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import numpy as np +import pytest +import torch + +from pytorch_layer_test_class import PytorchLayerTest + + +class TestBucketize(PytorchLayerTest): + + def _prepare_input(self, input_shape, boundaries_range, input_dtype, boundaries_dtype): + return ( + np.random.randn(*input_shape).astype(input_dtype), + np.arange(*boundaries_range).astype(boundaries_dtype)) + + def create_model(self, out_int32, right, is_out): + class aten_bucketize(torch.nn.Module): + + def __init__(self, out_int32, right, is_out) -> None: + super().__init__() + self.out_int32 = out_int32 + self.right = right + self.is_out = is_out + + def forward(self, input, boundaries): + if self.is_out: + output_dtype = torch.int32 if self.out_int32 else torch.int64 + output = torch.zeros_like(input, dtype=output_dtype) + torch.bucketize(input, boundaries, out_int32=self.out_int32, right=self.right, out=output) + return output + else: + return torch.bucketize(input, boundaries, out_int32=self.out_int32, right=self.right) + + ref_net = None + + return aten_bucketize(out_int32, right, is_out), ref_net, "aten::bucketize" + + @pytest.mark.nightly + @pytest.mark.precommit + @pytest.mark.parametrize("out_int32", [True, False]) + @pytest.mark.parametrize("right", [True, False]) + @pytest.mark.parametrize("is_out", [True, False]) + @pytest.mark.parametrize("input_shape", [[1, ], [2, 1], [2, 2, 1]]) + @pytest.mark.parametrize("input_dtype", ["float32", "int32"]) + @pytest.mark.parametrize("boundaries_range", [[1, 10], (100, 200)]) + @pytest.mark.parametrize("boundaries_dtype", ["float32", "int32"]) + def test_bucketize(self, input_shape, boundaries_range, input_dtype, boundaries_dtype, out_int32, right, is_out, ie_device, precision, ir_version): + self._test(*self.create_model(out_int32, right, is_out), ie_device, precision, ir_version, kwargs_to_prepare_input={ + "input_shape": input_shape, "input_dtype": input_dtype, + "boundaries_range": boundaries_range, "boundaries_dtype": boundaries_dtype, + }) diff --git a/tests/layer_tests/pytorch_tests/test_matmul.py b/tests/layer_tests/pytorch_tests/test_matmul.py new file mode 100644 index 00000000000000..4613ac3e761cca --- /dev/null +++ b/tests/layer_tests/pytorch_tests/test_matmul.py @@ -0,0 +1,42 @@ +import numpy as np +import pytest +import torch +from pytorch_layer_test_class import PytorchLayerTest + +class TestMatMulOperation(PytorchLayerTest): + def _prepare_input(self, matrix, vector): + matrix_input = np.array(matrix, dtype=np.float32) + vector_input = np.array(vector, dtype=np.float32) + return matrix_input, vector_input + + def create_model(self, matrix, vector): + class CustomMatMulOperation(torch.nn.Module): + def forward(self, matrix, vector): + return torch.mv(matrix, vector) + + model_class = CustomMatMulOperation() + ref_net = None + return model_class, ref_net, "aten::mv" + + @pytest.mark.nightly + @pytest.mark.precommit + @pytest.mark.parametrize("matrix, vector, dtype", [ + (np.array([[1, 2], [3, 4]]), np.array([5, 6]), torch.float64), + (np.array([[0, 0], [0, 0]]), np.array([1, 2]), torch.float32), + (np.array([[1, 2, 3], [4, 5, 6]]), np.array([0, 1, 0]), torch.float64), + (np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]), np.array([2, 3, 4]), torch.float32), + ]) + def test_matmul_operation(self, matrix, vector, dtype, ie_device, precision, ir_version): + matrix_input = torch.tensor(matrix, dtype=torch.float32) + vector_input = torch.tensor(vector, dtype=torch.float32) + + matrix_input = matrix_input.to(dtype=dtype) + vector_input = vector_input.to(dtype=dtype) + + self._test( + *self.create_model(matrix_input, vector_input), + ie_device, + precision, + ir_version, + kwargs_to_prepare_input={"matrix": matrix_input, "vector": vector_input} + ) diff --git a/tests/layer_tests/tensorflow2_keras_tests/test_tf2_keras_text_vectorization.py b/tests/layer_tests/tensorflow2_keras_tests/test_tf2_keras_text_vectorization.py index 2b709045cbd8e9..3c6f15c176d334 100644 --- a/tests/layer_tests/tensorflow2_keras_tests/test_tf2_keras_text_vectorization.py +++ b/tests/layer_tests/tensorflow2_keras_tests/test_tf2_keras_text_vectorization.py @@ -17,13 +17,13 @@ def _prepare_input(self, inputs_info): assert 'text_input' in inputs_info input_shape = inputs_info['text_input'] inputs_data = {} - strings_dictionary = ['hi OpenVINO here ', ' hello OpenVINO there', 'hello PyTorch here ', - ' hi TensorFlow here', ' hi JAX here \t'] - inputs_data['text_input'] = rng.choice(strings_dictionary, input_shape) + inputs_data['text_input'] = rng.choice(self.strings_dictionary, input_shape) return inputs_data - def create_text_vectorization_net(self, input_shapes, vocabulary, output_mode, output_sequence_length): + def create_text_vectorization_net(self, input_shapes, vocabulary, output_mode, output_sequence_length, + strings_dictionary): assert len(input_shapes) > 0 + self.strings_dictionary = strings_dictionary tf.keras.backend.clear_session() text_input = tf.keras.Input(shape=input_shapes[0][1:], name='text_input', dtype=tf.string) @@ -36,13 +36,22 @@ def create_text_vectorization_net(self, input_shapes, vocabulary, output_mode, o return tf2_net, None @pytest.mark.parametrize('input_shapes', [[[1, 1]], [[3, 1]]]) - @pytest.mark.parametrize('vocabulary', [['hello', 'there', 'OpenVINO', 'check']]) + @pytest.mark.parametrize('strings_dictionary', + [['hi OpenVINO here ', ' hello OpenVINO there', 'hello PyTorch here ', + ' hi TensorFlow here', ' hi JAX here \t'], + ['привет ОПЕНВИНО здесь ', ' привет ОпенВИНО там', 'привет Пайторч здесь ', + ' привет ТензорФлоу здесь', ' привет ДЖАКС там \t'], + ['這裡你好 OpenVINO ', '你好 OpenVINO 那裡', '你好這裡 PyTorch ', + ' 這裡是 TensorFlow', ' 這裡是 JAX \t'] + ]) + @pytest.mark.parametrize('vocabulary', [['hello', 'there', 'OpenVINO', 'check', 'привет', + 'ОПЕНВИНО', 'здесь', 'там', '你好', '那裡', '檢查']]) @pytest.mark.parametrize('output_mode', ['int']) @pytest.mark.parametrize('output_sequence_length', [32, 64]) @pytest.mark.precommit_tf_fe @pytest.mark.nightly - def test_text_vectorization(self, input_shapes, vocabulary, output_mode, output_sequence_length, ie_device, - precision, ir_version, temp_dir, use_legacy_frontend): + def test_text_vectorization(self, input_shapes, vocabulary, output_mode, output_sequence_length, strings_dictionary, + ie_device, precision, ir_version, temp_dir, use_legacy_frontend): if platform.system() in ('Darwin') or platform.machine() in ['arm', 'armv7l', 'aarch64', 'arm64', @@ -53,5 +62,6 @@ def test_text_vectorization(self, input_shapes, vocabulary, output_mode, output_ params['vocabulary'] = vocabulary params['output_mode'] = output_mode params['output_sequence_length'] = output_sequence_length + params['strings_dictionary'] = strings_dictionary self._test(*self.create_text_vectorization_net(**params), ie_device, precision, temp_dir=temp_dir, ir_version=ir_version, use_legacy_frontend=use_legacy_frontend, **params) diff --git a/tests/layer_tests/tensorflow_tests/test_tf_StringLower.py b/tests/layer_tests/tensorflow_tests/test_tf_StringLower.py index 25581612784820..9bcee9c86a9524 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_StringLower.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_StringLower.py @@ -16,14 +16,13 @@ def _prepare_input(self, inputs_info): assert 'input:0' in inputs_info input_shape = inputs_info['input:0'] inputs_data = {} - # TODO: add non ASCII symbols, fix comparator for output string tensors - strings_dictionary = ['UPPER CASE SENTENCE', 'lower case sentence', ' UppEr LoweR CAse SENtence', ' '] - sample_data = rng.choice(strings_dictionary, input_shape) + sample_data = rng.choice(self.strings_dictionary, input_shape) inputs_data['input:0'] = sample_data return inputs_data - def create_string_lower_net(self, input_shape, encoding): + def create_string_lower_net(self, input_shape, encoding, strings_dictionary): self.encoding = encoding + self.strings_dictionary = strings_dictionary tf.compat.v1.reset_default_graph() with tf.compat.v1.Session() as sess: @@ -39,14 +38,19 @@ def create_string_lower_net(self, input_shape, encoding): @pytest.mark.parametrize("encoding", [None, '', 'utf-8']) @pytest.mark.parametrize("input_shape", [[], [2], [3, 4], [1, 3, 2]]) + @pytest.mark.parametrize("strings_dictionary", + [['UPPER CASE SENTENCE', 'lower case sentence', ' UppEr LoweR CAse SENtence', ' '], + ['Первое Предложение', 'второе предложение', ' ', ' ТРЕТЬЕ ПРЕДЛОЖЕНИЕ '], + ['第一句話在這裡', '第二句話在這裡', '第三句話在這裡']]) @pytest.mark.precommit_tf_fe @pytest.mark.nightly @pytest.mark.xfail(condition=platform.system() in ('Darwin', 'Linux') and platform.machine() in ['arm', 'armv7l', 'aarch64', 'arm64', 'ARM64'], reason='Ticket - 126314, 132699') - def test_string_lower(self, input_shape, encoding, ie_device, precision, ir_version, temp_dir, + def test_string_lower(self, input_shape, encoding, strings_dictionary, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): - self._test(*self.create_string_lower_net(input_shape=input_shape, encoding=encoding), + self._test(*self.create_string_lower_net(input_shape=input_shape, encoding=encoding, + strings_dictionary=strings_dictionary), ie_device, precision, ir_version, temp_dir=temp_dir, use_legacy_frontend=use_legacy_frontend) diff --git a/tests/layer_tests/tensorflow_tests/test_tf_angle.py b/tests/layer_tests/tensorflow_tests/test_tf_angle.py new file mode 100644 index 00000000000000..7c7d35a3e99c9c --- /dev/null +++ b/tests/layer_tests/tensorflow_tests/test_tf_angle.py @@ -0,0 +1,47 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import numpy as np +import pytest +import tensorflow as tf +from common.tf_layer_test_class import CommonTFLayerTest + + +class TestAngle(CommonTFLayerTest): + def _prepare_input(self, inputs_info): + assert 'y:0' in inputs_info + assert 'x:0' in inputs_info + y_shape = inputs_info['y:0'] + x_shape = inputs_info['x:0'] + inputs_data = {} + inputs_data['y:0'] = np.random.rand(*y_shape).astype(self.input_type) - np.random.rand(*y_shape).astype(self.input_type) + inputs_data['x:0'] = np.random.rand(*x_shape).astype(self.input_type) - np.random.rand(*x_shape).astype(self.input_type) + return inputs_data + + def create_angle_net(self, input_shape, input_type): + self.input_type = input_type + tf.compat.v1.reset_default_graph() + # Create the graph and model + with tf.compat.v1.Session() as sess: + y = tf.compat.v1.placeholder(input_type, input_shape, 'y') + x = tf.compat.v1.placeholder(input_type, input_shape, 'x') + complex = tf.raw_ops.Complex(real=x, imag=y) + tf.raw_ops.Angle(input=complex) + tf.compat.v1.global_variables_initializer() + tf_net = sess.graph_def + + return tf_net, None + + test_data_basic = [ + dict(input_shape=[1, 2], input_type=np.float32), + dict(input_shape=[2, 3, 4], input_type=np.float32), + ] + + @pytest.mark.parametrize("params", test_data_basic) + @pytest.mark.precommit_tf_fe + @pytest.mark.nightly + def test_angle(self, params, ie_device, precision, ir_version, temp_dir, + use_legacy_frontend): + self._test(*self.create_angle_net(**params), + ie_device, precision, ir_version, temp_dir=temp_dir, + use_legacy_frontend=use_legacy_frontend)