diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp index 43d19a5158cb39..88b3292b38bb97 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.cpp +++ b/src/plugins/intel_cpu/src/graph_optimizer.cpp @@ -5,6 +5,7 @@ #include "graph_optimizer.h" #include "dnnl_extension_utils.h" +#include "low_precision/rt_info/bias_attribute.hpp" #include "nodes/bin_conv.h" #include "nodes/common/cpu_convert.h" #include "nodes/conv.h" @@ -278,6 +279,11 @@ void GraphOptimizer::FuseConvMatmulFCDeconvAndDQScales(Graph &graph) { auto scales = mul->getParentEdgeAt(1)->getParent(); if (!scaleDimsCheck(node, scales)) continue; + // TODO: debug only: how to check if attribute exists for CPU node (ov::marked_as_bias(mul)) + if ((node->getType() == Type::FullyConnected) || (node->getType() == Type::MatMul)) { + continue; + } + if (initializeDeQuantizedScales(node, scales)) { DEBUG_LOG("GraphOptimizer##FusingDQ: Node ##", mul->getName(), " optimized as DQ scales of Node ##", node->getName()); node->addOriginalLayer(mul->getOriginalLayers()); diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp index f6771ba5bae18c..6326e0617ad1f0 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp @@ -79,7 +79,10 @@ bool ACLLowpFullyConnectedExecutor::supports(const FCConfig &config) { //VERIFY(postOpsNumbers(config) == 0, UNSUPPORTED_NUMBER_OF_POSTOPS); VERIFY(one_of(srcRank(config), 2U, 3U, 4U), UNSUPPORTED_SRC_RANK); VERIFY(one_of(weiRank(config), 2U, 3U, 4U), UNSUPPORTED_WEI_RANK); - VERIFY(static_cast(config.attrs).dequantizationScales.size() <= 1, UNSUPPORTED_PER_CHANNEL_QUANTIZATION); + + const auto attrs = static_cast(config.attrs); + VERIFY(attrs.dequantizationScales.size() <= 1, UNSUPPORTED_PER_CHANNEL_QUANTIZATION); + return true; } @@ -106,7 +109,7 @@ arm_compute::Status ACLLowpFullyConnectedExecutor::validateTensorsInfo(const ACL const auto matMulValid = arm_compute::NEGEMMLowpMatrixMultiplyCore::validate( aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), aclMemoryInfos[ACLArgs::ACL_WEI].get(), - nullptr, //aclMemoryInfos[ACLArgs::ACL_BIAS].get(), + aclMemoryInfos[ACLArgs::ACL_BIAS].get(), aclMemoryInfos[ACLArgs::ACL_DST].get(), gemmInfo); return matMulValid; diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp index 0fdc2caee755bb..6b650f938eba2e 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp @@ -89,9 +89,10 @@ static const TypeMapping aclFCTypeMapping { static const TypeMapping aclLowpFCTypeMapping { // {src, wei, bia, dst} pt {{_i8, _i8, _any, _f32}, pt(just(), just(), just(), just())}, + {{_i8, _i8, _any, _f32}, pt(just(), just(), just(), just())}, //{{_i8, _i8, _any, _i32}, pt(just(), just(), just(), just())}, //{{_u8, _u8, _any, _i32}, pt(just(), just(), bypass(), just())}, - {{_any, _any, _any, _any}, pt(just(), just(), just(), just())} + //{{_any, _any, _any, _any}, pt(just(), just(), just(), just())} }; static const MappingNotation dnnlConvolutionMappingNotation { diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index bd581fcb4eab99..b8bcf6652b830e 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -385,11 +385,7 @@ void Transformations::PreLpt(const std::vector& defaultPrecis const auto precisions = get_convert_precisions(); if (inferencePrecision == ov::element::f16) { precisions_map fp_convert_precision_map = {{ov::element::f32, ov::element::f16}}; -#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) - type_to_fuse_map fuse_map = {{ov::opset1::FakeQuantize::get_type_info_static(), fuse_type_to_fq}}; -#else type_to_fuse_map fuse_map = {}; -#endif const bool keep_precision_sensitive_in_fp32 = true; CPU_REGISTER_PASS_COMMON(manager, ov::pass::ConvertPrecision, @@ -755,9 +751,11 @@ void Transformations::Lpt(const std::vector& defaultPrecision return LayerTransformation::isAsymmetricQuantization(node, defaultPrecisions) || WeightableLayerTransformation::isAsymmetricOnWeights(node, defaultPrecisions); }, ConvolutionBackpropDataTransformation); +#if !defined(OPENVINO_ARCH_ARM64) CPU_SET_CALLBACK_COMMON(lptManager, [](const_node_ptr& node) -> bool { return ov::marked_as_bias(node); }, AddTransformation); +#endif CPU_SET_CALLBACK_X64(lptManager, [&](const_node_ptr& node) -> bool { const auto& consumers = node->get_output_target_inputs(0); @@ -1161,7 +1159,7 @@ void Transformations::PostSnippets(void) { ov::pass::Manager postSnippetsManager("CPU:PostSnippets"); postSnippetsManager.set_per_pass_validation(false); CPU_REGISTER_PASS_COMMON(postSnippetsManager, ov::pass::FakeQuantizeDecomposition); - CPU_SET_CALLBACK_COMMON(postSnippetsManager, + CPU_SET_CALLBACK_X64(postSnippetsManager, [](const_node_ptr& node) -> bool { std::string errMsg; return node::FakeQuantize::isSupportedOperation(node, errMsg);