From 964dd8fa96ddb8a537830269fa07d4d54ca6d212 Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Thu, 10 Nov 2022 17:25:51 +0400 Subject: [PATCH] Added config parameter to disable MHA ops tokenization --- .../interface/ie_internal_plugin_config.hpp | 7 ++++++ src/plugins/intel_cpu/src/config.cpp | 8 +++++++ src/plugins/intel_cpu/src/config.h | 1 + src/plugins/intel_cpu/src/plugin.cpp | 22 +++++++++++-------- 4 files changed, 29 insertions(+), 9 deletions(-) diff --git a/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp b/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp index 31b77806876670..048f3457fabc5c 100644 --- a/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp +++ b/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp @@ -65,6 +65,13 @@ DECLARE_CONFIG_KEY(FORCE_DISABLE_CACHE); */ DECLARE_CONFIG_KEY(CONFIG_DEVICE_ID); +/** + * @brief Defines if MHA ops can be tokenized in Snippets + * Softmax, Transpose should be tokenized in Snippets only in tests and in MHA pattern + * @ingroup ie_dev_api_plugin_api + */ +DECLARE_CONFIG_KEY(SNIPPETS_MHA_OPS_TOKENIZATION_ENABLE); + } // namespace PluginConfigInternalParams } // namespace InferenceEngine diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp index 139466fa3f8c87..6098d0f09868e4 100644 --- a/src/plugins/intel_cpu/src/config.cpp +++ b/src/plugins/intel_cpu/src/config.cpp @@ -159,6 +159,14 @@ void Config::readProperties(const std::map &prop) { IE_THROW() << "Wrong value for property key " << CPUConfigParams::KEY_CPU_DENORMALS_OPTIMIZATION << ". Expected only YES/NO"; } + } else if (key == PluginConfigInternalParams::KEY_SNIPPETS_MHA_OPS_TOKENIZATION_ENABLE) { + if (val == PluginConfigParams::YES) + tokenizeMHAOpsSnippets = true; + else if (val == PluginConfigParams::NO) + tokenizeMHAOpsSnippets = false; + else + IE_THROW() << "Wrong value for property key " << PluginConfigInternalParams::KEY_SNIPPETS_MHA_OPS_TOKENIZATION_ENABLE + << ". Expected only YES/NO"; } else { IE_THROW(NotFound) << "Unsupported property " << key << " by CPU plugin"; } diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h index 9e9defcdb3e45c..f5db35bdf6760c 100644 --- a/src/plugins/intel_cpu/src/config.h +++ b/src/plugins/intel_cpu/src/config.h @@ -32,6 +32,7 @@ struct Config { bool collectPerfCounters = false; bool exclusiveAsyncRequests = false; bool enableDynamicBatch = false; + bool tokenizeMHAOpsSnippets = false; std::string dumpToDot = ""; int batchLimit = 0; size_t rtCacheCapacity = 5000ul; diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index 34fb7135cb2e2d..17f5843cdf7955 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -256,7 +256,7 @@ Engine::~Engine() { } static void TransformationUpToCPUSpecificOpSet(std::shared_ptr nGraphFunc, const bool _enableLPT, const bool _enableBF16, - const bool _enableSnippets, const bool isLegacyApi) { + const bool _enableSnippets, const bool _tokenizeSpecOpsSnippets, const bool isLegacyApi) { ngraph::pass::Manager manager; manager.set_per_pass_validation(false); manager.register_pass(); @@ -635,12 +635,12 @@ static void TransformationUpToCPUSpecificOpSet(std::shared_ptr snippetsManager.register_pass(); snippetsManager.register_pass(); snippetsManager.get_pass_config()->set_callback( - [](const std::shared_ptr& n) -> bool { + [_tokenizeSpecOpsSnippets](const std::shared_ptr& n) -> bool { // CPU Plugin support Swish in Subgraph via conversion to SwichCPU which assumes second input to be constant - if (ov::is_type(n)) { - if (n->inputs().size() > 1 && !ov::is_type(n->get_input_node_shared_ptr(1))) - return true; - } + const bool is_unsupported_swish = ov::is_type(n) && n->inputs().size() > 1 && + !ov::is_type(n->get_input_node_shared_ptr(1)); + const bool is_disabled_softmax_tokenization = + (ov::is_type(n) || ov::is_type(n)) && !_tokenizeSpecOpsSnippets; const auto& inputs = n->inputs(); // todo: clarify whether we can evaluate snippets on const paths const bool has_only_const_inputs = std::all_of(inputs.begin(), inputs.end(), @@ -657,7 +657,7 @@ static void TransformationUpToCPUSpecificOpSet(std::shared_ptr const auto& outputs = n->outputs(); const bool bad_output_rank = std::any_of(outputs.begin(), outputs.end(), [&](const ov::Output& out) {return rank_is_too_large(out.get_tensor());}); - return has_only_const_inputs || bad_input_rank || bad_output_rank; + return has_only_const_inputs || bad_input_rank || bad_output_rank || is_unsupported_swish || is_disabled_softmax_tokenization; }); snippetsManager.register_pass(); snippetsManager.run_passes(nGraphFunc); @@ -829,8 +829,10 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std const bool enableDynamicBatch = (dynamicBatchProp != config.end() && dynamicBatchProp->second == PluginConfigParams::YES) || engConfig.enableDynamicBatch; const bool enableSnippets = !(enableModelCache || enableDynamicBatch || enableBF16); + const auto& mhaOpsSnippetsProp = config.find(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MHA_OPS_TOKENIZATION_ENABLE); + const bool tokenizeMHAOpSnippets = enableSnippets && (mhaOpsSnippetsProp != config.end() && mhaOpsSnippetsProp->second == PluginConfigParams::YES); auto nGraphFunc = clonedNetwork.getFunction(); - TransformationUpToCPUSpecificOpSet(nGraphFunc, enableLPT, enableBF16, enableSnippets, isLegacyAPI()); + TransformationUpToCPUSpecificOpSet(nGraphFunc, enableLPT, enableBF16, enableSnippets, tokenizeMHAOpSnippets, isLegacyAPI()); // need to check that all outputs have static shapes // checking that all inputs have static shapes is performed in the common part @@ -1070,6 +1072,8 @@ QueryNetworkResult Engine::QueryNetwork(const CNNNetwork& network, const std::ma || Config::LPTransformsMode::On == engConfig.lpTransformsMode /* or already enabled */; const bool enableSnippets = !(conf.cache_dir.empty() || conf.enableDynamicBatch || (conf.enforceBF16 && dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core))); + const auto& mhaOpsSnippetsProp = config.find(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MHA_OPS_TOKENIZATION_ENABLE); + const bool tokenizeMHAOpSnippets = enableSnippets && (mhaOpsSnippetsProp != config.end() && mhaOpsSnippetsProp->second == PluginConfigParams::YES); auto model = network.getFunction(); if (model == nullptr) { @@ -1078,7 +1082,7 @@ QueryNetworkResult Engine::QueryNetwork(const CNNNetwork& network, const std::ma auto supported = GetSupportedNodes(model, [&](std::shared_ptr& model) { - TransformationUpToCPUSpecificOpSet(model, enableLPT, conf.enforceBF16, enableSnippets, isLegacyAPI()); + TransformationUpToCPUSpecificOpSet(model, enableLPT, conf.enforceBF16, enableSnippets, tokenizeMHAOpSnippets, isLegacyAPI()); ConvertToCPUSpecificOpset(model); }, [&](const std::shared_ptr& op) {