From 964dd8fa96ddb8a537830269fa07d4d54ca6d212 Mon Sep 17 00:00:00 2001
From: Alexandra Sidorova <alexandra.sidorova@intel.com>
Date: Thu, 10 Nov 2022 17:25:51 +0400
Subject: [PATCH] Added config parameter to disable MHA ops tokenization

---
 .../interface/ie_internal_plugin_config.hpp   |  7 ++++++
 src/plugins/intel_cpu/src/config.cpp          |  8 +++++++
 src/plugins/intel_cpu/src/config.h            |  1 +
 src/plugins/intel_cpu/src/plugin.cpp          | 22 +++++++++++--------
 4 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp b/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp
index 31b77806876670..048f3457fabc5c 100644
--- a/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp
+++ b/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp
@@ -65,6 +65,13 @@ DECLARE_CONFIG_KEY(FORCE_DISABLE_CACHE);
  */
 DECLARE_CONFIG_KEY(CONFIG_DEVICE_ID);
 
+/**
+ * @brief Defines if MHA ops can be tokenized in Snippets
+ *        Softmax, Transpose should be tokenized in Snippets only in tests and in MHA pattern
+ * @ingroup ie_dev_api_plugin_api
+ */
+DECLARE_CONFIG_KEY(SNIPPETS_MHA_OPS_TOKENIZATION_ENABLE);
+
 }  // namespace PluginConfigInternalParams
 
 }  // namespace InferenceEngine
diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp
index 139466fa3f8c87..6098d0f09868e4 100644
--- a/src/plugins/intel_cpu/src/config.cpp
+++ b/src/plugins/intel_cpu/src/config.cpp
@@ -159,6 +159,14 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) {
                 IE_THROW() << "Wrong value for property key " << CPUConfigParams::KEY_CPU_DENORMALS_OPTIMIZATION
                 << ". Expected only YES/NO";
             }
+        } else if (key == PluginConfigInternalParams::KEY_SNIPPETS_MHA_OPS_TOKENIZATION_ENABLE) {
+            if (val == PluginConfigParams::YES)
+                tokenizeMHAOpsSnippets = true;
+            else if (val == PluginConfigParams::NO)
+                tokenizeMHAOpsSnippets = false;
+            else
+                IE_THROW() << "Wrong value for property key " << PluginConfigInternalParams::KEY_SNIPPETS_MHA_OPS_TOKENIZATION_ENABLE
+                            << ". Expected only YES/NO";
         } else {
             IE_THROW(NotFound) << "Unsupported property " << key << " by CPU plugin";
         }
diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h
index 9e9defcdb3e45c..f5db35bdf6760c 100644
--- a/src/plugins/intel_cpu/src/config.h
+++ b/src/plugins/intel_cpu/src/config.h
@@ -32,6 +32,7 @@ struct Config {
     bool collectPerfCounters = false;
     bool exclusiveAsyncRequests = false;
     bool enableDynamicBatch = false;
+    bool tokenizeMHAOpsSnippets = false;
     std::string dumpToDot = "";
     int batchLimit = 0;
     size_t rtCacheCapacity = 5000ul;
diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp
index 34fb7135cb2e2d..17f5843cdf7955 100644
--- a/src/plugins/intel_cpu/src/plugin.cpp
+++ b/src/plugins/intel_cpu/src/plugin.cpp
@@ -256,7 +256,7 @@ Engine::~Engine() {
 }
 
 static void TransformationUpToCPUSpecificOpSet(std::shared_ptr<ngraph::Function> nGraphFunc, const bool _enableLPT, const bool _enableBF16,
-                                               const bool _enableSnippets, const bool isLegacyApi) {
+                                               const bool _enableSnippets, const bool _tokenizeSpecOpsSnippets, const bool isLegacyApi) {
     ngraph::pass::Manager manager;
     manager.set_per_pass_validation(false);
     manager.register_pass<ngraph::pass::InitNodeInfo>();
@@ -635,12 +635,12 @@ static void TransformationUpToCPUSpecificOpSet(std::shared_ptr<ngraph::Function>
         snippetsManager.register_pass<ngraph::snippets::pass::EnumerateNodes>();
         snippetsManager.register_pass<ngraph::snippets::pass::TokenizeSnippets>();
         snippetsManager.get_pass_config()->set_callback<ngraph::snippets::pass::TokenizeSnippets>(
-                [](const std::shared_ptr<const ov::Node>& n) -> bool {
+                [_tokenizeSpecOpsSnippets](const std::shared_ptr<const ov::Node>& n) -> bool {
                     // CPU Plugin support Swish in Subgraph via conversion to SwichCPU which assumes second input to be constant
-                    if (ov::is_type<const ov::op::v4::Swish>(n)) {
-                        if (n->inputs().size() > 1 && !ov::is_type<const ov::op::v0::Constant>(n->get_input_node_shared_ptr(1)))
-                            return true;
-                    }
+                    const bool is_unsupported_swish = ov::is_type<const ov::op::v4::Swish>(n) && n->inputs().size() > 1 &&
+                                                      !ov::is_type<const ov::op::v0::Constant>(n->get_input_node_shared_ptr(1));
+                    const bool is_disabled_softmax_tokenization =
+                            (ov::is_type<const ov::op::v1::Softmax>(n) || ov::is_type<const ov::op::v8::Softmax>(n)) && !_tokenizeSpecOpsSnippets;
                     const auto& inputs = n->inputs();
                     // todo: clarify whether we can evaluate snippets on const paths
                     const bool has_only_const_inputs = std::all_of(inputs.begin(), inputs.end(),
@@ -657,7 +657,7 @@ static void TransformationUpToCPUSpecificOpSet(std::shared_ptr<ngraph::Function>
                     const auto& outputs = n->outputs();
                     const bool bad_output_rank = std::any_of(outputs.begin(), outputs.end(),
                                                              [&](const ov::Output<const ov::Node>& out) {return  rank_is_too_large(out.get_tensor());});
-                    return has_only_const_inputs || bad_input_rank || bad_output_rank;
+                    return has_only_const_inputs || bad_input_rank || bad_output_rank || is_unsupported_swish || is_disabled_softmax_tokenization;
                 });
         snippetsManager.register_pass<ngraph::snippets::pass::CommonOptimizations>();
         snippetsManager.run_passes(nGraphFunc);
@@ -829,8 +829,10 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std
     const bool enableDynamicBatch = (dynamicBatchProp != config.end() && dynamicBatchProp->second == PluginConfigParams::YES)
             || engConfig.enableDynamicBatch;
     const bool enableSnippets = !(enableModelCache || enableDynamicBatch || enableBF16);
+    const auto& mhaOpsSnippetsProp = config.find(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MHA_OPS_TOKENIZATION_ENABLE);
+    const bool tokenizeMHAOpSnippets = enableSnippets && (mhaOpsSnippetsProp != config.end() && mhaOpsSnippetsProp->second == PluginConfigParams::YES);
     auto nGraphFunc = clonedNetwork.getFunction();
-    TransformationUpToCPUSpecificOpSet(nGraphFunc, enableLPT, enableBF16, enableSnippets, isLegacyAPI());
+    TransformationUpToCPUSpecificOpSet(nGraphFunc, enableLPT, enableBF16, enableSnippets, tokenizeMHAOpSnippets, isLegacyAPI());
 
     // need to check that all outputs have static shapes
     // checking that all inputs have static shapes is performed in the common part
@@ -1070,6 +1072,8 @@ QueryNetworkResult Engine::QueryNetwork(const CNNNetwork& network, const std::ma
                         || Config::LPTransformsMode::On == engConfig.lpTransformsMode /* or already enabled */;
     const bool enableSnippets = !(conf.cache_dir.empty() || conf.enableDynamicBatch || (conf.enforceBF16
             && dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)));
+    const auto& mhaOpsSnippetsProp = config.find(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MHA_OPS_TOKENIZATION_ENABLE);
+    const bool tokenizeMHAOpSnippets = enableSnippets && (mhaOpsSnippetsProp != config.end() && mhaOpsSnippetsProp->second == PluginConfigParams::YES);
 
     auto model = network.getFunction();
     if (model == nullptr) {
@@ -1078,7 +1082,7 @@ QueryNetworkResult Engine::QueryNetwork(const CNNNetwork& network, const std::ma
 
     auto supported = GetSupportedNodes(model,
     [&](std::shared_ptr<ov::Model>& model) {
-            TransformationUpToCPUSpecificOpSet(model, enableLPT, conf.enforceBF16, enableSnippets, isLegacyAPI());
+            TransformationUpToCPUSpecificOpSet(model, enableLPT, conf.enforceBF16, enableSnippets, tokenizeMHAOpSnippets, isLegacyAPI());
             ConvertToCPUSpecificOpset(model);
         },
     [&](const std::shared_ptr<ngraph::Node>& op) {