From 79d7facb00b41a1d33f2fe08c2e3a2200be19ed7 Mon Sep 17 00:00:00 2001
From: Ivan Novoselov <ivan.novoselov@intel.com>
Date: Sun, 31 Jul 2022 16:18:12 +0100
Subject: [PATCH] Snippets increase subgraph size (#3)

- Implement static TileScheduler to handle compile params processing. Now compile params are accessed only here
- TileScheduler should emit code only for necessary scalar/vector Tiles
- Perform abstract-to-physical register mapping in one place (currently KernelEmitter constructor)
- Implement more precise register mapping, so larger subgraphs could be created (now up to 12 i/o regs instead of 7)

Increments are invalid in some tests because of TileScheduler optimizations

Optimizations fixed, the tests pass Ok

Pass increment and dims to op::Tile constructor

Added support of Convert FP32, BF16, I8, U8

Fixed original input and output types

fixed minor comments

Applied first part

Applied second part
---
 .../snippets/include/snippets/emitter.hpp     |   2 +
 .../snippets/include/snippets/generator.hpp   |  19 +-
 .../include/snippets/op/blockedload.hpp       |  34 -
 .../include/snippets/op/blockedparameter.hpp  |  36 -
 .../snippets/op/convert_saturation.hpp        |  37 +
 .../snippets/op/convert_truncation.hpp        |  36 +
 .../snippets/include/snippets/op/load.hpp     |  17 +-
 .../include/snippets/op/scalarload.hpp        |  34 -
 .../include/snippets/op/scalarstore.hpp       |  34 -
 .../snippets/include/snippets/op/store.hpp    |  15 +-
 .../snippets/include/snippets/op/subgraph.hpp |   9 +-
 .../snippets/include/snippets/op/tile.hpp     |  21 +-
 .../include/snippets/op/tile_scheduler.hpp    |  39 +
 .../include/snippets/op/vectorload.hpp        |  34 -
 .../include/snippets/op/vectorstore.hpp       |  34 -
 .../snippets/pass/assign_registers.hpp        |   2 +-
 .../pass/insert_convert_on_inputs.hpp         |  31 +
 .../snippets/pass/insert_load_store.hpp       |   4 +-
 .../reset_type_relaxed_node_precision.hpp     |  31 +
 .../pass/transform_convert_to_truncation.hpp  |  28 +
 .../snippets/pass/vector_to_scalar.hpp        |  20 +-
 .../include/snippets/snippets_isa.hpp         |   9 +-
 .../include/snippets/snippets_isa_tbl.hpp     |   8 +-
 src/common/snippets/src/generator.cpp         |  61 +-
 src/common/snippets/src/op/blockedload.cpp    |  10 -
 .../snippets/src/op/convert_saturation.cpp    |  19 +
 .../snippets/src/op/convert_truncation.cpp    |  19 +
 src/common/snippets/src/op/load.cpp           |   4 +-
 src/common/snippets/src/op/scalarload.cpp     |  10 -
 src/common/snippets/src/op/scalarstore.cpp    |  10 -
 src/common/snippets/src/op/store.cpp          |   6 +-
 src/common/snippets/src/op/subgraph.cpp       | 106 ++-
 src/common/snippets/src/op/tile.cpp           |   5 +-
 src/common/snippets/src/op/tile_scheduler.cpp |  10 +
 src/common/snippets/src/op/vectorload.cpp     |  10 -
 src/common/snippets/src/op/vectorstore.cpp    |  10 -
 .../snippets/src/pass/assign_registers.cpp    |  69 +-
 .../snippets/src/pass/collapse_subgraph.cpp   |  60 +-
 .../src/pass/insert_convert_on_inputs.cpp     |  72 ++
 .../snippets/src/pass/insert_load_store.cpp   |  12 +-
 .../load_movebroadcast_to_broadcastload.cpp   |   2 +-
 .../reset_type_relaxed_node_precision.cpp     |  31 +
 .../pass/transform_convert_to_truncation.cpp  |  34 +
 .../snippets/src/pass/vector_to_scalar.cpp    |  33 +-
 .../snippets/tests/include/lowering_utils.hpp |   3 +-
 .../set_scalar_count_for_load_and_store.hpp   |  40 ++
 .../snippets/tests/src/lowering_utils.cpp     |   5 +-
 .../tests/src/pass/canonicalization.cpp       |   4 +-
 .../tests/src/pass/collapse_subgraph.cpp      |  38 +
 .../set_scalar_count_for_load_and_store.cpp}  |  49 +-
 src/common/snippets/tests/src/registers.cpp   |  43 +-
 .../intel_cpu/src/emitters/cpu_generator.cpp  |  20 +-
 .../intel_cpu/src/emitters/cpu_generator.hpp  |   2 +
 .../src/emitters/jit_conversion_emitters.cpp  | 313 ++++++++
 .../src/emitters/jit_conversion_emitters.hpp  |  87 +++
 .../intel_cpu/src/emitters/jit_emitter.cpp    |   4 +
 .../intel_cpu/src/emitters/jit_emitter.hpp    |   1 +
 .../src/emitters/jit_load_store_emitters.cpp  | 205 ++++--
 .../src/emitters/jit_load_store_emitters.hpp  |  15 +-
 .../src/emitters/jit_snippets_emitters.cpp    | 669 ++++++++++++++++++
 .../src/emitters/jit_snippets_emitters.hpp    | 598 +++++-----------
 src/plugins/intel_cpu/src/extension.cpp       |   3 +
 .../snippets_mark_skipped.cpp                 |  71 +-
 src/plugins/intel_cpu/src/nodes/subgraph.cpp  | 112 ++-
 src/plugins/intel_cpu/src/nodes/subgraph.h    |   1 +
 src/plugins/intel_cpu/src/plugin.cpp          |   1 -
 .../fuse_load_store_and_convert.cpp           | 121 ++++
 .../fuse_load_store_and_convert.hpp           |  38 +
 .../op/load_store_convert.cpp                 |  56 ++
 .../op/load_store_convert.hpp                 |  76 ++
 .../shared_tests_instances/snippets/add.cpp   |  42 +-
 .../snippets/convert.cpp                      | 162 +++++
 .../snippets/eltwise_two_results.cpp          |  25 +
 .../snippets/max_num_params_eltwise.cpp       |  26 +
 .../snippets/three_inputs_eltwise.cpp         |  36 +-
 .../snippets/two_inputs_and_outputs.cpp       |  45 ++
 .../functional/single_layer_tests/eltwise.cpp |   2 +-
 src/plugins/intel_cpu/thirdparty/onednn       |   2 +-
 .../plugin/shared/include/snippets/add.hpp    |  15 +
 .../shared/include/snippets/convert.hpp       |  76 ++
 .../include/snippets/eltwise_two_results.hpp  |  33 +
 .../snippets/max_num_params_eltwise.hpp       |  31 +
 .../snippets/two_inputs_and_outputs.hpp       |  31 +
 .../plugin/shared/src/snippets/add.cpp        |  92 ++-
 .../plugin/shared/src/snippets/convert.cpp    | 231 ++++++
 .../src/snippets/eltwise_two_results.cpp      |  44 ++
 .../src/snippets/max_num_params_eltwise.cpp   |  49 ++
 .../src/snippets/three_inputs_eltwise.cpp     |  72 +-
 .../src/snippets/two_inputs_and_outputs.cpp   |  43 ++
 .../src/base/snippets_test_utils.cpp          |   5 +
 .../include/subgraph_converts.hpp             | 214 ++++++
 .../include/subgraph_lowered.hpp              |   3 +-
 .../include/subgraph_simple.hpp               |  75 +-
 .../src/subgraph_convert.cpp                  | 241 +++++++
 .../src/subgraph_simple.cpp                   |  93 +++
 95 files changed, 4287 insertions(+), 1103 deletions(-)
 delete mode 100644 src/common/snippets/include/snippets/op/blockedload.hpp
 delete mode 100644 src/common/snippets/include/snippets/op/blockedparameter.hpp
 create mode 100644 src/common/snippets/include/snippets/op/convert_saturation.hpp
 create mode 100644 src/common/snippets/include/snippets/op/convert_truncation.hpp
 delete mode 100644 src/common/snippets/include/snippets/op/scalarload.hpp
 delete mode 100644 src/common/snippets/include/snippets/op/scalarstore.hpp
 create mode 100644 src/common/snippets/include/snippets/op/tile_scheduler.hpp
 delete mode 100644 src/common/snippets/include/snippets/op/vectorload.hpp
 delete mode 100644 src/common/snippets/include/snippets/op/vectorstore.hpp
 create mode 100644 src/common/snippets/include/snippets/pass/insert_convert_on_inputs.hpp
 create mode 100644 src/common/snippets/include/snippets/pass/reset_type_relaxed_node_precision.hpp
 create mode 100644 src/common/snippets/include/snippets/pass/transform_convert_to_truncation.hpp
 delete mode 100644 src/common/snippets/src/op/blockedload.cpp
 create mode 100644 src/common/snippets/src/op/convert_saturation.cpp
 create mode 100644 src/common/snippets/src/op/convert_truncation.cpp
 delete mode 100644 src/common/snippets/src/op/scalarload.cpp
 delete mode 100644 src/common/snippets/src/op/scalarstore.cpp
 create mode 100644 src/common/snippets/src/op/tile_scheduler.cpp
 delete mode 100644 src/common/snippets/src/op/vectorload.cpp
 delete mode 100644 src/common/snippets/src/op/vectorstore.cpp
 create mode 100644 src/common/snippets/src/pass/insert_convert_on_inputs.cpp
 create mode 100644 src/common/snippets/src/pass/reset_type_relaxed_node_precision.cpp
 create mode 100644 src/common/snippets/src/pass/transform_convert_to_truncation.cpp
 create mode 100644 src/common/snippets/tests/include/pass/set_scalar_count_for_load_and_store.hpp
 rename src/common/snippets/tests/src/{vector_scalar.cpp => pass/set_scalar_count_for_load_and_store.cpp} (53%)
 create mode 100644 src/plugins/intel_cpu/src/emitters/jit_conversion_emitters.cpp
 create mode 100644 src/plugins/intel_cpu/src/emitters/jit_conversion_emitters.hpp
 create mode 100644 src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp
 create mode 100644 src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp
 create mode 100644 src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.hpp
 create mode 100644 src/plugins/intel_cpu/src/snippets_transformations/op/load_store_convert.cpp
 create mode 100644 src/plugins/intel_cpu/src/snippets_transformations/op/load_store_convert.hpp
 create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp
 create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/eltwise_two_results.cpp
 create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/max_num_params_eltwise.cpp
 create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/two_inputs_and_outputs.cpp
 create mode 100644 src/tests/functional/plugin/shared/include/snippets/convert.hpp
 create mode 100644 src/tests/functional/plugin/shared/include/snippets/eltwise_two_results.hpp
 create mode 100644 src/tests/functional/plugin/shared/include/snippets/max_num_params_eltwise.hpp
 create mode 100644 src/tests/functional/plugin/shared/include/snippets/two_inputs_and_outputs.hpp
 create mode 100644 src/tests/functional/plugin/shared/src/snippets/convert.cpp
 create mode 100644 src/tests/functional/plugin/shared/src/snippets/eltwise_two_results.cpp
 create mode 100644 src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp
 create mode 100644 src/tests/functional/plugin/shared/src/snippets/two_inputs_and_outputs.cpp
 create mode 100644 src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_converts.hpp
 create mode 100644 src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_convert.cpp

diff --git a/src/common/snippets/include/snippets/emitter.hpp b/src/common/snippets/include/snippets/emitter.hpp
index 2ba0f85c5deda8..99c09d9d61d1bf 100644
--- a/src/common/snippets/include/snippets/emitter.hpp
+++ b/src/common/snippets/include/snippets/emitter.hpp
@@ -51,5 +51,7 @@ class Emitter {
     virtual ~Emitter() = default;
 };
 
+using AllocatedEmitter = std::pair<std::shared_ptr<Emitter>, ngraph::snippets::RegInfo>;
+
 } // namespace snippets
 } // namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/include/snippets/generator.hpp b/src/common/snippets/include/snippets/generator.hpp
index e1a1fdf720a413..b0510c8b13934c 100644
--- a/src/common/snippets/include/snippets/generator.hpp
+++ b/src/common/snippets/include/snippets/generator.hpp
@@ -18,7 +18,7 @@ auto getRegisters(std::shared_ptr<ngraph::Node>& n) -> ngraph::snippets::RegInfo
 
 /**
  * @interface TargetMachine
- * @brief Base class Target machine representation. Target derives from this class to provide generator information about supported emittors
+ * @brief Base class Target machine representation. Target derives from this class to provide generator information about supported emitters
  * @ingroup snippets
  */
 class TargetMachine {
@@ -41,9 +41,10 @@ class TargetMachine {
      */
     virtual size_t get_lanes() const = 0;
 
+
     /**
-     * @brief called by generator to all the emittor for a target machine
-     * @return a map by node's type info with callbacks to create an instance of emmitter for corresponding operation type
+     * @brief called by generator to all the emitter for a target machine
+     * @return a map by node's type info with callbacks to create an instance of emitter for corresponding operation type
      */
     std::function<std::shared_ptr<Emitter>(std::shared_ptr<ngraph::Node>)> get(const ngraph::DiscreteTypeInfo type) const {
         auto jitter = jitters.find(type);
@@ -118,6 +119,18 @@ class Generator {
      */
     code generate(std::shared_ptr<ov::Model>& m, const void* compile_params = nullptr) const;
 
+    /**
+     * @brief gets target machine
+     * @return pointer to constant target machine
+     */
+    std::shared_ptr<const TargetMachine> get_target_machine() const { return target; }
+
+    /**
+     * @brief gets supported element type for execution
+     * @return element type
+     */
+    virtual element::Type get_supported_exec_precision() const = 0;
+
 protected:
     std::shared_ptr<TargetMachine> target;
 };
diff --git a/src/common/snippets/include/snippets/op/blockedload.hpp b/src/common/snippets/include/snippets/op/blockedload.hpp
deleted file mode 100644
index d1ec4c5bdd43dd..00000000000000
--- a/src/common/snippets/include/snippets/op/blockedload.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (C) 2018-2022 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include <ngraph/op/op.hpp>
-#include "load.hpp"
-
-namespace ngraph {
-namespace snippets {
-namespace op {
-
-/**
- * @interface BlockedLoad
- * @brief Generated by Canonicalization step for blocked data (NCHW<X>c) to be loaded
- * @ingroup snippets
- */
-class BlockedLoad : public Load {
-public:
-    OPENVINO_OP("BlockedLoad", "SnippetsOpset", ngraph::snippets::op::Load);
-
-    BlockedLoad(const Output<Node>& x);
-    BlockedLoad() = default;
-
-    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override {
-        check_new_args_count(this, new_args);
-        return std::make_shared<BlockedLoad>(new_args.at(0));
-    }
-};
-
-} // namespace op
-} // namespace snippets
-} // namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/include/snippets/op/blockedparameter.hpp b/src/common/snippets/include/snippets/op/blockedparameter.hpp
deleted file mode 100644
index 34a080d837fcf8..00000000000000
--- a/src/common/snippets/include/snippets/op/blockedparameter.hpp
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (C) 2018-2022 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include <ngraph/op/op.hpp>
-#include <ngraph/op/parameter.hpp>
-
-namespace ngraph {
-namespace snippets {
-namespace op {
-
-/**
- * @interface BlockedParameter
- * @brief Represents blocked input (NCHW<X>c) for a subgraph
- * @ingroup snippets
- */
-class BlockedParameter : public ngraph::op::Parameter {
-public:
-    OPENVINO_OP("BlockedParameter", "SnippetsOpset", ngraph::op::Parameter);
-
-    BlockedParameter() = default;
-    BlockedParameter(const ngraph::element::Type& element_type, const PartialShape& pshape)
-        : Parameter(element_type, pshape) {
-    }
-
-    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override {
-        check_new_args_count(this, new_args);
-        return std::make_shared<BlockedParameter>(m_element_type, m_partial_shape);
-    }
-};
-
-} // namespace op
-} // namespace snippets
-} // namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/include/snippets/op/convert_saturation.hpp b/src/common/snippets/include/snippets/op/convert_saturation.hpp
new file mode 100644
index 00000000000000..8a26c8fb44818a
--- /dev/null
+++ b/src/common/snippets/include/snippets/op/convert_saturation.hpp
@@ -0,0 +1,37 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <openvino/op/convert.hpp>
+#include "ngraph/op/op.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface ConvertSaturation
+ * @brief The implementation uses "saturation" conversion.
+ *        It means that if the values are outside the limits
+ *        of the maximum and minimum values of the destination data type, they are clamped.
+ *        For example, int_32t ---> int8_t
+ *                       129   --->  127
+ * @ingroup snippets
+ */
+class ConvertSaturation : public ov::op::v0::Convert {
+public:
+    OPENVINO_OP("ConvertSaturation", "SnippetsOpset", ov::op::v0::Convert);
+
+    ConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type);
+    ConvertSaturation() = default;
+
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+
+    bool has_evaluate() const override { return false; }
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/convert_truncation.hpp b/src/common/snippets/include/snippets/op/convert_truncation.hpp
new file mode 100644
index 00000000000000..aa802dffa673bc
--- /dev/null
+++ b/src/common/snippets/include/snippets/op/convert_truncation.hpp
@@ -0,0 +1,36 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <openvino/op/convert.hpp>
+#include "ngraph/op/op.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface ConvertTruncation
+ * @brief The implementation "truncation" conversion.
+ *        It means that if there are overflow, the values will wrap around.
+ *        For example, int_32t ---> int8_t
+ *                       129   --->  -127
+ * @ingroup snippets
+ */
+class ConvertTruncation : public ov::op::v0::Convert {
+public:
+    OPENVINO_OP("ConvertTruncation", "SnippetsOpset", ov::op::v0::Convert);
+
+    ConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type);
+    ConvertTruncation() = default;
+
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+
+    bool has_evaluate() const override { return false; }
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/load.hpp b/src/common/snippets/include/snippets/op/load.hpp
index ab9182be9060a9..7f53240ae21946 100644
--- a/src/common/snippets/include/snippets/op/load.hpp
+++ b/src/common/snippets/include/snippets/op/load.hpp
@@ -12,20 +12,22 @@ namespace op {
 
 /**
  * @interface Load
- * @brief Generated by Canonicalization step where explicit load instruction should be emmiteed
- * ScalarLoad == scalar instruction + post increment
- * Load (VectorLoad) == vector instruction + post increment
- * BroadcastLoad == scalar instruction - post increment
- * BlockedLoad == vector instruction - post increment
+ * @brief Generated by Canonicalization step where explicit instructions should be emitted for data loading
+ *        where number of elements to load is determined by "count"
+ *        Default value is "1" - to load one element
  * @ingroup snippets
  */
 class Load : public ngraph::op::Op {
 public:
     OPENVINO_OP("Load", "SnippetsOpset");
 
-    Load(const Output<Node>& x);
+    Load(const Output<Node>& x, const size_t count = 1lu);
     Load() = default;
 
+    size_t get_count() const { return m_count; }
+
+    void set_count(const size_t count) { m_count = count; }
+
     bool visit_attributes(AttributeVisitor& visitor) override;
 
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
@@ -35,6 +37,9 @@ class Load : public ngraph::op::Op {
     OPENVINO_SUPPRESS_DEPRECATED_START
     bool evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const override;
     OPENVINO_SUPPRESS_DEPRECATED_END
+
+protected:
+    size_t m_count = 0lu;
 };
 
 } // namespace op
diff --git a/src/common/snippets/include/snippets/op/scalarload.hpp b/src/common/snippets/include/snippets/op/scalarload.hpp
deleted file mode 100644
index 83088064e8bbb7..00000000000000
--- a/src/common/snippets/include/snippets/op/scalarload.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (C) 2018-2022 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include <ngraph/op/op.hpp>
-#include "load.hpp"
-
-namespace ngraph {
-namespace snippets {
-namespace op {
-
-/**
- * @interface ScalarLoad
- * @brief Generated by Canonicalization for a scalar value load to vector register
- * @ingroup snippets
- */
-class ScalarLoad : public Load {
-public:
-    OPENVINO_OP("ScalarLoad", "SnippetsOpset", ngraph::snippets::op::Load);
-
-    ScalarLoad(const Output<Node>& x);
-    ScalarLoad() = default;
-
-    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override {
-        check_new_args_count(this, new_args);
-        return std::make_shared<ScalarLoad>(new_args.at(0));
-    }
-};
-
-} // namespace op
-} // namespace snippets
-} // namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/include/snippets/op/scalarstore.hpp b/src/common/snippets/include/snippets/op/scalarstore.hpp
deleted file mode 100644
index dc103edf72a7a8..00000000000000
--- a/src/common/snippets/include/snippets/op/scalarstore.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (C) 2018-2022 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include <ngraph/op/op.hpp>
-#include "store.hpp"
-
-namespace ngraph {
-namespace snippets {
-namespace op {
-
-/**
- * @interface ScalarStore
- * @brief Generated by Canonicalization for a scalar value store from vector register
- * @ingroup snippets
- */
-class ScalarStore : public Store {
-public:
-    OPENVINO_OP("ScalarStore", "SnippetsOpset", ngraph::snippets::op::Store);
-
-    ScalarStore(const Output<Node>& x);
-    ScalarStore() = default;
-
-    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override {
-        check_new_args_count(this, new_args);
-        return std::make_shared<ScalarStore>(new_args.at(0));
-    }
-};
-
-} // namespace op
-} // namespace snippets
-} // namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/include/snippets/op/store.hpp b/src/common/snippets/include/snippets/op/store.hpp
index fdf0801e06d92e..0ff5cc3ec8e063 100644
--- a/src/common/snippets/include/snippets/op/store.hpp
+++ b/src/common/snippets/include/snippets/op/store.hpp
@@ -11,17 +11,23 @@ namespace snippets {
 namespace op {
 
 /**
- * @interface Load
- * @brief Generated by Canonicalization step where explicit store instruction should be emmiteed
+ * @interface Store
+ * @brief Generated by Canonicalization step where explicit instructions should be emitted for data storing
+ *        where number of elements to store is determined by "count"
+ *        Default value is "1" - to store one element
  * @ingroup snippets
  */
 class Store : public ngraph::op::Op {
 public:
     OPENVINO_OP("Store", "SnippetsOpset");
 
-    Store(const Output<Node>& x);
+    Store(const Output<Node>& x, const size_t count = 1lu);
     Store() = default;
 
+    size_t get_count() const { return m_count; }
+
+    void set_count(const size_t count) { m_count = count; }
+
     bool visit_attributes(AttributeVisitor& visitor) override;
 
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
@@ -31,6 +37,9 @@ class Store : public ngraph::op::Op {
     OPENVINO_SUPPRESS_DEPRECATED_START
     bool evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const override;
     OPENVINO_SUPPRESS_DEPRECATED_END
+
+protected:
+    size_t m_count = 0lu;
 };
 
 } // namespace op
diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp
index 12fc506be926ae..34b8183c69f827 100644
--- a/src/common/snippets/include/snippets/op/subgraph.hpp
+++ b/src/common/snippets/include/snippets/op/subgraph.hpp
@@ -90,12 +90,12 @@ class Subgraph : public ngraph::op::Op {
 
 
     snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes,
-                                ngraph::pass::Manager& opt, const void* compile_params = nullptr);
+                                ngraph::pass::Manager& opt, const ov::element::Type exec_type = ngraph::element::f32, const void* compile_params = nullptr);
     snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes,
-                                const void* compile_params = nullptr);
+                                const ov::element::Type exec_type = ngraph::element::f32, const void* compile_params = nullptr);
     snippets::Schedule generate(ngraph::pass::Manager &opt, const void* compile_params = nullptr);
     snippets::Schedule generate(const void* compile_params = nullptr);
-    Shape canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes);
+    Shape canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, const ov::element::Type exec_type);
 
     // plugin sets generator for a snippet to some specific generator.
     // it's going to be replaced with Jitters table later
@@ -107,8 +107,11 @@ class Subgraph : public ngraph::op::Op {
     void serialize() const;
 
     static auto wrap_node_as_subgraph(const std::shared_ptr<ngraph::Node>& node) -> std::shared_ptr<Subgraph>;
+    static void fill_empty_output_names(const Output<Node>& target_output_node, const Output<Node>& replacement_output_node);
 
 private:
+    void align_element_types(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes,
+                             const ov::element::Type exec_type);
     void convert_to_snippet_dialect();
     Shape exec_domain;
     std::shared_ptr<ov::Model> m_body;
diff --git a/src/common/snippets/include/snippets/op/tile.hpp b/src/common/snippets/include/snippets/op/tile.hpp
index 9620b81421fdff..ac1d6ef4d1a2b9 100644
--- a/src/common/snippets/include/snippets/op/tile.hpp
+++ b/src/common/snippets/include/snippets/op/tile.hpp
@@ -20,14 +20,27 @@ class Tile : public ngraph::op::Op {
 public:
     OPENVINO_OP("Tile", "SnippetsOpset");
 
-    Tile(const std::vector<std::pair<std::shared_ptr<ngraph::snippets::Emitter>, ngraph::snippets::RegInfo>>& region);
+    /// \brief Construct an Tile
+    /// \param region The vector of pairs: emitters and the corresponding registers
+    /// \param increment Tile size - count of elements to load and store.
+    ///                  Vector Tile should have size of vector register and Scalar Tile should have 1
+    /// \param num_inputs Count of inputs
+    /// \param num_outputs Count of outputs
+    /// \param io_dims Vector of last dimensions of inputs and outputs
+    /// \param io_data_sizes Vector of data type sizes of inputs and outputs
+    Tile(const std::vector<AllocatedEmitter>& region, size_t increment, size_t num_inputs, size_t num_outputs,
+         const std::vector<size_t>& io_dims, const std::vector<size_t>& io_data_sizes);
     Tile() = default;
-    std::vector<std::pair<std::shared_ptr<ngraph::snippets::Emitter>, ngraph::snippets::RegInfo>> region;
+    std::vector<AllocatedEmitter> region;
+    size_t increment = 0;
+    size_t num_inputs = 0;
+    size_t num_outputs = 0;
+    std::vector<size_t> io_dims {};
+    std::vector<size_t> io_data_size {};
 
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override {
-        return std::make_shared<Tile>(region);
+        return std::make_shared<Tile>(region, increment, num_inputs, num_outputs, io_dims, io_data_size);
     }
-    const void *compile_params;
 };
 
 } // namespace op
diff --git a/src/common/snippets/include/snippets/op/tile_scheduler.hpp b/src/common/snippets/include/snippets/op/tile_scheduler.hpp
new file mode 100644
index 00000000000000..9d6010f77978b0
--- /dev/null
+++ b/src/common/snippets/include/snippets/op/tile_scheduler.hpp
@@ -0,0 +1,39 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/op/op.hpp"
+#include "snippets/emitter.hpp"
+#include "tile.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface TileScheduler
+ * @brief Contains a set of Tiles (currently one vector and one scalar) and performs necessary preparations
+ * before the Tiles could be executed: calculates offsets, sets proper work amounts, decrement pointers if the same data
+ * have to be read several times (broadcasting).
+ * @ingroup snippets
+ */
+class TileScheduler : public ngraph::op::Op {
+public:
+    OPENVINO_OP("TileScheduler", "SnippetsOpset");
+
+    TileScheduler(const AllocatedEmitter& vector_region, const AllocatedEmitter& scalar_region);
+    TileScheduler() = default;
+    AllocatedEmitter vector_region;
+    AllocatedEmitter scalar_region;
+    // todo: this clone_with_new_inputs is irrelevant
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override {
+        return std::make_shared<TileScheduler>(vector_region, scalar_region);
+    }
+    const void *compile_params;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/vectorload.hpp b/src/common/snippets/include/snippets/op/vectorload.hpp
deleted file mode 100644
index a4a45ae9eb9803..00000000000000
--- a/src/common/snippets/include/snippets/op/vectorload.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (C) 2018-2022 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include <ngraph/op/op.hpp>
-#include "load.hpp"
-
-namespace ngraph {
-namespace snippets {
-namespace op {
-
-/**
- * @interface VectorLoad
- * @brief Generated by Canonicalization for a vector value load to vector register
- * @ingroup snippets
- */
-class VectorLoad : public Load {
-public:
-    OPENVINO_OP("VectorLoad", "SnippetsOpset", ngraph::snippets::op::Load);
-
-    VectorLoad(const Output<Node>& x);
-    VectorLoad() = default;
-
-    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override {
-        check_new_args_count(this, new_args);
-        return std::make_shared<VectorLoad>(new_args.at(0));
-    }
-};
-
-} // namespace op
-} // namespace snippets
-} // namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/include/snippets/op/vectorstore.hpp b/src/common/snippets/include/snippets/op/vectorstore.hpp
deleted file mode 100644
index 7d55d28f2b8611..00000000000000
--- a/src/common/snippets/include/snippets/op/vectorstore.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (C) 2018-2022 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include <ngraph/op/op.hpp>
-#include "store.hpp"
-
-namespace ngraph {
-namespace snippets {
-namespace op {
-
-/**
- * @interface VectorStore
- * @brief Generated by Canonicalization for a vector value store from vector register
- * @ingroup snippets
- */
-class VectorStore : public Store {
-public:
-    OPENVINO_OP("VectorStore", "SnippetsOpset", ngraph::snippets::op::Store);
-
-    VectorStore(const Output<Node>& x);
-    VectorStore() = default;
-
-    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override {
-        check_new_args_count(this, new_args);
-        return std::make_shared<VectorStore>(new_args.at(0));
-    }
-};
-
-} // namespace op
-} // namespace snippets
-} // namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/include/snippets/pass/assign_registers.hpp b/src/common/snippets/include/snippets/pass/assign_registers.hpp
index fb3672fe389536..0eff4bcc7d7033 100644
--- a/src/common/snippets/include/snippets/pass/assign_registers.hpp
+++ b/src/common/snippets/include/snippets/pass/assign_registers.hpp
@@ -18,7 +18,7 @@ namespace pass {
  */
 class AssignRegisters : public ngraph::pass::FunctionPass {
 public:
-    AssignRegisters() {
+    explicit AssignRegisters() {
         set_property(ngraph::pass::PassProperty::REQUIRE_STATIC_SHAPE, true);
     }
     bool run_on_model(const std::shared_ptr<ov::Model>& m) override;
diff --git a/src/common/snippets/include/snippets/pass/insert_convert_on_inputs.hpp b/src/common/snippets/include/snippets/pass/insert_convert_on_inputs.hpp
new file mode 100644
index 00000000000000..d92ff619ede46f
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/insert_convert_on_inputs.hpp
@@ -0,0 +1,31 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/pattern/matcher.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface InsertConvertOnInputs
+ * @brief Inserts ConvertSaturation op after Parameters and Scalars to convert data type of inputs
+ *        to supported execution data type.
+ *        Note: ConvertSaturation op isn't covered by specification of "Convert" op
+ *              This op is used for conversion into and from FP32 after the correspoding Load
+ *              and before Store to calculate in FP32 inside subgraph body in CPU Plugin
+ * @ingroup snippets
+ */
+class InsertConvertOnInputs: public ngraph::pass::MatcherPass {
+public:
+    InsertConvertOnInputs(const ov::element::Type exec_type = ov::element::f32);
+};
+
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ngraph
diff --git a/src/common/snippets/include/snippets/pass/insert_load_store.hpp b/src/common/snippets/include/snippets/pass/insert_load_store.hpp
index f26416e6ecd27a..dc1bf6b3e68717 100644
--- a/src/common/snippets/include/snippets/pass/insert_load_store.hpp
+++ b/src/common/snippets/include/snippets/pass/insert_load_store.hpp
@@ -19,7 +19,7 @@ namespace pass {
  */
 class InsertLoad: public ngraph::pass::MatcherPass {
 public:
-    InsertLoad();
+    InsertLoad(const size_t count = 1lu);
 };
 
 /**
@@ -30,7 +30,7 @@ class InsertLoad: public ngraph::pass::MatcherPass {
  */
 class InsertStore: public ngraph::pass::MatcherPass {
 public:
-    InsertStore();
+    InsertStore(const size_t count = 1lu);
 };
 
 
diff --git a/src/common/snippets/include/snippets/pass/reset_type_relaxed_node_precision.hpp b/src/common/snippets/include/snippets/pass/reset_type_relaxed_node_precision.hpp
new file mode 100644
index 00000000000000..03e9db7aa78a87
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/reset_type_relaxed_node_precision.hpp
@@ -0,0 +1,31 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/pattern/matcher.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface ResetTypeRelaxedNodePrecision
+ * @brief Reset precision for type relaxed nodes inside body to align precision between nodes.
+ *        Should be called after all Convert insertions
+ * @ingroup snippets
+ */
+class ResetTypeRelaxedNodePrecision: public ngraph::pass::FunctionPass {
+public:
+    OPENVINO_RTTI("ResetTypeRelaxedNodePrecision", "0");
+    ResetTypeRelaxedNodePrecision(const ov::element::Type exec_type = ov::element::f32);
+    bool run_on_model(const std::shared_ptr<ngraph::Function>& m) override;
+private:
+    ov::element::Type exec_type;
+};
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ngraph
diff --git a/src/common/snippets/include/snippets/pass/transform_convert_to_truncation.hpp b/src/common/snippets/include/snippets/pass/transform_convert_to_truncation.hpp
new file mode 100644
index 00000000000000..219d0bf0d73244
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/transform_convert_to_truncation.hpp
@@ -0,0 +1,28 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/pattern/matcher.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface TransofrmConvertToConvertTruncation
+ * @brief Transform Convert to ConvertTruncation with specification conversion rules
+ *        Note: ConvertTruncation op is covered by specification of "Convert" op
+ *              This op is used for real Convert ops inside subgraph body in CPU Plugin
+ * @ingroup snippets
+ */
+class TransformConvertToConvertTruncation: public ngraph::pass::MatcherPass {
+public:
+    TransformConvertToConvertTruncation();
+};
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ngraph
diff --git a/src/common/snippets/include/snippets/pass/vector_to_scalar.hpp b/src/common/snippets/include/snippets/pass/vector_to_scalar.hpp
index e8534cd22a2bfd..bbf428a4393dcb 100644
--- a/src/common/snippets/include/snippets/pass/vector_to_scalar.hpp
+++ b/src/common/snippets/include/snippets/pass/vector_to_scalar.hpp
@@ -12,27 +12,27 @@ namespace snippets {
 namespace pass {
 
 /**
- * @interface ReplaceLoadsWithScalarLoads
- * @brief Replaces vector loads with scalar versions.
- * The pass is used to cange element type of function in a canonical form vector to scalar.
+ * @interface SetScalarCountForLoad
+ * @brief Set count `1` for Load to represent as ScalarLoad
+ * The pass is used to change element count to loading to "1" to load scalar value
  * Used for tail generation
  * @ingroup snippets
  */
-class ReplaceLoadsWithScalarLoads: public ngraph::pass::MatcherPass {
+class SetScalarCountForLoad: public ngraph::pass::MatcherPass {
 public:
-    ReplaceLoadsWithScalarLoads();
+    SetScalarCountForLoad();
 };
 
 /**
- * @interface ReplaceStoresWithScalarStores
- * @brief Replaces vector stores with scalar versions.
- * The pass is used to cange element type of model in a canonical form vector to scalar.
+ * @interface SetScalarCountForStore
+ * @brief Set count `1` for Store to represent as ScalarStore
+ * The pass is used to change element count to stroring to "1" to store scalar valuw
  * Used for tail generation
  * @ingroup snippets
  */
-class ReplaceStoresWithScalarStores: public ngraph::pass::MatcherPass {
+class SetScalarCountForStore: public ngraph::pass::MatcherPass {
 public:
-    ReplaceStoresWithScalarStores();
+    SetScalarCountForStore();
 };
 
 } // namespace pass
diff --git a/src/common/snippets/include/snippets/snippets_isa.hpp b/src/common/snippets/include/snippets/snippets_isa.hpp
index da94fec0980d3a..f1c0e9056d66eb 100644
--- a/src/common/snippets/include/snippets/snippets_isa.hpp
+++ b/src/common/snippets/include/snippets/snippets_isa.hpp
@@ -7,21 +7,18 @@
 #include "ngraph/ops.hpp"
 #include <ngraph/opsets/opset1.hpp>
 
-#include "op/blockedload.hpp"
-#include "op/blockedparameter.hpp"
 #include "op/broadcastload.hpp"
 #include "op/broadcastmove.hpp"
+#include "op/convert_saturation.hpp"
+#include "op/convert_truncation.hpp"
 #include "op/kernel.hpp"
 #include "op/load.hpp"
 #include "op/nop.hpp"
 #include "op/scalar.hpp"
-#include "op/scalarload.hpp"
-#include "op/scalarstore.hpp"
 #include "op/powerstatic.hpp"
 #include "op/store.hpp"
 #include "op/tile.hpp"
-#include "op/vectorload.hpp"
-#include "op/vectorstore.hpp"
+#include "op/tile_scheduler.hpp"
 
 namespace ngraph {
 namespace snippets {
diff --git a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
index 53504a469e9a48..255a4f3a5e23d1 100644
--- a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
+++ b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
@@ -11,14 +11,9 @@
 
 // SnippetS dialect
 NGRAPH_OP(Load, ngraph::snippets::op)
-NGRAPH_OP(ScalarLoad, ngraph::snippets::op)
-NGRAPH_OP(VectorLoad, ngraph::snippets::op)
-NGRAPH_OP(BlockedLoad, ngraph::snippets::op)
 NGRAPH_OP(BroadcastLoad, ngraph::snippets::op)
 
 NGRAPH_OP(Store, ngraph::snippets::op)
-NGRAPH_OP(ScalarStore, ngraph::snippets::op)
-NGRAPH_OP(VectorStore, ngraph::snippets::op)
 
 NGRAPH_OP(BroadcastMove, ngraph::snippets::op)
 NGRAPH_OP(Scalar, ngraph::snippets::op)
@@ -29,9 +24,10 @@ NGRAPH_OP(Nop, ngraph::snippets::op)
 // opset completeness
 NGRAPH_OP(Constant, ngraph::op)
 NGRAPH_OP(Parameter, ngraph::op::v0)
-NGRAPH_OP(BlockedParameter, ngraph::snippets::op)
 NGRAPH_OP(Result, ngraph::op::v0)
 NGRAPH_OP(Broadcast, ngraph::op::v1)
+NGRAPH_OP(ConvertTruncation, ngraph::snippets::op)
+NGRAPH_OP(ConvertSaturation, ngraph::snippets::op)
 
 // unary
 NGRAPH_OP(Abs, ngraph::op::v0)
diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp
index 44a69470134279..0e85fe72861a21 100644
--- a/src/common/snippets/src/generator.cpp
+++ b/src/common/snippets/src/generator.cpp
@@ -17,7 +17,8 @@ auto ngraph::snippets::getRegisters(std::shared_ptr<ngraph::Node>& n) -> ngraph:
     auto rt = n->get_rt_info();
 
     // ToDo: change to reg_t
-    std::vector<size_t> rout;
+    std::vector<size_t> rin, rout;
+
     auto it_rt = rt.find("reginfo");
     if (it_rt != rt.end()) {
         for (auto reg : it_rt->second.as<std::vector<size_t>>()) {
@@ -25,12 +26,11 @@ auto ngraph::snippets::getRegisters(std::shared_ptr<ngraph::Node>& n) -> ngraph:
         }
     }
 
-    std::vector<size_t> rin;
-    for (auto input : n->inputs()) {
+    for (const auto& input : n->inputs()) {
         auto rt = input.get_source_output().get_node_shared_ptr()->get_rt_info();
         auto it_rt = rt.find("reginfo");
         if (it_rt != rt.end()) {
-            for (auto reg : it_rt->second.as<std::vector<size_t>>()) {
+            for (auto& reg : it_rt->second.as<std::vector<size_t>>()) {
                 rin.push_back(reg);
             }
         }
@@ -48,51 +48,56 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptr<ov:
     auto results = m->get_results();
     auto in = params.size();
     auto out = results.size();
-    auto nptrs = in + out;
+    std::vector<size_t> io_last_dims(in + out);
+    std::vector<size_t> io_data_sizes(in + out);
+    std::transform(params.begin(), params.end(), io_last_dims.begin(),
+                   [](const std::shared_ptr<Node>& n){return n->get_output_shape(0).back();});
+    std::transform(results.begin(), results.end(), io_last_dims.begin() + in,
+                   [](const std::shared_ptr<Node>& n){return n->get_input_shape(0).back();});
+    std::transform(params.begin(), params.end(), io_data_sizes.begin(),
+                   [](const std::shared_ptr<Node>& n){return n->get_element_type().size();});
+    std::transform(results.begin(), results.end(), io_data_sizes.begin() + in,
+                   [](const std::shared_ptr<Node>& n){return n->get_element_type().size();});
 
     OV_ITT_TASK_CHAIN(GENERATE, ngraph::pass::itt::domains::SnippetsTransform, "Snippets::Generator", "::VectorTile")
     // vector tile
-    std::vector<std::pair<std::shared_ptr<ngraph::snippets::Emitter>, ngraph::snippets::RegInfo>> lowered;
+    std::vector<AllocatedEmitter> lowered;
     for (auto n : m->get_ordered_ops()) {
-        lowered.push_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n)));
+        lowered.emplace_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n)));
     }
     OV_ITT_TASK_NEXT(GENERATE, "::ScalarTile")
 
     // scalar tile
     auto m_scalar = ov::clone_model(*m.get());
     ngraph::pass::Manager mng;
-    mng.register_pass<ngraph::snippets::pass::ReplaceLoadsWithScalarLoads>();
-    mng.register_pass<ngraph::snippets::pass::ReplaceStoresWithScalarStores>();
+    mng.register_pass<ngraph::snippets::pass::SetScalarCountForLoad>();
+    mng.register_pass<ngraph::snippets::pass::SetScalarCountForStore>();
     mng.run_passes(m_scalar);
     OV_ITT_TASK_NEXT(GENERATE, "::ScalarTile_get")
-    std::vector<std::pair<std::shared_ptr<Emitter>, RegInfo>> scalar_lowered;
+    std::vector<AllocatedEmitter> scalar_lowered;
     for (auto n : m_scalar->get_ordered_ops()) {
-        scalar_lowered.push_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n)));
+        scalar_lowered.emplace_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n)));
     }
-    OV_ITT_TASK_NEXT(GENERATE, "::Tiles1D")
-
+    OV_ITT_TASK_NEXT(GENERATE, "::Tiles1D");
     // wrapping into tiles1D
-    std::vector<std::pair<std::shared_ptr<Emitter>, RegInfo>> tiles1D;
-    auto tile = std::make_shared<ngraph::snippets::op::Tile>(lowered);
-    tile->compile_params = compile_params;
-    tiles1D.push_back(std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(tile),
-                                   std::make_pair(std::vector<size_t>({target->get_lanes(), 0, nptrs, 1}), std::vector<size_t>{})));
-    tile = std::make_shared<ngraph::snippets::op::Tile>(scalar_lowered);
-    tile->compile_params = compile_params;
-    tiles1D.push_back(std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(tile),
-                    std::make_pair(std::vector<size_t>{{1, target->get_lanes(), nptrs, 1}}, std::vector<size_t>{})));
+    //todo: in, out, and io_last_dims should derive naturally from the graph representation
+    const auto& vector_tile = std::make_shared<ngraph::snippets::op::Tile>(lowered, target->get_lanes(), in, out, io_last_dims, io_data_sizes);
+    const auto& vector_region = std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(vector_tile),
+                                   std::make_pair(std::vector<size_t>{}, std::vector<size_t>{}));
+    const auto& scalar_tile = std::make_shared<ngraph::snippets::op::Tile>(scalar_lowered, 1, in, out, io_last_dims, io_data_sizes);
+    const auto& scalar_region = std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(scalar_tile),
+                    std::make_pair(std::vector<size_t>{}, std::vector<size_t>{}));
 
     OV_ITT_TASK_NEXT(GENERATE, "::Tiles2D")
     // wrapping into tiles2D
-    std::vector<std::pair<std::shared_ptr<Emitter>, RegInfo>> tiles2D;
-    tile = std::make_shared<ngraph::snippets::op::Tile>(tiles1D);
-    tile->compile_params = compile_params;
-    tiles2D.push_back(std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(tile),
-                                     std::make_pair(std::vector<size_t>({1, 0, nptrs, 0}), std::vector<size_t>{})));
+    auto tile_scheduler = std::make_shared<ngraph::snippets::op::TileScheduler>(vector_region, scalar_region);
+    tile_scheduler->compile_params = compile_params;
+    const auto& tile_scheduler_region = std::make_pair(target->get(ngraph::snippets::op::TileScheduler::get_type_info_static())(tile_scheduler),
+                                                       std::make_pair(std::vector<size_t>({in, out, target->get_lanes()}), std::vector<size_t>{}));
 
     OV_ITT_TASK_NEXT(GENERATE, "::EmitCode")
     // emission
-    auto tiles2DKernel = std::make_shared<ngraph::snippets::op::Kernel>(tiles2D);
+    auto tiles2DKernel = std::make_shared<ngraph::snippets::op::Kernel>(std::vector<AllocatedEmitter> {tile_scheduler_region});
     tiles2DKernel->compile_params = compile_params;
     std::shared_ptr<Emitter> kernel = target->get(ngraph::snippets::op::Kernel::get_type_info_static())(tiles2DKernel);
     kernel->emit_code({in, out}, {});
diff --git a/src/common/snippets/src/op/blockedload.cpp b/src/common/snippets/src/op/blockedload.cpp
deleted file mode 100644
index 013977b591a6dc..00000000000000
--- a/src/common/snippets/src/op/blockedload.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-// Copyright (C) 2018-2022 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "snippets/op/blockedload.hpp"
-
-using namespace ngraph;
-
-snippets::op::BlockedLoad::BlockedLoad(const Output<Node>& x) : Load(x) {
-}
diff --git a/src/common/snippets/src/op/convert_saturation.cpp b/src/common/snippets/src/op/convert_saturation.cpp
new file mode 100644
index 00000000000000..115f127dae5626
--- /dev/null
+++ b/src/common/snippets/src/op/convert_saturation.cpp
@@ -0,0 +1,19 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/itt.hpp"
+
+#include "snippets/op/convert_saturation.hpp"
+
+#include "ngraph/runtime/host_tensor.hpp"
+
+ngraph::snippets::op::ConvertSaturation::ConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type)
+    : ov::op::v0::Convert({x}, destination_type) {
+}
+
+std::shared_ptr<ngraph::Node> ngraph::snippets::op::ConvertSaturation::clone_with_new_inputs(const OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(ConvertSaturation_clone_with_new_inputs);
+    check_new_args_count(this, new_args);
+    return std::make_shared<ConvertSaturation>(new_args.at(0), m_destination_type);
+}
diff --git a/src/common/snippets/src/op/convert_truncation.cpp b/src/common/snippets/src/op/convert_truncation.cpp
new file mode 100644
index 00000000000000..a009dc7d5618ad
--- /dev/null
+++ b/src/common/snippets/src/op/convert_truncation.cpp
@@ -0,0 +1,19 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/itt.hpp"
+
+#include "snippets/op/convert_truncation.hpp"
+
+#include "ngraph/runtime/host_tensor.hpp"
+
+ngraph::snippets::op::ConvertTruncation::ConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type)
+    : ov::op::v0::Convert({x}, destination_type) {
+}
+
+std::shared_ptr<ngraph::Node> ngraph::snippets::op::ConvertTruncation::clone_with_new_inputs(const OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(ConvertTruncation_clone_with_new_inputs);
+    check_new_args_count(this, new_args);
+    return std::make_shared<ConvertTruncation>(new_args.at(0), m_destination_type);
+}
diff --git a/src/common/snippets/src/op/load.cpp b/src/common/snippets/src/op/load.cpp
index b1eb539340f5e4..1ac4df725fe75d 100644
--- a/src/common/snippets/src/op/load.cpp
+++ b/src/common/snippets/src/op/load.cpp
@@ -11,7 +11,7 @@
 using namespace std;
 using namespace ngraph;
 
-snippets::op::Load::Load(const Output<Node>& x) : Op({x}) {
+snippets::op::Load::Load(const Output<Node>& x, const size_t count) : Op({x}), m_count(count) {
     constructor_validate_and_infer_types();
 }
 
@@ -22,7 +22,7 @@ bool snippets::op::Load::visit_attributes(AttributeVisitor& visitor) {
 std::shared_ptr<Node> snippets::op::Load::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(Load);
     check_new_args_count(this, new_args);
-    return std::make_shared<Load>(new_args.at(0));
+    return std::make_shared<Load>(new_args.at(0), m_count);
 }
 
 void snippets::op::Load::validate_and_infer_types() {
diff --git a/src/common/snippets/src/op/scalarload.cpp b/src/common/snippets/src/op/scalarload.cpp
deleted file mode 100644
index 83277647223616..00000000000000
--- a/src/common/snippets/src/op/scalarload.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-// Copyright (C) 2018-2022 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "snippets/op/scalarload.hpp"
-
-using namespace ngraph;
-
-snippets::op::ScalarLoad::ScalarLoad(const Output<Node>& x) : Load(x) {
-}
diff --git a/src/common/snippets/src/op/scalarstore.cpp b/src/common/snippets/src/op/scalarstore.cpp
deleted file mode 100644
index ee333bfffa2e92..00000000000000
--- a/src/common/snippets/src/op/scalarstore.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-// Copyright (C) 2018-2022 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "snippets/op/scalarstore.hpp"
-
-using namespace ngraph;
-
-snippets::op::ScalarStore::ScalarStore(const Output<Node>& x) : Store(x) {
-}
diff --git a/src/common/snippets/src/op/store.cpp b/src/common/snippets/src/op/store.cpp
index 1c2d62948bc5b6..db3204df69ab0b 100644
--- a/src/common/snippets/src/op/store.cpp
+++ b/src/common/snippets/src/op/store.cpp
@@ -4,14 +4,14 @@
 
 #include <snippets/itt.hpp>
 
-#include "snippets/op/scalarstore.hpp"
+#include "snippets/op/store.hpp"
 
 #include <ngraph/runtime/host_tensor.hpp>
 
 using namespace std;
 using namespace ngraph;
 
-snippets::op::Store::Store(const Output<Node>& x) : Op({x}) {
+snippets::op::Store::Store(const Output<Node>& x, const size_t count) : Op({x}), m_count(count) {
     constructor_validate_and_infer_types();
 }
 
@@ -22,7 +22,7 @@ bool snippets::op::Store::visit_attributes(AttributeVisitor& visitor) {
 std::shared_ptr<Node> snippets::op::Store::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(Store);
     check_new_args_count(this, new_args);
-    return std::make_shared<Store>(new_args.at(0));
+    return std::make_shared<Store>(new_args.at(0), m_count);
 }
 
 void snippets::op::Store::validate_and_infer_types() {
diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp
index c2359af2b958d5..980e3dcdba1705 100644
--- a/src/common/snippets/src/op/subgraph.cpp
+++ b/src/common/snippets/src/op/subgraph.cpp
@@ -6,6 +6,7 @@
 #include "snippets/remarks.hpp"
 
 #include "snippets/op/subgraph.hpp"
+#include "snippets/op/convert_saturation.hpp"
 #include "snippets/pass/insert_load_store.hpp"
 #include "snippets/pass/insert_movebroadcast.hpp"
 #include "snippets/pass/load_movebroadcast_to_broadcastload.hpp"
@@ -13,8 +14,15 @@
 #include "snippets/pass/convert_constants_to_scalars.hpp"
 #include "snippets/pass/convert_power_to_powerstatic.hpp"
 #include "snippets/pass/vector_to_scalar.hpp"
+#include "snippets/pass/transform_convert_to_truncation.hpp"
+#include "snippets/pass/insert_convert_on_inputs.hpp"
+#include "snippets/pass/reset_type_relaxed_node_precision.hpp"
+
+#include "transformations/common_optimizations/nop_elimination.hpp"
+#include "transformations/utils/utils.hpp"
 
 #include <ngraph/pass/manager.hpp>
+#include "ngraph/pass/constant_folding.hpp"
 #include <openvino/pass/serialize.hpp>
 
 #include <algorithm>
@@ -92,6 +100,9 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptr<ov::Nod
 
     auto body_node = node->clone_with_new_inputs(body_inputs);
     body_node->set_friendly_name(node->get_friendly_name());
+    for (size_t i = 0; i < node->get_output_size(); i++) {
+        fill_empty_output_names(body_node->output(i), node->output(i));
+    }
 
     if (node->get_output_size() != body_node->get_output_size()) {
         throw ngraph::ngraph_error("original node outputs size and extracted subgraph node outputs size doesn't much");
@@ -118,6 +129,20 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptr<ov::Nod
 
     return subgraph;
 }
+
+void snippets::op::Subgraph::fill_empty_output_names(const Output<Node>& target_output_node, const Output<Node>& replacement_output_node) {
+    NGRAPH_SUPPRESS_DEPRECATED_START
+    auto out_tensor = target_output_node.get_tensor_ptr();
+    const std::string new_name = ngraph::op::util::get_ie_output_name(replacement_output_node);
+    if (out_tensor->get_name().empty()) {
+        out_tensor->set_name(new_name);
+    }
+    if (!replacement_output_node.get_names().empty()) {
+        out_tensor->set_names(replacement_output_node.get_names());
+    }
+    NGRAPH_SUPPRESS_DEPRECATED_END
+}
+
 ///
 /// \brief  Canonization transforms original subgraph and to canonical form suitable for code generation. In particular,
 ///         it handles supported layout conversions, broadcasts inputs and outputs to a single rank and layout. Canonicalization
@@ -125,7 +150,8 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptr<ov::Nod
 ///         Canonicalization currently supports only the following layout conversions:
 ///             * None: all inputs have the same layout
 ///             * Planar + blocked: some inputs have blocked, and some have planar layouts, e.g. <N, C, H, W, c> + <N, C, H, W>
-Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes) {
+///         Also there is precision aligning inside body of subgraph during canonicalization
+Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes, const ov::element::Type exec_type) {
     INTERNAL_OP_SCOPE(Subgraph);
     OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::canonicalize")
     NODE_VALIDATION_CHECK(this, inputShapes.size() == m_body->get_parameters().size(),
@@ -176,7 +202,8 @@ Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShape
                               PartialShape::broadcast_merge_into(tmpPShape, inShape, ::ngraph::op::AutoBroadcastType::NUMPY),
                               "Failed to create broadcastable shapes in snippets canonicalization");
         const auto paramShape = m_body->get_parameters()[i]->get_shape();
-        if (paramShape.size() != inShape.size() || !equal(paramShape.begin(), paramShape.end(), inShape.begin()))
+        const auto paramType =  m_body->get_parameters()[i]->get_element_type();
+        if (paramShape.size() != inShape.size() || !equal(paramShape.begin(), paramShape.end(), inShape.begin()) || paramType != inType)
                 m_body->replace_parameter(i, std::make_shared<opset1::Parameter>(inType, inShape));
     }
 
@@ -213,21 +240,78 @@ Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShape
                                                                ::ngraph::op::AutoBroadcastType::NUMPY);
         NODE_VALIDATION_CHECK(this, compatibleWithOtherOutputs, "Snippets output shapes must be numpy broadcastable");
     }
+
+    // We should insert Converts after Parameters and Constant and before Results
+    // to align precision inside Subgraph body that is supported by Plugin
+    align_element_types(outputShapes, inputShapes, exec_type);
+
     exec_domain = outPShape.get_shape();
     return exec_domain;
 }
 
+void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outputShapes,
+                                                 const BlockedShapeVector& inputShapes,
+                                                 const ov::element::Type exec_type) {
+    ngraph::pass::Manager p_manager;
+    p_manager.register_pass<snippets::pass::TransformConvertToConvertTruncation>();
+    p_manager.run_passes(m_body);
+
+    const auto& body_results = m_body->get_results();
+    for (size_t i = 0; i < outputShapes.size(); i++) {
+        const auto needed_out_type = std::get<2>(outputShapes[i]);
+
+        // If there is real Convert from graph (ConvertTruncation) before Result
+        // we should check destination type and insert ConvertSaturation before that if needed.
+        // For example, to return original element type after Convert insertion on inputs
+        std::shared_ptr<ov::Node> first_convert = body_results[i];
+        while (ov::is_type<ngraph::snippets::op::ConvertTruncation>(first_convert->get_input_node_ptr(0))) {
+            first_convert = first_convert->get_input_node_shared_ptr(0);
+        }
+        if (auto existing_convert_t = ngraph::as_type_ptr<ngraph::snippets::op::ConvertTruncation>(first_convert)) {
+            const auto original_input_element_type = existing_convert_t->get_input_element_type(0);
+            if (original_input_element_type != exec_type) {
+                const auto convert = std::make_shared<ngraph::snippets::op::ConvertSaturation>(
+                        existing_convert_t->get_input_node_shared_ptr(0), original_input_element_type);
+                existing_convert_t->set_argument(0, convert);
+            }
+        }
+
+        // We should insert Convert before Results to return original output element type
+        const auto convert = std::make_shared<ngraph::snippets::op::ConvertSaturation>(
+                body_results[i]->get_input_node_shared_ptr(0), needed_out_type);
+        body_results[i]->set_argument(0, convert);
+    }
+
+    // After Convert insertion we should make the following steps:
+    //      - insert ConvertSaturation after inputs and scalar to start aligning of exec data type inside body
+    //      - manually set output element types of type relaxed nodes to align element type inside subgraph body
+    //      - after Convert insertion on inputs and after scalars we should use ConstantFolding pass to convert
+    //        element type of Scalars before inference
+    //      - eliminate redundant Convert that could have been inserted
+    ngraph::pass::Manager manager;
+    manager.register_pass<snippets::pass::InsertConvertOnInputs>(exec_type);
+    manager.register_pass<snippets::pass::ResetTypeRelaxedNodePrecision>(exec_type);
+    manager.register_pass<ngraph::pass::ConstantFolding>();
+    manager.register_pass<ngraph::pass::EliminateConvert>();
+    manager.run_passes(m_body);
+}
+
 void snippets::op::Subgraph::convert_to_snippet_dialect() {
     INTERNAL_OP_SCOPE(Subgraph);
     OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::convert_to_snippet_dialect")
     auto skip_matching_domain = [](const std::shared_ptr<const ov::Node>& n) -> bool {
         return n->get_input_shape(0).back() != 1;
     };
+
+    // At the moment we support only full vector Load/Store and scalar Load/Store so that count is equal to lanes.
+    // Then we are going to support variadic Load/Store with different element count
+    const size_t count = m_generator->get_target_machine()->get_lanes();
+
     ngraph::pass::Manager manager;
     manager.register_pass<snippets::pass::ConvertConstantsToScalars>();
     manager.register_pass<snippets::pass::ConvertPowerToPowerStatic>();
-    manager.register_pass<snippets::pass::InsertLoad>();
-    manager.register_pass<snippets::pass::InsertStore>();
+    manager.register_pass<snippets::pass::InsertLoad>(count);
+    manager.register_pass<snippets::pass::InsertStore>(count);
     manager.register_pass<snippets::pass::InsertMoveBroadcast>();
     manager.register_pass<snippets::pass::LoadMoveBroadcastToBroadcastLoad>();
     // Note that, BrodacastMove is typically inserted right after the Load. Such cases are typical for
@@ -246,28 +330,30 @@ void snippets::op::Subgraph::convert_to_snippet_dialect() {
     //                        Result
     // Note: Load* should be replaced with ScalarLoad in this example to avoid invalid read in vector Tile.
     if (!exec_domain.empty() && exec_domain.back() != 1) {
-        manager.register_pass<snippets::pass::ReplaceLoadsWithScalarLoads>();
-        manager.register_pass<snippets::pass::ReplaceStoresWithScalarStores>();
+        manager.register_pass<snippets::pass::SetScalarCountForLoad>();
+        manager.register_pass<snippets::pass::SetScalarCountForStore>();
         manager.get_pass_config()->
-        set_callback<ngraph::snippets::pass::ReplaceLoadsWithScalarLoads>(skip_matching_domain);
+        set_callback<ngraph::snippets::pass::SetScalarCountForLoad>(skip_matching_domain);
         manager.get_pass_config()->
-        set_callback<ngraph::snippets::pass::ReplaceStoresWithScalarStores>(skip_matching_domain);
+        set_callback<ngraph::snippets::pass::SetScalarCountForStore>(skip_matching_domain);
     }
     manager.run_passes(m_body);
 }
 
 snippets::Schedule snippets::op::Subgraph::generate(const BlockedShapeVector& output_shapes,
                                                     const BlockedShapeVector& input_shapes,
+                                                    const ov::element::Type exec_type,
                                                     const void* compile_params) {
-    canonicalize(output_shapes, input_shapes);
+    canonicalize(output_shapes, input_shapes, exec_type);
     return generate(compile_params);
 }
 
 snippets::Schedule snippets::op::Subgraph::generate(const BlockedShapeVector& output_shapes,
                                                     const BlockedShapeVector& input_shapes,
                                                     ngraph::pass::Manager& opt,
+                                                    const ov::element::Type exec_type,
                                                     const void* compile_params) {
-    canonicalize(output_shapes, input_shapes);
+    canonicalize(output_shapes, input_shapes, exec_type);
     return generate(opt, compile_params);
 }
 
diff --git a/src/common/snippets/src/op/tile.cpp b/src/common/snippets/src/op/tile.cpp
index c17b0b0c8163c5..b37e212fdcf88d 100644
--- a/src/common/snippets/src/op/tile.cpp
+++ b/src/common/snippets/src/op/tile.cpp
@@ -8,5 +8,8 @@
 using namespace std;
 using namespace ngraph;
 
-snippets::op::Tile::Tile(const std::vector<std::pair<std::shared_ptr<snippets::Emitter>, snippets::RegInfo>>& nested) : Op(), region(nested) {
+snippets::op::Tile::Tile(const std::vector<AllocatedEmitter>& region, size_t increment,
+                         size_t num_inputs, size_t num_outputs,
+                         const std::vector<size_t>& io_dims, const std::vector<size_t>& io_data_sizes) :
+      Op(), region(region), increment(increment), num_inputs(num_inputs), num_outputs(num_outputs), io_dims(io_dims), io_data_size(io_data_sizes) {
 }
diff --git a/src/common/snippets/src/op/tile_scheduler.cpp b/src/common/snippets/src/op/tile_scheduler.cpp
new file mode 100644
index 00000000000000..fd0ba9e6a23223
--- /dev/null
+++ b/src/common/snippets/src/op/tile_scheduler.cpp
@@ -0,0 +1,10 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/op/tile_scheduler.hpp"
+#include "snippets/generator.hpp"
+
+ngraph::snippets::op::TileScheduler::TileScheduler(const AllocatedEmitter& vector_region, const AllocatedEmitter& scalar_region)
+    : Op(), vector_region{vector_region}, scalar_region{scalar_region} {
+}
diff --git a/src/common/snippets/src/op/vectorload.cpp b/src/common/snippets/src/op/vectorload.cpp
deleted file mode 100644
index 333b310d6cb88e..00000000000000
--- a/src/common/snippets/src/op/vectorload.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-// Copyright (C) 2018-2022 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "snippets/op/vectorload.hpp"
-
-using namespace ngraph;
-
-snippets::op::VectorLoad::VectorLoad(const Output<Node>& x) : Load(x) {
-}
diff --git a/src/common/snippets/src/op/vectorstore.cpp b/src/common/snippets/src/op/vectorstore.cpp
deleted file mode 100644
index fb4b4e76ef2311..00000000000000
--- a/src/common/snippets/src/op/vectorstore.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-// Copyright (C) 2018-2022 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "snippets/op/vectorstore.hpp"
-
-using namespace ngraph;
-
-snippets::op::VectorStore::VectorStore(const Output<Node>& x) : Store(x) {
-}
diff --git a/src/common/snippets/src/pass/assign_registers.cpp b/src/common/snippets/src/pass/assign_registers.cpp
index d5703cc2905739..291b60e7cd809b 100644
--- a/src/common/snippets/src/pass/assign_registers.cpp
+++ b/src/common/snippets/src/pass/assign_registers.cpp
@@ -16,7 +16,6 @@
 bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr<ov::Model>& f) {
     RUN_ON_MODEL_SCOPE(AssignRegisters);
     OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::AssignRegisters")
-    int reg64_tmp_start { 8 }; // R8, R9, R10, R11, R12, R13, R14, R15 inputs+outputs+1
     using Reg = size_t;
     auto ops = f->get_ordered_ops();
     decltype(ops) stmts;
@@ -26,8 +25,8 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
 
     size_t rdx = 0;
     std::map<std::shared_ptr<descriptor::Tensor>, Reg> regs;
-    for (auto op : stmts) {
-        for (auto output : op->outputs()) {
+    for (const auto& op : stmts) {
+        for (const auto& output : op->outputs()) {
             regs[output.get_tensor_ptr()] = rdx++;
         }
     }
@@ -35,9 +34,9 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
     std::vector<std::set<Reg>> used;
     std::vector<std::set<Reg>> def;
 
-    for (auto op : stmts) {
+    for (const auto& op : stmts) {
         std::set<Reg> u;
-        for (auto input : op->inputs()) {
+        for (const auto& input : op->inputs()) {
             if (regs.count(input.get_tensor_ptr())) {
                 u.insert(regs[input.get_tensor_ptr()]);
             }
@@ -46,7 +45,7 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
 
         std::set<Reg> d;
         if (!std::dynamic_pointer_cast<snippets::op::Store>(op)) {
-            for (auto output : op->outputs()) {
+            for (const auto& output : op->outputs()) {
                 d.insert(regs[output.get_tensor_ptr()]);
             }
         }
@@ -65,8 +64,8 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
         for (size_t n = 0; n < stmts.size(); n++) {
             auto node = stmts[n];
             if (!std::dynamic_pointer_cast<snippets::op::Store>(node)) {
-                for (auto out : node->outputs()) {
-                    for (auto port : out.get_target_inputs()) {
+                for (const auto& out : node->outputs()) {
+                    for (const auto& port : out.get_target_inputs()) {
                         auto pos = std::find(stmts.begin(), stmts.end(), port.get_node()->shared_from_this());
                         if (pos != stmts.end()) {
                             auto k = pos-stmts.begin();
@@ -136,46 +135,32 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
 
     std::map<std::shared_ptr<descriptor::Tensor>, Reg> physical_regs;
 
-    for (auto reg : regs) {
+    for (const auto& reg : regs) {
         physical_regs[reg.first] = register_map[reg.second];
     }
-
-    size_t constantID = 0;
-
-    for (auto n : f->get_ordered_ops()) {
+    const auto num_parameters = f->get_parameters().size();
+    for (const auto& n : f->get_ordered_ops()) {
         auto& rt = n->get_rt_info();
-        // nothing to do for model signature
-        if (std::dynamic_pointer_cast<opset1::Parameter>(n) || std::dynamic_pointer_cast<opset1::Result>(n)) {
-            continue;
-        }
-
-        // store only effective address
-        if (auto result = std::dynamic_pointer_cast<snippets::op::Store>(n)) {
-            auto ea = reg64_tmp_start+static_cast<int64_t>(f->get_result_index(result) + f->get_parameters().size());
-            rt["effectiveAddress"] = ea;
+        std::vector<size_t> regs;
+        regs.reserve(n->outputs().size());
+        /* The main idea here is that each operation stores its output regs in rt["reginfo"]. Input and output regs are
+         * then derived by parsing node's and parent's rt["reginfo"], look into ngraph::snippets::getRegisters for details.
+         * Note also that Parameter and Result store general-purpose register index, because they work with memory
+         * (memory pointer is stored in gpr). All other "regular" ops store vector regs indexes, since calculations are
+         * performed on registers.
+         */
+        if (is_type<ov::op::v0::Result>(n)) {
             continue;
-        }
-        // store effective address and procced with vector registers
-        if (ov::as_type_ptr<ngraph::snippets::op::Load>(n) || ov::as_type_ptr<ngraph::snippets::op::BroadcastLoad>(n)) {
-            auto source = n->get_input_source_output(0).get_node_shared_ptr();
-
-            if (auto param = ov::as_type_ptr<opset1::Parameter>(source)) {
-                auto ea = reg64_tmp_start+static_cast<int64_t>(f->get_parameter_index(param));
-                rt["effectiveAddress"] = ea;
-            } else if (auto constant = ov::as_type_ptr<opset1::Constant>(source)) {
-                auto ea = reg64_tmp_start+static_cast<int64_t>(f->get_parameters().size() + f->get_results().size() + 1 + constantID);
-                rt["effectiveAddress"] = ea;
-                constantID++;
-            } else {
-                throw ngraph_error("load/broadcast should follow only Parameter or non-Scalar constant");
+        } else if (const auto& param = ov::as_type_ptr<ov::op::v0::Parameter>(n)) {
+            regs.push_back(f->get_parameter_index(param));
+        } else if (const auto& store = ov::as_type_ptr<ngraph::snippets::op::Store>(n)) {
+            regs.push_back(f->get_result_index(store) + num_parameters);
+        } else {
+            for (const auto& output : n->outputs()) {
+                auto allocated = physical_regs[output.get_tensor_ptr()];
+                regs.push_back(allocated);
             }
         }
-
-        std::vector<size_t> regs; regs.reserve(n->outputs().size());
-        for (auto output : n->outputs()) {
-            auto allocated = physical_regs[output.get_tensor_ptr()];
-            regs.push_back(allocated);
-        }
         rt["reginfo"] = regs;
     }
 
diff --git a/src/common/snippets/src/pass/collapse_subgraph.cpp b/src/common/snippets/src/pass/collapse_subgraph.cpp
index 49cb66b610ee8f..20acb0b35237b0 100644
--- a/src/common/snippets/src/pass/collapse_subgraph.cpp
+++ b/src/common/snippets/src/pass/collapse_subgraph.cpp
@@ -99,15 +99,17 @@ auto is_layout_oblivious(const std::shared_ptr<const Node> &n) -> bool {
             || ov::is_type<opset1::Tanh>(n)
             || ov::is_type<ngraph::op::v0::Gelu>(n)
             || ov::is_type<ngraph::op::v7::Gelu>(n)
-            || ov::is_type<ngraph::op::v4::HSwish>(n);
+            || ov::is_type<ngraph::op::v4::HSwish>(n)
+            || ov::is_type<ngraph::op::v0::Convert>(n);
     };
     return is_layout_oblivious_unary(n) || is_layout_oblivious_binary(n);
 }
 
 auto has_supported_in_out(const std::shared_ptr<const Node> &n) -> bool {
     auto supported = [](descriptor::Tensor& t) -> bool {
-        return t.get_element_type() == ngraph::element::f32 &&
-               t.get_partial_shape().is_static();
+        static const std::set<ngraph::element::Type> supported_data_types =
+                { ngraph::element::f32, ngraph::element::i32, ngraph::element::bf16, ngraph::element::i8, ngraph::element::u8 };
+        return t.get_partial_shape().is_static() && supported_data_types.count(t.get_element_type()) != 0;
     };
     const auto & inputs = n->inputs();
     const auto & outputs = n->outputs();
@@ -148,19 +150,9 @@ auto update_out_tensor_name(std::shared_ptr<ngraph::snippets::op::Subgraph> &sub
     for (unsigned int i = 0; i < subgraph->get_output_size() && not_set; i++) {
         for (const auto &in : subgraph->get_output_target_inputs(i)) {
             if (ov::is_type<opset1::Result>(in.get_node())) {
-                auto out_tensor = subgraph->output(i).get_tensor_ptr();
-                NGRAPH_SUPPRESS_DEPRECATED_START
-                if (out_tensor->get_name().empty()) {
-                    const auto& body_result = subgraph->get_body()->get_output_op(i);
-                    const auto& body_result_input = body_result->get_input_source_output(0);
-                    // Note that create_ie_output_name() checks only deprecated output.get_tensor().get_name()
-                    // However output.get_tensor().get_names() should also be updated
-                    if (!body_result_input.get_names().empty())
-                        out_tensor->add_names(body_result_input.get_names());
-                    std::string newTensorName = ngraph::op::util::get_ie_output_name(body_result_input);
-                    out_tensor->set_name(newTensorName);
-                }
-                NGRAPH_SUPPRESS_DEPRECATED_END
+                const auto& body_result = subgraph->get_body()->get_output_op(i);
+                const auto& body_result_input = body_result->get_input_source_output(0);
+                op::Subgraph::fill_empty_output_names(subgraph->output(i), body_result_input);
                 not_set = false;
                 break;
             }
@@ -406,6 +398,40 @@ TokenizeSnippets::TokenizeSnippets() {
                 auto& input_body = clones[input_node];
                 size_t source_output_index = input_value.get_index();
                 auto source_result = input_body->get_results()[source_output_index];
+
+                // We cannot add new node, that is not Convert, after Convert (that is start node) to avoid arithmetic problems with conversion
+                // We can add any new node in Subgraph after Convert (bacause after Input)
+                //              Parameter
+                //                  |
+                //               Convert
+                //
+                // We cannot add new node, that isn't Convert, in Subgraph after existing Convert
+                //              Parameter
+                //                Relu
+                //               Convert
+                //
+                // But we can add new Convert in Subgraph after existing Convert
+                //              Parameter
+                //                Relu
+                //               Convert
+                //               Convert
+                //
+                // Thus, We can grow subgraph only if Convert is the first node of subgraph and have to abort it's the last one and we want to add not Convert
+                // We have this limitation because at the moment we support only one execution precision inside body, so
+                // if there is Convert with input and output data types that aren't equal to supported exec type,
+                // we can get conversion math errors
+                const auto output_of_subgraph = source_result->get_input_node_shared_ptr(0);
+                if (!ov::is_type<ngraph::op::v0::Convert>(node) && ov::is_type<ngraph::op::v0::Convert>(output_of_subgraph)) {
+                    // Also we can add new node after < Parameter -> Convert -> Convert -> Convert >
+                    auto grandparent = output_of_subgraph->get_input_node_ptr(0);
+                    while (ov::is_type<ngraph::op::v0::Convert>(grandparent)) {
+                        grandparent = grandparent->get_input_node_ptr(0);
+                    }
+
+                    if (!ov::is_type<ngraph::op::v0::Parameter>(grandparent)) {
+                        return abort_with_strategy("Convert supports only as Input and as Result of subgraph. Aborting");
+                    }
+                }
                 // Result op has a single input
                 internal_inputs.push_back(source_result->input_value(0));
             } else {
@@ -477,7 +503,7 @@ TokenizeSnippets::TokenizeSnippets() {
             throw ngraph_error("body results and node results size mismatch during subgraph collaps");
         }
         // todo: move this plugin-specific constraint to the plugin callback
-        if (body_parameters.size() + body_results.size() > 7) {
+        if (body_parameters.size() + body_results.size() > 12) {
             const std::string message_reset = "new subgraph is created. Impossible to schedule subgraph with " +
             std::to_string(body_parameters.size()) + " inputs and " + std::to_string(body_results.size()) + " outputs.";
             const std::string message_abort = "failed to continue subgraph. Impossible to schedule subgraph with " +
diff --git a/src/common/snippets/src/pass/insert_convert_on_inputs.cpp b/src/common/snippets/src/pass/insert_convert_on_inputs.cpp
new file mode 100644
index 00000000000000..4647949145e44a
--- /dev/null
+++ b/src/common/snippets/src/pass/insert_convert_on_inputs.cpp
@@ -0,0 +1,72 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+#include "snippets/remarks.hpp"
+
+#include "snippets/pass/insert_convert_on_inputs.hpp"
+#include "snippets/snippets_isa.hpp"
+
+#include "ngraph/type.hpp"
+#include "ngraph/node.hpp"
+
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+#include <ngraph/pattern/op/or.hpp>
+
+// We should recursivelly (after full sequences of ConvertTruncation) go through inputs and
+// insert ConvertSaturation with supported element type before eltwises
+// NOTE: JUST EXAMPLE:
+//                             Parameter I8
+//                        ConvertTruncation U8
+//                  /              |               \
+// ConvertTruncation F32  ConvertTruncation I32  ConvertTruncation BF16
+//      Eltwise           ConvertSaturation FP32 ConvertTruncation I32
+//        <>                    Eltwise          ConvertSaturation FP32
+//                                 <>                    Eltwise
+bool insertConvertSaturationAfterNode(const std::shared_ptr<ov::Node>& node, const ov::element::Type element_type) {
+    bool rewritten = false;
+    for (const auto& output : node->outputs()) {
+        for (auto consumer : output.get_target_inputs()) {
+            const auto output_shared_node = consumer.get_node()->shared_from_this();
+            // Go down through ConvertTruncation sequence
+            if (auto existing_convert_t = ov::as_type_ptr<ngraph::snippets::op::ConvertTruncation>(output_shared_node)) {
+                rewritten = insertConvertSaturationAfterNode(existing_convert_t, element_type);
+                continue;
+            }
+
+            // Check if ConvertSaturation already exists with supported element type or not and insert ConvertSaturation with supported element type
+            auto existing_convert_s = ov::as_type_ptr<ngraph::snippets::op::ConvertSaturation>(output_shared_node);
+            if ((!existing_convert_s && !ov::is_type<ov::op::v0::Result>(output_shared_node) && consumer.get_element_type() != element_type) ||
+                (existing_convert_s && existing_convert_s->get_destination_type() != element_type)) {
+                const auto convert = std::make_shared<ngraph::snippets::op::ConvertSaturation>(node, element_type);
+                consumer.replace_source_output(convert);
+                rewritten |= true;
+            }
+        }
+    }
+    return rewritten;
+}
+
+ngraph::snippets::pass::InsertConvertOnInputs::InsertConvertOnInputs(const ov::element::Type exec_type) {
+    MATCHER_SCOPE(InsertConvertOnInputs);
+
+    auto param_pattern = ngraph::pattern::wrap_type<ngraph::opset1::Parameter>();
+    auto scalar_pattern = pattern::wrap_type<opset1::Constant>(
+        [=](Output<Node> output) -> bool { return ngraph::shape_size(output.get_shape()) == 1; });
+    auto input = std::make_shared<pattern::op::Or>(OutputVector{ param_pattern, scalar_pattern });
+
+    ngraph::matcher_pass_callback callback = [this, exec_type](ngraph::pattern::Matcher& m) {
+        OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertConvertOnInputs")
+        auto root = m.get_match_root();
+
+        auto rewritten = insertConvertSaturationAfterNode(root, exec_type);
+
+        return rewritten;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(input, matcher_name);
+    register_matcher(m, callback);
+}
diff --git a/src/common/snippets/src/pass/insert_load_store.cpp b/src/common/snippets/src/pass/insert_load_store.cpp
index 417458571d6168..827b1f914a793d 100644
--- a/src/common/snippets/src/pass/insert_load_store.cpp
+++ b/src/common/snippets/src/pass/insert_load_store.cpp
@@ -12,11 +12,11 @@
 #include <ngraph/rt_info.hpp>
 #include <ngraph/pattern/op/wrap_type.hpp>
 
-ngraph::snippets::pass::InsertLoad::InsertLoad() {
+ngraph::snippets::pass::InsertLoad::InsertLoad(const size_t count) {
     MATCHER_SCOPE(InsertLoad);
     register_matcher(std::make_shared<ngraph::pattern::Matcher>(
         ngraph::pattern::wrap_type<ngraph::opset1::Parameter>(), matcher_name),
-            [this](ngraph::pattern::Matcher &m) {
+            [this, count](ngraph::pattern::Matcher &m) {
             OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertLoad")
             auto root = m.get_match_root();
 
@@ -29,7 +29,7 @@ ngraph::snippets::pass::InsertLoad::InsertLoad() {
                 }
             }
 
-            auto load = std::make_shared<ngraph::snippets::op::Load> (root);
+            auto load = std::make_shared<ngraph::snippets::op::Load>(root, count);
             ngraph::copy_runtime_info(root, load);
 
             bool rewritten = false;
@@ -46,11 +46,11 @@ ngraph::snippets::pass::InsertLoad::InsertLoad() {
         });
 }
 
-ngraph::snippets::pass::InsertStore::InsertStore() {
+ngraph::snippets::pass::InsertStore::InsertStore(const size_t count) {
     MATCHER_SCOPE(InsertStore);
     register_matcher(std::make_shared<ngraph::pattern::Matcher>(
         ngraph::pattern::wrap_type<ngraph::opset1::Result>(), matcher_name),
-            [this](ngraph::pattern::Matcher &m) {
+            [this, count](ngraph::pattern::Matcher &m) {
             OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertStore")
             auto root = m.get_match_root();
 
@@ -61,7 +61,7 @@ ngraph::snippets::pass::InsertStore::InsertStore() {
                 }
             }
 
-            auto store = std::make_shared<ngraph::snippets::op::Store> (root->input_value(0));
+            auto store = std::make_shared<ngraph::snippets::op::Store> (root->input_value(0), count);
             ngraph::copy_runtime_info(root, store);
             root->set_argument(0, store);
             return true;
diff --git a/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp b/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp
index cf6eb80e484c33..ce632f33608514 100644
--- a/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp
+++ b/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp
@@ -15,7 +15,7 @@
 ngraph::snippets::pass::LoadMoveBroadcastToBroadcastLoad::LoadMoveBroadcastToBroadcastLoad() {
     MATCHER_SCOPE(LoadMoveBroadcastToBroadcastLoad);
     auto param_pattern = ngraph::pattern::wrap_type<ngraph::opset1::Parameter>();
-    auto load_pattern = std::make_shared<ngraph::snippets::op::Load>(param_pattern);
+    auto load_pattern = ngraph::pattern::wrap_type<ngraph::snippets::op::Load>({param_pattern});
     auto fbn = std::make_shared<ngraph::snippets::op::BroadcastMove>(load_pattern, Shape{1});
 
     register_matcher(std::make_shared<ngraph::pattern::Matcher>(fbn, matcher_name),
diff --git a/src/common/snippets/src/pass/reset_type_relaxed_node_precision.cpp b/src/common/snippets/src/pass/reset_type_relaxed_node_precision.cpp
new file mode 100644
index 00000000000000..9cb89933ab0f0e
--- /dev/null
+++ b/src/common/snippets/src/pass/reset_type_relaxed_node_precision.cpp
@@ -0,0 +1,31 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+
+#include "snippets/op/convert_saturation.hpp"
+#include "snippets/pass/reset_type_relaxed_node_precision.hpp"
+#include "ngraph_ops/type_relaxed.hpp"
+
+#include <ngraph/rt_info.hpp>
+
+
+ngraph::snippets::pass::ResetTypeRelaxedNodePrecision::ResetTypeRelaxedNodePrecision(const ov::element::Type exec_type) : exec_type(exec_type) { }
+
+bool ngraph::snippets::pass::ResetTypeRelaxedNodePrecision::run_on_model(const std::shared_ptr<ov::Model> &m) {
+    RUN_ON_FUNCTION_SCOPE(ResetTypeRelaxedNodePrecision);
+    bool rewritten = false;
+    for (auto& op : m->get_ordered_ops()) {
+        if (auto node = std::dynamic_pointer_cast<ngraph::op::TypeRelaxedBase>(op)) {
+            for (int i = 0; i < op->outputs().size(); i++) {
+                node->set_overridden_output_type(exec_type, i);
+                rewritten |= true;
+            }
+        } else {
+            op->validate_and_infer_types();
+        }
+    }
+
+    return rewritten;
+}
diff --git a/src/common/snippets/src/pass/transform_convert_to_truncation.cpp b/src/common/snippets/src/pass/transform_convert_to_truncation.cpp
new file mode 100644
index 00000000000000..3ba93d74b4c023
--- /dev/null
+++ b/src/common/snippets/src/pass/transform_convert_to_truncation.cpp
@@ -0,0 +1,34 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/remarks.hpp"
+#include <snippets/itt.hpp>
+
+#include "snippets/pass/transform_convert_to_truncation.hpp"
+#include "snippets/snippets_isa.hpp"
+
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+
+ngraph::snippets::pass::TransformConvertToConvertTruncation::TransformConvertToConvertTruncation() {
+    MATCHER_SCOPE(TransformConvertToConvertTruncation);
+    register_matcher(std::make_shared<ngraph::pattern::Matcher>(
+        ngraph::pattern::wrap_type<ngraph::opset1::Convert>()),
+            [this](ngraph::pattern::Matcher &m) {
+            OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::TransformConvertToConvertTruncation")
+            const auto root = m.get_match_root();
+            const auto convert = ngraph::as_type_ptr<ngraph::opset1::Convert>(root);
+            if (!convert)
+                return false;
+
+            auto convert_truncation = std::make_shared<op::ConvertTruncation>(convert->get_input_source_output(0),
+                                                                              convert->get_destination_type());
+            convert_truncation->set_friendly_name(convert->get_friendly_name());
+            ngraph::copy_runtime_info(convert, convert_truncation);
+            ngraph::replace_node(convert, convert_truncation);
+
+            return true;
+        });
+}
\ No newline at end of file
diff --git a/src/common/snippets/src/pass/vector_to_scalar.cpp b/src/common/snippets/src/pass/vector_to_scalar.cpp
index 0af4d084f73f36..b8de68eafd8258 100644
--- a/src/common/snippets/src/pass/vector_to_scalar.cpp
+++ b/src/common/snippets/src/pass/vector_to_scalar.cpp
@@ -7,40 +7,43 @@
 #include "snippets/pass/vector_to_scalar.hpp"
 #include "snippets/snippets_isa.hpp"
 
-#include <ngraph/opsets/opset1.hpp>
 #include <ngraph/rt_info.hpp>
 #include <ngraph/pattern/op/wrap_type.hpp>
 
-ngraph::snippets::pass::ReplaceLoadsWithScalarLoads::ReplaceLoadsWithScalarLoads() {
-    MATCHER_SCOPE(ReplaceLoadsWithScalarLoads);
+ngraph::snippets::pass::SetScalarCountForLoad::SetScalarCountForLoad() {
+    MATCHER_SCOPE(SetScalarCountForLoad);
     register_matcher(std::make_shared<ngraph::pattern::Matcher>(
         ngraph::pattern::wrap_type<ngraph::snippets::op::Load>(), matcher_name),
             [this](ngraph::pattern::Matcher &m) {
-            OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ReplaceLoadsWithScalarLoads_callback")
+            OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::SetScalarCountForLoad_callback")
             auto root = m.get_match_root();
             if (transformation_callback(root))
                 return false;
-            auto load = std::make_shared<ngraph::snippets::op::ScalarLoad> (root->input_value(0));
-            load->set_friendly_name(root->get_friendly_name());
-            ngraph::copy_runtime_info(root, load);
-            ngraph::replace_node(root, load);
+
+            const auto load = ov::as_type_ptr<ngraph::snippets::op::Load>(root);
+            if (!load)
+                return false;
+
+            load->set_count(1lu);
             return true;
         });
 }
 
-ngraph::snippets::pass::ReplaceStoresWithScalarStores::ReplaceStoresWithScalarStores() {
-    MATCHER_SCOPE(ReplaceStoresWithScalarStores);
+ngraph::snippets::pass::SetScalarCountForStore::SetScalarCountForStore() {
+    MATCHER_SCOPE(SetScalarCountForStore);
     register_matcher(std::make_shared<ngraph::pattern::Matcher>(
         ngraph::pattern::wrap_type<ngraph::snippets::op::Store>(), matcher_name),
             [this](ngraph::pattern::Matcher &m) {
-            OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ReplaceStoresWithScalarStores_callback")
+            OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::SetScalarCountForStore_callback")
             auto root = m.get_match_root();
             if (transformation_callback(root))
                 return false;
-            auto store = std::make_shared<ngraph::snippets::op::ScalarStore> (root->input_value(0));
-            store->set_friendly_name(root->get_friendly_name());
-            ngraph::copy_runtime_info(root, store);
-            ngraph::replace_node(root, store);
+
+            const auto store = ov::as_type_ptr<ngraph::snippets::op::Store>(root);
+            if (!store)
+                return false;
+
+            store->set_count(1lu);
             return true;
         });
 }
diff --git a/src/common/snippets/tests/include/lowering_utils.hpp b/src/common/snippets/tests/include/lowering_utils.hpp
index 7d822e5853438a..1551f4fe99e311 100644
--- a/src/common/snippets/tests/include/lowering_utils.hpp
+++ b/src/common/snippets/tests/include/lowering_utils.hpp
@@ -29,12 +29,13 @@ class DummyTargetMachine : public ngraph::snippets::TargetMachine {
     DummyTargetMachine();
     bool is_supported() const override { return true; }
     ngraph::snippets::code get_snippet() const override { return nullptr; }
-    size_t get_lanes() const override { return 1; }
+    size_t get_lanes() const override { return 10; }
 };
 
 class DummyGenerator : public ngraph::snippets::Generator {
 public:
     DummyGenerator() : ngraph::snippets::Generator(std::make_shared<DummyTargetMachine>()) {}
+    element::Type get_supported_exec_precision() const override { return ov::element::f32; }
 };
 
 class LoweringTests : public TransformationTestsF {
diff --git a/src/common/snippets/tests/include/pass/set_scalar_count_for_load_and_store.hpp b/src/common/snippets/tests/include/pass/set_scalar_count_for_load_and_store.hpp
new file mode 100644
index 00000000000000..2bc13f3290b30c
--- /dev/null
+++ b/src/common/snippets/tests/include/pass/set_scalar_count_for_load_and_store.hpp
@@ -0,0 +1,40 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "lowering_utils.hpp"
+#include "snippets_helpers.hpp"
+
+/* The main purpose is to test that:
+ * - Load/Store ops are inserted
+ * - Load + BroadcastMove fuses to BroadcastLoad (not the main focus, but still had to cover; overlays with insert_movebroadcast.cpp)
+ * - Proper Load/Stores are converted to scalar form to avoid invalid memory access by vector tile
+ *      (temporary disabled, since corresponding PR is not merged yet)
+ */
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+typedef std::tuple<
+        Shape, // Input shape 0
+        Shape, // Input shape 1
+        Shape, // Input shape 2
+        Shape, // Broadcast shape 0
+        Shape, // Broadcast shape 1
+        Shape // Broadcast shape 2
+> insertLoadStoreParams;
+
+class InsertLoadStoreTests : public LoweringTests, public testing::WithParamInterface<insertLoadStoreParams> {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<insertLoadStoreParams> obj);
+protected:
+    void SetUp() override;
+    std::shared_ptr<SnippetsFunctionBase> snippets_function;
+};
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
diff --git a/src/common/snippets/tests/src/lowering_utils.cpp b/src/common/snippets/tests/src/lowering_utils.cpp
index bdbfe41d6dd45c..4aab86d5d7c07c 100644
--- a/src/common/snippets/tests/src/lowering_utils.cpp
+++ b/src/common/snippets/tests/src/lowering_utils.cpp
@@ -23,18 +23,15 @@ DummyTargetMachine::DummyTargetMachine() {
     jitters[op::v1::Multiply::get_type_info_static()] = dummy_functor;
     jitters[op::v1::Multiply::get_type_info_static()] = dummy_functor;
     jitters[ngraph::snippets::op::Load::get_type_info_static()] = dummy_functor;
-    jitters[ngraph::snippets::op::VectorLoad::get_type_info_static()] = dummy_functor;
-    jitters[ngraph::snippets::op::ScalarLoad::get_type_info_static()] = dummy_functor;
     jitters[ngraph::snippets::op::BroadcastLoad::get_type_info_static()] = dummy_functor;
 
     jitters[ngraph::snippets::op::Store::get_type_info_static()] = dummy_functor;
-    jitters[ngraph::snippets::op::VectorStore::get_type_info_static()] = dummy_functor;
-    jitters[ngraph::snippets::op::ScalarStore::get_type_info_static()] = dummy_functor;
 
     jitters[ngraph::snippets::op::Scalar::get_type_info_static()] = dummy_functor;
     jitters[ngraph::snippets::op::BroadcastMove::get_type_info_static()] = dummy_functor;
     jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = dummy_functor;
     jitters[ngraph::snippets::op::Tile::get_type_info_static()] = dummy_functor;
+    jitters[ngraph::snippets::op::TileScheduler::get_type_info_static()] = dummy_functor;
 }
 
 std::shared_ptr<ngraph::snippets::op::Subgraph> LoweringTests::getSubgraph(const std::shared_ptr<Model>& f) {
diff --git a/src/common/snippets/tests/src/pass/canonicalization.cpp b/src/common/snippets/tests/src/pass/canonicalization.cpp
index 08ced11370cd7b..a9126c0f4216d1 100644
--- a/src/common/snippets/tests/src/pass/canonicalization.cpp
+++ b/src/common/snippets/tests/src/pass/canonicalization.cpp
@@ -49,7 +49,9 @@ TEST_P(CanonicalizationTests, Add) {
     function = snippets_function->getOriginal();
     function_ref = snippets_function->getReference();
     auto subgraph =  getTokenizedSubgraph(function);
-    Shape canonical_output_shape = subgraph->canonicalize(output_blocked_shapes, input_blocked_shapes);
+    subgraph->set_generator(std::make_shared<DummyGenerator>());
+    const auto exec_type = subgraph->get_generator()->get_supported_exec_precision();
+    Shape canonical_output_shape = subgraph->canonicalize(output_blocked_shapes, input_blocked_shapes, exec_type);
     ASSERT_DIMS_EQ(canonical_output_shape, expected_output_shape);
 }
 
diff --git a/src/common/snippets/tests/src/pass/collapse_subgraph.cpp b/src/common/snippets/tests/src/pass/collapse_subgraph.cpp
index 2a1e107df3c107..3e578119b25d19 100644
--- a/src/common/snippets/tests/src/pass/collapse_subgraph.cpp
+++ b/src/common/snippets/tests/src/pass/collapse_subgraph.cpp
@@ -5,6 +5,7 @@
 #include <gtest/gtest.h>
 #include <pass/collapse_subgraph.hpp>
 #include <subgraph_simple.hpp>
+#include <subgraph_converts.hpp>
 #include "snippets/pass/collapse_subgraph.hpp"
 
 namespace ov {
@@ -39,6 +40,43 @@ TEST_F(CollapseSubgraphTests, smoke_Snippets_AvoidLoopEltwise) {
     run();
 }
 
+TEST_F(CollapseSubgraphTests, smoke_Snippets_OneConvert) {
+    const auto &f = ConvertFunction(std::vector<Shape>{{2, 5}});
+    function = f.getOriginal();
+    function_ref = f.getReference();
+    run();
+}
+
+TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertInput) {
+    const auto &f = ConvertInputFunction(std::vector<Shape>{{2, 5}, {1, 5}});
+    function = f.getOriginal();
+    function_ref = f.getReference();
+    run();
+}
+
+TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertOutput) {
+    const auto &f = ConvertOutputFunction(std::vector<Shape>{{2, 5}, {1, 5}});
+    function = f.getOriginal();
+    function_ref = f.getReference();
+    run();
+}
+
+TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertStub) {
+    const auto &f = ConvertStubFunction(std::vector<Shape>{{2, 5, 2}, {1, 5, 1}});
+    function = f.getOriginal();
+    function_ref = f.getReference();
+    run();
+}
+
+TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertPartialInputsAndResults) {
+    const auto &f = ConvertPartialInputsAndResultsFunction(std::vector<Shape>{{2, 5, 1}, {1, 5, 1}, {2, 1, 10}},
+                                                           std::vector<ov::element::Type>{ov::element::i8, ov::element::bf16, ov::element::f32},
+                                                           std::vector<ov::element::Type>{ov::element::f32, ov::element::i8});
+    function = f.getOriginal();
+    function_ref = f.getReference();
+    run();
+}
+
 }  // namespace snippets
 }  // namespace test
 }  // namespace ov
\ No newline at end of file
diff --git a/src/common/snippets/tests/src/vector_scalar.cpp b/src/common/snippets/tests/src/pass/set_scalar_count_for_load_and_store.cpp
similarity index 53%
rename from src/common/snippets/tests/src/vector_scalar.cpp
rename to src/common/snippets/tests/src/pass/set_scalar_count_for_load_and_store.cpp
index 7f46b9f01bc7e1..9305faa50119be 100644
--- a/src/common/snippets/tests/src/vector_scalar.cpp
+++ b/src/common/snippets/tests/src/pass/set_scalar_count_for_load_and_store.cpp
@@ -19,56 +19,81 @@ using namespace ngraph;
 
 //  todo: Rewrite this test using Snippets test infrastructure. See ./include/canonicalization.hpp for example
 
-TEST(TransformationTests, ReplaceLoadsWithScalarLoads) {
+template<typename T>
+size_t get_count(const std::shared_ptr<Function>& f, const std::string& name) {
+    size_t load_count = std::numeric_limits<size_t>::max();
+    for (auto op : f->get_ops()) {
+        if (op->get_friendly_name() == name) {
+            load_count = ov::as_type_ptr<T>(op)->get_count();
+        }
+    }
+    return load_count;
+}
+
+TEST(TransformationTests, SetScalarCountForLoad) {
     std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
+    const auto count = 16;
     {
         auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 2});
-        auto load = std::make_shared<snippets::isa::Load>(data);
+        auto load = std::make_shared<snippets::isa::Load>(data, count);
+        load->set_friendly_name("load");
         auto neg = std::make_shared<opset1::Negative>(load);
-        auto store = std::make_shared<snippets::isa::Store>(neg);
+        auto store = std::make_shared<snippets::isa::Store>(neg, count);
         f = std::make_shared<Function>(NodeVector{store}, ParameterVector{data});
 
         pass::Manager m;
         m.register_pass<pass::InitNodeInfo>();
-        m.register_pass<snippets::pass::ReplaceLoadsWithScalarLoads>();
+        m.register_pass<snippets::pass::SetScalarCountForLoad>();
         m.run_passes(f);
         ASSERT_NO_THROW(check_rt_info(f));
     }
     {
         auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 2});
-        auto load = std::make_shared<snippets::isa::ScalarLoad>(data);
+        auto load = std::make_shared<snippets::isa::Load>(data, 1lu);
+        load->set_friendly_name("load_ref");
         auto neg = std::make_shared<opset1::Negative>(load);
-        auto store = std::make_shared<snippets::isa::Store>(neg);
+        auto store = std::make_shared<snippets::isa::Store>(neg, count);
         f_ref = std::make_shared<Function>(NodeVector{store}, ParameterVector{data});
     }
 
     auto res = compare_functions(f, f_ref);
     ASSERT_TRUE(res.first) << res.second;
+
+    auto load_count = get_count<ngraph::snippets::op::Load>(f, "load");
+    auto load_count_ref = get_count<ngraph::snippets::op::Load>(f_ref, "load_ref");
+    ASSERT_EQ(load_count, load_count_ref);
 }
 
-TEST(TransformationTests, ReplaceStoresWithScalarStores) {
+TEST(TransformationTests, SetScalarCountForStore) {
     std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
+    const auto count = 16;
     {
         auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 2});
-        auto load = std::make_shared<snippets::isa::Load>(data);
+        auto load = std::make_shared<snippets::isa::Load>(data, count);
         auto neg = std::make_shared<opset1::Negative>(load);
-        auto store = std::make_shared<snippets::isa::Store>(neg);
+        auto store = std::make_shared<snippets::isa::Store>(neg, count);
+        store->set_friendly_name("store");
         f = std::make_shared<Function>(NodeVector{store}, ParameterVector{data});
 
         pass::Manager m;
         m.register_pass<pass::InitNodeInfo>();
-        m.register_pass<snippets::pass::ReplaceStoresWithScalarStores>();
+        m.register_pass<snippets::pass::SetScalarCountForStore>();
         m.run_passes(f);
         ASSERT_NO_THROW(check_rt_info(f));
     }
     {
         auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 2});
-        auto load = std::make_shared<snippets::isa::Load>(data);
+        auto load = std::make_shared<snippets::isa::Load>(data, count);
         auto neg = std::make_shared<opset1::Negative>(load);
-        auto store = std::make_shared<snippets::isa::ScalarStore>(neg);
+        auto store = std::make_shared<snippets::isa::Store>(neg, 1lu);
+        store->set_friendly_name("store_ref");
         f_ref = std::make_shared<Function>(NodeVector{store}, ParameterVector{data});
     }
 
     auto res = compare_functions(f, f_ref);
     ASSERT_TRUE(res.first) << res.second;
+
+    int64_t store_count = get_count<ngraph::snippets::op::Store>(f, "store");
+    int64_t store_count_ref = get_count<ngraph::snippets::op::Store>(f_ref, "store_ref");
+    ASSERT_EQ(store_count, store_count_ref);
 }
\ No newline at end of file
diff --git a/src/common/snippets/tests/src/registers.cpp b/src/common/snippets/tests/src/registers.cpp
index 89e4e4768ff60e..2eb5cddd84fb9f 100644
--- a/src/common/snippets/tests/src/registers.cpp
+++ b/src/common/snippets/tests/src/registers.cpp
@@ -25,12 +25,14 @@ TEST(TransformationTests, AssignRegisters) {
     {
         auto p0 = std::make_shared<opset1::Parameter>(element::f32, Shape(1));
         auto p1 = std::make_shared<opset1::Parameter>(element::f32, Shape(1));
+        p0->set_friendly_name("p00");
+        p1->set_friendly_name("p01");
         auto y00 = std::make_shared<snippets::isa::Load>(p0); y00->set_friendly_name("y00");
         auto y01 = std::make_shared<snippets::isa::Load>(p1); y01->set_friendly_name("y01");
         auto y02 = std::make_shared<opset1::Multiply>(y00, y01); y02->set_friendly_name("y02");
-        auto y03 = std::make_shared<snippets::isa::Store>(y02); y03->set_friendly_name("y03");
-
-        f = std::make_shared<Function>(NodeVector{y03}, ParameterVector{p0, p1});
+        auto s00 = std::make_shared<snippets::isa::Store>(y02); s00->set_friendly_name("y03");
+        s00->set_friendly_name("s00");
+        f = std::make_shared<Function>(NodeVector{s00}, ParameterVector{p0, p1});
 
         pass::Manager m;
         m.register_pass<pass::InitNodeInfo>();
@@ -39,13 +41,17 @@ TEST(TransformationTests, AssignRegisters) {
         ASSERT_NO_THROW(check_rt_info(f));
     }
 
-    // instead of comparing to a reference function check that registers are correctly assigned
-    // and stored to runtime info
+    /* Instead of comparing to a reference function check that registers are correctly assigned and stored to runtime
+     * info. Note that Parameters and Store rt_info contains gpr indexes, while general op's rt_info contain vector
+     * indexes */
     {
         std::map<std::string, size_t> ref_registers {
+            {"p00", 0}, // gpr
+            {"p01", 1}, // gpr
             {"y00", 0},
             {"y01", 1},
-            {"y02", 2}
+            {"y02", 2},
+            {"s00", 2}, // gpr
         };
 
         auto total_ops = 0;
@@ -75,6 +81,14 @@ TEST(TransformationTests, AssignRegisters2) {
         auto p5 = std::make_shared<opset1::Parameter>(ngraph::element::f32, Shape());
         auto p6 = std::make_shared<opset1::Parameter>(ngraph::element::f32, Shape());
         auto p7 = std::make_shared<opset1::Parameter>(ngraph::element::f32, Shape());
+        p0->set_friendly_name("p00");
+        p1->set_friendly_name("p01");
+        p2->set_friendly_name("p02");
+        p3->set_friendly_name("p03");
+        p4->set_friendly_name("p04");
+        p5->set_friendly_name("p05");
+        p6->set_friendly_name("p06");
+        p7->set_friendly_name("p07");
 
         auto c0 = std::make_shared<snippets::isa::Scalar>(ngraph::element::f32, Shape(), 3.14f); c0->set_friendly_name("r00");
         auto c1 = std::make_shared<snippets::isa::Scalar>(ngraph::element::f32, Shape(), 6.6260701e-34f); c1->set_friendly_name("r01");
@@ -102,9 +116,10 @@ TEST(TransformationTests, AssignRegisters2) {
         auto y20 = std::make_shared<opset1::Add>(y17, y18); y20->set_friendly_name("r22");
         auto y21 = std::make_shared<opset1::Add>(y15, y19); y21->set_friendly_name("r23");
         auto y22 = std::make_shared<opset1::Add>(y20, y21); y22->set_friendly_name("r24");
-        auto y23 = std::make_shared<snippets::isa::Store>(y22);
+        auto s00 = std::make_shared<snippets::isa::Store>(y22);
+        s00->set_friendly_name("s00");
 
-        f = std::make_shared<Function>(NodeVector{y23}, ParameterVector{p0, p1, p2, p3, p4, p5, p6, p7});
+        f = std::make_shared<Function>(NodeVector{s00}, ParameterVector{p0, p1, p2, p3, p4, p5, p6, p7});
 
         pass::Manager m;
         m.register_pass<pass::InitNodeInfo>();
@@ -117,10 +132,14 @@ TEST(TransformationTests, AssignRegisters2) {
     // and stored to runtime info
     {
         std::map<std::string, size_t> ref_registers {
-            {"r00", 1}, {"r01", 3}, {"r02", 5}, {"r03", 5}, {"r04", 2}, {"r05", 6}, {"r06", 6}, {"r07", 6},
-            {"r08", 5}, {"r09", 2}, {"r10", 1}, {"r11", 4}, {"r12", 4}, {"r13", 6}, {"r14", 2}, {"r15", 5},
-            {"r16", 0}, {"r17", 4}, {"r18", 0}, {"r19", 2}, {"r20", 4}, {"r21", 1}, {"r22", 0}, {"r23", 6},
-            {"r24", 1}
+            {"p00", 0}, {"p01", 1}, {"p02", 2}, {"p03", 3}, {"p04", 4}, {"p05", 5},
+            {"p06", 6}, {"p07", 7},
+            {"r00", 1}, {"r01", 3}, {"r02", 5}, {"r03", 5}, {"r04", 2}, {"r05", 6},
+            {"r06", 6}, {"r07", 6}, {"r08", 5}, {"r09", 2}, {"r10", 1}, {"r11", 4},
+            {"r12", 4}, {"r13", 6}, {"r14", 2}, {"r15", 5}, {"r16", 0}, {"r17", 4},
+            {"r18", 0}, {"r19", 2}, {"r20", 4}, {"r21", 1}, {"r22", 0}, {"r23", 6},
+            {"r24", 1},
+            {"s00", 8},
         };
 
         auto total_ops = 0;
diff --git a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp
index b6e5fb3b2ec6cd..c35533a0a28990 100644
--- a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp
+++ b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp
@@ -13,6 +13,9 @@
 #include "jit_eltwise_emitters.hpp"
 #include "jit_dnnl_emitters.hpp"
 #include "jit_dnnl_ext_emitters.hpp"
+#include "jit_conversion_emitters.hpp"
+
+#include "snippets_transformations/op/load_store_convert.hpp"
 
 #include <ngraph/opsets/opset5.hpp>
 
@@ -39,25 +42,23 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_
     : TargetMachine(), h(new jit_snippet()), isa(host_isa) {
     // data movement
     jitters[ngraph::opset1::Parameter::get_type_info_static()] = CREATE_EMITTER(NopEmitter);
-    jitters[ngraph::snippets::op::BlockedParameter::get_type_info_static()] = CREATE_EMITTER(NopEmitter);
     jitters[ngraph::opset1::Result::get_type_info_static()] = CREATE_EMITTER(NopEmitter);
     // jitters[ngraph::opset1::Constant::get_type_info_static()] = CREATE_EMITTER(); // Not supported
 
     jitters[ngraph::snippets::op::Load::get_type_info_static()] = CREATE_EMITTER(LoadEmitter);
-    jitters[ngraph::snippets::op::VectorLoad::get_type_info_static()] = CREATE_EMITTER(LoadEmitter);
-    jitters[ngraph::snippets::op::ScalarLoad::get_type_info_static()] = CREATE_EMITTER(ScalarLoadEmitter);
     jitters[ngraph::snippets::op::BroadcastLoad::get_type_info_static()] = CREATE_EMITTER(BroadcastLoadEmitter);
+    jitters[ov::intel_cpu::LoadConvert::get_type_info_static()] = CREATE_EMITTER(LoadConvertEmitter);
 
     jitters[ngraph::snippets::op::Store::get_type_info_static()] = CREATE_EMITTER(StoreEmitter);
-    jitters[ngraph::snippets::op::VectorStore::get_type_info_static()] = CREATE_EMITTER(StoreEmitter);
-    jitters[ngraph::snippets::op::ScalarStore::get_type_info_static()] = CREATE_EMITTER(ScalarStoreEmitter);
+    jitters[ov::intel_cpu::StoreConvert::get_type_info_static()] = CREATE_EMITTER(StoreConvertEmitter);
 
     jitters[ngraph::snippets::op::Scalar::get_type_info_static()] = CREATE_EMITTER(ScalarEmitter);
-    jitters[ngraph::snippets::op::BroadcastMove::get_type_info_static()] = CREATE_EMITTER(FakeBroadcastEmitter);
+    jitters[ngraph::snippets::op::BroadcastMove::get_type_info_static()] = CREATE_EMITTER(BroadcastMoveEmitter);
     // jitters[ngraph::snippets::op::Nop::get_type_info_static()] = CREATE_EMITTER(NopEmitter); // Not supported
     // jitters[ngraph::opset1::Broadcast::get_type_info_static()] = CREATE_EMITTER(); // Not supported
 
-    // jitters[ngraph::opset1::Convert::get_type_info_static()] = CREATE_EMITTER(); // Not supported
+    jitters[ngraph::snippets::op::ConvertTruncation::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_convert_truncation_emitter);
+    jitters[ngraph::snippets::op::ConvertSaturation::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_convert_saturation_emitter);
     // jitters[ngraph::opset1::FakeQuantize::get_type_info_static()] = CREATE_EMITTER(); // not supported
 
     // binary
@@ -118,6 +119,7 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_
 
     jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = CREATE_EMITTER(KernelEmitter);
     jitters[ngraph::snippets::op::Tile::get_type_info_static()] = CREATE_EMITTER(TileEmitter);
+    jitters[ngraph::snippets::op::TileScheduler::get_type_info_static()] = CREATE_EMITTER(TileSchedulerEmitter);
 }
 
 size_t ov::intel_cpu::CPUTargetMachine::get_lanes() const {
@@ -140,3 +142,7 @@ code ov::intel_cpu::CPUTargetMachine::get_snippet() const {
 
 ov::intel_cpu::CPUGenerator::CPUGenerator(dnnl::impl::cpu::x64::cpu_isa_t isa_) : Generator(std::make_shared<CPUTargetMachine>(isa_)) {
 }
+
+ov::element::Type ov::intel_cpu::CPUGenerator::CPUGenerator::get_supported_exec_precision() const {
+    return ov::element::f32;
+}
diff --git a/src/plugins/intel_cpu/src/emitters/cpu_generator.hpp b/src/plugins/intel_cpu/src/emitters/cpu_generator.hpp
index 7301fcb177b93f..9b1fe3bc79935b 100644
--- a/src/plugins/intel_cpu/src/emitters/cpu_generator.hpp
+++ b/src/plugins/intel_cpu/src/emitters/cpu_generator.hpp
@@ -28,6 +28,8 @@ class CPUTargetMachine : public ngraph::snippets::TargetMachine {
 class CPUGenerator : public ngraph::snippets::Generator {
 public:
     CPUGenerator(dnnl::impl::cpu::x64::cpu_isa_t isa);
+
+    element::Type get_supported_exec_precision() const override;
 };
 
 }   // namespace intel_cpu
diff --git a/src/plugins/intel_cpu/src/emitters/jit_conversion_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_conversion_emitters.cpp
new file mode 100644
index 00000000000000..bbb70ee3eafdb7
--- /dev/null
+++ b/src/plugins/intel_cpu/src/emitters/jit_conversion_emitters.cpp
@@ -0,0 +1,313 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "jit_conversion_emitters.hpp"
+#include "utils/bfloat16.hpp"
+#include <cpu/x64/jit_uni_eltwise.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <nodes/eltwise.h>
+
+using namespace InferenceEngine;
+using namespace dnnl::impl::utils;
+using namespace dnnl::impl;
+using namespace dnnl::impl::cpu::x64;
+using namespace Xbyak;
+
+namespace ov {
+namespace intel_cpu {
+
+jit_convert_emitter::jit_convert_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr<ngraph::Node>& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {
+    input_type = node->get_input_element_type(0);
+    output_type = node->get_output_element_type(0);
+
+    if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
+       emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(host, host_isa));
+}
+
+void jit_convert_emitter::validate_types() const {
+    auto is_supported_type = [this](const ov::element::Type& type) {
+        return any_of(supported_types.begin(), supported_types.end(),
+                      [&type](const ov::element::Type& supported_type) { return supported_type == type; } );
+    };
+
+    if (!is_supported_type(input_type))
+        IE_THROW() << "Unsupported input type: " << input_type.get_type_name();
+    if (!is_supported_type(output_type))
+        IE_THROW() << "Unsupported output type: " << output_type.get_type_name();
+}
+
+size_t jit_convert_emitter::get_inputs_num() const { return 1; }
+
+void jit_convert_emitter::emit_data() const {
+    jit_emitter::emit_data();
+    if (emu_vcvtneps2bf16)
+        emu_vcvtneps2bf16->emit_data();
+}
+
+void jit_convert_emitter::float2bfloat(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    Zmm zmm_src = Zmm(in_vec_idxs[0]);
+    Zmm zmm_dst  = Zmm(out_vec_idxs[0]);
+
+    if (mayiuse(avx512_core_bf16)) {
+        h->vcvtneps2bf16(zmm_dst, zmm_src);
+    } else {
+        if (!emu_vcvtneps2bf16)
+            IE_THROW() << "Converter from float to bf16 isn't initialized!";
+
+        emu_vcvtneps2bf16->emit_code({static_cast<size_t>(zmm_src.getIdx())}, {static_cast<size_t>(zmm_dst.getIdx())});
+    }
+}
+
+jit_convert_truncation_emitter::jit_convert_truncation_emitter(jit_generator *host, cpu_isa_t host_isa,
+                                                               const std::shared_ptr<ngraph::Node>& node, Precision exec_prc)
+        : jit_convert_emitter(host, host_isa, node, exec_prc) {
+    prepare_table();
+}
+
+bool jit_convert_truncation_emitter::is_i8_and_u8_case() const {
+    return one_of(input_type, ov::element::i8, ov::element::u8) &&
+           one_of(output_type, ov::element::i8, ov::element::u8);
+}
+
+void jit_convert_truncation_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                               const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs,
+                                               const emitter_context *emit_context) const {
+    validate_types();
+    if (host_isa_ == cpu::x64::sse41) {
+        emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx512_core) {
+        emit_isa<cpu::x64::avx512_core>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+void jit_convert_truncation_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src = Vmm(in_vec_idxs[0]);
+    Vmm vmm_dst  = Vmm(out_vec_idxs[0]);
+
+    // For Truncation behavior we can just move data from src to dst if we want convert i8 -> u8 or u8 -> i8
+    if ((input_type == output_type) || is_i8_and_u8_case()) {
+        if (vmm_src != vmm_dst) {
+            h->uni_vmovups(vmm_dst, vmm_src);
+        }
+        return;
+    }
+
+    switch (input_type) {
+        case ov::element::f32:
+            if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8))
+                h->uni_vcvttps2dq(vmm_dst, vmm_src);
+            break;
+        case ov::element::i32:
+            if (one_of(output_type, ov::element::f32, ov::element::bf16))
+                h->uni_vcvtdq2ps(vmm_dst, vmm_src);
+            break;
+        case ov::element::bf16:
+            h->vpmovzxwd(vmm_dst, vmm_src);
+            h->uni_vpslld(vmm_dst, vmm_dst, 16);
+            if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8))
+                h->uni_vcvttps2dq(vmm_dst, vmm_dst);
+            break;
+        case ov::element::i8:
+            h->uni_vpmovsxbd(vmm_dst, vmm_src);
+            break;
+        case ov::element::u8:
+            h->uni_vpmovzxbd(vmm_dst, vmm_src);
+            break;
+        default:
+            assert(!"unsupported output data type");
+    }
+
+    switch (output_type) {
+        case ov::element::f32:
+            if (!one_of(input_type, ov::element::i32, ov::element::bf16)) {
+                h->uni_vcvtdq2ps(vmm_dst, vmm_dst);
+            }
+            break;
+        case ov::element::i32:
+            break;
+        case ov::element::bf16:
+            if (input_type == ov::element::f32) {
+                float2bfloat({static_cast<size_t>(vmm_src.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())});
+            } else {
+                if (one_of(input_type, ov::element::i8, ov::element::u8)) {
+                    h->uni_vcvtdq2ps(vmm_dst, vmm_dst);
+                }
+                float2bfloat({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())});
+            }
+            break;
+        case ov::element::i8:
+        case ov::element::u8:
+            if (input_type == ov::element::i32) {
+                dword2int8<isa>({static_cast<size_t>(vmm_src.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())});
+            } else {
+                dword2int8<isa>({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())});
+            }
+            break;
+        default:
+            assert(!"unsupported output data type");
+    }
+}
+
+void jit_convert_truncation_emitter::register_table_entries() {
+    if (host_isa_ == dnnl::impl::cpu::x64::avx2 &&
+        one_of(output_type, ov::element::i8, ov::element::u8) &&
+        !is_i8_and_u8_case())
+        push_arg_entry_of("mask_byte", 0x000000ff, true);
+}
+
+template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+void jit_convert_truncation_emitter::dword2int8(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src = Vmm(in_vec_idxs[0]);
+
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+    Xmm xmm_dst = Xmm(out_vec_idxs[0]);
+    Ymm ymm_dst = Ymm(out_vec_idxs[0]);
+
+    if (isa == dnnl::impl::cpu::x64::avx512_core) {
+        h->vpmovdb(xmm_dst, vmm_src);
+    } else if (isa == dnnl::impl::cpu::x64::avx2) {
+        h->vpand(vmm_dst, vmm_src, table_val("mask_byte"));  // to avoid saturation
+        h->uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst);
+        if (isa != dnnl::impl::cpu::x64::sse41)
+            h->vpermq(ymm_dst, ymm_dst, 0x08);
+        h->uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
+    }
+}
+
+jit_convert_saturation_emitter::jit_convert_saturation_emitter(jit_generator *host, cpu_isa_t host_isa,
+                                                               const std::shared_ptr<ngraph::Node>& node, Precision exec_prc)
+    : jit_convert_emitter(host, host_isa, node, exec_prc) {
+}
+
+void jit_convert_saturation_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                               const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs,
+                                               const emitter_context *emit_context) const {
+    validate_types();
+    if (host_isa_ == cpu::x64::sse41) {
+        emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx512_core) {
+        emit_isa<cpu::x64::avx512_core>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+void jit_convert_saturation_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src = Vmm(in_vec_idxs[0]);
+    Vmm vmm_dst  = Vmm(out_vec_idxs[0]);
+
+    if (input_type == output_type) {
+        h->uni_vmovups(vmm_dst, vmm_src);
+        return;
+    }
+
+    switch (input_type) {
+        case ov::element::f32:
+            if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8))
+                h->uni_vcvtps2dq(vmm_dst, vmm_src);
+            break;
+        case ov::element::i32:
+            if (one_of(output_type, ov::element::f32, ov::element::bf16))
+                h->uni_vcvtdq2ps(vmm_dst, vmm_src);
+            break;
+        case ov::element::bf16:
+            h->vpmovzxwd(vmm_dst, vmm_src);
+            h->uni_vpslld(vmm_dst, vmm_dst, 16);
+            if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8))
+                h->uni_vcvttps2dq(vmm_dst, vmm_dst);
+            break;
+        case ov::element::i8:
+            h->uni_vpmovsxbd(vmm_dst, vmm_src);
+            break;
+        case ov::element::u8:
+            h->uni_vpmovzxbd(vmm_dst, vmm_src);
+            break;
+        default:
+            assert(!"unsupported output data type");
+    }
+
+    switch (output_type) {
+        case ov::element::f32:
+            if (!one_of(input_type, ov::element::i32, ov::element::bf16)) {
+                h->uni_vcvtdq2ps(vmm_dst, vmm_dst);
+            }
+            break;
+        case ov::element::i32:
+            break;
+        case ov::element::bf16:
+            if (input_type == ov::element::f32) {
+                float2bfloat({static_cast<size_t>(vmm_src.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())});
+            } else {
+                if (one_of(input_type, ov::element::i8, ov::element::u8)) {
+                    h->uni_vcvtdq2ps(vmm_dst, vmm_dst);
+                }
+                float2bfloat({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())});
+            }
+            break;
+        case ov::element::i8:
+        case ov::element::u8:
+            if (input_type == ov::element::i32) {
+                dword2int8<isa>({static_cast<size_t>(vmm_src.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())}, output_type.is_signed());
+            } else {
+                dword2int8<isa>({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())}, output_type.is_signed());
+            }
+            break;
+        default:
+            assert(!"unsupported output data type");
+    }
+}
+
+template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+void jit_convert_saturation_emitter::dword2int8(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs, bool is_signed) const {
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src = Vmm(in_vec_idxs[0]);
+
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+    Xmm xmm_dst = Xmm(out_vec_idxs[0]);
+    Ymm ymm_dst = Ymm(out_vec_idxs[0]);
+
+    if (isa == dnnl::impl::cpu::x64::avx512_core) {
+        if (is_signed) {
+            h->vpmovsdb(xmm_dst, vmm_src);
+        } else {
+            Vmm vmm_zero  = Vmm(aux_vec_idxs[0]);
+            h->vpxord(vmm_zero, vmm_zero, vmm_zero);
+            h->vpmaxsd(vmm_dst, vmm_src, vmm_zero);
+            h->vpmovusdb(xmm_dst, vmm_dst);
+        }
+    } else {
+        if (is_signed)
+            h->uni_vpackssdw(vmm_dst, vmm_src, vmm_src);
+        else
+            h->uni_vpackusdw(vmm_dst, vmm_src, vmm_src);
+
+        if (isa != dnnl::impl::cpu::x64::sse41)
+            h->vpermq(ymm_dst, ymm_dst, 0x08);
+
+        if (is_signed)
+            h->uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
+        else
+            h->uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
+    }
+}
+
+size_t jit_convert_saturation_emitter::aux_vecs_count() const {
+    // 1 register is for dword2int8 unsigned
+    return output_type == ov::element::u8 && host_isa_ == dnnl::impl::cpu::x64::avx512_core? 1 : 0;
+}
+
+}   // namespace intel_cpu
+}   // namespace ov
diff --git a/src/plugins/intel_cpu/src/emitters/jit_conversion_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_conversion_emitters.hpp
new file mode 100644
index 00000000000000..71a45f918ea595
--- /dev/null
+++ b/src/plugins/intel_cpu/src/emitters/jit_conversion_emitters.hpp
@@ -0,0 +1,87 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cpu/x64/jit_generator.hpp>
+#include "jit_emitter.hpp"
+#include "jit_bf16_emitters.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+class jit_convert_emitter : public jit_emitter {
+public:
+    jit_convert_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
+                        const std::shared_ptr<ngraph::Node>& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() const override;
+
+protected:
+    void emit_data() const override;
+    void validate_types() const;
+
+    void float2bfloat(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+    ov::element::Type input_type;
+    ov::element::Type output_type;
+
+    const ov::element::TypeVector supported_types = {
+            ov::element::f32,
+            ov::element::i32,
+            ov::element::bf16,
+            ov::element::i8,
+            ov::element::u8
+    };
+
+    std::shared_ptr<jit_emu_vcvtneps2bf16> emu_vcvtneps2bf16 = nullptr;
+};
+
+// This emitter is covered by specification of "Convert" operation. The implementation uses a "warp-around" conversion.
+// Example:
+//  int32_t -> int8_t
+//   129   -> -127
+class jit_convert_truncation_emitter : public jit_convert_emitter {
+public:
+    jit_convert_truncation_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
+                                   const std::shared_ptr<ngraph::Node>& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+private:
+    void emit_impl(const std::vector<size_t>& in, const std::vector<size_t>& out,
+                   const std::vector<size_t>& pool, const std::vector<size_t>& gpr,
+                   const ov::intel_cpu::emitter_context *emit_context) const override;
+    template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+    template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+    void dword2int8(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+    bool is_i8_and_u8_case() const;
+    void register_table_entries() override;
+};
+
+// This emitter is covered by the common dnnl behavior. The implementation uses a "saturation" conversion.
+// Example:
+//  int32_t -> int8_t
+//   129   -> 127
+class jit_convert_saturation_emitter : public jit_convert_emitter {
+public:
+    jit_convert_saturation_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
+                                   const std::shared_ptr<ngraph::Node>& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+private:
+    void emit_impl(const std::vector<size_t>& in, const std::vector<size_t>& out,
+                   const std::vector<size_t>& pool, const std::vector<size_t>& gpr,
+                   const ov::intel_cpu::emitter_context *emit_context) const override;
+    template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+    template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+    void dword2int8(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs, bool is_signed) const;
+
+    size_t aux_vecs_count() const override;
+};
+
+}   // namespace intel_cpu
+}   // namespace ov
diff --git a/src/plugins/intel_cpu/src/emitters/jit_emitter.cpp b/src/plugins/intel_cpu/src/emitters/jit_emitter.cpp
index 50f2674fb111b4..91079b55da46c8 100644
--- a/src/plugins/intel_cpu/src/emitters/jit_emitter.cpp
+++ b/src/plugins/intel_cpu/src/emitters/jit_emitter.cpp
@@ -46,6 +46,10 @@ size_t jit_emitter::aux_vecs_count() const {
     return 0;
 }
 
+emitter_in_out_map jit_emitter::get_in_out_type() const {
+    return in_out_type_;
+}
+
 size_t jit_emitter::aux_gprs_count() const {
     // We need one gpr to load table address
     return entry_map_.empty() ? 0 : 1;
diff --git a/src/plugins/intel_cpu/src/emitters/jit_emitter.hpp b/src/plugins/intel_cpu/src/emitters/jit_emitter.hpp
index f0f460d51713a5..74fe712ddd6f9f 100644
--- a/src/plugins/intel_cpu/src/emitters/jit_emitter.hpp
+++ b/src/plugins/intel_cpu/src/emitters/jit_emitter.hpp
@@ -55,6 +55,7 @@ class jit_emitter : public ngraph::snippets::Emitter {
                       const std::vector<size_t> &pool_vec_idxs = {}, const std::vector<size_t> &pool_gpr_idxs = {});
     virtual size_t get_inputs_num() const = 0;
     virtual size_t aux_vecs_count() const;
+    emitter_in_out_map get_in_out_type() const;
     static std::set<InferenceEngine::Precision> get_supported_precisions();
 
 protected:
diff --git a/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.cpp
index da1589aa4497d4..490f957c1efa99 100644
--- a/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.cpp
+++ b/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.cpp
@@ -547,8 +547,10 @@ void jit_load_emitter::register_table_entries() {
 
 /// STORE ///
 jit_store_emitter::jit_store_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
-                                     Precision src_prc, Precision dst_prc, int store_num, Precision exec_prc, emitter_in_out_map in_out_type)
-: jit_emitter(host, host_isa, exec_prc, in_out_type), store_num_(store_num), src_prc_(src_prc), dst_prc_(dst_prc), name_("unknown") {
+                                     Precision src_prc, Precision dst_prc, int store_num, arithmetic_mode mode, Precision exec_prc,
+                                     emitter_in_out_map in_out_type)
+    : jit_emitter(host, host_isa, exec_prc, in_out_type), store_num_(store_num), src_prc_(src_prc), dst_prc_(dst_prc), mode_(mode), name_("unknown") {
+    prepare_table();
     v_len_elt_ = get_vec_length() / exec_prc.size();
     store_size_ = store_num * dst_prc.size();
     if (!mayiuse(cpu::x64::avx512_core_bf16) && mayiuse(cpu::x64::avx512_core)) {
@@ -556,9 +558,25 @@ jit_store_emitter::jit_store_emitter(dnnl::impl::cpu::x64::jit_generator *host,
     }
 }
 
-// 0 for temp reg for mask store for avx512
+inline bool jit_store_emitter::is_saturation() const {
+    return mode_ == arithmetic_mode::saturation;
+}
+
+// case for SSE and AVX2 when we should use AND to truncate values
+inline bool jit_store_emitter::is_truncation_emulation() const {
+    return !mayiuse(cpu::x64::avx512_core) && !is_saturation() &&
+        src_prc_ != dst_prc_ && one_of(dst_prc_, Precision::U16, Precision::I16, Precision::U8, Precision::I8);
+}
+
 size_t jit_store_emitter::aux_gprs_count() const {
-    return get_aux_regs_for_avx512_mask(store_num_ * src_prc_.size());
+    // for temp reg for mask store
+    int count = get_aux_regs_for_avx512_mask(store_num_ * src_prc_.size());
+
+    // for table value in truncation arithmetic mode
+    if (is_truncation_emulation())
+        count++;
+
+    return count;
 }
 
 size_t jit_store_emitter::aux_vecs_count() const {
@@ -580,6 +598,7 @@ size_t jit_store_emitter::aux_vecs_count() const {
 size_t jit_store_emitter::get_inputs_num() const { return 1; }
 
 void jit_store_emitter::emit_data() const {
+    jit_emitter::emit_data();
     if (emu_vcvtneps2bf16_)
         emu_vcvtneps2bf16_->emit_data();
 }
@@ -618,7 +637,11 @@ void jit_store_emitter::emit_isa(const int in_vec_idx, const Xbyak::Reg64 &reg_d
         switch (src_prc_) {
             case Precision::FP32:
                 if ((dst_prc_ != Precision::FP32) && (dst_prc_ != Precision::BF16)) {
-                    h->uni_vcvtps2dq(Vmm(aux_vec_idxs.back()), Vmm(data_idx));
+                    if (is_saturation()) {
+                        h->uni_vcvtps2dq(Vmm(aux_vec_idxs.back()), Vmm(data_idx));
+                    } else {
+                        h->uni_vcvttps2dq(Vmm(aux_vec_idxs.back()), Vmm(data_idx));
+                    }
                     data_idx = aux_vec_idxs.back();
                 }
                 break;
@@ -804,7 +827,7 @@ void jit_store_emitter::store_bytes(const Vmm &vmm, const Xbyak::Reg64 &reg, int
 
 /**
 * store_dword_to_byte_extension is the utility function to
-* 1. convert store_num (0 <= store_num <= 16) dwords in the Xmm/Ymm/Zmm to store_num bytes with singed or unsinged saturation.
+* 1. convert store_num (0 <= store_num <= 16) dwords in the Xmm/Ymm/Zmm to store_num bytes with and without singed or unsinged saturation.
 * 2. store the packed byte into the memory referenced by ptr[reg + offset] address.
 */
 template <typename Vmm>
@@ -835,28 +858,37 @@ void jit_store_emitter::store_dword_to_byte_extension(const Vmm &vmm, const Xbya
     };
 
     auto store_dword_to_byte_base = [&]() {
-        // db only available on avx512, need dw+wb to emulate
-        if (is_signed)
-            h->uni_vpackssdw(vmm, vmm, vmm);
-        else
-            h->uni_vpackusdw(vmm, vmm, vmm);
-        // gather 2(cross lane) 64 bits into lower vmm to store
-        // [y_3 y_2 y_1 y_0] |--> [y_0 y_0 y_2 y_0]
-        if (is_ymm) {
-            h->vpermq(ymm, ymm, 0x08);  // 00001000
-        }
+        if (is_saturation()) {
+            // db only available on avx512, need dw+wb to emulate
+            if (is_signed)
+                h->uni_vpackssdw(vmm, vmm, vmm);
+            else
+                h->uni_vpackusdw(vmm, vmm, vmm);
+            // gather 2(cross lane) 64 bits into lower vmm to store
+            // [y_3 y_2 y_1 y_0] |--> [y_0 y_0 y_2 y_0]
+            if (is_ymm) {
+                h->vpermq(ymm, ymm, 0x08);  // 00001000
+            }
 
-        if (is_signed)
-            h->uni_vpacksswb(vmm, vmm, vmm);
-        else
+            if (is_signed)
+                h->uni_vpacksswb(vmm, vmm, vmm);
+            else
+                h->uni_vpackuswb(vmm, vmm, vmm);
+        } else {
+            h->vpand(vmm, vmm, table_val("mask_truncation_byte"));  // to avoid saturation
+            h->uni_vpackssdw(vmm, vmm, vmm);
+            if (is_ymm)
+                h->vpermq(ymm, ymm, 0x08);
             h->uni_vpackuswb(vmm, vmm, vmm);
+        }
 
         store_bytes(vmm, reg, offset, store_num);
     };
 
     switch (store_num) {
-        case 16:
-            // must support avx512F
+    case 16:
+        // must support avx512F
+        if (is_saturation()) {
             if (is_signed) {
                 h->vpmovsdb(addr(0), vmm);
             } else {
@@ -865,9 +897,13 @@ void jit_store_emitter::store_dword_to_byte_extension(const Vmm &vmm, const Xbya
                 h->uni_vpmaxsd(vmm, vmm, zero);
                 h->vpmovusdb(addr(0), vmm);
             }
-            break;
-        case 8:
-            if (mayiuse(cpu::x64::avx512_core)) {  // ymm block on avx512F + VL
+        } else {
+            h->vpmovdb(addr(0), vmm);
+        }
+        break;
+    case 8:
+        if (mayiuse(cpu::x64::avx512_core)) {
+            if (is_saturation()) {  // ymm block on avx512F + VL
                 if (is_signed) {
                     h->vpmovsdb(addr(0), ymm);
                 } else {
@@ -877,11 +913,15 @@ void jit_store_emitter::store_dword_to_byte_extension(const Vmm &vmm, const Xbya
                     h->vpmovusdb(addr(0), ymm);
                 }
             } else {
-                store_dword_to_byte_base();
+                h->vpmovdb(addr(0), ymm);
             }
-            break;
-        case 4:
-            if (mayiuse(cpu::x64::avx512_core)) {  // xmm block on avx512F + VL
+        } else {
+            store_dword_to_byte_base();
+        }
+        break;
+    case 4:
+        if (mayiuse(cpu::x64::avx512_core)) {
+            if (is_saturation()) {// xmm block on avx512F + VL
                 if (is_signed) {
                     h->vpmovsdb(addr(0), xmm);
                 } else {
@@ -891,15 +931,19 @@ void jit_store_emitter::store_dword_to_byte_extension(const Vmm &vmm, const Xbya
                     h->vpmovusdb(addr(0), xmm);
                 }
             } else {
-                store_dword_to_byte_base();
+                h->vpmovdb(addr(0), xmm);
             }
-            break;
-        default:
-            if (is_zmm) {  // avx512F
-                unsigned int mask = 1;
-                mask = (mask << store_num) - mask;
-                h->mov(Reg32(aux_gpr_idxs[0]), mask);
-                h->kmovw(k_mask, Reg32(aux_gpr_idxs[0]));
+        } else {
+            store_dword_to_byte_base();
+        }
+        break;
+    default:
+        if (is_zmm) {  // avx512F
+            unsigned int mask = 1;
+            mask = (mask << store_num) - mask;
+            h->mov(Reg32(aux_gpr_idxs[0]), mask);
+            h->kmovw(k_mask, Reg32(aux_gpr_idxs[0]));
+            if (is_saturation()) {
                 if (is_signed) {
                     h->vpmovsdb(addr(0), vmm | k_mask);
                 } else {
@@ -909,9 +953,12 @@ void jit_store_emitter::store_dword_to_byte_extension(const Vmm &vmm, const Xbya
                     h->vpmovusdb(addr(0), vmm | k_mask);
                 }
             } else {
-                store_dword_to_byte_base();
+                h->vpmovdb(addr(0), vmm | k_mask);
             }
-            break;
+        } else {
+            store_dword_to_byte_base();
+        }
+        break;
     }
 }
 
@@ -946,16 +993,21 @@ void jit_store_emitter::store_dword_to_word_extension(const Vmm &vmm, const Xbya
     auto zmm = Xbyak::Zmm(vmm.getIdx());
 
     auto store_dword_to_word_base = [&]() {
-        // direct mov_dw available only on avx512, emulate with pack_dw + permute + pure store
-        if (is_signed)
-            h->uni_vpackssdw(vmm, vmm, vmm);
-        else
+        // direct mov_dw available only on avx512
+        if (is_saturation()) {  // emulate with pack_dw + permute + pure store for saturation mode
+            if (is_signed)
+                h->uni_vpackssdw(vmm, vmm, vmm);
+            else
+                h->uni_vpackusdw(vmm, vmm, vmm);
+            // gather 2/4(cross lane) 64 bits into lower vmm to store
+            // [y_3 y_2 y_1 y_0] |--> [y_0 y_0 y_2 y_0]
+            // [  128  |  128  ] |--> [ 128   |  128  ]
+            if (is_ymm) {
+                h->vpermq(ymm, ymm, 0x08);  // 00001000
+            }
+        } else {  // emulate with AND + pure store for truncation mode
+            h->vpand(vmm, vmm, table_val("mask_truncation_word"));
             h->uni_vpackusdw(vmm, vmm, vmm);
-        // gather 2/4(cross lane) 64 bits into lower vmm to store
-        // [y_3 y_2 y_1 y_0] |--> [y_0 y_0 y_2 y_0]
-        // [  128  |  128  ] |--> [ 128   |  128  ]
-        if (is_ymm) {
-            h->vpermq(ymm, ymm, 0x08);  // 00001000
         }
 
         store_bytes(vmm, reg, offset, store_num * 2);
@@ -978,7 +1030,8 @@ void jit_store_emitter::store_dword_to_word_extension(const Vmm &vmm, const Xbya
         }
     } else {
         switch (store_num) {
-            case 16:
+        case 16:
+            if (is_saturation()) {
                 if (is_signed) {
                     h->vpmovsdw(ptr[reg + offset], vmm);  // singed int32 saturate to signed int16.
                 } else {
@@ -987,9 +1040,13 @@ void jit_store_emitter::store_dword_to_word_extension(const Vmm &vmm, const Xbya
                     h->uni_vpmaxsd(vmm, zero, vmm);        // if singed bit is 1, set value as 0.
                     h->vpmovusdw(ptr[reg + offset], vmm); // unsinged int32 saturate to unsigned int16.
                 }
-                break;
-            case 8:
-                if (mayiuse(cpu::x64::avx512_core)) {
+            } else {
+                h->vpmovdw(ptr[reg + offset], vmm);
+            }
+            break;
+        case 8:
+            if (mayiuse(cpu::x64::avx512_core)) {
+                if (is_saturation()) {
                     if (is_signed) {
                         h->vpmovsdw(ptr[reg + offset], ymm);
                     } else {
@@ -999,11 +1056,15 @@ void jit_store_emitter::store_dword_to_word_extension(const Vmm &vmm, const Xbya
                         h->vpmovusdw(ptr[reg + offset], ymm);
                     }
                 } else {
-                    store_dword_to_word_base();
+                    h->vpmovdw(ptr[reg + offset], ymm);
                 }
-                break;
-            case 4:
-                if (mayiuse(cpu::x64::avx512_core)) {
+            } else {
+                store_dword_to_word_base();
+            }
+            break;
+        case 4:
+            if (mayiuse(cpu::x64::avx512_core)) {
+                if (is_saturation()) {
                     if (is_signed) {
                         h->vpmovsdw(ptr[reg + offset], xmm);
                     } else {
@@ -1013,15 +1074,19 @@ void jit_store_emitter::store_dword_to_word_extension(const Vmm &vmm, const Xbya
                         h->vpmovusdw(ptr[reg + offset], xmm);
                     }
                 } else {
-                   store_dword_to_word_base();
+                    h->vpmovdw(ptr[reg + offset], xmm);
                 }
-                break;
-            default:
-                if (is_zmm) {
-                    unsigned int mask = 1;
-                    mask = (mask << store_num) - mask;
-                    h->mov(Reg32(aux_gpr_idxs[0]), mask);
-                    h->kmovw(k_mask, Reg32(aux_gpr_idxs[0]));
+            } else {
+               store_dword_to_word_base();
+            }
+            break;
+        default:
+            if (is_zmm) {
+                unsigned int mask = 1;
+                mask = (mask << store_num) - mask;
+                h->mov(Reg32(aux_gpr_idxs[0]), mask);
+                h->kmovw(k_mask, Reg32(aux_gpr_idxs[0]));
+                if (is_saturation()) {
                     if (is_signed) {
                         h->vpmovsdw(ptr[reg + offset], vmm | k_mask);
                     } else {
@@ -1031,12 +1096,22 @@ void jit_store_emitter::store_dword_to_word_extension(const Vmm &vmm, const Xbya
                         h->vpmovusdw(ptr[reg + offset], vmm | k_mask);
                     }
                 } else {
-                    store_dword_to_word_base();
+                    h->vpmovdw(ptr[reg + offset], vmm | k_mask);
                 }
-                break;
+            } else {
+                store_dword_to_word_base();
+            }
+            break;
         }
     }
 }
 
+void jit_store_emitter::register_table_entries() {
+    if (is_truncation_emulation()) {
+        push_arg_entry_of("mask_truncation_byte", 0x000000ff, true);
+        push_arg_entry_of("mask_truncation_word", 0x0000ffff, true);
+    }
+}
+
 }   // namespace intel_cpu
 }   // namespace ov
diff --git a/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.hpp
index 3784a343d3fbe2..a198eb705c3022 100644
--- a/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.hpp
+++ b/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.hpp
@@ -39,6 +39,12 @@ struct store_emitter_params : public emitter_params {
     int store_num_;
 };
 
+// Arithmetic modes for data type conversion in store_emitter
+enum arithmetic_mode {
+    saturation,
+    truncation
+};
+
 class jit_load_emitter : public jit_emitter {
 public:
     jit_load_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, Precision src_prc, Precision dst_prc, int load_num,
@@ -101,7 +107,8 @@ class jit_load_emitter : public jit_emitter {
 class jit_store_emitter : public jit_emitter {
 public:
     jit_store_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, Precision src_prc, Precision dst_prc, int store_num,
-                      Precision exec_prc = Precision::FP32, emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_gpr);
+                      arithmetic_mode mode = arithmetic_mode::saturation, Precision exec_prc = Precision::FP32,
+                      emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_gpr);
 
     /**
     * store_num values with src_prc in Vmm[in_vec_idx] is stored to ptr[reg_dst + offset_byte] address as dst_prc data, where offset_byte is in_idxs[1]
@@ -143,15 +150,21 @@ class jit_store_emitter : public jit_emitter {
     template <typename Vmm>
     void store_dword_to_word_extension(const Vmm &vmm, const Xbyak::Reg64 &reg, int offset, bool is_bf16, bool is_signed, int store_size) const;
 
+    void register_table_entries() override;
+
     size_t aux_gprs_count() const override;
     size_t aux_vecs_count() const override;
 
+    inline bool is_saturation() const;
+    inline bool is_truncation_emulation() const;
+
     std::string name_;
     int v_len_elt_;  // 4/8/16
     int store_num_;
     int store_size_;
     Precision src_prc_;
     Precision dst_prc_;
+    arithmetic_mode mode_ = arithmetic_mode::saturation;
     std::shared_ptr<jit_emu_vcvtneps2bf16> emu_vcvtneps2bf16_;
 };
 
diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp
new file mode 100644
index 00000000000000..a1b3f1b0068a78
--- /dev/null
+++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp
@@ -0,0 +1,669 @@
+// Copyright (C) 2020-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <ngraph/rt_info.hpp>
+#include <ngraph/variant.hpp>
+#include <cpu/x64/jit_generator.hpp>
+
+#include "jit_snippets_emitters.hpp"
+#include "snippets_transformations/op/load_store_convert.hpp"
+
+using namespace Xbyak;
+
+namespace ov {
+namespace intel_cpu {
+
+inline static void transform_idxs_to_regs(const std::vector<size_t>& idxs, std::vector<Reg64>& regs) {
+    regs.resize(idxs.size());
+    std::transform(idxs.begin(), idxs.end(), regs.begin(), [](size_t idx){return Reg64(static_cast<int>(idx));});
+}
+
+jit_container_emitter::jit_container_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
+                      const std::shared_ptr<ov::Node>& n) : jit_emitter(h, isa, n) {
+    in_out_type_ = emitter_in_out_map::gpr_to_gpr;
+}
+
+void jit_container_emitter::map_abstract_registers(const std::vector<size_t> &vec_pool,  const std::vector<size_t> &gpr_pool,
+                                                    std::set<size_t>& vecs_used, std::set<size_t>& gprs_used) {
+    if (body.empty())
+        IE_THROW() << "Cannot map registers for jit_container_emitter when its body is empty";
+    auto abstract_to_physical = [](const std::vector<size_t>& abstract_regs, const std::vector<size_t>& regs_pool) {
+        std::vector<size_t> physical_regs(abstract_regs.size());
+        for (size_t i = 0; i < abstract_regs.size(); i++)
+            physical_regs[i] = regs_pool.at(abstract_regs[i]);
+        return physical_regs;
+    };
+    for (auto& code : body) {
+        const auto& emitter = code.first;
+        std::vector<size_t> in_abstract_regs, out_abstract_regs;
+        std::tie(in_abstract_regs, out_abstract_regs) = code.second;
+        std::vector<size_t> in_physical_regs, out_physical_regs;
+        switch (std::dynamic_pointer_cast<jit_emitter>(emitter)->get_in_out_type()) {
+            case gpr_to_gpr:
+                // Note that gpr_to_gpr is used for high-level utility operations like Kernel/TileScheduler/Tile.
+                // Input registers are not mapped in this case, since they contain utility info
+                // (num_params, tile increment, etc.), but not reg indexes.
+                in_physical_regs = std::move(in_abstract_regs);
+                out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, gpr_pool));
+                gprs_used.insert(out_physical_regs.begin(), out_physical_regs.end());
+                break;
+            case gpr_to_vec:
+                // Load Emitters
+                in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, gpr_pool));
+                out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, vec_pool));
+                gprs_used.insert(in_physical_regs.begin(), in_physical_regs.end());
+                vecs_used.insert(out_physical_regs.begin(), out_physical_regs.end());
+                break;
+            case vec_to_gpr:
+                // Store Emitters
+                in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, vec_pool));
+                out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, gpr_pool));
+                vecs_used.insert(in_physical_regs.begin(), in_physical_regs.end());
+                gprs_used.insert(out_physical_regs.begin(), out_physical_regs.end());
+                break;
+            case vec_to_vec:
+                // Regular operations
+                in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, vec_pool));
+                out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, vec_pool));
+                vecs_used.insert(in_physical_regs.begin(), in_physical_regs.end());
+                vecs_used.insert(out_physical_regs.begin(), out_physical_regs.end());
+                break;
+            default:
+                IE_THROW() << "Unhandled in_out type";
+        }
+        code.second = std::make_pair(in_physical_regs, out_physical_regs);
+        if (auto container = std::dynamic_pointer_cast<jit_container_emitter>(code.first))
+            container->map_abstract_registers(vec_pool, gpr_pool, vecs_used, gprs_used);
+    }
+}
+
+KernelEmitter::KernelEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
+                             const std::shared_ptr<ov::Node>& n) : jit_container_emitter(h, isa, n) {
+    const auto kernel = ov::as_type_ptr<ngraph::snippets::op::Kernel>(n);
+    if (!kernel)
+        IE_THROW() << "KernelEmitter invoked with invalid op argument";
+    if (kernel->region.empty())
+        IE_THROW() << "KernelEmitter invoked with empty body";
+    body = kernel->region;
+    if (!kernel->compile_params)
+        IE_THROW() << "KernelEmitter invoked without compile_params";
+    jcp = *reinterpret_cast<const jit_snippets_compile_args*>(kernel->compile_params);
+    // Initialize pools of gp and vec registers
+    gp_regs_pool.resize(16);
+    vec_regs_pool.resize(16);
+    std::iota(gp_regs_pool.begin(), gp_regs_pool.end(), 0);
+    std::iota(vec_regs_pool.begin(), vec_regs_pool.end(), 0);
+    auto remove_regs_from_pool = [](std::vector<size_t>& pool, const std::set<size_t>& to_remove) {
+        // It's important to keep the order of other elements
+        pool.erase(std::remove_if(pool.begin(), pool.end(),
+                                       [&](size_t x) {return to_remove.count(x) != 0;}), pool.end());
+    };
+    // Reserve stack base and pointer for push(...) and pop(...) operations
+    // Reserve abi_param1 and abi_param2, since they'll be used to pass runtime call args to kernel
+    remove_regs_from_pool(gp_regs_pool, {Xbyak::Operand::RSP, Xbyak::Operand::RBP,
+                                         static_cast<size_t>(abi_param1.getIdx()),
+                                         static_cast<size_t>(abi_param2.getIdx())});
+    std::set<size_t> vecs_used, gprs_used;
+    map_abstract_registers(vec_regs_pool, gp_regs_pool, vecs_used, gprs_used);
+    remove_regs_from_pool(gp_regs_pool, gprs_used);
+    remove_regs_from_pool(vec_regs_pool, vecs_used);
+    // Remember used gprs to pass it to the TileSchedulerEmitter, so it can init them with appropriate data ptrs
+    gp_regs_used = std::vector<size_t>(gprs_used.begin(), gprs_used.end());
+}
+
+void KernelEmitter::emit_code(const std::vector<size_t> &in,
+                              const std::vector<size_t> &out,
+                              const std::vector<size_t> &pool,
+                              const std::vector<size_t> &gpr) const {
+    validate_arguments(in, out, pool, gpr);
+    emit_impl(in, out, pool, gpr, nullptr);
+}
+
+void KernelEmitter::validate_arguments(const std::vector<size_t> &in,
+                                       const std::vector<size_t> &out,
+                                       const std::vector<size_t> &pool,
+                                       const std::vector<size_t> &gpr) const {
+    if (in.size() != 2)
+        IE_THROW() << "KernelEmitter got invalid number of inputs. Expected 2, got " << in.size();
+    if (!out.empty())
+        IE_THROW() << "KKernelEmitter got invalid number of outputs. Expected 0, got " << out.size();
+}
+
+void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params,
+                                              const Reg64& reg_indexes, const Reg64& reg_const_params, const std::vector<Reg64>& data_ptr_regs) const {
+    const int64_t harness_num_dims = jcp.output_dims.size() - 1;
+    auto init_ptrs_with_offsets = [&](Reg64 pointer, const int64_t *offsets, Reg64 reg_tmp) {
+        for (int j = 0; j < harness_num_dims; j++) {
+            if (jcp.output_dims[j] != 1 && offsets[j] != 0) {
+                h->mov(reg_tmp, offsets[j]);
+                h->imul(reg_tmp, h->ptr[reg_indexes + j * sizeof(size_t)]);
+                h->add(pointer, reg_tmp);
+            }
+        }
+    };
+    for (auto i = 0; i < num_params; i++) {
+        if (i < num_inputs)
+            h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(src_ptrs) + i * sizeof(void*)]);
+        else
+            h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)]);
+        // we can use the last data_ptr_reg as tmp_reg until the last iteration, and reg_const_params then
+        Reg64 reg_tmp = i < num_params-1 ? data_ptr_regs.back() : reg_const_params;
+        init_ptrs_with_offsets(data_ptr_regs[i], &jcp.data_offsets[i * harness_num_dims], reg_tmp);
+    }
+}
+void KernelEmitter::emit_impl(const std::vector<size_t>& in,
+                              const std::vector<size_t>& out,
+                              const std::vector<size_t>& allocated_vec_regs,
+                              const std::vector<size_t>& allocated_gp_regs,
+                              const ov::intel_cpu::emitter_context *emit_context) const {
+    h->preamble();
+
+    const size_t num_inputs = in[0];
+    const size_t num_outputs = in[1];
+
+    Reg64 reg_indexes = Reg64(abi_param1.getIdx());
+    Reg64 reg_const_params = Reg64(abi_param2.getIdx());
+    std::vector<Reg64> data_ptr_regs;
+    transform_idxs_to_regs(gp_regs_used, data_ptr_regs);
+
+    init_data_pointers(num_inputs, num_inputs + num_outputs, reg_indexes, reg_const_params, data_ptr_regs);
+    // todo: emit_impl is a const method, so we can't just push_back unused regs to the gp_regs_pool.
+    //  we need a more elegant approach to avoid a full copy here
+    auto local_gpr_pool = gp_regs_pool;
+    local_gpr_pool.push_back(static_cast<size_t>(reg_indexes.getIdx()));
+    local_gpr_pool.push_back(static_cast<size_t>(reg_const_params.getIdx()));
+    for (const auto& c : body) {
+        const auto& emitter = c.first;
+        std::vector<size_t> in_regs, out_regs;
+        std::tie(in_regs, out_regs) = c.second;
+        if (auto tile_scheduler = std::dynamic_pointer_cast<TileSchedulerEmitter>(emitter))
+            out_regs = gp_regs_used;
+        emitter->emit_code(in_regs, out_regs, vec_regs_pool, local_gpr_pool);
+    }
+    h->postamble();
+}
+
+TileSchedulerEmitter::TileSchedulerEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
+                                           const std::shared_ptr<ov::Node>& n) : jit_container_emitter(h, isa, n) {
+    const auto tile_scheduler = ov::as_type_ptr<ngraph::snippets::op::TileScheduler>(n);
+    if (!tile_scheduler)
+        IE_THROW() << "TileSchedulerEmitter invoked with invalid op argument";
+    if (!tile_scheduler->compile_params)
+        IE_THROW() << "TileEmitter invoked without compile_params";
+    body = {tile_scheduler->vector_region, tile_scheduler->scalar_region};
+    jcp = *reinterpret_cast<const jit_snippets_compile_args*>(tile_scheduler->compile_params);
+}
+void TileSchedulerEmitter::emit_code(const std::vector<size_t> &in,
+                                     const std::vector<size_t> &out,
+                                     const std::vector<size_t> &pool,
+                                     const std::vector<size_t> &gpr) const {
+    validate_arguments(in, out, pool, gpr);
+    emit_impl(in, out, pool, gpr, nullptr);
+}
+void TileSchedulerEmitter::validate_arguments(const std::vector<size_t> &in,
+                                     const std::vector<size_t> &out,
+                                     const std::vector<size_t> &pool,
+                                     const std::vector<size_t> &gpr) const {
+    if (in.size() != 3)
+        IE_THROW() << "TileSchedulerEmitter got invalid number of inputs. Expected 3, got " << in.size();
+    if (out.size() != in[0] + in[1])
+        IE_THROW() << "TileSchedulerEmitter got invalid number of outputs. Expected " << in[0] + in[1] << " , got " << out.size();
+    if (body.size() != 2)
+        IE_THROW() << "TileSchedulerEmitter got invalid body size, expected 2 (vector & scalar TileEmitter), got " << body.size();
+    if (!(std::dynamic_pointer_cast<TileEmitter>(body[0].first) && std::dynamic_pointer_cast<TileEmitter>(body[1].first)))
+        IE_THROW() << "TileSchedulerEmitter can contain only TileEmitters inside its body";
+}
+
+void TileSchedulerEmitter::emit_tiles(const Reg64& reg_inner_amount, const std::vector<Reg64>& data_ptr_regs, size_t vector_size,
+                                      const std::vector<size_t>& vec_pool, const std::vector<size_t>& gpr_pool) const {
+    // TileAllocatedEmitter is just an alias to perform dynamic_pointer_cast only once and reuse it below several times
+    using TileAllocatedEmitter = std::pair<std::shared_ptr<TileEmitter>, const ngraph::snippets::RegInfo&>;
+    TileAllocatedEmitter vector_tile {std::dynamic_pointer_cast<TileEmitter>(body[0].first), body[0].second};
+    TileAllocatedEmitter scalar_tile {std::dynamic_pointer_cast<TileEmitter>(body[1].first), body[1].second};
+    const size_t inner_work_amount = jcp.scheduler_dims[1];
+    auto process_tile =
+        [&](const bool evaluate_once, const TileAllocatedEmitter& tile) {
+            // If Tile is evaluated only once, then we can emit its body directly and skip work_amount decrements and checks
+            if (evaluate_once) {
+                tile.first->emit_body(vec_pool, gpr_pool);
+            } else {
+                std::vector<size_t> in_regs, out_regs;
+                std::tie(in_regs, out_regs) = tile.second;
+                // pass work_amount reg to Tile
+                in_regs.push_back(static_cast<size_t>(reg_inner_amount.getIdx()));
+                for (const auto& reg : data_ptr_regs)
+                    out_regs.emplace_back(reg.getIdx());
+                tile.first->emit_code(in_regs, out_regs, vec_pool, gpr_pool);
+            }
+        };
+    // todo: these optimizations should be performed on using Tile graph representation in the future
+    bool vector_evaluate_once = false;
+    if (inner_work_amount >= vector_size) {
+        vector_evaluate_once = inner_work_amount < 2 * vector_size;
+        // Need to set proper work amount for inner tiles if evaluated multiple times
+        if (!vector_evaluate_once)
+            h->mov(reg_inner_amount, inner_work_amount);
+        process_tile(vector_evaluate_once, vector_tile);
+    }
+    if (inner_work_amount % vector_size >= 1) {
+        bool scalar_evaluate_once = inner_work_amount % vector_size < 2;
+        if (!scalar_evaluate_once) {
+            // vector_tile is not executed, work_amount is not set
+            if (inner_work_amount < vector_size) {
+                h->mov(reg_inner_amount, inner_work_amount);
+                // vector_tile is executed, but work_amount is neither set nor decremented appropriately.
+            } else if (vector_evaluate_once) {
+                vector_tile.first->emit_ptr_increments(data_ptr_regs);
+                h->mov(reg_inner_amount, inner_work_amount - vector_size);
+            }
+            // else: vector_tile is executed multiple times, so work_amount is already set
+        } else {
+            if (vector_evaluate_once) {
+                vector_tile.first->emit_ptr_increments(data_ptr_regs);
+            }
+        }
+        process_tile(scalar_evaluate_once, scalar_tile);
+    }
+}
+
+void TileSchedulerEmitter::emit_impl(const std::vector<size_t>& in,
+                                     const std::vector<size_t>& out,
+                                     const std::vector<size_t>& vec_pool,
+                                     const std::vector<size_t>& gpr_pool,
+                                     const ov::intel_cpu::emitter_context *emit_context) const {
+    const size_t num_inputs = in[0];
+    const size_t num_outputs = in[1];
+    const size_t vector_size = in[2];
+    const size_t num_params = num_inputs + num_outputs;
+    const auto& data_ptr_reg_idxs(out);
+    std::vector<Reg64> data_ptr_regs;
+    transform_idxs_to_regs(data_ptr_reg_idxs, data_ptr_regs);
+    // todo: emit_impl has const input args, so we can't just pop_back necessary regs from gpr_pool.
+    //  we need a more elegant approach to avoid a full copy here. Similar problem is demonstrated in KernelEmitter
+    auto local_gpr_pool = gpr_pool;
+    Reg64 reg_outer_amount = Reg64(static_cast<int>(local_gpr_pool.back()));
+    local_gpr_pool.pop_back();
+    Reg64 reg_inner_amount = Reg64(static_cast<int>(local_gpr_pool.back()));
+    local_gpr_pool.pop_back();
+    Label for_body;
+    const size_t outer_work_amount = jcp.scheduler_dims[0];
+    if (outer_work_amount == 1) {
+        // emit code directly without looping over external dim
+        emit_tiles(reg_inner_amount, data_ptr_regs, vector_size, vec_pool, local_gpr_pool);
+    } else if (outer_work_amount > 1) {
+        // We need to create a Loop in this case
+        h->mov(reg_outer_amount, outer_work_amount);
+        h->L(for_body);
+        {
+            emit_tiles(reg_inner_amount, data_ptr_regs, vector_size, vec_pool, local_gpr_pool);
+
+            // Todo: Load and Store emitters are currently implemented so they ALWAYS increment appropriate pointers
+            //   after reading/writing. This might be a problem if we need to read the same data multiple times (broadcasting shapes).
+            //   To overcome this limitation, we add appropriate negative offsets if necessary.
+            for (auto i = 0; i < num_params; i++) {
+                if (jcp.scheduler_offsets[i] != 0) {
+                    h->add(data_ptr_regs[i], jcp.scheduler_offsets[i]);
+                }
+            }
+            // Note that outer dimensions are always incremented by 1 (outer tiles are always scalar)
+            h->sub(reg_outer_amount, 1);
+            h->cmp(reg_outer_amount, 1);
+            h->jge(for_body, CodeGenerator::T_NEAR);
+        }
+    }
+}
+
+std::vector<AllocatedEmitter>& TileEmitter::get_nested_code() {
+    return body;
+}
+
+TileEmitter::TileEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
+                         const std::shared_ptr<ov::Node>& n) : jit_container_emitter(h, isa, n) {
+    const auto tile = ov::as_type_ptr<ngraph::snippets::op::Tile>(n);
+    if (!tile)
+        IE_THROW() << "TileEmitter invoked with invalid op argument";
+    body = tile->region;
+    if (body.empty())
+        IE_THROW() << "TileEmitter is invoked with empty body";
+    num_inputs = tile->num_inputs;
+    num_outputs = tile->num_outputs;
+    io_dims = tile->io_dims;
+    io_data_size = tile->io_data_size;
+    increment = tile->increment;
+    if (io_dims.size() != num_inputs + num_outputs)
+        IE_THROW() << "TileEmitter constructor got inconsistent arguments. Check num_inputs + num_outputs == io_dims.size()";
+}
+
+void TileEmitter::emit_code(const std::vector<size_t> &in,
+                            const std::vector<size_t> &out,
+                            const std::vector<size_t> &pool,
+                            const std::vector<size_t> &gpr) const {
+    validate_arguments(in, out, pool, gpr);
+    emit_impl(in, out, pool, gpr, nullptr);
+}
+
+void TileEmitter::validate_arguments(const std::vector<size_t> &in,
+                                     const std::vector<size_t> &out,
+                                     const std::vector<size_t> &pool,
+                                     const std::vector<size_t> &gpr) const {
+    if (in.size() != 1)
+        IE_THROW() << "TileEmitter got invalid number of inputs. Expected 1, got " << in.size();
+    if (out.size() != io_dims.size())
+        IE_THROW() << "TileEmitter got invalid number of outputs. Expected " << io_dims.size() << " , got " << out.size();
+}
+
+void TileEmitter::emit_body(const std::vector<size_t>& vec_pool, const std::vector<size_t>& gpr_pool) const {
+    for (auto& code : body)
+        code.first->emit_code(code.second.first, code.second.second, vec_pool, gpr_pool);
+}
+
+void TileEmitter::emit_ptr_increments(const std::vector<Reg64>& data_ptr_regs) const {
+    for (size_t i = 0; i < num_inputs + num_outputs; i++) {
+        // those with dims == 1 will be broadcasted, hence don't require increment
+        if (io_dims[i] != 1)
+            h->add(data_ptr_regs[i], increment * io_data_size[i]);
+    }
+}
+
+void TileEmitter::emit_impl(const std::vector<size_t>& in,
+                            const std::vector<size_t>& out,
+                            const std::vector<size_t>& vec_pool,
+                            const std::vector<size_t>& gpr_pool,
+                            const ov::intel_cpu::emitter_context *emit_context) const {
+    Reg64 work_amount = Reg64(static_cast<int>(in[0]));
+    std::vector<Reg64> data_ptr_regs;
+    transform_idxs_to_regs(out, data_ptr_regs);
+    Label for_body;
+    // Note that:
+    // * Work amount must be set by TileScheduler that executes Tiles
+    // * TileScheduler executes Tile only if it has to perform >= 1 iterations
+    h->L(for_body);
+    emit_body(vec_pool, gpr_pool);
+    emit_ptr_increments(data_ptr_regs);
+    h->sub(work_amount, increment);
+    h->cmp(work_amount, increment);
+    h->jge(for_body, CodeGenerator::T_NEAR);
+}
+
+BroadcastMoveEmitter::BroadcastMoveEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
+                                           const std::shared_ptr<ov::Node>& n) : jit_emitter(h, isa, n) {
+    if (n->get_input_shape(0).empty())
+        use_broadcast = true;
+    else if (*n->get_input_shape(0).rbegin() != *n->get_output_shape(0).rbegin())
+        use_broadcast = true;
+    else
+        use_broadcast = false;
+
+    if (n->get_input_element_type(0) != n->get_output_element_type(0))
+        IE_THROW() << "BroadcastMoveEmitter supports only equal input and output types but gets: "
+            << n->get_input_element_type(0) << " and " << n->get_output_element_type(0);
+    byte_size = n->get_input_element_type(0).size();
+}
+
+void BroadcastMoveEmitter::emit_impl(const std::vector<size_t>& in,
+          const std::vector<size_t>& out,
+          const std::vector<size_t>& pool,
+          const std::vector<size_t>& gpr,
+          const ov::intel_cpu::emitter_context *emit_context) const {
+    if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
+        emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
+        emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
+        emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
+    } else {
+        IE_THROW() << "BroadcastMove emitter doesn't support " << host_isa_;
+    }
+}
+
+template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+void BroadcastMoveEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
+    using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
+            Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src0 = Vmm(in[0]);
+    Xmm xmm_src0 = Xmm(in[0]);
+    Vmm vmm_dst  = Vmm(out[0]);
+
+    if (use_broadcast) {
+        switch (byte_size) {
+            case 4: h->uni_vbroadcastss(vmm_dst, xmm_src0); break;
+            case 2: h->vpbroadcastw(vmm_dst, xmm_src0); break;
+            case 1: h->vpbroadcastb(vmm_dst, xmm_src0); break;
+            default: assert(!"unsupported data type");
+        }
+    } else {
+        if (vmm_src0 != vmm_dst)
+            h->uni_vmovups(vmm_dst, vmm_src0);
+    }
+}
+
+ScalarEmitter::ScalarEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
+                             const std::shared_ptr<ov::Node>& n) : jit_emitter(h, isa, n) {
+    value = dnnl::impl::cpu::x64::float2int(ov::as_type_ptr<ngraph::snippets::op::Scalar>(n)->cast_vector<float>()[0]);
+    push_arg_entry_of("scalar", value, true);
+    prepare_table();
+}
+
+void ScalarEmitter::emit_impl(const std::vector<size_t>& in,
+                              const std::vector<size_t>& out,
+                              const std::vector<size_t>& pool,
+                              const std::vector<size_t>& gpr,
+                              const ov::intel_cpu::emitter_context *emit_context) const {
+    if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
+        emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
+        emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
+        emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
+    } else {
+        IE_THROW() << "Scalar emitter doesn't support " << host_isa_;
+    }
+}
+
+template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+void ScalarEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
+    using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
+            Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
+    Vmm vmm_dst  = Vmm(out[0]);
+    h->uni_vbroadcastss(vmm_dst, table_val("scalar"));
+}
+
+
+MemoryEmitter::MemoryEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
+                             const std::shared_ptr<ov::Node>& n) : jit_emitter(h, isa, n) {
+    src_prc = InferenceEngine::details::convertPrecision(n->get_input_element_type(0));
+    dst_prc = InferenceEngine::details::convertPrecision(n->get_output_element_type(0));
+}
+
+StoreEmitter::StoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
+                           const std::shared_ptr<ov::Node>& n) : MemoryEmitter(h, isa, n) {
+    if (src_prc != dst_prc)
+        IE_THROW() << "StoreEmitter supports only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name();
+
+    count = ov::as_type_ptr<ngraph::snippets::op::Store>(n)->get_count();
+    in_out_type_ = emitter_in_out_map::vec_to_gpr;
+    store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count));
+}
+
+void StoreEmitter::emit_impl(const std::vector<size_t>& in,
+                             const std::vector<size_t>& out,
+                             const std::vector<size_t>& pool,
+                             const std::vector<size_t>& gpr,
+                             const ov::intel_cpu::emitter_context *emit_context) const {
+    if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
+        emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
+        emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
+        emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
+    } else {
+        IE_THROW() << "Store emitter doesn't support " << host_isa_;
+    }
+}
+
+template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+void StoreEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
+    using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
+            Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
+    if (!store_emitter)
+        IE_THROW() << "Store CPU emitter isn't initialized for StoreEmitter!";
+    store_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
+}
+
+void StoreEmitter::emit_data() const {
+    store_emitter->emit_data();
+}
+
+LoadEmitter::LoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
+                         const std::shared_ptr<ov::Node>& n) : MemoryEmitter(h, isa, n) {
+    if (src_prc != dst_prc)
+        IE_THROW() << "LoadEmitter supports only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name();
+
+    count = ov::as_type_ptr<ngraph::snippets::op::Load>(n)->get_count();
+    in_out_type_ = emitter_in_out_map::gpr_to_vec;
+    load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count));
+}
+
+void LoadEmitter::emit_impl(const std::vector<size_t>& in,
+                            const std::vector<size_t>& out,
+                            const std::vector<size_t>& pool,
+                            const std::vector<size_t>& gpr,
+                            const ov::intel_cpu::emitter_context *emit_context) const {
+    if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
+        emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
+        emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
+        emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
+    } else {
+        IE_THROW() << "Load emitter doesn't support " << host_isa_;
+    }
+}
+
+template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+void LoadEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
+    using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
+            Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
+    if (!load_emitter)
+        IE_THROW() << "Load CPU emitter isn't initialized for LoadEmitter!";
+    load_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
+}
+
+void LoadEmitter::emit_data() const {
+    load_emitter->emit_data();
+}
+
+BroadcastLoadEmitter::BroadcastLoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
+                                           const std::shared_ptr<ov::Node>& n) : MemoryEmitter(h, isa, n) {
+    if (src_prc != dst_prc)
+            IE_THROW() << "BroadcastEmitters support only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name();
+
+    in_out_type_ = emitter_in_out_map::gpr_to_vec;
+}
+
+void BroadcastLoadEmitter::emit_impl(const std::vector<size_t>& in,
+                                     const std::vector<size_t>& out,
+                                     const std::vector<size_t>& pool,
+                                     const std::vector<size_t>& gpr,
+                                     const ov::intel_cpu::emitter_context *emit_context) const {
+    if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
+        emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
+        emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
+        emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
+    } else {
+        IE_THROW() << "BroadcastLoad emitter doesn't support " << host_isa_;
+    }
+}
+
+template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+void BroadcastLoadEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
+    using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
+            Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
+    Reg64 in_reg(in[0]);
+    Vmm vmm_dst = Vmm(out[0]);
+
+    // In doesn't really matter if we broadcast or `movss` for vector tails so keep only one version for `BroadcastLoad`,
+    // key point here is not to add post-increment, it might be fixed by some other approach in future
+    switch (src_prc.size()) {
+        case 4: h->uni_vbroadcastss(vmm_dst, h->ptr[in_reg]); break;
+        case 2: h->vpbroadcastw(vmm_dst, h->ptr[in_reg]); break;
+        case 1: h->vpbroadcastb(vmm_dst, h->ptr[in_reg]); break;
+        default: assert(!"unsupported data type");
+    }
+}
+
+LoadConvertEmitter::LoadConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
+    : MemoryEmitter(h, isa, n) {
+    count = ov::as_type_ptr<ngraph::snippets::op::Load>(n)->get_count();
+    in_out_type_ = emitter_in_out_map::gpr_to_vec;
+    load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count));
+}
+
+void LoadConvertEmitter::emit_impl(const std::vector<size_t>& in,
+                                   const std::vector<size_t>& out,
+                                   const std::vector<size_t>& pool,
+                                   const std::vector<size_t>& gpr,
+                                   const ov::intel_cpu::emitter_context *emit_context) const {
+    if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
+        emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
+        emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
+        emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
+    } else {
+        IE_THROW() << "LoadConvert emitter doesn't support " << host_isa_;
+    }
+}
+
+template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+void LoadConvertEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
+    if (!load_emitter)
+        IE_THROW() << "Load CPU emitter isn't initialized for LoadEmitter!";
+    load_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
+}
+
+void LoadConvertEmitter::emit_data() const {
+    load_emitter->emit_data();
+}
+
+StoreConvertEmitter::StoreConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
+                                         const std::shared_ptr<ov::Node>& n) : MemoryEmitter(h, isa, n) {
+    count = ov::as_type_ptr<ngraph::snippets::op::Store>(n)->get_count();
+    in_out_type_ = emitter_in_out_map::vec_to_gpr;
+
+    const auto mode = ov::as_type_ptr<ov::intel_cpu::StoreConvert>(n)->get_arithmetic_mode();
+    store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count, mode));
+}
+
+void StoreConvertEmitter::emit_impl(const std::vector<size_t>& in,
+                                    const std::vector<size_t>& out,
+                                    const std::vector<size_t>& pool,
+                                    const std::vector<size_t>& gpr,
+                                    const ov::intel_cpu::emitter_context *emit_context) const {
+    if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
+        emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
+        emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
+        emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
+    } else {
+        IE_THROW() << "StoreConvert emitter doesn't support " << host_isa_;
+    }
+}
+
+template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+void StoreConvertEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
+    if (!store_emitter)
+        IE_THROW() << "Store CPU emitter isn't initialized for StoreEmitter!";
+    store_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
+}
+
+void StoreConvertEmitter::emit_data() const {
+    store_emitter->emit_data();
+}
+
+}   // namespace intel_cpu
+}   // namespace ov
diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp
index c078fa68003cd7..f23efe19e75dbe 100644
--- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp
+++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp
@@ -6,15 +6,19 @@
 
 #include <ngraph/rt_info.hpp>
 #include <ngraph/variant.hpp>
+#include <ie_ngraph_utils.hpp>
 
 #include "jit_emitter.hpp"
+#include "jit_load_store_emitters.hpp"
 
 using namespace Xbyak;
+using ngraph::snippets::AllocatedEmitter;
 
 namespace ov {
 namespace intel_cpu {
 
-#define SNIPPETS_MAX_SNIPPETS_DIMS 7
+
+#define SNIPPETS_MAX_SNIPPETS_DIMS 12
 #define SNIPPETS_MAX_HARNESS_DIMS 5
 #define SNIPPETS_MAX_TILE_RANK 2
 #define GET_OFF(field) offsetof(jit_snippets_call_args, field)
@@ -30,11 +34,27 @@ struct jit_snippets_compile_args {
     std::vector<size_t> output_dims = {};
 };
 ///
-/// \brief    Kernel is the only entry point to Codogen Jit compilation. Kernel calculates appropriate data offsets,
-/// and invokes enclosed outer Tiles. Only 2d Tiles are currently supported, so the emitters should
-/// be organized in the following way:
-/// KernelEmitter {          /* entry point */
-///     TileEmitter {        /* outer tile */
+/// \brief jit_container_emitter designed to wrap Emitters that contain other Emitters (presently KernelEmitter,
+/// TileSchedulerEmitter and TileEmitter). This is needed to provide common interface for register mapping
+/// (abstract to physical) and nested code access.
+///
+class jit_container_emitter: public jit_emitter {
+public:
+    jit_container_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
+                          const std::shared_ptr<ov::Node>& n);
+protected:
+    // maps gpr and vec abstract registers to physical ones. Physical reg indexes are taken from the provided pools
+    // (the first 2 args). All the used gpr and vec registers are also stored in the provided sets (the second 2 args).
+    void map_abstract_registers(const std::vector<size_t>&,  const std::vector<size_t>&,
+                                std::set<size_t>&, std::set<size_t>&);
+    std::vector<AllocatedEmitter> body;
+};
+///
+/// \brief    Kernel is the only entry point to Codogen Jit compilation. Kernel perform abstract-to-physical register
+/// mapping and creates pools of available gpr and vec registers. Kernel is expected to contain (at least one)
+/// TileSchedulerEmitter. In general the enclosed emitters should be organized in the following way:
+/// KernelEmitter {          /* entry point, maps registers, creates pools of available registers */
+///     TileSchedulerEmitter { /* executes required inner, avoids emitting code that won't be executed */
 ///         TileEmitter {    /* inner vector tile */
 ///             ...          /* All the necessary Load/Strore/elementwise emitters */
 ///         }
@@ -43,255 +63,110 @@ struct jit_snippets_compile_args {
 ///         }
 ///     }
 /// }
-/// Note that Kernel params are passed directly to the emit_code(). The vector of inputs should contain 2 arguments, the
-/// output vector should be empty. Input parameters
+/// Note that Kernel doesn't accept any input arguments.
 ///
-/// \param      in[0]       The number of the node inputs
-/// \param      in[1]      The number of the node outputs
-///
-// Todo: Scheduler dims and offsets are currently calculated in Subgraph node and passed to the KernelEmitter.
-//  However, it seems more natural to calculate all the offsets right in the Kernel op, because the calculation is
-//  not device-specific. It is based only on input/output dims (which we already know) and harness num dims
-//  (which we should pass from the plugin). It seems also better to wrap the enclosed emitters in tiles in the Kernel op
-//  and avoid creating empty tiles.
-class KernelEmitter : public jit_emitter {
+class KernelEmitter : public jit_container_emitter {
 public:
     KernelEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
-    const std::shared_ptr<ov::Node>& n)
-    : jit_emitter(h, isa, n) {
-        const auto kernel = ov::as_type_ptr<ngraph::snippets::op::Kernel>(n);
-        if (!kernel)
-            IE_THROW() << "KernelEmitter invoked with invalid op argument";
-        if (!kernel->compile_params)
-            IE_THROW() << "KernelEmitter invoked without compile_params";
-        code = kernel->region;
-        jcp = *reinterpret_cast<const jit_snippets_compile_args*>(kernel->compile_params);
-    }
+                  const std::shared_ptr<ov::Node>& n);
 
     size_t get_inputs_num() const override {return 0;}
-
-    void emit_code(const std::vector<size_t> &in, const std::vector<size_t> &out,
-              const std::vector<size_t> &pool = {}, const std::vector<size_t> &gpr = {}) const override {
-        validate_arguments(in, out, pool, gpr);
-        emit_impl(in, out, pool, gpr, nullptr);
-    }
+    void emit_code(const std::vector<size_t> &in,
+                   const std::vector<size_t> &out,
+                   const std::vector<size_t> &pool,
+                   const std::vector<size_t> &gpr) const override;
 
 private:
-    void validate_arguments(const std::vector<size_t> &in, const std::vector<size_t> &out,
-                            const std::vector<size_t> &pool = {}, const std::vector<size_t> &gpr = {}) const override {
-        if (in.size() != 2)
-            IE_THROW() << "KernelEmitter got invalid number of inputs. Expected 2, got " << in.size();
-        if (out.size() != 0)
-            IE_THROW() << "KernelEmitter got unexpected output arguments.";
-        const size_t num_params = in[0] + in[1];
-        if (num_params > SNIPPETS_MAX_SNIPPETS_DIMS)
-            IE_THROW() << "KernelEmitter supports only up to " << SNIPPETS_MAX_SNIPPETS_DIMS <<
-                       " parameters, got " << num_params;
-        const int64_t harness_num_dims = jcp.output_dims.size() - 1;
-        if (harness_num_dims > SNIPPETS_MAX_HARNESS_DIMS)
-            IE_THROW() << "KernelEmitter supports harness with up to " << SNIPPETS_MAX_HARNESS_DIMS <<
-                       " dims, got " << harness_num_dims;
-    }
+    void validate_arguments(const std::vector<size_t> &in,
+                            const std::vector<size_t> &out,
+                            const std::vector<size_t> &pool,
+                            const std::vector<size_t> &gpr) const override;
+    void emit_impl(const std::vector<size_t>& in,
+                   const std::vector<size_t>& out,
+                   const std::vector<size_t>& pool,
+                   const std::vector<size_t>& gpr,
+                   const ov::intel_cpu::emitter_context *emit_context) const override;
+    void init_data_pointers(size_t, size_t, const Reg64&, const Reg64&, const std::vector<Reg64>&) const;
+
+    jit_snippets_compile_args jcp;
+    std::vector<size_t> gp_regs_pool;
+    std::vector<size_t> gp_regs_used;
+    std::vector<size_t> vec_regs_pool;
+};
+///
+/// \brief  TileSchedulerEmitter contains Tiles to be executed (presently vector and scalar). It calculates data offsets
+/// and work amounts, performs data pointer decrements if necessary. It also performs some Tile optimizations: scalar/vector
+/// tiles are emitted only if necessary; Tile body could be emitted directly, if only one Tile evaluation is required.
+///
+/// \param      in[0]      The number of the node inputs
+/// \param      in[1]      The number of the node outputs
+/// \param      in[2]      The number of elements that fits into vector register
+///
+
+class TileSchedulerEmitter : public jit_container_emitter {
+public:
+    TileSchedulerEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
+                         const std::shared_ptr<ov::Node>& n);
 
+    size_t get_inputs_num() const override {return 0;}
+    void emit_code(const std::vector<size_t> &in,
+                   const std::vector<size_t> &out,
+                   const std::vector<size_t> &pool,
+                   const std::vector<size_t> &gpr) const override;
+
+private:
+    void validate_arguments(const std::vector<size_t> &in,
+                            const std::vector<size_t> &out,
+                            const std::vector<size_t> &pool,
+                            const std::vector<size_t> &gpr) const override;
     void emit_impl(const std::vector<size_t>& in,
                    const std::vector<size_t>& out,
                    const std::vector<size_t>& pool,
                    const std::vector<size_t>& gpr,
-                   const ov::intel_cpu::emitter_context *emit_context) const override {
-        const size_t num_inputs = in[0];
-        const size_t num_outputs = in[1];
-        const size_t num_params = num_inputs + num_outputs;
-        int reg64_tmp_start { 8 }; // R8, R9, R10, R11, R12, R13, R14, R15 inputs+outputs+1
-        const int64_t harness_num_dims = jcp.output_dims.size() - 1;
-
-        Reg64 reg_indexes   { dnnl::impl::cpu::x64::abi_param_regs[0] };
-        Reg64 reg_const_params { dnnl::impl::cpu::x64::abi_param_regs[1] };
-        Xbyak::Reg64 reg_tmp_64 { dnnl::impl::cpu::x64::abi_not_param_reg };
-
-        h->preamble();
-
-        std::vector<Reg64> regs(num_params);
-        auto init_ptrs_with_offsets = [&](Reg64 pointer, const int64_t *offsets) {
-            for (int j = 0; j < harness_num_dims; j++) {
-                if (jcp.output_dims[j] != 1 && offsets[j] != 0) {
-                    h->mov(reg_tmp_64, offsets[j]);
-                    h->imul(reg_tmp_64, h->ptr[reg_indexes + j * sizeof(size_t)]);
-                    h->add(pointer, reg_tmp_64);
-                }
-            }
-        };
-        for (auto i = 0; i < num_params; i++) {
-            regs[i] = Reg64(reg64_tmp_start + i);
-            if (i < num_inputs)
-                h->mov(regs[i], h->ptr[reg_const_params + GET_OFF(src_ptrs) + i * sizeof(void*)]);
-            else
-                h->mov(regs[i], h->ptr[reg_const_params + GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)]);
-            init_ptrs_with_offsets(regs[i], &jcp.data_offsets[i * harness_num_dims]);
-        }
-
-        for (auto& c : code) {
-            c.first->emit_code(c.second.first, c.second.second, pool, gpr);
-        }
-
-        h->postamble();
-    }
+                   const ov::intel_cpu::emitter_context *emit_context) const override;
+
+    void emit_tiles(const Reg64&, const std::vector<Reg64>&, size_t, const std::vector<size_t>& , const std::vector<size_t>&) const;
 
     jit_snippets_compile_args jcp;
-    std::vector<std::pair<std::shared_ptr<Emitter>, ngraph::snippets::RegInfo>> code;
 };
+
 ///
 /// \brief    Tile is designed to organize loop over the input and output data. It is essentially a for(...) loop:
-/// it calculates the total number of iterations, performs operations specified by enclosed emitters, advances iteration counters
+/// it performs operations specified by enclosed emitters, advances iteration counters
 /// and breaks when necessary.
 ///
 /// \param      in[0]    The number of input entities (or scheduler counts) processed during one iteration of the tile.
-/// It is expected to be 1 for outer or scalar tiles and vlen for vector tiles.
-/// \param      in[1]    Increment of the previous Tile in current dimension. Must be 0 if this is the first Tile.
-/// So previous_inc is zero for outer and vector tiles (the are the first in dim) and vlen for scalar tiles (they usually go after vector Tiles).
-/// \param      in[2]    sum number inputs and number of outputs of the node.
-/// \param      in[3]    dimension of the tile. Note that only 2d Tile are currently supported, so dim is 0 for outer tiles, 1 for inner tiles.
-///
-// Todo: Inner and outer tiles have different semantics. For example, outer tile always has the increment == 1, and it can contain only
-//  tile emitters (one outer or two inner). So it seems better to create different classes for inner and outer tiles.
-// Todo: Currently data pointers incremented after each read/write in Load/Store emitters, so we have to decrement them here
-//  if the same data needs to be read twice. Better to move all the pointer increments to TileEmitter and avoid the increments if necessary.
-class TileEmitter : public jit_emitter {
+///  It is expected to be 1 for outer or scalar tiles and vlen for vector tiles.
+class TileEmitter : public jit_container_emitter {
 public:
-    TileEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
-    const std::shared_ptr<ov::Node>& n)
-    : jit_emitter(h, isa, n) {
-        const auto tile = ov::as_type_ptr<ngraph::snippets::op::Tile>(n);
-        if (!tile)
-            IE_THROW() << "TileEmitter invoked with invalid op argument";
-        if (!tile->compile_params)
-            IE_THROW() << "TileEmitter invoked without compile_params";
-        code = tile->region;
-        jcp = *reinterpret_cast<const jit_snippets_compile_args*>(tile->compile_params);
-    }
+    TileEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
 
     size_t get_inputs_num() const override {return 0;}
+    std::vector<AllocatedEmitter>& get_nested_code();
+    void emit_code(const std::vector<size_t> &in,
+                   const std::vector<size_t> &out,
+                   const std::vector<size_t> &pool,
+                   const std::vector<size_t> &gpr) const override;
 
-    void emit_code(const std::vector<size_t> &in, const std::vector<size_t> &out,
-              const std::vector<size_t> &pool = {}, const std::vector<size_t> &gpr = {}) const override {
-        validate_arguments(in, out, pool, gpr);
-        emit_impl(in, out, pool, gpr, nullptr);
-    }
+    void emit_body(const std::vector<size_t>& vec_pool, const std::vector<size_t>& gpr_pool) const;
+    void emit_ptr_increments(const std::vector<Reg64>& data_ptr_regs) const;
 
 private:
-    void validate_arguments(const std::vector<size_t> &in, const std::vector<size_t> &out,
-                            const std::vector<size_t> &pool = {}, const std::vector<size_t> &gpr = {}) const override {
-        if (in.size() != 4)
-            IE_THROW() << "TileEmitter got invalid number of inputs. Expected 4, got " << in.size();
-        if (out.size() != 0)
-            IE_THROW() << "TileEmitter got unexpected output arguments.";
-        const size_t num_params = in[2];
-        if (num_params > SNIPPETS_MAX_SNIPPETS_DIMS)
-            IE_THROW() << "TileEmitter supports only up to " << SNIPPETS_MAX_SNIPPETS_DIMS <<
-                       " parameters, got " << num_params;
-        const size_t dim = in[3];
-        if (dim >= SNIPPETS_MAX_TILE_RANK)
-            IE_THROW() << "TileEmitter supports tile ranks up to " << SNIPPETS_MAX_TILE_RANK <<
-                       " got " << dim;
-    }
-
+    void validate_arguments(const std::vector<size_t> &in,
+                            const std::vector<size_t> &out,
+                            const std::vector<size_t> &pool,
+                            const std::vector<size_t> &gpr) const override;
     void emit_impl(const std::vector<size_t>& in,
                    const std::vector<size_t>& out,
                    const std::vector<size_t>& pool,
                    const std::vector<size_t>& gpr,
-                   const ov::intel_cpu::emitter_context *emit_context) const override {
-        const size_t inc = in[0];
-        const size_t previous_inc = in[1]; // increment of a previous tile in the same dim (0 if the first tile in the dim)
-        const size_t num_params = in[2];
-        const size_t dim = in[3]; // tile dimension: 0 - outer, 1 - inner
-        const int reg64_tmp_start { 8 }; // R8, R9, R10, R11, R12, R13, R14, R15 inputs+outputs+1
-        Reg64 amount = Reg64(reg64_tmp_start + num_params); // amount
-        std::array<Label, 2> for_body;
-
-        // If R15 is not used, reserve it for use in scalar to avoid redundant push-pop's.
-        // todo: Do we need explicitly check that code contains ScalarEmitter?
-        std::vector<size_t> local_gpr = reg64_tmp_start + num_params < 15 ? std::vector<size_t>{15} : std::vector<size_t>{};
-        std::vector<Reg64> regs(num_params);
-        for (auto i = 0; dim == 0 && i < num_params; i++)
-            regs[i] = Reg64(reg64_tmp_start + i);
-        // Loop processing could be simplified in some cases
-        if (inc > jcp.scheduler_dims[dim]) {
-            return;
-        } else if (inc == jcp.scheduler_dims[dim]) {
-            for (auto& c : code) {
-                c.first->emit_code(c.second.first, c.second.second, pool, local_gpr);
-            }
-        } else {
-            // The previous tile has done nothing, all the work is ours
-            if (previous_inc == 0 || previous_inc > jcp.scheduler_dims[dim]) {
-                h->mov(amount, jcp.scheduler_dims[dim]);
-            // The previous tile has done all the work
-            } else if (jcp.scheduler_dims[dim] % previous_inc == 0) {
-                return;
-            }// else: the previous tile has already set a proper work amount
-            h->cmp(amount, inc);
-            h->jl(for_body[0], CodeGenerator::T_NEAR);
-
-            h->L(for_body[1]);
-            {
-                h->push(amount);
-                for (auto& c : code) {
-                    c.first->emit_code(c.second.first, c.second.second, pool, local_gpr);
-                }
-                h->pop(amount);
-                // Todo: Load and Store emitters are currently implemented so they ALWAYS increment appropriate pointers
-                //   after reading/writing. This might be a problem if we need to read the same data multiple times (broadcasting shapes).
-                //   To overcome this limitation, we add appropriate negative offsets if necessary.
-                for (auto i = 0; dim == 0 && i < num_params; i++) {
-                    if (jcp.scheduler_offsets[i] != 0) {
-                        h->add(regs[i], jcp.scheduler_offsets[i]);
-                    }
-                }
-                    h->sub(amount, inc);
-                    h->cmp(amount, inc);
-                    h->jge(for_body[1], CodeGenerator::T_NEAR);
-            }
-
-            h->L(for_body[0]);
-        }
-    }
+                   const ov::intel_cpu::emitter_context *emit_context) const override;
 
-    // A = <42, 17>
-    // B = < 1, 17>
-    // for (auto k = 0; k < dom_0; k++) { // 42
-    //   for (auto n = 0; n < dom_1; n++) { // 17
-    //     auto a = *ptr0; ptr0 += vlan; // vector/scalar load
-    //     auto b = *ptr1; ptr1 += vlan; // vector/scalar load
-    //   }
-    //   ptr0 -= 0*dom_1;
-    //   ptr1 -= 1*dom_1;
-    // }
-
-    // broadcast by MVD is extra case
-    // A = <42, 17>
-    // B = <42,  1>
-    // for (auto k = 0; k < dom_0; k++) { // 42
-    //   for (auto n = 0; n < dom_1; n++) { // 17
-    //     auto a = *ptr0; ptr0 += vlan; // vector/scalar load
-    //     auto b = *ptr1;  // broadcast load
-    //   }
-    //   ptr0 -= 0*dom_1;
-    //   ptr1 += sizeof(ptr1[0]); //ptr1 -= -sizeof(ptr1[0]);
-    // }
-
-    // A = <42, 17, 31>
-    // B = < 1, 17, 31>
-    // for (auto k = 0; k < dom_0; k++) { // 42
-    //   for (auto n = 0; n < dom_1; n++) { // 17
-    //     for (auto m = 0; m < dom_2; m++) { // 31
-    //       auto a = *ptr0; ptr0 += vlan; // vector/scalar load
-    //       auto b = *ptr1; ptr1 += vlan; // vector/scalar load
-    //     }
-    //   }
-    //   ptr0 -= 0*dom_1*dom2;
-    //   ptr1 -= 1*dom_1*dom2;
-    // }
-    jit_snippets_compile_args jcp;
-    std::vector<std::pair<std::shared_ptr<Emitter>, ngraph::snippets::RegInfo>> code;
+    size_t num_inputs = 0;
+    size_t num_outputs = 0;
+    std::vector<size_t> io_dims {};
+    std::vector<size_t> io_data_size {};
+    size_t increment = 0;
 };
 
 class NopEmitter : public jit_emitter {
@@ -311,17 +186,10 @@ class NopEmitter : public jit_emitter {
     }
 };
 
-class FakeBroadcastEmitter : public jit_emitter {
+class BroadcastMoveEmitter : public jit_emitter {
 public:
-    FakeBroadcastEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
-    : jit_emitter(h, isa, n) {
-        if (n->get_input_shape(0).empty())
-            use_broadcast = true;
-        else if (*n->get_input_shape(0).rbegin() != *n->get_output_shape(0).rbegin())
-            use_broadcast = true;
-        else
-            use_broadcast = false;
-    }
+    BroadcastMoveEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
+
     size_t get_inputs_num() const override {return 1;}
 
 private:
@@ -329,45 +197,19 @@ class FakeBroadcastEmitter : public jit_emitter {
               const std::vector<size_t>& out,
               const std::vector<size_t>& pool,
               const std::vector<size_t>& gpr,
-              const ov::intel_cpu::emitter_context *emit_context) const override {
-        if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
-            emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
-        } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
-            emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
-        } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
-            emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
-        } else {
-            IE_THROW() << host_isa_;
-            assert(!"unsupported isa");
-        }
-    }
+              const ov::intel_cpu::emitter_context *emit_context) const override;
 
     template <dnnl::impl::cpu::x64::cpu_isa_t isa>
-    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
-        using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
-                                    Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
-        Vmm vmm_src0 = Vmm(in[0]);
-        Vmm vmm_dst  = Vmm(out[0]);
-
-        if (use_broadcast) {
-            h->uni_vbroadcastss(vmm_dst, Xmm(in[0]));
-        } else {
-            h->uni_vmovups(vmm_dst, vmm_src0);
-        }
-    }
+    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
 
 private:
     bool use_broadcast;
+    size_t byte_size = 0lu;
 };
 
 class ScalarEmitter : public jit_emitter {
 public:
-    ScalarEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
-    : jit_emitter(h, isa, n) {
-        value = dnnl::impl::cpu::x64::float2int(ov::as_type_ptr<ngraph::snippets::op::Scalar>(n)->cast_vector<float>()[0]);
-        push_arg_entry_of("scalar", value, true);
-        prepare_table();
-    }
+    ScalarEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
 
     size_t get_inputs_num() const override {return 0;}
 
@@ -379,26 +221,10 @@ class ScalarEmitter : public jit_emitter {
               const std::vector<size_t>& out,
               const std::vector<size_t>& pool,
               const std::vector<size_t>& gpr,
-              const ov::intel_cpu::emitter_context *emit_context) const override {
-        if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
-            emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
-        } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
-            emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
-        } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
-            emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
-        } else {
-            IE_THROW() << host_isa_;
-            assert(!"unsupported isa");
-        }
-    }
+              const ov::intel_cpu::emitter_context *emit_context) const override;
 
     template <dnnl::impl::cpu::x64::cpu_isa_t isa>
-    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
-        using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
-                                    Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
-        Vmm vmm_dst  = Vmm(out[0]);
-        h->uni_vbroadcastss(vmm_dst, table_val("scalar"));
-    }
+    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
 
 private:
     int32_t value;
@@ -415,33 +241,16 @@ class ScalarEmitter : public jit_emitter {
 /// Blocked parameter to tell if input is actually blocked. Broadcast means broadcast by W in other cases no need to substitute load.
 class MemoryEmitter : public jit_emitter  {
 public:
-    MemoryEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
-    : jit_emitter(h, isa, n), ea(getEA(n)) {
-    }
-
-    size_t get_inputs_num() const override {return 1;}
+    MemoryEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
 
 protected:
-    static auto getEA(const std::shared_ptr<ov::Node>& n) -> size_t {
-        auto& rt = n->get_rt_info();
-        size_t ea = 0;
-        auto it = rt.find("effectiveAddress");
-        if (it != rt.end()) {
-            ea = it->second.as<int64_t>();
-        } else {
-            throw ov::Exception("effective address for Load generation cannot be determined");
-        }
-        return ea;
-    }
-
-    size_t ea;
+    Precision src_prc;
+    Precision dst_prc;
 };
 
 class StoreEmitter : public MemoryEmitter  {
 public:
-    StoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
-    : MemoryEmitter(h, isa, n) {
-    }
+    StoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
 
     size_t get_inputs_num() const override {return 1;}
 
@@ -450,72 +259,42 @@ class StoreEmitter : public MemoryEmitter  {
               const std::vector<size_t>& out,
               const std::vector<size_t>& pool,
               const std::vector<size_t>& gpr,
-              const ov::intel_cpu::emitter_context *emit_context) const override {
-        if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
-            emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
-        } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
-            emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
-        } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
-            emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
-        } else {
-            IE_THROW() << host_isa_;
-            assert(!"unsupported isa");
-        }
-    }
+              const ov::intel_cpu::emitter_context *emit_context) const override;
 
     template <dnnl::impl::cpu::x64::cpu_isa_t isa>
-    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
-        using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
-                                    Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
-        Reg64 out_reg(ea);
-        Vmm vmm_src0 = Vmm(in[0]);
-        h->uni_vmovups(h->ptr[out_reg], vmm_src0);
-        h->add(out_reg, dnnl::impl::cpu::x64::cpu_isa_traits<isa>::vlen);
-    }
+    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
+    void emit_data() const override;
+
+private:
+    size_t count;
+    std::unique_ptr<jit_store_emitter> store_emitter = nullptr;
 };
 
-class ScalarStoreEmitter : public MemoryEmitter {
+class LoadEmitter : public MemoryEmitter {
 public:
-    ScalarStoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
-    : MemoryEmitter(h, isa, n) {
-    }
+    LoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
 
-    size_t get_inputs_num() const override {return 1;}
+    size_t get_inputs_num() const override {return 0;}
 
 private:
     void emit_impl(const std::vector<size_t>& in,
               const std::vector<size_t>& out,
               const std::vector<size_t>& pool,
               const std::vector<size_t>& gpr,
-              const ov::intel_cpu::emitter_context *emit_context) const override {
-        if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
-            emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
-        } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
-            emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
-        } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
-            emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
-        } else {
-            IE_THROW() << host_isa_;
-            assert(!"unsupported isa");
-        }
-    }
+              const ov::intel_cpu::emitter_context *emit_context) const override;
 
     template <dnnl::impl::cpu::x64::cpu_isa_t isa>
-    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
-        using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
-                                        Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
-        Reg64 out_reg(ea);
-        Xmm vmm_src0 = Xmm(in[0]);
-        h->uni_vmovss(h->ptr[out_reg], vmm_src0);
-        h->add(out_reg, sizeof(float));
-    }
+    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
+    void emit_data() const override;
+
+private:
+    size_t count;
+    std::unique_ptr<jit_load_emitter> load_emitter = nullptr;
 };
 
-class LoadEmitter : public MemoryEmitter {
+class BroadcastLoadEmitter : public MemoryEmitter {
 public:
-    LoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
-    : MemoryEmitter(h, isa, n), shouldPostIncrement(*n->get_input_shape(0).rbegin() != 1) {
-    }
+    BroadcastLoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
 
     size_t get_inputs_num() const override {return 0;}
 
@@ -524,115 +303,54 @@ class LoadEmitter : public MemoryEmitter {
               const std::vector<size_t>& out,
               const std::vector<size_t>& pool,
               const std::vector<size_t>& gpr,
-              const ov::intel_cpu::emitter_context *emit_context) const override {
-        if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
-            emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
-        } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
-            emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
-        } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
-            emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
-        } else {
-            IE_THROW() << host_isa_;
-            assert(!"unsupported isa");
-        }
-    }
+              const ov::intel_cpu::emitter_context *emit_context) const override;
 
     template <dnnl::impl::cpu::x64::cpu_isa_t isa>
-    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
-        using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
-                                            Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
-        Reg64 in_reg(ea);
-        Vmm vmm_src0 = Vmm(out[0]);
-        h->uni_vmovups(vmm_src0, h->ptr[in_reg]);
-
-        if (shouldPostIncrement) {
-            h->add(in_reg, dnnl::impl::cpu::x64::cpu_isa_traits<isa>::vlen);
-        }
-    }
-
-private:
-    bool shouldPostIncrement;
+    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
 };
 
-class BroadcastLoadEmitter : public MemoryEmitter {
+class LoadConvertEmitter : public MemoryEmitter {
 public:
-    BroadcastLoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
-    : MemoryEmitter(h, isa, n) {
-    }
+    LoadConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
+
     size_t get_inputs_num() const override {return 0;}
 
 private:
     void emit_impl(const std::vector<size_t>& in,
-              const std::vector<size_t>& out,
-              const std::vector<size_t>& pool,
-              const std::vector<size_t>& gpr,
-              const ov::intel_cpu::emitter_context *emit_context) const override {
-        if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
-            emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
-        } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
-            emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
-        } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
-            emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
-        } else {
-            IE_THROW() << host_isa_;
-            assert(!"unsupported isa");
-        }
-    }
+                   const std::vector<size_t>& out,
+                   const std::vector<size_t>& pool,
+                   const std::vector<size_t>& gpr,
+                   const ov::intel_cpu::emitter_context *emit_context) const override;
 
     template <dnnl::impl::cpu::x64::cpu_isa_t isa>
-    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
-        using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
-                                            Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
-        Reg64 in_reg(ea);
-        Vmm vmm_src0 = Vmm(out[0]);
-
-        // In doesn't really matter if we broadcast or `movss` for vector tails so keep only one version for `BroadcastLoad`,
-        // key point here is not to add post-increment, it might be fixed by some other approach in future
-        h->uni_vbroadcastss(vmm_src0, h->ptr[in_reg]);
-    }
+    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
+    void emit_data() const override;
+
+private:
+    size_t count;
+    std::unique_ptr<jit_load_emitter> load_emitter = nullptr;
 };
 
-class ScalarLoadEmitter : public MemoryEmitter {
+class StoreConvertEmitter : public MemoryEmitter {
 public:
-    ScalarLoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
-    : MemoryEmitter(h, isa, n), shouldPostIncrement(*n->get_input_shape(0).rbegin() != 1) {
-    }
-    size_t get_inputs_num() const override {return 0;}
+    StoreConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
+
+    size_t get_inputs_num() const override {return 1;}
 
 private:
     void emit_impl(const std::vector<size_t>& in,
-              const std::vector<size_t>& out,
-              const std::vector<size_t>& pool,
-              const std::vector<size_t>& gpr,
-              const ov::intel_cpu::emitter_context *emit_context) const override {
-        if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
-            emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
-        } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
-            emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
-        } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
-            emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
-        } else {
-            IE_THROW() << host_isa_;
-            assert(!"unsupported isa");
-        }
-    }
+                   const std::vector<size_t>& out,
+                   const std::vector<size_t>& pool,
+                   const std::vector<size_t>& gpr,
+                   const ov::intel_cpu::emitter_context *emit_context) const override;
 
     template <dnnl::impl::cpu::x64::cpu_isa_t isa>
-    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
-        using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
-                                            Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
-        Reg64 in_reg(ea);
-        Xmm vmm_src0 = Xmm(out[0]);
-        h->uni_vmovss(vmm_src0, h->ptr[in_reg]);
-
-        // Doesn't work if the same pointer comes with multiple load operations
-        if (shouldPostIncrement) {
-            h->add(in_reg, sizeof(float));
-        }
-    }
+    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
+    void emit_data() const override;
 
 private:
-    bool shouldPostIncrement;
+    size_t count;
+    std::unique_ptr<jit_store_emitter> store_emitter = nullptr;
 };
 
 }   // namespace intel_cpu
diff --git a/src/plugins/intel_cpu/src/extension.cpp b/src/plugins/intel_cpu/src/extension.cpp
index 59b7a838ed9fd6..3a846baa390936 100644
--- a/src/plugins/intel_cpu/src/extension.cpp
+++ b/src/plugins/intel_cpu/src/extension.cpp
@@ -7,6 +7,7 @@
 #include "ngraph_transformations/op/leaky_relu.hpp"
 #include "ngraph_transformations/op/power_static.hpp"
 #include "ngraph_transformations/op/swish_cpu.hpp"
+#include "snippets_transformations/op/load_store_convert.hpp"
 
 #include <ngraph/ngraph.hpp>
 #include <ngraph_ops/type_relaxed.hpp>
@@ -40,6 +41,8 @@ std::map<std::string, ngraph::OpSet> Extension::getOpSets() {
         NGRAPH_OP(LeakyReluNode, ov::intel_cpu)
         NGRAPH_OP(PowerStaticNode, ov::intel_cpu)
         NGRAPH_OP(SwishNode, ov::intel_cpu)
+        NGRAPH_OP(LoadConvert, ov::intel_cpu)
+        NGRAPH_OP(StoreConvert, ov::intel_cpu)
 #undef NGRAPH_OP
 
         return opset;
diff --git a/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp b/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp
index 577578914de627..0768dbb34d0821 100644
--- a/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp
+++ b/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp
@@ -180,6 +180,39 @@ bool isSuitableMatMulParent(const std::shared_ptr<const Node> &node) {
     const bool has_only_child = (out.size() == 1) && (out[0].get_target_inputs().size() == 1);
     return is_suitable_node && has_only_child;
 }
+// Subtract as ZeroPoints for Convolution
+bool isSuitableSubtractAsZeroPointsParent(const std::shared_ptr<const Node> &node) {
+    const bool is_suitable_node = ov::is_type<ngraph::op::v1::Subtract>(node);
+    const auto out = node->outputs();
+    const bool has_only_child = (out.size() == 1) && (out[0].get_target_inputs().size() == 1);
+    const bool has_two_parents = node->get_input_size() == 2;
+    if (!(is_suitable_node && has_only_child && has_two_parents))
+        return false;
+
+    const auto child = node->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
+    const bool is_conv = ov::is_type<ov::op::v1::Convolution>(child);
+    const bool is_group_conv = ov::is_type<ov::op::v1::GroupConvolution>(child);
+    if (!is_conv && !is_group_conv)
+        return false;
+    const auto weight_shape = child->get_input_shape(1);
+    const bool is_depthwise = is_group_conv && weight_shape[1] == 1 && weight_shape[2] == 1;
+    const bool deptwise_is_suitable = implication(is_depthwise, child->get_input_shape(0).size() < 5);
+    if (!(is_conv && deptwise_is_suitable))
+        return false;
+
+    const bool first_input_is_suitable = node->get_input_node_shared_ptr(0)->get_output_element_type(0) == ov::element::u8;
+    const auto zp_weights = node->get_input_node_shared_ptr(1);
+    const auto zp_weight_shape = zp_weights->get_output_shape(0);
+    bool second_input_is_suitable =
+            ov::is_type<ngraph::op::v0::Constant>(zp_weights) &&
+                    zp_weights->get_output_element_type(0) == ov::element::u8 &&
+                    zp_weight_shape.size() >= 2;
+    if (!(first_input_is_suitable && second_input_is_suitable))
+        return false;
+    auto correct_shape = ov::Shape(zp_weight_shape.size(), 1);
+    correct_shape[1] = zp_weight_shape[1];
+    return correct_shape == zp_weight_shape;
+}
 bool isSuitablePoolChild(const std::shared_ptr<const Node> &node) {
     const bool is_suitable_node = ov::is_type<ngraph::op::v1::MaxPool>(node);
     // has a single output, connected to a single child
@@ -225,15 +258,40 @@ bool isSuitableChildForFusingMatMul(const std::shared_ptr<const Node> &node, Nod
     // FuseMatMulAndSimpleOperation or FuseFullyConnectedAndSimpleOperation
     // Invoke SupportsFusingWithConvolution_Simple directly instead of isSuitableChildForFusingSimple to
     // eliminate getNumNonConstInputs() check
-    int fusingAxis;
-    if (can_be_converted_to_FC)
-        fusingAxis = matmul_shape.size() == 3 ? 2 : 1;
-    else
-        fusingAxis = matmul_shape.size() - 1;
+    int fusingAxis = can_be_converted_to_FC ? (matmul_shape.size() == 3 ? 2 : 1) : matmul_shape.size() - 1;
+
     if (SupportsFusingWithConvolution_Simple(node, fusingAxis)) {
         updatedChainType = NodeFusingType::FusedWithMisc;
         return true;
     }
+
+    // canFuse() from MatMul for case with rank > 2
+    // Algorithm::EltwisePowerStatic is ignored
+    if (!can_be_converted_to_FC &&
+        node->get_output_shape(0).size() > 2) {
+        if (ov::is_type<ov::op::v1::Add>(node) ||
+            ov::is_type<ov::op::v1::Multiply>(node) ||
+            ov::is_type<ov::op::v1::Subtract>(node) ||
+            ov::is_type<ov::op::v1::Divide>(node) ||
+            ov::is_type<ov::op::v0::PRelu>(node)) {
+            const auto const1 = ov::is_type<ov::op::v0::Constant>(node->get_input_node_shared_ptr(0));
+            const auto const2 = ov::is_type<ov::op::v0::Constant>(node->get_input_node_shared_ptr(1));
+            int constPort = -1;
+            if (const2) {
+                constPort = 1;
+            } else if (const1) {
+                constPort = 0;
+            }
+
+            if (constPort != -1) {
+                auto const_shape = node->get_input_shape(constPort);
+                if (ov::shape_size(const_shape) != 1) {
+                    return false;
+                }
+            }
+        }
+    }
+
     //    FullyConnectedBiasFusion
     if (!(can_be_converted_to_FC && ov::is_type<ngraph::opset1::Add>(node) &&
         bias_shape.back() == matmul_shape.back() &&
@@ -340,6 +398,9 @@ bool SnippetsMarkSkipped::run_on_model(const std::shared_ptr<ov::Model> &m) {
         } else if (isSuitableMatMulParent(node)) {
             SetNodeFusingType(node, NodeFusingType::FusedWithMatMul);
             continue;
+        } else if (isSuitableSubtractAsZeroPointsParent(node)) {
+            SetSnippetsNodeType(node, snippets::pass::SnippetsNodeType::SkippedByPlugin);
+            continue;
         }
         for (const auto fusingChainType : getContinuableChains(node)) {
             if (isSuitableChildForFusingSimple(node, channelAxis)) {
diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
index a95e3d6634fe82..b16281417ba9d9 100644
--- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
@@ -22,6 +22,7 @@
 
 #include <snippets/op/subgraph.hpp>
 #include "emitters/cpu_generator.hpp"
+#include "snippets_transformations/fuse_load_store_and_convert.hpp"
 
 using namespace InferenceEngine;
 using namespace dnnl::impl::utils;
@@ -60,7 +61,7 @@ void Snippet::initSupportedPrimitiveDescriptors() {
     if (!supportedPrimitiveDescriptors.empty())
         return;
 
-    const Precision supportedPrecision = Precision::FP32;
+    const std::set<Precision> supportedPrecisions = { Precision::FP32, Precision::I32, Precision::BF16, Precision::I8, Precision::U8 };
 
     bool dimRanksAreEqual = true;
     for (size_t i = 0; dimRanksAreEqual && i < inputShapes.size(); i++) {
@@ -125,18 +126,29 @@ void Snippet::initSupportedPrimitiveDescriptors() {
         config.dynBatchSupport = false;
         config.inConfs.resize(inputShapes.size());
         for (size_t i = 0; i < inputShapes.size(); i++) {
+            auto precision = getOriginalInputPrecisionAtPort(i);
+            if (supportedPrecisions.count(precision) == 0)
+                IE_THROW() << "Subgraph node with name `" << getName() << "` doesn't support " << precision << " precision.";
+
+            const auto equalPrecisions = getOriginalOutputPrecisions().size() == 1 &&
+                    precision == getOriginalOutputPrecisionAtPort(0);
+
             BlockedMemoryDesc::CmpMask inputMask = BLOCKED_DESC_SKIP_OFFSET_MASK;
             PortConfig portConfig;
-            portConfig.inPlace((!i && canBeInPlace()) ? 0 : -1);
+            portConfig.inPlace((!i && canBeInPlace() && equalPrecisions) ? 0 : -1);
             portConfig.constant(false);
             if (inputShapes[i].getDims()[0] == 1) {
                 inputMask.reset(0); // accepts any stride on batch axis
             }
-            portConfig.setMemDesc(createMemoryDesc(inputShapes[i], supportedPrecision, offset), inputMask);
+            portConfig.setMemDesc(createMemoryDesc(inputShapes[i], precision, offset), inputMask);
             config.inConfs[i] = portConfig;
         }
         config.outConfs.resize(outputShapes.size());
         for (size_t i = 0; i < outputShapes.size(); i++) {
+            auto precision = getOriginalOutputPrecisionAtPort(i);
+            if (supportedPrecisions.count(precision) == 0)
+                IE_THROW() << "Subgraph node with name `" << getName() << "` doesn't support " << precision << " precision.";
+
             BlockedMemoryDesc::CmpMask outputMask = BLOCKED_DESC_SKIP_OFFSET_MASK;
             PortConfig portConfig;
             portConfig.inPlace(-1);
@@ -144,7 +156,7 @@ void Snippet::initSupportedPrimitiveDescriptors() {
             if (outputShapes[i].getDims()[0] == 1) {
                 outputMask.reset(0); // accepts any stride on batch axis
             }
-            portConfig.setMemDesc(createMemoryDesc(outputShapes[i], supportedPrecision, offset), outputMask);
+            portConfig.setMemDesc(createMemoryDesc(outputShapes[i], precision, offset), outputMask);
             config.outConfs[i] = portConfig;
         }
 
@@ -203,11 +215,27 @@ bool Snippet::created() const {
     return getType() == Type::Subgraph;
 }
 
+InferenceEngine::Precision Snippet::getRuntimePrecision() const {
+    std::vector<InferenceEngine::Precision> inputPrecisions;
+    for (size_t i = 0; i < getParentEdges().size(); i++) {
+        auto parentEdge = getParentEdgeAt(i);
+        if (parentEdge && parentEdge->getStatus() == Edge::Status::Validated && !parentEdge->getParent()->isConstant()) {
+            inputPrecisions.emplace_back(DnnlExtensionUtils::DataTypeToIEPrecision((parentEdge->getMemoryPtr()->GetDataType())));
+        }
+    }
+
+    return getMaxPrecision(inputPrecisions);
+}
+
 bool Snippet::canBeInPlace() const {
     if (getParentEdgesAtPort(0)[0]->getParent()->getType() == Type::Input) {
         return false;
     }
 
+    if (getChildEdges().size() != 1) {
+        return false;
+    }
+
     for (auto& parentEdge : getParentEdges()) {
         auto parent = parentEdge.lock()->getParent();
         if (parent->getChildEdges().size() != 1)
@@ -271,7 +299,10 @@ void Snippet::define_schedule() {
     ngraph::snippets::op::Subgraph::BlockedShapeVector output_blocked_shapes;
     for (size_t i = 0; i < outputShapes.size(); i++)
         output_blocked_shapes.push_back(edgeToBlockedShape(getChildEdgesAtPort(i)[0]));
-    exec_domain = snippet->canonicalize(output_blocked_shapes, input_blocked_shapes);
+
+    const auto supported_exec_type = snippet->get_generator()->get_supported_exec_precision();
+    exec_domain = snippet->canonicalize(output_blocked_shapes, input_blocked_shapes, supported_exec_type);
+
     // initialize by maximum output dimension. Dimensions of outputs should be broadcastable
     tensorRank = std::max(static_cast<size_t>(rank6D), exec_domain.size());
     // Canonicalization broadcasts inputs and outputs to max input rank, which can be smaller than tensorRank
@@ -287,8 +318,7 @@ void Snippet::define_schedule() {
     }
 
     const auto config = getSelectedPrimitiveDescriptor()->getConfig();
-    const auto dataSize = config.inConfs[0].getMemDesc()->getPrecision().size();
-    auto initOffsets = [this, config, dataSize]() {
+    auto initOffsets = [this, config]() {
         // find max rank input among all outputs
         const size_t inputNum = getParentEdges().size();
         offsets_in.resize(inputNum);
@@ -296,7 +326,7 @@ void Snippet::define_schedule() {
             offsets_in[i].resize(tensorRank, 1);
             offset_calculation(offsets_in[i], dims_in[i], exec_domain);
             for (size_t j = 0; j < tensorRank; j++) {
-                offsets_in[i][j] *= dataSize;
+                offsets_in[i][j] *= config.inConfs[i].getMemDesc()->getPrecision().size();
             }
         }
 
@@ -305,7 +335,8 @@ void Snippet::define_schedule() {
         for (size_t i = 0; i < inputNum; i++) {
             const auto memPtr = getParentEdgeAt(i)->getMemoryPtr();
             srcMemPtrs[i] = memPtr;
-            start_offset_in[i] =  memPtr->GetDescWithType<BlockedMemoryDesc>()->getOffsetPadding() * dataSize;
+            start_offset_in[i] =  memPtr->GetDescWithType<BlockedMemoryDesc>()->getOffsetPadding() *
+                    config.inConfs[i].getMemDesc()->getPrecision().size();
         }
 
         const size_t outputNum = config.outConfs.size();
@@ -314,7 +345,7 @@ void Snippet::define_schedule() {
             offsets_out[i].resize(tensorRank, 1);
             offset_calculation(offsets_out[i], dims_out[i], exec_domain);
             for (size_t j = 0; j < tensorRank; j++) {
-                offsets_out[i][j] *= dataSize;
+                offsets_out[i][j] *= config.outConfs[i].getMemDesc()->getPrecision().size();
             }
         }
 
@@ -323,7 +354,8 @@ void Snippet::define_schedule() {
         for (size_t i = 0; i < outputNum; i++) {
             const auto memPtr = getChildEdgeAt(i)->getMemoryPtr();
             dstMemPtrs[i] = memPtr;
-            start_offset_out[i] = memPtr->GetDescWithType<BlockedMemoryDesc>()->getOffsetPadding() * dataSize;
+            start_offset_out[i] = memPtr->GetDescWithType<BlockedMemoryDesc>()->getOffsetPadding() *
+                    config.outConfs[i].getMemDesc()->getPrecision().size();
         }
     };
 
@@ -373,7 +405,7 @@ void Snippet::define_schedule() {
         return collapsedDims;
     };
 
-    auto initSchedulingInfo = [this, dataSize]() -> void {
+    auto initSchedulingInfo = [this, config]() -> void {
         // initialize scheduling information
         sch_offsets_in.resize(offsets_in.size(), 0);
         sch_offsets_out.resize(offsets_out.size(), 0);
@@ -385,19 +417,38 @@ void Snippet::define_schedule() {
             schedulerWorkAmount /= exec_domain[tensorRank - 2];
             exec_domain[tensorRank - 2] = 1;
 
-            // update offsets for tile 2D because loaders have ptr shifts in some cases and stores have always ptrs shifts
+            // update offsets for tile 2D because loaders and stores have ptr shifts in some cases
+            const int64_t vector_size = snippet->get_generator()->get_target_machine()->get_lanes();
             for (size_t i = 0; i < offsets_in.size(); i++) {
-                int64_t offset = offsets_in[i][tensorRank - 2];
-                if ((offset > dataSize) || (offset == 0 && dims_in[i].back() != 1)) {
-                    sch_offsets_in[i] = offset - exec_domain.back() * dataSize;
-                } else if (offset == dataSize) {
+                const int64_t offset = offsets_in[i][tensorRank - 2];
+                const int64_t data_size = config.inConfs[i].getMemDesc()->getPrecision().size();
+                if (offset == data_size || offset == vector_size * data_size) {
                     sch_offsets_in[i] = offset;
+                } else if ((offset > data_size) || (offset == 0 && dims_in[i].back() != 1 && dims_in[i].back() != vector_size)) {
+                    sch_offsets_in[i] = offset - exec_domain.back() * data_size;
+
+                    // If scalar tile executes one time, ptr doesn't move on 1 value
+                    // so we should absolutelly decrease offset
+                    if (exec_domain.back() % vector_size == 1) {
+                        sch_offsets_in[i] += data_size;
+                    }
                 }
             }
 
             for (size_t i = 0; i < offsets_out.size(); i++) {
-                int64_t offset = offsets_out[i][tensorRank - 2];
-                sch_offsets_out[i] = offset - exec_domain.back() * dataSize;
+                const int64_t offset = offsets_out[i][tensorRank - 2];
+                const size_t data_size = config.outConfs[i].getMemDesc()->getPrecision().size();
+                if (offset == data_size || offset == vector_size * data_size) {
+                    sch_offsets_out[i] = offset;
+                } else if ((offset > data_size) || (offset == 0 && dims_out[i].back() != 1 && dims_out[i].back() != vector_size)) {
+                    sch_offsets_out[i] = offset - exec_domain.back() * data_size;
+
+                    // If scalar tile executes one time, ptr doesn't move on 1 value
+                    // so we should absolutelly decrease offset
+                    if (exec_domain.back() % vector_size == 1) {
+                        sch_offsets_out[i] += data_size;
+                    }
+                }
             }
         }
     };
@@ -434,7 +485,28 @@ void Snippet::generate() {
         auto b = offsets_out[i].begin();
         std::copy(b, b + harness_num_dims, &jcp.data_offsets[(inputShapes.size() + i) * harness_num_dims]);
     }
-    schedule = snippet->generate(reinterpret_cast<void*>(&jcp));
+
+    ov::pass::Manager optManager;
+    optManager.register_pass<ov::intel_cpu::pass::FuseLoadConvert>();
+    optManager.register_pass<ov::intel_cpu::pass::FuseStoreConvert>();
+
+    // LoadConvert uses Load emitter that support conversion from any type to only f32
+    optManager.get_pass_config()->set_callback<ov::intel_cpu::pass::FuseLoadConvert>(
+            [](const std::shared_ptr<const ov::Node>& n) -> bool {
+                if (const auto& convert = std::dynamic_pointer_cast<const ov::op::v0::Convert>(n))
+                    return convert->get_destination_type() != ov::element::f32;
+                return true;
+            });
+
+    // StoreConvert uses Store emitter that support conversion from only f32 to any types
+    optManager.get_pass_config()->set_callback<ov::intel_cpu::pass::FuseStoreConvert>(
+            [](const std::shared_ptr<const ov::Node>& n) -> bool {
+                if (const auto& convert = std::dynamic_pointer_cast<const ov::op::v0::Convert>(n))
+                    return convert->get_input_element_type(0) != ov::element::f32;
+                return true;
+            });
+
+    schedule = snippet->generate(optManager, reinterpret_cast<void*>(&jcp));
 }
 
 void Snippet::schedule_6d(const jit_snippets_call_args& call_args) const {
diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h
index f92b167209e451..fad68e1287dd27 100644
--- a/src/plugins/intel_cpu/src/nodes/subgraph.h
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.h
@@ -30,6 +30,7 @@ class Snippet : public Node {
     void getSupportedDescriptors() override {};
     void initSupportedPrimitiveDescriptors() override;
     void selectOptimalPrimitiveDescriptor() override;
+    InferenceEngine::Precision getRuntimePrecision() const override;
 
     // Here we convert to canonical for & jit everything
     void createPrimitive() override;
diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp
index 945ed2dacfde93..9dfdee5e0a91a6 100644
--- a/src/plugins/intel_cpu/src/plugin.cpp
+++ b/src/plugins/intel_cpu/src/plugin.cpp
@@ -599,7 +599,6 @@ static void TransformationUpToCPUSpecificOpSet(std::shared_ptr<ngraph::Function>
 
     postLPTPassManager.register_pass<ngraph::pass::ConstantFolding>();
     postLPTPassManager.run_passes(nGraphFunc);
-
     if (!useLpt && _enableSnippets && dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2)) {
         ngraph::pass::Manager tokenization_manager;
         tokenization_manager.register_pass<SnippetsMarkSkipped>();
diff --git a/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp b/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp
new file mode 100644
index 00000000000000..397d5f2ce391d0
--- /dev/null
+++ b/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp
@@ -0,0 +1,121 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/itt.hpp"
+
+#include "fuse_load_store_and_convert.hpp"
+#include "snippets/snippets_isa.hpp"
+
+#include "snippets_transformations/op/load_store_convert.hpp"
+
+
+#include "ngraph/opsets/opset1.hpp"
+#include "ngraph/rt_info.hpp"
+#include "ngraph/pattern/op/wrap_type.hpp"
+
+ov::intel_cpu::pass::FuseLoadConvert::FuseLoadConvert() {
+    MATCHER_SCOPE(FuseLoadConvert);
+    auto param_pattern = ngraph::pattern::wrap_type<ngraph::opset1::Parameter>();
+    auto load_pattern = ngraph::pattern::wrap_type<ngraph::snippets::op::Load>({param_pattern});
+    auto convert_pattern = ngraph::pattern::wrap_type<ngraph::opset1::Convert>({load_pattern});
+
+    auto callback = [=](ngraph::pattern::Matcher& m) {
+        OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "ov::intel_cpu::pass::FuseLoadConvert")
+        auto& pm = m.get_pattern_value_map();
+        const auto param = pm.at(param_pattern).get_node_shared_ptr();
+        const auto load_shared = pm.at(load_pattern).get_node_shared_ptr();
+        if (!load_shared || load_shared->output(0).get_target_inputs().size() != 1) {
+            return false;
+        }
+
+        const auto load = std::dynamic_pointer_cast<ngraph::snippets::op::Load>(load_shared);
+        if (!load)
+            return false;
+
+        const auto convert = pm.at(convert_pattern).get_node_shared_ptr();
+        if (transformation_callback(convert))
+            return false;
+
+        std::shared_ptr<ngraph::Node> load_convert = nullptr;
+        if (const auto convert_saturation =
+                std::dynamic_pointer_cast<ngraph::snippets::op::ConvertSaturation>(convert)) {
+            load_convert = std::make_shared<ov::intel_cpu::LoadConvert>(param,
+                                                                        convert_saturation->get_destination_type(),
+                                                                        arithmetic_mode::saturation,
+                                                                        load->get_count());
+        } else if (const auto convert_truncation =
+                std::dynamic_pointer_cast<ngraph::snippets::op::ConvertTruncation>(convert)) {
+            load_convert = std::make_shared<ov::intel_cpu::LoadConvert>(param,
+                                                                        convert_truncation->get_destination_type(),
+                                                                        arithmetic_mode::truncation,
+                                                                        load->get_count());
+        } else {
+            throw ngraph::ngraph_error(
+                "Type of Convert op is undefined. Supports only fusing Load and ConvertTruncation or ConvertSaturation ops");
+        }
+
+        if (!load_convert)
+            return false;
+
+        ngraph::copy_runtime_info(convert, load_convert);
+        ngraph::replace_node(convert, load_convert);
+
+        return true;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(convert_pattern, matcher_name);
+    register_matcher(m, callback);
+}
+
+
+ov::intel_cpu::pass::FuseStoreConvert::FuseStoreConvert() {
+    MATCHER_SCOPE(FuseStoreConvert);
+    auto input_pattern = ngraph::pattern::any_input();
+    auto convert_pattern = ngraph::pattern::wrap_type<ngraph::opset1::Convert>({input_pattern});
+    auto store_pattern = ngraph::pattern::wrap_type<ngraph::snippets::op::Store>({convert_pattern});
+
+    auto callback = [=](ngraph::pattern::Matcher& m) {
+        OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "ov::intel_cpu::pass::FuseStoreConvert")
+        auto& pm = m.get_pattern_value_map();
+        const auto input = pm.at(input_pattern).get_node_shared_ptr();
+
+        const auto store = std::dynamic_pointer_cast<ngraph::snippets::op::Store>(pm.at(store_pattern).get_node_shared_ptr());
+        if (!store)
+            return false;
+
+        const auto convert = pm.at(convert_pattern).get_node_shared_ptr();
+        if (convert->output(0).get_target_inputs().size() != 1 || transformation_callback(convert))
+            return false;
+
+        std::shared_ptr<ngraph::Node> store_convert = nullptr;
+        if (const auto convert_saturation =
+                std::dynamic_pointer_cast<ngraph::snippets::op::ConvertSaturation>(convert)) {
+            store_convert = std::make_shared<ov::intel_cpu::StoreConvert>(input,
+                                                                          convert_saturation->get_destination_type(),
+                                                                          arithmetic_mode::saturation,
+                                                                          store->get_count());
+        } else if (const auto convert_truncation =
+                std::dynamic_pointer_cast<ngraph::snippets::op::ConvertTruncation>(convert)) {
+            store_convert = std::make_shared<ov::intel_cpu::StoreConvert>(input,
+                                                                          convert_truncation->get_destination_type(),
+                                                                          arithmetic_mode::truncation,
+                                                                          store->get_count());
+        } else {
+            throw ngraph::ngraph_error(
+                "Type of Convert op is undefined. Supports only fusing Store and ConvertTruncation or ConvertSaturation ops");
+        }
+
+
+        if (!store_convert)
+            return false;
+
+        ngraph::copy_runtime_info(store, store_convert);
+        ngraph::replace_node(store, store_convert);
+
+        return true;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(store_pattern, matcher_name);
+    register_matcher(m, callback);
+}
diff --git a/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.hpp b/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.hpp
new file mode 100644
index 00000000000000..fcadb235e62794
--- /dev/null
+++ b/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.hpp
@@ -0,0 +1,38 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/pass/graph_rewrite.hpp"
+#include "ngraph/pattern/matcher.hpp"
+
+namespace ov {
+namespace intel_cpu {
+namespace pass {
+
+/**
+ * @interface FuseLoadConvert
+ * @brief Fuse Load and ConvertSaturation/ConvertTruncation into one op LoadConvert with the corresponding mode
+ * @ingroup snippets
+ */
+class FuseLoadConvert: public ngraph::pass::MatcherPass {
+public:
+    OPENVINO_RTTI("FuseLoadConvert", "0");
+    FuseLoadConvert();
+};
+
+/**
+ * @interface FuseStoreConvert
+ * @brief Fuse Store and ConvertSaturation/ConvertTruncation into one op StoreConvert with the corresponding mode
+ * @ingroup snippets
+ */
+class FuseStoreConvert: public ngraph::pass::MatcherPass {
+public:
+    OPENVINO_RTTI("FuseStoreConvert",  "0");
+    FuseStoreConvert();
+};
+
+}  // namespace pass
+}  // namespace intel_cpu
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/load_store_convert.cpp b/src/plugins/intel_cpu/src/snippets_transformations/op/load_store_convert.cpp
new file mode 100644
index 00000000000000..49d37d1b803854
--- /dev/null
+++ b/src/plugins/intel_cpu/src/snippets_transformations/op/load_store_convert.cpp
@@ -0,0 +1,56 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/itt.hpp"
+
+#include "load_store_convert.hpp"
+
+#include "ngraph/runtime/host_tensor.hpp"
+
+using namespace std;
+using namespace ov;
+
+intel_cpu::LoadConvert::LoadConvert(const Output<Node>& x, const ov::element::Type& destination_type, arithmetic_mode mode, const size_t count) :
+    Load(x, count), m_destination_type(destination_type), m_mode(mode) {
+    constructor_validate_and_infer_types();
+}
+
+bool intel_cpu::LoadConvert::visit_attributes(AttributeVisitor& visitor) {
+    INTERNAL_OP_SCOPE(LoadConvert_visit_attributes);
+    visitor.on_attribute("destination_type", m_destination_type);
+    return true;
+}
+
+void intel_cpu::LoadConvert::validate_and_infer_types() {
+    INTERNAL_OP_SCOPE(LoadConvert_validate_and_infer_types);
+    set_output_type(0, m_destination_type, get_input_partial_shape(0));
+}
+
+std::shared_ptr<Node> intel_cpu::LoadConvert::clone_with_new_inputs(const OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(LoadConvert_clone_with_new_inputs);
+    check_new_args_count(this, new_args);
+    return std::make_shared<LoadConvert>(new_args.at(0), m_destination_type, m_mode, m_count);
+}
+
+intel_cpu::StoreConvert::StoreConvert(const Output<Node>& x, const ov::element::Type& destination_type, arithmetic_mode mode, const size_t count) :
+        Store(x, count), m_destination_type(destination_type), m_mode(mode) {
+    constructor_validate_and_infer_types();
+}
+
+bool intel_cpu::StoreConvert::visit_attributes(AttributeVisitor& visitor) {
+    INTERNAL_OP_SCOPE(StoreConvert_visit_attributes);
+    visitor.on_attribute("destination_type", m_destination_type);
+    return true;
+}
+
+void intel_cpu::StoreConvert::validate_and_infer_types() {
+    INTERNAL_OP_SCOPE(StoreConvert_validate_and_infer_types);
+    set_output_type(0, m_destination_type, get_input_partial_shape(0));
+}
+
+std::shared_ptr<Node> intel_cpu::StoreConvert::clone_with_new_inputs(const OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(StoreConvert_clone_with_new_inputs);
+    check_new_args_count(this, new_args);
+    return std::make_shared<StoreConvert>(new_args.at(0), m_destination_type, m_mode, m_count);
+}
diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/load_store_convert.hpp b/src/plugins/intel_cpu/src/snippets_transformations/op/load_store_convert.hpp
new file mode 100644
index 00000000000000..7568003f2c627a
--- /dev/null
+++ b/src/plugins/intel_cpu/src/snippets_transformations/op/load_store_convert.hpp
@@ -0,0 +1,76 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/op/op.hpp"
+#include "snippets/op/load.hpp"
+
+#include "emitters/jit_load_store_emitters.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+/**
+ * @interface LoadConvert
+ * @brief Fused operation to represent computations equal to consecutive Load and Convert operations.
+ *        The operation is used for peephole optimization during subgraph lowering.
+ * @ingroup snippets
+ */
+
+class LoadConvert : public ngraph::snippets::op::Load {
+public:
+    OPENVINO_OP("LoadConvert", "SnippetsOpset", ngraph::snippets::op::Load);
+
+    LoadConvert(const Output<Node>& x, const ov::element::Type& destination_type, arithmetic_mode mode, const size_t count = 1lu);
+    LoadConvert() = default;
+
+    ov::element::Type get_destination_type() const { return m_destination_type; }
+    arithmetic_mode get_arithmetic_mode() const { return m_mode; }
+
+    bool visit_attributes(AttributeVisitor& visitor) override;
+
+    void validate_and_infer_types() override;
+
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+
+    bool has_evaluate() const override { return false; }
+
+protected:
+    arithmetic_mode m_mode;
+    ov::element::Type m_destination_type;
+};
+
+/**
+ * @interface StoreConvert
+ * @brief Fused operation to represent computations equal to consecutive Store and Convert operations.
+ *        The operation is used for peephole optimization during subgraph lowering.
+ * @ingroup snippets
+ */
+class StoreConvert : public ngraph::snippets::op::Store {
+public:
+    OPENVINO_OP("StoreConvert", "SnippetsOpset", ngraph::snippets::op::Store);
+
+    StoreConvert(const Output<Node>& x, const ov::element::Type& destination_type, arithmetic_mode mode, const size_t count = 1lu);
+    StoreConvert() = default;
+
+    ov::element::Type get_destination_type() const { return m_destination_type; }
+    arithmetic_mode get_arithmetic_mode() const { return m_mode; }
+
+    bool visit_attributes(AttributeVisitor& visitor) override;
+
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+
+    void validate_and_infer_types() override;
+
+    bool has_evaluate() const override { return false; }
+
+protected:
+    arithmetic_mode m_mode;
+    ov::element::Type m_destination_type;
+};
+
+
+} // namespace intel_cpu
+} // namespace ov
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp
index 56ba1a51c2b651..d7bc5d0de7e12e 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp
@@ -12,23 +12,31 @@ namespace snippets {
 
 namespace {
 
-    INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, Add,
-                         ::testing::Combine(
-                                 ::testing::Values(ov::Shape {1, 42, 16, 64}),
-                                 ::testing::Values(ov::Shape {1, 42, 16,  1}),
-                                 ::testing::Values(1), // one node - Add
-                                 ::testing::Values(0), // SnippetsMarkSkipped disables tokenization for eltwise chains after inputs
-                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-                         Add::getTestCaseName);
-
-    INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddSinh,
-            ::testing::Combine(
-            ::testing::Values(ov::Shape {1, 42, 16, 64}),
-            ::testing::Values(ov::Shape {1, 42, 16,  1}),
-            ::testing::Values(3), // Add + 2 converts after inputs
-            ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts
-            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-                             AddSinh::getTestCaseName);
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, Add,
+                     ::testing::Combine(
+                             ::testing::Values(ov::Shape {1, 42, 16, 64}),
+                             ::testing::Values(ov::Shape {1, 42, 16,  1}),
+                             ::testing::Values(1), // one node - Add
+                             ::testing::Values(0), // SnippetsMarkSkipped disables tokenization for eltwise chains after inputs
+                             ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                     Add::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddSinh,
+        ::testing::Combine(
+        ::testing::Values(ov::Shape {1, 42, 16, 64}),
+        ::testing::Values(ov::Shape {1, 42, 16,  1}),
+        ::testing::Values(3), // Add + 2 converts after inputs
+        ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts
+        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         AddSinh::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddSinhConst,
+                     ::testing::Combine(
+                             ::testing::Values(ov::Shape {1, 42, 16, 64}),
+                             ::testing::Values(2), // Add + 2 converts after inputs
+                             ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts
+                             ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                     AddSinhConst::getTestCaseName);
 
 }  // namespace
 } // namespace snippets
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp
new file mode 100644
index 00000000000000..5c074239077886
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp
@@ -0,0 +1,162 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/convert.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+
+namespace {
+
+const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>>> types_Convert = {
+        { { ov::element::f32 }, { ov::element::i32 } },
+        { { ov::element::f32 }, { ov::element::bf16 } },
+        { { ov::element::f32 }, { ov::element::u8 } },
+        { { ov::element::f32 }, { ov::element::i8 } },
+
+        { { ov::element::bf16 }, { ov::element::f32 } },
+        { { ov::element::bf16 }, { ov::element::i32 } },
+        { { ov::element::bf16 }, { ov::element::i8 } },
+        { { ov::element::bf16 }, { ov::element::u8 } },
+
+        { { ov::element::i8 }, { ov::element::f32 } },
+        { { ov::element::i8 }, { ov::element::i32 } },
+        { { ov::element::i8 }, { ov::element::bf16 } },
+        { { ov::element::i8 }, { ov::element::u8 }  },
+
+        { { ov::element::u8 }, { ov::element::f32 } },
+        { { ov::element::u8 }, { ov::element::i32 } },
+        { { ov::element::u8 }, { ov::element::bf16 } },
+        { { ov::element::u8 }, { ov::element::i8 } },
+};
+
+const std::vector<std::vector<ov::Shape>> inputShapes_Convert = {
+        { ov::Shape{2, 16} },
+        { ov::Shape{5, 5} },
+        { ov::Shape{2, 12, 1} }
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Convert, Convert,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(inputShapes_Convert),
+                                 ::testing::ValuesIn(types_Convert),
+                                 ::testing::Values(2),
+                                 ::testing::Values(1),
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         Convert::getTestCaseName);
+
+const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>>> types_ConvertInput = {
+        { { ov::element::f32 }, { ov::element::i32 } },
+        { { ov::element::f32 }, { ov::element::bf16 } },
+
+        { { ov::element::bf16 }, { ov::element::f32 } },
+
+        { { ov::element::i8 }, { ov::element::f32 } },
+        { { ov::element::i8 }, { ov::element::i32 } },
+        { { ov::element::i8 }, { ov::element::bf16 } },
+
+        { { ov::element::u8 }, { ov::element::f32 } },
+        { { ov::element::u8 }, { ov::element::i32 } },
+        { { ov::element::u8 }, { ov::element::bf16 } },
+};
+
+const std::vector<std::vector<ov::Shape>> inputShapes_ConvertInput = {
+        { ov::Shape{2, 16}, ov::Shape{1, 16} },
+        { ov::Shape{5, 18}, ov::Shape{5, 1} },
+        { ov::Shape{3, 1}, ov::Shape{3, 21} }
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertInput, ConvertInput,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(inputShapes_ConvertInput),
+                                 ::testing::ValuesIn(types_ConvertInput),
+                                 ::testing::Values(3),
+                                 ::testing::Values(1),
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         Convert::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertOutput, ConvertOutput,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(inputShapes_ConvertInput),
+                                 ::testing::ValuesIn(types_ConvertInput),
+                                 ::testing::Values(3),
+                                 ::testing::Values(1),
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         Convert::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertStub, ConvertStub,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(inputShapes_ConvertInput),
+                                 ::testing::ValuesIn(types_ConvertInput),
+                                 ::testing::Values(4),
+                                 ::testing::Values(2),
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         Convert::getTestCaseName);
+
+const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>>> types_ConvertPartialInputsAndResults = {
+        { { ov::element::i8, ov::element::i32, ov::element::f32 }, { ov::element::f32, ov::element::i8 } },
+        { { ov::element::bf16, ov::element::u8, ov::element::i32 }, { ov::element::i32, ov::element::bf16 } },
+};
+
+const std::vector<std::vector<ov::Shape>> inputShapes_ConvertPartialInputsAndResults = {
+        { ov::Shape{2, 16}, ov::Shape{1, 16}, ov::Shape{1, 1} },
+        { ov::Shape{5, 18}, ov::Shape{5, 1}, ov::Shape{1, 18} },
+        { ov::Shape{3, 1}, ov::Shape{3, 21}, ov::Shape{3, 1} }
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertPartialInputsAndResults, ConvertPartialInputsAndResults,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(inputShapes_ConvertPartialInputsAndResults),
+                                 ::testing::ValuesIn(types_ConvertPartialInputsAndResults),
+                                 ::testing::Values(6),
+                                 ::testing::Values(1),
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         Convert::getTestCaseName);
+
+const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>>> types_ConvertMany = {
+        { { ov::element::i32, ov::element::u8}, {} },
+        { { ov::element::i32, ov::element::u8, ov::element::i32 }, {} },
+        { { ov::element::i32, ov::element::f32, ov::element::i32, ov::element::i8 }, {} },
+        { { ov::element::i32, ov::element::i8, ov::element::i32, ov::element::f32 }, {} },
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnInputs, ConvertManyOnInputs,
+                         ::testing::Combine(
+                                 ::testing::Values(std::vector<ov::Shape>{ov::Shape{5, 5, 5, 5}}),
+                                 ::testing::ValuesIn(types_ConvertMany),
+                                 ::testing::Values(2),
+                                 ::testing::Values(1),
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         Convert::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnOutputs, ConvertManyOnOutputs,
+                         ::testing::Combine(
+                                 ::testing::Values(std::vector<ov::Shape>{ov::Shape{5, 5, 5, 5}}),
+                                 ::testing::ValuesIn(types_ConvertMany),
+                                 ::testing::Values(5), // sinh + subgraph + reorders for sinh
+                                 ::testing::Values(1),
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         Convert::getTestCaseName);
+
+const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>>> types_ConvertManyIO = {
+        { { ov::element::i32, ov::element::u8}, {ov::element::i32} },
+        { { ov::element::i32, ov::element::u8, ov::element::i32 }, { ov::element::i32, ov::element::i8, ov::element::i32, ov::element::f32 } },
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnInputOutput, ConvertManyOnInputOutput,
+                         ::testing::Combine(
+                                 ::testing::Values(std::vector<ov::Shape>{ov::Shape{5, 5, 5, 5}}),
+                                 ::testing::ValuesIn(types_ConvertManyIO),
+                                 ::testing::Values(5), // sinh + subgraph + reorders for sinh
+                                 ::testing::Values(1),
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         Convert::getTestCaseName);
+
+}  // namespace
+} // namespace snippets
+} // namespace test
+} // namespace ov
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/eltwise_two_results.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/eltwise_two_results.cpp
new file mode 100644
index 00000000000000..934a243773a7e8
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/eltwise_two_results.cpp
@@ -0,0 +1,25 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/eltwise_two_results.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+namespace {
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, EltwiseTwoResults,
+                        ::testing::Combine(
+                             ::testing::Values(ov::Shape {1, 64, 10, 10}),
+                             ::testing::Values(ov::Shape {1, 64, 10,  1}),
+                             ::testing::Values(4),
+                             ::testing::Values(2),
+                             ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         EltwiseTwoResults::getTestCaseName);
+
+}  // namespace
+} // namespace snippets
+} // namespace test
+} // namespace ov
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/max_num_params_eltwise.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/max_num_params_eltwise.cpp
new file mode 100644
index 00000000000000..20c01c02be8fd3
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/max_num_params_eltwise.cpp
@@ -0,0 +1,26 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/max_num_params_eltwise.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+namespace {
+// Note that we need these shapes to cover all cases of code emission (none/one/multiple of scalar/vector tiles)
+std::vector<ov::Shape> input_shapes {{1, 64, 10, 10}, {1, 1, 17, 37}, {1, 1, 1, 1}, {1, 1, 1, 7},
+                                    {1, 1, 1, 128}, {1, 1, 1, 14}, {1, 1, 1, 16}, {1, 1, 1, 30}};
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, MaxNumParamsEltwiseSinh,
+                     ::testing::Combine(
+                             ::testing::ValuesIn(input_shapes),
+                             ::testing::Values(12), // 10 Sinh after inputs + Subgraph + Concat
+                             ::testing::Values(1),
+                             ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         MaxNumParamsEltwiseSinh::getTestCaseName);
+
+}  // namespace
+} // namespace snippets
+} // namespace test
+} // namespace ov
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/three_inputs_eltwise.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/three_inputs_eltwise.cpp
index c0c833268898fb..779db741cd258b 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/three_inputs_eltwise.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/three_inputs_eltwise.cpp
@@ -10,25 +10,25 @@ namespace test {
 namespace snippets {
 namespace {
 
-    INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, ThreeInputsEltwise,
-                         ::testing::Combine(
-                                 ::testing::Values(ov::Shape {1, 64, 10, 10}),
-                                 ::testing::Values(ov::Shape {1, 64, 10,  1}),
-                                 ::testing::Values(ov::Shape {1, 1, 1,  10}),
-                                 ::testing::Values(2), // eltwises fuse only for non-broadcasted shapes
-                                 ::testing::Values(0), // SnippetsMarkSkipped disables tokenization for eltwise chains after inputs
-                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-                             ThreeInputsEltwise::getTestCaseName);
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, ThreeInputsEltwise,
+                     ::testing::Combine(
+                             ::testing::Values(ov::Shape {1, 64, 10, 10}),
+                             ::testing::Values(ov::Shape {1, 64, 10,  1}),
+                             ::testing::Values(ov::Shape {1, 1, 1,  10}),
+                             ::testing::Values(2), // eltwises fuse only for non-broadcasted shapes
+                             ::testing::Values(0), // SnippetsMarkSkipped disables tokenization for eltwise chains after inputs
+                             ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         ThreeInputsEltwise::getTestCaseName);
 
-    INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, ThreeInputsEltwiseSinh,
-            ::testing::Combine(
-            ::testing::Values(ov::Shape {1, 64, 10, 10}),
-            ::testing::Values(ov::Shape {1, 64, 10,  1}),
-            ::testing::Values(ov::Shape {1, 1, 1,  10}),
-            ::testing::Values(4), // Subgraph + 3 converts after inputs
-            ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts
-            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-                             ThreeInputsEltwiseSinh::getTestCaseName);
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, ThreeInputsEltwiseSinh,
+        ::testing::Combine(
+        ::testing::Values(ov::Shape {1, 64, 10, 10}),
+        ::testing::Values(ov::Shape {1, 64, 10,  1}),
+        ::testing::Values(ov::Shape {1, 1, 1,  10}),
+        ::testing::Values(4), // Subgraph + 3 converts after inputs
+        ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts
+        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         ThreeInputsEltwiseSinh::getTestCaseName);
 
 }  // namespace
 } // namespace snippets
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/two_inputs_and_outputs.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/two_inputs_and_outputs.cpp
new file mode 100644
index 00000000000000..fa182cf548a937
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/two_inputs_and_outputs.cpp
@@ -0,0 +1,45 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/two_inputs_and_outputs.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+namespace {
+
+const std::vector<std::vector<ov::Shape>> input_shapes = {
+        { {5, 5, 256, 1}, {5, 5, 256, 1} },
+        { {5, 5, 16, 35}, {5, 5, 16, 35} },
+        { {5, 5, 256, 1}, {5, 5, 256, 35} },
+        { {5, 5, 256, 1}, {5, 5, 1, 1} },
+
+        { {5, 5, 16, 35}, {5, 5, 1, 1} },
+        { {5, 5, 16, 35}, {5, 5, 16, 1} },
+        { {5, 5, 5, 35}, {5, 5, 1, 35} },
+        { {5, 5, 16, 1}, {5, 5, 1, 35} },
+
+        { {5, 5, 35, 16}, {5, 5, 35, 16} },
+        { {5, 5, 35, 16}, {5, 5, 1, 16} },
+
+        { {5, 5, 35, 17}, {5, 5, 35, 17} },
+        { {5, 5, 35, 17}, {5, 5, 1, 17} },
+
+        { {5, 5, 35, 18}, {5, 5, 35, 18} },
+        { {5, 5, 35, 18}, {5, 5, 1, 18} },
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, TwoInputsAndOutputs,
+                         ::testing::Combine(
+                             ::testing::ValuesIn(input_shapes),
+                             ::testing::Values(4),
+                             ::testing::Values(1),
+                             ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         TwoInputsAndOutputs::getTestCaseName);
+
+}  // namespace
+} // namespace snippets
+} // namespace test
+} // namespace ov
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/eltwise.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/eltwise.cpp
index 603e78dde07818..4e051b2e715491 100644
--- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/eltwise.cpp
+++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/eltwise.cpp
@@ -731,4 +731,4 @@ const auto params_5D_dyn_param = ::testing::Combine(
 INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_5D_MemOrder_dyn_param, EltwiseLayerCPUTest, params_5D_dyn_param, EltwiseLayerCPUTest::getTestCaseName);
 
 } // namespace
-} // namespace CPULayerTestsDefinitions
\ No newline at end of file
+} // namespace CPULayerTestsDefinitions
diff --git a/src/plugins/intel_cpu/thirdparty/onednn b/src/plugins/intel_cpu/thirdparty/onednn
index 2a749c577f8a84..f9e363fc1ff471 160000
--- a/src/plugins/intel_cpu/thirdparty/onednn
+++ b/src/plugins/intel_cpu/thirdparty/onednn
@@ -1 +1 @@
-Subproject commit 2a749c577f8a841a396d4bd46eaf311b7e7dc089
+Subproject commit f9e363fc1ff47191c7ddea63b19c7893965a786a
diff --git a/src/tests/functional/plugin/shared/include/snippets/add.hpp b/src/tests/functional/plugin/shared/include/snippets/add.hpp
index a3dbe852cde592..7f7001de94bf5d 100644
--- a/src/tests/functional/plugin/shared/include/snippets/add.hpp
+++ b/src/tests/functional/plugin/shared/include/snippets/add.hpp
@@ -18,6 +18,13 @@ typedef std::tuple<
         std::string                  // Target Device
 > AddParams;
 
+typedef std::tuple<
+        ov::Shape,                   // Input 0 Shape
+        size_t,                      // Expected num nodes
+        size_t,                      // Expected num subgraphs
+        std::string                  // Target Device
+> AddConstParams;
+
 class Add : public testing::WithParamInterface<ov::test::snippets::AddParams>,
             virtual public ov::test::SnippetsTestsCommon {
 public:
@@ -32,6 +39,14 @@ class AddSinh : public Add {
     void SetUp() override;
 };
 
+class AddSinhConst : public testing::WithParamInterface<ov::test::snippets::AddConstParams>,
+                     virtual public ov::test::SnippetsTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::AddConstParams> obj);
+protected:
+    void SetUp() override;
+};
+
 } // namespace snippets
 } // namespace test
 } // namespace ov
\ No newline at end of file
diff --git a/src/tests/functional/plugin/shared/include/snippets/convert.hpp b/src/tests/functional/plugin/shared/include/snippets/convert.hpp
new file mode 100644
index 00000000000000..bd4d7641711a0a
--- /dev/null
+++ b/src/tests/functional/plugin/shared/include/snippets/convert.hpp
@@ -0,0 +1,76 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/base/snippets_test_utils.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+typedef std::tuple<
+        std::vector<ov::Shape>,                                                     // InputShapes
+        std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>>,  // Input and Output data types for Converts
+        size_t,                                                                     // Expected num nodes
+        size_t,                                                                     // Expected num subgraphs
+        std::string                                                                 // Target Device
+> ConvertParams;
+
+using parameters = std::vector<std::tuple<int32_t, int32_t, int32_t>>;
+
+class Convert : public testing::WithParamInterface<ov::test::snippets::ConvertParams>,
+                virtual public ov::test::SnippetsTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::ConvertParams> obj);
+
+protected:
+    void SetUp() override;
+
+    void generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) override;
+    virtual parameters generate_params_random() const;
+
+    ov::element::Type output_type = ov::element::f32;
+};
+
+class ConvertInput : public Convert {
+protected:
+    void SetUp() override;
+
+    parameters generate_params_random() const override;
+};
+
+class ConvertOutput : public ConvertInput {
+protected:
+    void SetUp() override;
+};
+
+class ConvertStub : public ConvertInput {
+protected:
+    void SetUp() override;
+};
+
+class ConvertPartialInputsAndResults : public ConvertInput {
+protected:
+    void SetUp() override;
+};
+
+class ConvertManyOnInputs : public ConvertInput {
+protected:
+    void SetUp() override;
+};
+
+class ConvertManyOnOutputs : public ConvertInput {
+protected:
+    void SetUp() override;
+};
+
+class ConvertManyOnInputOutput : public ConvertInput {
+protected:
+    void SetUp() override;
+};
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
\ No newline at end of file
diff --git a/src/tests/functional/plugin/shared/include/snippets/eltwise_two_results.hpp b/src/tests/functional/plugin/shared/include/snippets/eltwise_two_results.hpp
new file mode 100644
index 00000000000000..59d3e17e5acb18
--- /dev/null
+++ b/src/tests/functional/plugin/shared/include/snippets/eltwise_two_results.hpp
@@ -0,0 +1,33 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/base/snippets_test_utils.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+typedef std::tuple<
+        ov::Shape,                   // Input 0 Shape
+        ov::Shape,                   // Input 1 Shape
+        size_t,                      // Expected num nodes
+        size_t,                      // Expected num subgraphs
+        std::string                  // Target Device
+> EltwiseTwoResultsParams;
+
+class EltwiseTwoResults : public testing::WithParamInterface<ov::test::snippets::EltwiseTwoResultsParams>,
+                   virtual public ov::test::SnippetsTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::EltwiseTwoResultsParams> obj);
+
+protected:
+    void SetUp() override;
+};
+
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
diff --git a/src/tests/functional/plugin/shared/include/snippets/max_num_params_eltwise.hpp b/src/tests/functional/plugin/shared/include/snippets/max_num_params_eltwise.hpp
new file mode 100644
index 00000000000000..26640e58910512
--- /dev/null
+++ b/src/tests/functional/plugin/shared/include/snippets/max_num_params_eltwise.hpp
@@ -0,0 +1,31 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/base/snippets_test_utils.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+typedef std::tuple<
+        ov::Shape,                   // Input Shape All shapes are replicated
+        size_t,                      // Expected num nodes
+        size_t,                      // Expected num subgraphs
+        std::string                  // Target Device
+> MaxNumParamsEltwiseParams;
+
+class MaxNumParamsEltwiseSinh : public testing::WithParamInterface<ov::test::snippets::MaxNumParamsEltwiseParams>,
+                   virtual public ov::test::SnippetsTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::MaxNumParamsEltwiseParams> obj);
+
+protected:
+    void SetUp() override;
+};
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
diff --git a/src/tests/functional/plugin/shared/include/snippets/two_inputs_and_outputs.hpp b/src/tests/functional/plugin/shared/include/snippets/two_inputs_and_outputs.hpp
new file mode 100644
index 00000000000000..0a209de2fe9244
--- /dev/null
+++ b/src/tests/functional/plugin/shared/include/snippets/two_inputs_and_outputs.hpp
@@ -0,0 +1,31 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/base/snippets_test_utils.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+typedef std::tuple<
+        std::vector<ov::Shape>,      // Input Shape All shapes
+        size_t,                      // Expected num nodes
+        size_t,                      // Expected num subgraphs
+        std::string                  // Target Device
+> TwoInputsAndOutputsParams;
+
+class TwoInputsAndOutputs : public testing::WithParamInterface<ov::test::snippets::TwoInputsAndOutputsParams>,
+                             virtual public ov::test::SnippetsTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::TwoInputsAndOutputsParams> obj);
+
+protected:
+    void SetUp() override;
+};
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
diff --git a/src/tests/functional/plugin/shared/src/snippets/add.cpp b/src/tests/functional/plugin/shared/src/snippets/add.cpp
index 896f03e78d05a1..1b8d1f8ecdfc8d 100644
--- a/src/tests/functional/plugin/shared/src/snippets/add.cpp
+++ b/src/tests/functional/plugin/shared/src/snippets/add.cpp
@@ -10,38 +10,61 @@ namespace ov {
 namespace test {
 namespace snippets {
 
-    std::string Add::getTestCaseName(testing::TestParamInfo<ov::test::snippets::AddParams> obj) {
-        ov::Shape inputShapes0, inputShapes1, newInputShapes;
-        std::string targetDevice;
-        size_t num_nodes, num_subgraphs;
-        std::tie(inputShapes0, inputShapes1, num_nodes, num_subgraphs, targetDevice) = obj.param;
-
-        std::ostringstream result;
-        result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_";
-        result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_";
-        result << "#N=" << num_nodes << "_";
-        result << "#S=" << num_subgraphs << "_";
-        result << "targetDevice=" << targetDevice;
-        return result.str();
-    }
-
-    void Add::SetUp() {
-        ov::Shape inputShape0, inputShape1;
-        std::tie(inputShape0, inputShape1, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
-        init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}});
-
-        auto f = ov::test::snippets::AddFunction({inputShape0, inputShape1});
-        function = f.getOriginal();
-    }
-
-    void AddSinh::SetUp() {
-        ov::Shape inputShape0, inputShape1;
-        std::tie(inputShape0, inputShape1, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
-        init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}});
-
-        auto f = ov::test::snippets::AddSinhFunction({inputShape0, inputShape1});
-        function = f.getOriginal();
-    }
+std::string Add::getTestCaseName(testing::TestParamInfo<ov::test::snippets::AddParams> obj) {
+    ov::Shape inputShapes0, inputShapes1, newInputShapes;
+    std::string targetDevice;
+    size_t num_nodes, num_subgraphs;
+    std::tie(inputShapes0, inputShapes1, num_nodes, num_subgraphs, targetDevice) = obj.param;
+
+    std::ostringstream result;
+    result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_";
+    result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_";
+    result << "#N=" << num_nodes << "_";
+    result << "#S=" << num_subgraphs << "_";
+    result << "targetDevice=" << targetDevice;
+    return result.str();
+}
+
+void Add::SetUp() {
+    ov::Shape inputShape0, inputShape1;
+    std::tie(inputShape0, inputShape1, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}});
+
+    auto f = ov::test::snippets::AddFunction({inputShape0, inputShape1});
+    function = f.getOriginal();
+}
+
+void AddSinh::SetUp() {
+    ov::Shape inputShape0, inputShape1;
+    std::tie(inputShape0, inputShape1, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}});
+
+    auto f = ov::test::snippets::AddSinhFunction({inputShape0, inputShape1});
+    function = f.getOriginal();
+}
+
+std::string AddSinhConst::getTestCaseName(testing::TestParamInfo<ov::test::snippets::AddConstParams> obj) {
+    ov::Shape inputShapes, newInputShapes;
+    std::string targetDevice;
+    size_t num_nodes, num_subgraphs;
+    std::tie(inputShapes, num_nodes, num_subgraphs, targetDevice) = obj.param;
+
+    std::ostringstream result;
+    result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes) << "_";
+    result << "#N=" << num_nodes << "_";
+    result << "#S=" << num_subgraphs << "_";
+    result << "targetDevice=" << targetDevice;
+    return result.str();
+}
+
+void AddSinhConst::SetUp() {
+    ov::Shape inputShape;
+    std::tie(inputShape, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes({{{}, {inputShape, }}});
+
+    auto f = ov::test::snippets::AddSinhConstFunction({inputShape});
+    function = f.getOriginal();
+}
 
 TEST_P(Add, CompareWithRefImpl) {
     run();
@@ -53,6 +76,11 @@ TEST_P(AddSinh, CompareWithRefImpl) {
     validateNumSubgraphs();
 }
 
+TEST_P(AddSinhConst, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
 } // namespace snippets
 } // namespace test
 } // namespace ov
diff --git a/src/tests/functional/plugin/shared/src/snippets/convert.cpp b/src/tests/functional/plugin/shared/src/snippets/convert.cpp
new file mode 100644
index 00000000000000..b4c5c840cb6869
--- /dev/null
+++ b/src/tests/functional/plugin/shared/src/snippets/convert.cpp
@@ -0,0 +1,231 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common_test_utils/common_utils.hpp"
+#include "snippets/convert.hpp"
+#include "subgraph_converts.hpp"
+#include "common_test_utils/ov_tensor_utils.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+std::string Convert::getTestCaseName(testing::TestParamInfo<ov::test::snippets::ConvertParams> obj) {
+    std::vector<ov::Shape> inputShape;
+    std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
+    std::string targetDevice;
+    size_t num_nodes, num_subgraphs;
+    std::tie(inputShape, types, num_nodes, num_subgraphs, targetDevice) = obj.param;
+
+    std::ostringstream result;
+    result << "IS=";
+    for (const auto& sh : inputShape)
+        result << CommonTestUtils::vec2str(sh) << "_";
+    result << "IT=" << CommonTestUtils::vec2str(types.first) << "_";
+    result << "OT=" << CommonTestUtils::vec2str(types.second) << "_";
+    result << "#N=" << num_nodes << "_";
+    result << "#S=" << num_subgraphs << "_";
+    result << "targetDevice=" << targetDevice;
+    return result.str();
+}
+
+void Convert::SetUp() {
+    std::vector<ov::Shape> inputShape;
+    std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
+    std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes(static_shapes_to_test_representation(inputShape));
+
+    auto f = ov::test::snippets::ConvertFunction(inputShape, types.first[0], types.second[0]);
+    function = f.getOriginal();
+    output_type = types.second.front();
+}
+
+parameters Convert::generate_params_random() const {
+    int32_t startFrom, range, resolution = 5;
+    switch (output_type) {
+        case ov::element::f32:
+        case ov::element::i32:
+        case ov::element::bf16:
+            startFrom = -10;
+            range = 20;
+            break;
+        case ov::element::u8:
+            startFrom = -10;
+            range = 20;
+            break;
+        case ov::element::i8:
+            startFrom = 117;
+            range = 20;
+            break;
+        default:
+            startFrom = 0;
+            range = 10;
+    }
+    return {{ startFrom, range, resolution }};
+}
+
+void Convert::generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) {
+    inputs.clear();
+    const auto& funcInputs = function->inputs();
+    const auto params = generate_params_random();
+    if (params.size() != funcInputs.size()) {
+        IE_THROW() << "Incorrect count of parameters for random generation and inputs of function!";
+    }
+
+    for (int i = 0; i < funcInputs.size(); ++i) {
+        const auto& funcInput = funcInputs[i];
+        ov::Tensor tensor;
+        int32_t startFrom, range, resolution;
+        std::tie(startFrom, range, resolution) = params[i];
+        tensor = ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i],
+                                                         range, startFrom, resolution);
+        inputs.insert({funcInput.get_node_shared_ptr(), tensor});
+    }
+}
+
+void ConvertInput::SetUp() {
+    std::vector<ov::Shape> inputShape;
+    std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
+    std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes(static_shapes_to_test_representation(inputShape));
+
+    auto f = ov::test::snippets::ConvertInputFunction(inputShape, types.first[0], types.second[0]);
+    function = f.getOriginal();
+}
+
+parameters ConvertInput::generate_params_random() const {
+    parameters params;
+    const auto& funcInputs = function->inputs();
+    for (int i = 0; i < funcInputs.size(); ++i) {
+        int32_t startFrom, range, resolution = 1;
+        switch (funcInputs[i].get_element_type()) {
+            case ov::element::f32:
+            case ov::element::bf16:
+                startFrom = -10;
+                range = 20;
+                resolution = 7;
+                break;
+            case ov::element::i32:
+            case ov::element::i8:
+                startFrom = -10;
+                range = 20;
+                break;
+            case ov::element::u8:
+                startFrom = 10;
+                range = 20;
+                break;
+            default:
+                startFrom = 0;
+                range = 10;
+        }
+        params.push_back({ startFrom, range, resolution });
+    }
+    return params;
+}
+
+void ConvertOutput::SetUp() {
+    std::vector<ov::Shape> inputShape;
+    std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
+    std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes(static_shapes_to_test_representation(inputShape));
+
+    auto f = ov::test::snippets::ConvertOutputFunction(inputShape, types.first[0], types.second[0]);
+    function = f.getOriginal();
+    output_type = types.second.front();
+}
+
+void ConvertStub::SetUp() {
+    std::vector<ov::Shape> inputShape;
+    std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
+    std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes(static_shapes_to_test_representation(inputShape));
+
+    auto f = ov::test::snippets::ConvertStubFunction(inputShape, types.first[0], types.second[0]);
+    function = f.getOriginal();
+    output_type = types.second.front();
+}
+
+void ConvertPartialInputsAndResults::SetUp() {
+    std::vector<ov::Shape> inputShape;
+    std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
+    std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes(static_shapes_to_test_representation(inputShape));
+
+    auto f = ov::test::snippets::ConvertPartialInputsAndResultsFunction(inputShape, types.first, types.second);
+    function = f.getOriginal();
+}
+
+void ConvertManyOnInputs::SetUp() {
+    std::vector<ov::Shape> inputShape;
+    std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
+    std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes(static_shapes_to_test_representation(inputShape));
+
+    auto f = ov::test::snippets::ConvertManyOnInputsFunction(inputShape, types.first);
+    function = f.getOriginal();
+}
+
+void ConvertManyOnOutputs::SetUp() {
+    std::vector<ov::Shape> inputShape;
+    std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
+    std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes(static_shapes_to_test_representation(inputShape));
+
+    auto f = ov::test::snippets::ConvertManyOnOutputsFunction(inputShape, types.first);
+    function = f.getOriginal();
+}
+
+void ConvertManyOnInputOutput::SetUp() {
+    std::vector<ov::Shape> inputShape;
+    std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
+    std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes(static_shapes_to_test_representation(inputShape));
+
+    auto f = ov::test::snippets::ConvertManyOnInputOutputFunction(inputShape, types.first, types.second);
+    function = f.getOriginal();
+}
+
+TEST_P(Convert, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+TEST_P(ConvertInput, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+TEST_P(ConvertOutput, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+TEST_P(ConvertStub, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+TEST_P(ConvertPartialInputsAndResults, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+TEST_P(ConvertManyOnInputs, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+TEST_P(ConvertManyOnOutputs, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+TEST_P(ConvertManyOnInputOutput, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
diff --git a/src/tests/functional/plugin/shared/src/snippets/eltwise_two_results.cpp b/src/tests/functional/plugin/shared/src/snippets/eltwise_two_results.cpp
new file mode 100644
index 00000000000000..f35f0717155e42
--- /dev/null
+++ b/src/tests/functional/plugin/shared/src/snippets/eltwise_two_results.cpp
@@ -0,0 +1,44 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common_test_utils/common_utils.hpp"
+#include "snippets/eltwise_two_results.hpp"
+#include "subgraph_simple.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+std::string EltwiseTwoResults::getTestCaseName(testing::TestParamInfo<ov::test::snippets::EltwiseTwoResultsParams> obj) {
+    ov::Shape inputShapes0, inputShapes1;
+    std::string targetDevice;
+    size_t num_nodes, num_subgraphs;
+    std::tie(inputShapes0, inputShapes1, num_nodes, num_subgraphs, targetDevice) = obj.param;
+
+    std::ostringstream result;
+    result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_";
+    result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_";
+    result << "#N=" << num_nodes << "_";
+    result << "#S=" << num_subgraphs << "_";
+    result << "targetDevice=" << targetDevice;
+    return result.str();
+}
+
+void EltwiseTwoResults::SetUp() {
+    ov::Shape inputShape0, inputShape1;
+    std::tie(inputShape0, inputShape1, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}});
+
+    auto f = ov::test::snippets::EltwiseTwoResultsFunction({inputShape0, inputShape1});
+    function = f.getOriginal();
+}
+
+TEST_P(EltwiseTwoResults, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
diff --git a/src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp b/src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp
new file mode 100644
index 00000000000000..1140937be63359
--- /dev/null
+++ b/src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp
@@ -0,0 +1,49 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common_test_utils/common_utils.hpp"
+#include "snippets/max_num_params_eltwise.hpp"
+#include "subgraph_simple.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+std::string MaxNumParamsEltwiseSinh::getTestCaseName(testing::TestParamInfo<ov::test::snippets::MaxNumParamsEltwiseParams> obj) {
+    ov::Shape inputShapes;
+    std::string targetDevice;
+    size_t num_nodes, num_subgraphs;
+    std::tie(inputShapes, num_nodes, num_subgraphs, targetDevice) = obj.param;
+
+    std::ostringstream result;
+    result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes) << "_";
+    result << "#N=" << num_nodes << "_";
+    result << "#S=" << num_subgraphs << "_";
+    result << "targetDevice=" << targetDevice;
+    return result.str();
+}
+
+void MaxNumParamsEltwiseSinh::SetUp() {
+    ov::Shape inputShape;
+    std::tie(inputShape, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    std::vector<ov::Shape> expandedShapes(10, inputShape);
+    std::vector<InputShape> input_shapes;
+    for (const auto& s : expandedShapes) {
+        input_shapes.emplace_back(InputShape {{}, {s, }});
+    }
+
+    init_input_shapes(input_shapes);
+
+    auto f = ov::test::snippets::EltwiseMaxNumParamsSinhFunction(expandedShapes);
+    function = f.getOriginal();
+}
+
+TEST_P(MaxNumParamsEltwiseSinh, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
diff --git a/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp b/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp
index ad1c3e74255938..276218e6150c57 100644
--- a/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp
+++ b/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp
@@ -10,42 +10,42 @@ namespace ov {
 namespace test {
 namespace snippets {
 
-    std::string ThreeInputsEltwise::getTestCaseName(testing::TestParamInfo<ov::test::snippets::ThreeInputsEltwiseParams> obj) {
-        ov::Shape inputShapes0, inputShapes1, inputShapes2;
-        std::string targetDevice;
-        size_t num_nodes, num_subgraphs;
-        std::tie(inputShapes0, inputShapes1, inputShapes2,
-                 num_nodes, num_subgraphs, targetDevice) = obj.param;
-
-        std::ostringstream result;
-        result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_";
-        result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_";
-        result << "IS[2]=" << CommonTestUtils::vec2str(inputShapes2) << "_";
-        result << "#N=" << num_nodes << "_";
-        result << "#S=" << num_subgraphs << "_";
-        result << "targetDevice=" << targetDevice;
-        return result.str();
-    }
-
-    void ThreeInputsEltwise::SetUp() {
-        ov::Shape inputShape0, inputShape1, inputShape2;
-        std::tie(inputShape0, inputShape1, inputShape2,
-                 ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
-        init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}, {{}, {inputShape2, }}});
-
-        auto f = ov::test::snippets::EltwiseThreeInputsFunction({inputShape0, inputShape1, inputShape2});
-        function = f.getOriginal();
-    }
-
-    void ThreeInputsEltwiseSinh::SetUp() {
-        ov::Shape inputShape0, inputShape1, inputShape2;
-        std::tie(inputShape0, inputShape1, inputShape2,
-                 ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
-        init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}, {{}, {inputShape2, }}});
-
-        auto f = ov::test::snippets::EltwiseThreeInputsSinhFunction({inputShape0, inputShape1, inputShape2});
-        function = f.getOriginal();
-    }
+std::string ThreeInputsEltwise::getTestCaseName(testing::TestParamInfo<ov::test::snippets::ThreeInputsEltwiseParams> obj) {
+    ov::Shape inputShapes0, inputShapes1, inputShapes2;
+    std::string targetDevice;
+    size_t num_nodes, num_subgraphs;
+    std::tie(inputShapes0, inputShapes1, inputShapes2,
+             num_nodes, num_subgraphs, targetDevice) = obj.param;
+
+    std::ostringstream result;
+    result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_";
+    result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_";
+    result << "IS[2]=" << CommonTestUtils::vec2str(inputShapes2) << "_";
+    result << "#N=" << num_nodes << "_";
+    result << "#S=" << num_subgraphs << "_";
+    result << "targetDevice=" << targetDevice;
+    return result.str();
+}
+
+void ThreeInputsEltwise::SetUp() {
+    ov::Shape inputShape0, inputShape1, inputShape2;
+    std::tie(inputShape0, inputShape1, inputShape2,
+             ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}, {{}, {inputShape2, }}});
+
+    auto f = ov::test::snippets::EltwiseThreeInputsFunction({inputShape0, inputShape1, inputShape2});
+    function = f.getOriginal();
+}
+
+void ThreeInputsEltwiseSinh::SetUp() {
+    ov::Shape inputShape0, inputShape1, inputShape2;
+    std::tie(inputShape0, inputShape1, inputShape2,
+             ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}, {{}, {inputShape2, }}});
+
+    auto f = ov::test::snippets::EltwiseThreeInputsSinhFunction({inputShape0, inputShape1, inputShape2});
+    function = f.getOriginal();
+}
 
 TEST_P(ThreeInputsEltwise, CompareWithRefImpl) {
     run();
diff --git a/src/tests/functional/plugin/shared/src/snippets/two_inputs_and_outputs.cpp b/src/tests/functional/plugin/shared/src/snippets/two_inputs_and_outputs.cpp
new file mode 100644
index 00000000000000..205587e1a30f97
--- /dev/null
+++ b/src/tests/functional/plugin/shared/src/snippets/two_inputs_and_outputs.cpp
@@ -0,0 +1,43 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common_test_utils/common_utils.hpp"
+#include "snippets/two_inputs_and_outputs.hpp"
+#include "subgraph_simple.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+std::string TwoInputsAndOutputs::getTestCaseName(testing::TestParamInfo<ov::test::snippets::TwoInputsAndOutputsParams> obj) {
+    std::vector<ov::Shape> inputShapes;
+    std::string targetDevice;
+    size_t num_nodes, num_subgraphs;
+    std::tie(inputShapes, num_nodes, num_subgraphs, targetDevice) = obj.param;
+
+    std::ostringstream result;
+    for (auto i = 0; i < inputShapes.size(); i++)
+        result << "IS[" << i << "]=" << CommonTestUtils::vec2str(inputShapes[i]) << "_";
+    result << "#N=" << num_nodes << "_";
+    result << "#S=" << num_subgraphs << "_";
+    result << "targetDevice=" << targetDevice;
+    return result.str();
+}
+
+void TwoInputsAndOutputs::SetUp() {
+    std::vector<ov::Shape> inputShape;
+    std::tie(inputShape, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes(static_shapes_to_test_representation(inputShape));
+    auto f = ov::test::snippets::TwoInputsAndOutputsFunction(inputShape);
+    function = f.getOriginal();
+}
+
+TEST_P(TwoInputsAndOutputs, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
diff --git a/src/tests/functional/shared_test_classes/src/base/snippets_test_utils.cpp b/src/tests/functional/shared_test_classes/src/base/snippets_test_utils.cpp
index b7a58321172fd6..4397a88c3157c3 100644
--- a/src/tests/functional/shared_test_classes/src/base/snippets_test_utils.cpp
+++ b/src/tests/functional/shared_test_classes/src/base/snippets_test_utils.cpp
@@ -3,11 +3,16 @@
 //
 
 #include "shared_test_classes/base/snippets_test_utils.hpp"
+#include "functional_test_utils/skip_tests_config.hpp"
 #include "exec_graph_info.hpp"
 
 namespace ov {
 namespace test {
 void SnippetsTestsCommon::validateNumSubgraphs() {
+    bool isCurrentTestDisabled = FuncTestUtils::SkipTestsConfig::currentTestIsDisabled();
+    if (isCurrentTestDisabled)
+        GTEST_SKIP() << "Disabled test due to configuration" << std::endl;
+
     const auto& compiled_model = compiledModel.get_runtime_model();
     size_t num_subgraphs = 0;
     size_t num_nodes = 0;
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_converts.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_converts.hpp
new file mode 100644
index 00000000000000..a7c6bd34e0f58e
--- /dev/null
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_converts.hpp
@@ -0,0 +1,214 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/ngraph.hpp"
+#include "./snippets_helpers.hpp"
+
+/* This file contains definitions of relatively simple functions (models) that will be used
+ * to test snippets-specific behavior. All the functions are expected to be direct descendants of
+ * SnippetsFunctionBase, so their constructors take only one (inputShapes) argument.
+ */
+
+namespace ov {
+namespace test {
+namespace snippets {
+/// The most trivial graph, just one Convert.
+/// Tokenized simply by starting subgraph.
+//    in1
+//  Convert
+//   Result
+class ConvertFunction : public SnippetsFunctionBase {
+public:
+    explicit ConvertFunction(const std::vector<Shape>& inputShapes,
+                             const ov::element::Type inType = ov::element::f32,
+                             const ov::element::Type outType = ov::element::u8)
+    : SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) {
+        NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+    std::shared_ptr<ov::Model> initReference() const override;
+
+    ov::element::Type inType;
+    ov::element::Type outType;
+};
+
+
+/// The one of the input of Add is Convert
+/// Tokenized simply by starting subgraph.
+//    in1
+//  Convert    in2
+//       Add
+//      Result
+class ConvertInputFunction : public SnippetsFunctionBase {
+public:
+    explicit ConvertInputFunction(const std::vector<Shape>& inputShapes,
+                                  const ov::element::Type inType = ov::element::f32,
+                                  const ov::element::Type outType = ov::element::u8)
+    : SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) {
+        NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+    std::shared_ptr<ov::Model> initReference() const override;
+
+    ov::element::Type inType;
+    ov::element::Type outType;
+};
+
+/// The output of Sub is Convert
+/// Tokenized simply by starting subgraph.
+//    in1     in2
+//       Sub
+//     Convert
+//      Result
+class ConvertOutputFunction : public SnippetsFunctionBase {
+public:
+    explicit ConvertOutputFunction(const std::vector<Shape>& inputShapes,
+                                   const ov::element::Type inType = ov::element::f32,
+                                   const ov::element::Type outType = ov::element::i8)
+    : SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) {
+        NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+    std::shared_ptr<ov::Model> initReference() const override;
+
+    ov::element::Type inType;
+    ov::element::Type outType;
+};
+
+
+/// There are 2 subgraphs: Add + Convert(Stub) and Relu
+/// Tokenized simply by starting subgraph.
+//    in1    in2           in1     in2
+//       Add                 Subgraph
+//     Convert        ->        |
+//       Relu                Subgraph
+//      Result                Result
+class ConvertStubFunction : public SnippetsFunctionBase {
+public:
+    explicit ConvertStubFunction(const std::vector<Shape>& inputShapes,
+                                 const ov::element::Type inType = ov::element::f32,
+                                 const ov::element::Type outType = ov::element::i8)
+        : SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) {
+        NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+    std::shared_ptr<ov::Model> initReference() const override;
+
+    ov::element::Type inType;
+    ov::element::Type outType;
+};
+
+
+/// Not all Inputs and Results have Convert
+/// Tokenized simply by starting subgraph.
+//    in1      in2
+//  Convert  Convert
+//        Add
+//       Relu        in3
+//  Convert     Sub
+//  Result1  Unsqueeze   <- It's to avoid many result output for subgraph (it's a limitation of collapsing)
+//            Result2
+class ConvertPartialInputsAndResultsFunction : public SnippetsFunctionBase {
+public:
+    explicit ConvertPartialInputsAndResultsFunction(const std::vector<Shape>& inputShapes,
+                                                    const std::vector<ov::element::Type>& inTypes = {ov::element::f32},
+                                                    const std::vector<ov::element::Type>& outTypes = {ov::element::f32})
+    : SnippetsFunctionBase(inputShapes), inTypes(inTypes), outTypes(outTypes) {
+        NGRAPH_CHECK(input_shapes.size() == 3, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+    std::shared_ptr<ov::Model> initReference() const override;
+
+    std::vector<ov::element::Type> inTypes;
+    std::vector<ov::element::Type> outTypes;
+};
+
+/// Convert Sequence on input
+/// Tokenized simply by starting subgraph.
+//    in           in
+//   Stub         Stub
+//  Convert         |
+//  Convert  ->  Subgraph
+//  Convert         |
+//   Relu         Result
+//  Result
+class ConvertManyOnInputsFunction : public SnippetsFunctionBase {
+public:
+    explicit ConvertManyOnInputsFunction(const std::vector<Shape>& inputShapes, const std::vector<ov::element::Type>& types)
+    : SnippetsFunctionBase(inputShapes), types(types) {
+        NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
+        NGRAPH_CHECK(types.size() > 1, "Got invalid number of element types");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+    std::shared_ptr<ov::Model> initReference() const override;
+
+    std::vector<ov::element::Type> types;
+};
+
+/// Convert Sequence on output
+/// Tokenized simply by starting subgraph.
+//    in           in
+//   Stub         Stub
+//   Relu           |
+//  Convert  ->  Subgraph
+//  Convert         |
+//  Convert         |
+//  Result        Result
+class ConvertManyOnOutputsFunction : public SnippetsFunctionBase {
+public:
+    explicit ConvertManyOnOutputsFunction(const std::vector<Shape>& inputShapes, const std::vector<ov::element::Type>& types)
+    : SnippetsFunctionBase(inputShapes), types(types) {
+        NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
+        NGRAPH_CHECK(types.size() > 1, "Got invalid number of element types");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+    std::shared_ptr<ov::Model> initReference() const override;
+
+    std::vector<ov::element::Type> types;
+};
+
+/// Convert Sequence on input and output
+/// Tokenized simply by starting subgraph.
+//    in           in
+//   Stub         Stub
+//  Convert         |
+//  Convert         |
+//  Convert         |
+//   Relu    ->  Subgraph
+//  Convert         |
+//  Convert         |
+//  Convert         |
+//  Result        Result
+class ConvertManyOnInputOutputFunction : public SnippetsFunctionBase {
+public:
+    explicit ConvertManyOnInputOutputFunction(const std::vector<Shape>& inputShapes,
+                                              const std::vector<ov::element::Type>& inTypes,
+                                              const std::vector<ov::element::Type>& outTypes)
+    : SnippetsFunctionBase(inputShapes), inTypes(inTypes), outTypes(outTypes) {
+        NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
+        NGRAPH_CHECK(inTypes.size() > 1, "Got invalid number of input element types");
+        NGRAPH_CHECK(outTypes.size() > 0, "Got invalid number of output element types");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+    std::shared_ptr<ov::Model> initReference() const override;
+
+    std::vector<ov::element::Type> inTypes;
+    std::vector<ov::element::Type> outTypes;
+};
+
+
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp
index f35e0e1ecd4b33..fad086acf031e1 100644
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp
@@ -7,6 +7,7 @@
 #include "ngraph/ngraph.hpp"
 #include "snippets_helpers.hpp"
 #include "subgraph_simple.hpp"
+#include "subgraph_converts.hpp"
 
 /* This file provides lowered representations (after the generate() was calles) for some simple functions.
  * This is required to test snippets lowering and optimization passes. All the functions are expected to be direct
@@ -45,7 +46,7 @@ class EltwiseThreeInputsLoweredFunction : public EltwiseThreeInputsFunction {
 protected:
     std::shared_ptr<ov::Model> initLowered() const override;
 private:
-    std::vector<Shape> broadcast_shapes;;
+    std::vector<Shape> broadcast_shapes;
 };
 
 }  // namespace snippets
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp
index f67a86966a4bd8..3623db2873f416 100644
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp
@@ -29,13 +29,14 @@ class AddFunction : public SnippetsFunctionBase {
     std::shared_ptr<ov::Model> initOriginal() const override;
     std::shared_ptr<ov::Model> initReference() const override;
 };
-/// Add separated from inputs by Sin to WA CPU-specific disabling after inputs.
+/// Add separated from inputs by Sinh to WA CPU-specific disabling after inputs.
 /// Works because Sinh is not supported by tokenization yet.
 /// Tokenized simply by starting subgraph.
 //   in1       in2
-//   Sin       Sinh
+//   Sinh       Sinh
 //        Add
 //      Result
+// todo: remove Sinh once "no subgraph after input" limitation is relaxed
 class AddSinhFunction : public SnippetsFunctionBase {
 public:
     explicit AddSinhFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
@@ -45,6 +46,21 @@ class AddSinhFunction : public SnippetsFunctionBase {
     std::shared_ptr<ov::Model> initOriginal() const override;
     std::shared_ptr<ov::Model> initReference() const override;
 };
+/// Like AddSinh but with a constant second input (and no sinh on in)
+//   in1       in2
+//   Sin       Sinh
+//        Add
+//      Result
+// todo: remove Sinh once "no subgraph after input" limitation is relaxed
+class AddSinhConstFunction : public SnippetsFunctionBase {
+public:
+    explicit AddSinhConstFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+        NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+//    std::shared_ptr<ov::Model> initReference() const override;
+};
 /// Simple Eltwise graph fully convertible to Subgraph.
 /// Tokenized simply by attaching eltwises.
 // in1   in2
@@ -77,6 +93,7 @@ class EltwiseThreeInputsFunction : public SnippetsFunctionBase {
 };
 /// EltwiseFunctionThreeInputs with Sinh after inputs to to WA CPU-specific disabling after inputs
 /// See AddSinh for details.
+// todo: remove Sinh once "no subgraph after input" limitation is relaxed
 class EltwiseThreeInputsSinhFunction : public SnippetsFunctionBase {
 public:
     explicit EltwiseThreeInputsSinhFunction(const std::vector<Shape>& inputShapes) :
@@ -86,6 +103,24 @@ class EltwiseThreeInputsSinhFunction : public SnippetsFunctionBase {
 protected:
     std::shared_ptr<ov::Model> initOriginal() const override;
 };
+/// Eltwise graph with 10 inputs and 2 outputs.
+/// Needed to test for a max number of inputs+outputs allowed.
+// in1   in2   in3 ... in10
+// Sinh  Sinh  Sinh ...Sinh
+// ........................
+//    Subtract    Power
+//          \   Sinh
+//          Result
+// todo: remove Sinh once "no subgraph after input" limitation is relaxed
+class EltwiseMaxNumParamsSinhFunction : public SnippetsFunctionBase {
+public:
+    explicit EltwiseMaxNumParamsSinhFunction(const std::vector<Shape>& inputShapes) :
+            SnippetsFunctionBase(inputShapes) {
+        NGRAPH_CHECK(input_shapes.size() == 10, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+};
 /// MatMul with two eltwise branches joined with Add just before the Result.
 /// Tokenized by attaching eltwises to separate subgraphs, and then joining them together.
 //                   in1   in2
@@ -125,7 +160,41 @@ class EltwiseLogLoopFunction : public SnippetsFunctionBase {
     std::shared_ptr<ov::Model> initOriginal() const override;
     std::shared_ptr<ov::Model> initReference() const override;
 };
-
+/// 2 results.
+/// So we have 2 subgraphs - Snippets don't support subgraphs with many results
+/// Also Output tensors have names to check correct copying output names
+//    in1    in2
+//    Sinh   Sinh
+//        Add
+//  HSwish   Result
+//  Relu
+//  Result
+class EltwiseTwoResultsFunction : public SnippetsFunctionBase {
+public:
+    explicit EltwiseTwoResultsFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+            NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+    std::shared_ptr<ov::Model> initReference() const override;
+};
+/// Two different Input and Outputs.
+/// This function is to check correct Broadcasting
+//        in1       in2
+//        Sin       Sin
+//       HSwish      /
+//  Result      Add
+//              Relu
+//              Sin
+//             Result
+class TwoInputsAndOutputsFunction : public SnippetsFunctionBase {
+public:
+    explicit TwoInputsAndOutputsFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+        NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+};
 }  // namespace snippets
 }  // namespace test
 }  // namespace ov
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_convert.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_convert.cpp
new file mode 100644
index 00000000000000..5c743cf2006bb3
--- /dev/null
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_convert.cpp
@@ -0,0 +1,241 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "subgraph_converts.hpp"
+#include "common_test_utils/data_utils.hpp"
+#include <snippets/op/convert_truncation.hpp>
+#include <snippets/op/subgraph.hpp>
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+std::shared_ptr<ov::Node> createRollAsStub(const std::shared_ptr<ov::Node>& parent) {
+    auto shift = std::make_shared<op::v0::Constant>(ov::element::i32, Shape{1}, std::vector<int>{1});
+    auto axes = std::make_shared<op::v0::Constant>(ov::element::i32, Shape{1}, std::vector<int>{0});
+    return std::make_shared<op::v7::Roll>(parent->output(0), shift, axes);
+}
+
+std::shared_ptr<ov::Model> ConvertFunction::initOriginal() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(inType, input_shapes[0]);
+    auto stub = createRollAsStub(data0);
+    auto convert = std::make_shared<op::v0::Convert>(stub, outType);
+    return std::make_shared<ov::Model>(NodeVector{convert}, ParameterVector{data0});
+}
+std::shared_ptr<ov::Model> ConvertFunction::initReference() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(inType, input_shapes[0]);
+    auto stub = createRollAsStub(data0);
+    auto indata0 = std::make_shared<op::v0::Parameter>(inType, stub->get_shape());
+    auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(NodeVector{stub},
+         std::make_shared<ov::Model>(NodeVector{std::make_shared<ngraph::snippets::op::ConvertTruncation>(indata0, outType)},
+                                     ParameterVector{indata0}));
+    return std::make_shared<ov::Model>(NodeVector{subgraph}, ParameterVector{data0});
+}
+
+std::shared_ptr<ov::Model> ConvertInputFunction::initOriginal() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(inType, input_shapes[0]);
+    auto data1 = std::make_shared<op::v0::Parameter>(outType, input_shapes[1]);
+    auto stub0 = createRollAsStub(data0);
+    auto stub1 = createRollAsStub(data1);
+    auto convert = std::make_shared<op::v0::Convert>(stub0, outType);
+    auto add = std::make_shared<op::v1::Add>(convert, stub1);
+    return std::make_shared<ov::Model>(NodeVector{add}, ParameterVector{data0, data1});
+}
+std::shared_ptr<ov::Model> ConvertInputFunction::initReference() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(inType, input_shapes[0]);
+    auto data1 = std::make_shared<op::v0::Parameter>(outType, input_shapes[1]);
+    auto stub0 = createRollAsStub(data0);
+    auto stub1 = createRollAsStub(data1);
+    auto indata0 = std::make_shared<op::v0::Parameter>(inType, stub0->get_shape());
+    auto indata1 = std::make_shared<op::v0::Parameter>(outType, stub1->get_shape());
+    auto convert = std::make_shared<ngraph::snippets::op::ConvertTruncation>(indata0, outType);
+    auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(NodeVector{stub0, stub1},
+        std::make_shared<ov::Model>(
+                NodeVector{std::make_shared<op::v1::Add>(convert, indata1)},
+                ParameterVector{indata0, indata1}));
+    return std::make_shared<ov::Model>(NodeVector{subgraph}, ParameterVector{data0, data1});
+}
+
+std::shared_ptr<ov::Model> ConvertOutputFunction::initOriginal() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(inType, input_shapes[0]);
+    auto data1 = std::make_shared<op::v0::Parameter>(inType, input_shapes[1]);
+    auto stub0 = createRollAsStub(data0);
+    auto stub1 = createRollAsStub(data1);
+    auto add = std::make_shared<op::v1::Add>(stub0, stub1);
+    auto convert = std::make_shared<op::v0::Convert>(add, outType);
+    return std::make_shared<ov::Model>(NodeVector{convert}, ParameterVector{data0, data1});
+}
+std::shared_ptr<ov::Model> ConvertOutputFunction::initReference() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(inType, input_shapes[0]);
+    auto data1 = std::make_shared<op::v0::Parameter>(inType, input_shapes[1]);
+    auto stub0 = createRollAsStub(data0);
+    auto stub1 = createRollAsStub(data1);
+    auto indata0 = std::make_shared<op::v0::Parameter>(inType, stub0->get_shape());
+    auto indata1 = std::make_shared<op::v0::Parameter>(inType, stub1->get_shape());
+    auto add = std::make_shared<op::v1::Add>(indata0, indata1);
+    auto convert = std::make_shared<ngraph::snippets::op::ConvertTruncation>(add, outType);
+    auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(NodeVector{stub0, stub1},
+                                                                     std::make_shared<ov::Model>(
+                                                                             NodeVector{convert},
+                                                                             ParameterVector{indata0, indata1}));
+    return std::make_shared<ov::Model>(NodeVector{subgraph}, ParameterVector{data0, data1});
+}
+
+std::shared_ptr<ov::Model> ConvertStubFunction::initOriginal() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(inType, input_shapes[0]);
+    auto data1 = std::make_shared<op::v0::Parameter>(inType, input_shapes[1]);
+    auto stub0 = createRollAsStub(data0);
+    auto stub1 = createRollAsStub(data1);
+    auto add = std::make_shared<op::v1::Add>(stub0, stub1);
+    auto convert = std::make_shared<op::v0::Convert>(add, outType);
+    auto relu = std::make_shared<op::v0::Relu>(convert);
+    return std::make_shared<ov::Model>(NodeVector{relu}, ParameterVector{data0, data1});
+}
+std::shared_ptr<ov::Model> ConvertStubFunction::initReference() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(inType, input_shapes[0]);
+    auto data1 = std::make_shared<op::v0::Parameter>(inType, input_shapes[1]);
+    auto stub0 = createRollAsStub(data0);
+    auto stub1 = createRollAsStub(data1);
+    auto indata0 = std::make_shared<op::v0::Parameter>(inType, stub0->get_shape());
+    auto indata1 = std::make_shared<op::v0::Parameter>(inType, stub1->get_shape());
+    auto add = std::make_shared<op::v1::Add>(indata0, indata1);
+    auto convert = std::make_shared<ngraph::snippets::op::ConvertTruncation>(add, outType);
+    auto subgraph0 = std::make_shared<ngraph::snippets::op::Subgraph>(
+            NodeVector{stub0, stub1}, std::make_shared<ov::Model>(NodeVector{convert}, ParameterVector{indata0, indata1}));
+    auto indata2 = std::make_shared<op::v0::Parameter>(convert->get_destination_type(), convert->get_shape());
+    auto relu = std::make_shared<op::v0::Relu>(indata2);
+    auto subgraph1 = std::make_shared<ngraph::snippets::op::Subgraph>(
+            NodeVector{subgraph0}, std::make_shared<ov::Model>(NodeVector{relu}, ParameterVector{indata2}));
+    return std::make_shared<ov::Model>(NodeVector{subgraph1}, ParameterVector{data0, data1});
+}
+
+std::shared_ptr<ov::Model> ConvertPartialInputsAndResultsFunction::initOriginal() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(inTypes[0], input_shapes[0]);
+    auto data1 = std::make_shared<op::v0::Parameter>(inTypes[1], input_shapes[1]);
+    auto data2 = std::make_shared<op::v0::Parameter>(inTypes[2], input_shapes[2]);
+    auto stub0 = createRollAsStub(data0);
+    auto stub1 = createRollAsStub(data1);
+    auto stub2 = createRollAsStub(data2);
+    auto convert0 = std::make_shared<op::v0::Convert>(stub0, outTypes[0]);
+    auto convert1 = std::make_shared<op::v0::Convert>(stub1, outTypes[0]);
+    auto add = std::make_shared<op::v1::Add>(convert0, convert1);
+    auto relu = std::make_shared<op::v0::Relu>(add);
+    auto sub = std::make_shared<op::v1::Subtract>(relu, stub2);
+    auto stub3 = createRollAsStub(sub);
+    auto convert2 = std::make_shared<op::v0::Convert>(relu, outTypes[1]);
+    return std::make_shared<ov::Model>(NodeVector{convert2, stub3}, ParameterVector{data0, data1, data2});
+}
+std::shared_ptr<ov::Model> ConvertPartialInputsAndResultsFunction::initReference() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(inTypes[0], input_shapes[0]);
+    auto data1 = std::make_shared<op::v0::Parameter>(inTypes[1], input_shapes[1]);
+    auto data2 = std::make_shared<op::v0::Parameter>(inTypes[2], input_shapes[2]);
+    auto stub0 = createRollAsStub(data0);
+    auto stub1 = createRollAsStub(data1);
+    auto stub2 = createRollAsStub(data2);
+    auto indata0 = std::make_shared<op::v0::Parameter>(inTypes[0], stub0->get_shape());
+    auto indata1 = std::make_shared<op::v0::Parameter>(inTypes[1], stub1->get_shape());
+    auto indata2 = std::make_shared<op::v0::Parameter>(inTypes[2], stub2->get_shape());
+    auto convert0 = std::make_shared<ngraph::snippets::op::ConvertTruncation>(indata0, outTypes[0]);
+    auto convert1 = std::make_shared<ngraph::snippets::op::ConvertTruncation>(indata1, outTypes[0]);
+    auto add = std::make_shared<op::v1::Add>(convert0, convert1);
+    auto relu = std::make_shared<op::v0::Relu>(add);
+    auto sub = std::make_shared<op::v1::Subtract>(relu, indata2);
+    auto convert2 = std::make_shared<ngraph::snippets::op::ConvertTruncation>(relu, outTypes[1]);
+    auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(
+            NodeVector{stub0, stub1, stub2}, std::make_shared<ov::Model>(NodeVector{sub, convert2}, ParameterVector{indata0, indata1, indata2}));
+    auto stub3 = createRollAsStub(subgraph);
+    return std::make_shared<ov::Model>(OutputVector{subgraph->output(1), stub3->output(0)},
+                                       ParameterVector{data0, data1, data2});
+}
+
+std::shared_ptr<ov::Model> ConvertManyOnInputsFunction::initOriginal() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(types[0], input_shapes[0]);
+    auto stub0 = createRollAsStub(data0);
+    std::shared_ptr<ov::Node> out = stub0;
+    for (auto i = 1; i < types.size(); i++) {
+        auto convert = std::make_shared<op::v0::Convert>(out, types[i]);
+        out = convert;
+    }
+    auto relu = std::make_shared<op::v0::Relu>(out);
+    return std::make_shared<ov::Model>(NodeVector{relu}, ParameterVector{data0});
+}
+std::shared_ptr<ov::Model> ConvertManyOnInputsFunction::initReference() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(types[0], input_shapes[0]);
+    auto stub0 = createRollAsStub(data0);
+    auto indata0 = std::make_shared<op::v0::Parameter>(types[0], stub0->get_shape());
+    std::shared_ptr<ov::Node> out = indata0;
+    for (auto i = 1; i < types.size(); i++) {
+        auto convert = std::make_shared<ngraph::snippets::op::ConvertTruncation>(out, types[i]);
+        out = convert;
+    }
+    auto relu = std::make_shared<op::v0::Relu>(out);
+    auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(NodeVector{stub0},
+        std::make_shared<ov::Model>(NodeVector{relu}, ParameterVector{indata0}));
+    return std::make_shared<ov::Model>(NodeVector{subgraph}, ParameterVector{data0});
+}
+
+std::shared_ptr<ov::Model> ConvertManyOnOutputsFunction::initOriginal() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(types[0], input_shapes[0]);
+    auto stub0 = std::make_shared<ov::op::v0::Sinh>(data0);
+    auto relu = std::make_shared<op::v0::Relu>(stub0);
+    std::shared_ptr<ov::Node> out = relu;
+    for (auto i = 1; i < types.size(); i++) {
+        auto convert = std::make_shared<op::v0::Convert>(out, types[i]);
+        out = convert;
+    }
+    return std::make_shared<ov::Model>(NodeVector{out}, ParameterVector{data0});
+}
+std::shared_ptr<ov::Model> ConvertManyOnOutputsFunction::initReference() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(types[0], input_shapes[0]);
+    auto stub0 = std::make_shared<ov::op::v0::Sinh>(data0);
+    auto indata0 = std::make_shared<op::v0::Parameter>(types[0], stub0->get_shape());
+    auto relu = std::make_shared<op::v0::Relu>(indata0);
+    std::shared_ptr<ov::Node> out = relu;
+    for (auto i = 1; i < types.size(); i++) {
+        auto convert = std::make_shared<ngraph::snippets::op::ConvertTruncation>(out, types[i]);
+        out = convert;
+    }
+    auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(NodeVector{stub0},
+        std::make_shared<ov::Model>(NodeVector{out}, ParameterVector{indata0}));
+    return std::make_shared<ov::Model>(NodeVector{subgraph}, ParameterVector{data0});
+}
+
+std::shared_ptr<ov::Model> ConvertManyOnInputOutputFunction::initOriginal() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(inTypes[0], input_shapes[0]);
+    auto stub0 = std::make_shared<ov::op::v0::Sinh>(data0);
+    std::shared_ptr<ov::Node> out = stub0;
+    for (auto i = 1; i < inTypes.size(); i++) {
+        auto convert = std::make_shared<op::v0::Convert>(out, inTypes[i]);
+        out = convert;
+    }
+    auto relu = std::make_shared<op::v0::Relu>(stub0);
+    out = relu;
+    for (auto i = 0; i < outTypes.size(); i++) {
+        auto convert = std::make_shared<op::v0::Convert>(out, outTypes[i]);
+        out = convert;
+    }
+    return std::make_shared<ov::Model>(NodeVector{out}, ParameterVector{data0});
+}
+std::shared_ptr<ov::Model> ConvertManyOnInputOutputFunction::initReference() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(inTypes[0], input_shapes[0]);
+    auto stub0 = std::make_shared<ov::op::v0::Sinh>(data0);
+    auto indata0 = std::make_shared<op::v0::Parameter>(inTypes[0], stub0->get_shape());
+    std::shared_ptr<ov::Node> out = indata0;
+    for (auto i = 1; i < inTypes.size(); i++) {
+        auto convert = std::make_shared<op::v0::Convert>(out, inTypes[i]);
+        out = convert;
+    }
+    auto relu = std::make_shared<op::v0::Relu>(stub0);
+    out = relu;
+    for (auto i = 0; i < outTypes.size(); i++) {
+        auto convert = std::make_shared<op::v0::Convert>(out, outTypes[i]);
+        out = convert;
+    }
+    auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(NodeVector{stub0},
+        std::make_shared<ov::Model>(NodeVector{out}, ParameterVector{indata0}));
+    return std::make_shared<ov::Model>(NodeVector{subgraph}, ParameterVector{data0});
+}
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp
index 81c267f3745828..6117ffb6c76c68 100644
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp
@@ -46,6 +46,14 @@ std::shared_ptr<ov::Model> AddSinhFunction::initReference() const {
                                                                       ParameterVector{indata0, indata1}));
     return std::make_shared<ov::Model>(NodeVector{add}, ParameterVector{data0, data1});
 }
+std::shared_ptr<ov::Model> AddSinhConstFunction::initOriginal() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
+    const std::vector<float> const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shapes[0]), -10., 10.);
+    auto const_data1 = std::make_shared<op::v0::Constant>(precision, input_shapes[0], const_values);
+    auto sin0 = std::make_shared<ov::op::v0::Sinh>(data0);
+    auto add = std::make_shared<op::v1::Add>(sin0, const_data1);
+    return std::make_shared<ov::Model>(NodeVector{add}, ParameterVector{data0});
+}
 std::shared_ptr<ov::Model> EltwiseFunction::initOriginal() const {
     auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
     auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
@@ -98,6 +106,28 @@ std::shared_ptr<ov::Model> EltwiseThreeInputsSinhFunction::initOriginal() const
     auto mul = std::make_shared<op::v1::Multiply>(add, sub);
     return std::make_shared<ov::Model>(NodeVector{mul}, ParameterVector{data0, data1, data2});
 }
+std::shared_ptr<ov::Model> EltwiseMaxNumParamsSinhFunction::initOriginal() const {
+    ParameterVector params;
+    std::vector<std::shared_ptr<Node>> sinh; // 10
+    for (const auto& shape : input_shapes) {
+        auto param = std::make_shared<op::v0::Parameter>(precision, shape);
+        params.push_back(param);
+        sinh.push_back(std::make_shared<op::v0::Sinh>(param));
+    }
+    std::vector<std::shared_ptr<Node>> add; // 5
+    for (size_t i = 0; i < input_shapes.size() / 2; i++) {
+        add.push_back(std::make_shared<op::v1::Add>(sinh[i * 2], sinh[i * 2 + 1]));
+    }
+    std::vector<std::shared_ptr<Node>> mul; // 2
+    for (size_t i = 0; i < add.size() / 2; i++) {
+        auto mul_node = std::make_shared<op::v1::Multiply>(add[i * 2], add[i * 2 + 1]);
+        mul.push_back(mul_node);
+    }
+    auto sub = std::make_shared<op::v1::Subtract>(mul[0], mul[1]);
+    auto power = std::make_shared<op::v1::Power>(add.back(), sub);
+    auto exit_sinh = std::make_shared<op::v0::Sinh>(power);
+    return std::make_shared<ov::Model>(NodeVector{sub, exit_sinh}, params);
+}
 
 std::shared_ptr<ov::Model> MatMulEltwiseBranchesFunction::initOriginal() const {
     auto data_1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
@@ -187,6 +217,69 @@ std::shared_ptr<ov::Model> EltwiseLogLoopFunction::initReference() const {
     return std::make_shared<Model>(NodeVector{mul}, ParameterVector{data0, data1});
 }
 
+std::shared_ptr<ov::Model> EltwiseTwoResultsFunction::initOriginal() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
+    auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
+    auto sinh0 = std::make_shared<op::v0::Sinh>(data0);
+    auto sinh1 = std::make_shared<op::v0::Sinh>(data1);
+    auto add = std::make_shared<op::v1::Add>(sinh0, sinh1);
+    auto hswish = std::make_shared<op::v4::HSwish>(add);
+    auto relu = std::make_shared<op::v0::Relu>(hswish);
+
+    NGRAPH_SUPPRESS_DEPRECATED_START
+    auto& out_tensor0 = add->get_output_tensor(0);
+    out_tensor0.set_name("add_out");
+    out_tensor0.set_names({"add_out", "y0"});
+
+    auto& out_tensor1 = relu->get_output_tensor(0);
+    out_tensor1.set_name("relu_out");
+    out_tensor1.set_names({"relu_out", "y1"});
+    NGRAPH_SUPPRESS_DEPRECATED_END
+
+    return std::make_shared<Model>(NodeVector{add, relu}, ParameterVector{data0, data1});
+}
+std::shared_ptr<ov::Model> EltwiseTwoResultsFunction::initReference() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
+    auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
+    auto sinh0 = std::make_shared<op::v0::Sinh>(data0);
+    auto sinh1 = std::make_shared<op::v0::Sinh>(data1);
+    auto indata0 = std::make_shared<op::v0::Parameter>(precision, sinh0->get_shape());
+    auto indata1 = std::make_shared<op::v0::Parameter>(precision, sinh1->get_shape());
+    auto add = std::make_shared<op::v1::Add>(indata0, indata1);
+    auto hswish = std::make_shared<op::v4::HSwish>(add);
+    auto subgraph0 = std::make_shared<ngraph::snippets::op::Subgraph>(NodeVector{sinh0, sinh1},
+                                        std::make_shared<ov::Model>(NodeVector{add, hswish},
+                                                                    ParameterVector{indata0, indata1}));
+    auto indata2 = std::make_shared<op::v0::Parameter>(precision, subgraph0->get_output_shape(1));
+    auto relu = std::make_shared<op::v0::Relu>(indata2);
+    auto subgraph1 = std::make_shared<ngraph::snippets::op::Subgraph>(OutputVector{subgraph0->output(1)},
+                                        std::make_shared<ov::Model>(NodeVector{relu},
+                                                                    ParameterVector{indata2}));
+    NGRAPH_SUPPRESS_DEPRECATED_START
+    auto& out_tensor0 = subgraph0->get_output_tensor(0);
+    out_tensor0.set_name("add_out");
+    out_tensor0.set_names({"add_out", "y0"});
+
+    auto& out_tensor1 = subgraph1->get_output_tensor(0);
+    out_tensor1.set_name("relu_out");
+    out_tensor1.set_names({"relu_out", "y1"});
+    NGRAPH_SUPPRESS_DEPRECATED_END
+    return std::make_shared<Model>(OutputVector{subgraph0->output(0), subgraph1->output(0)}, ParameterVector{data0, data1});
+}
+
+std::shared_ptr<ov::Model> TwoInputsAndOutputsFunction::initOriginal() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
+    auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
+    auto sin0 = std::make_shared<op::v0::Sin>(data0);
+    auto sin1 = std::make_shared<op::v0::Sin>(data1);
+    auto hswish = std::make_shared<op::v4::HSwish>(sin0);
+    auto add = std::make_shared<op::v1::Add>(hswish, sin1);
+    auto relu = std::make_shared<op::v0::Relu>(add);
+    auto sin3 = std::make_shared<op::v0::Sin>(relu);
+
+    return std::make_shared<Model>(NodeVector{hswish, sin3}, ParameterVector{data0, data1});
+}
+
 }  // namespace snippets
 }  // namespace test
 }  // namespace ov
\ No newline at end of file