[Snippets][CPU] Applied Vladislav & Ivan comments 3

openvinotoolkit · Jan 3, 2025 · f104183 · f104183
1 parent 1908740
commit f104183
Show file tree

Hide file tree

Showing 22 changed files with 69 additions and 75 deletions.
diff --git a/src/common/snippets/include/snippets/op/load.hpp b/src/common/snippets/include/snippets/op/load.hpp
@@ -41,17 +41,17 @@ class Load : public modifier::MemoryAccess, public ov::op::Op {
 };
 
 /**
- * @interface LoadReshape
+ * @interface LoadReorder
  * @brief It's just Load operation (and it's mapped on LoadEmitter during code generation) that allows to tweak
  *        shape propagation. We need it to keep correct shape propagation  when Transpose is decomposed to
  *        Load and Store. This is a temporary solution until tokenization of Reshape operation is supported.
  * @ingroup snippets
  */
-class LoadReshape : public Load {
+class LoadReorder : public Load {
 public:
-    OPENVINO_OP("LoadReshape", "SnippetsOpset", Load);
-    LoadReshape(const Output<Node>& x, size_t count = 1lu, const size_t offset = 0lu, std::vector<size_t> order = {});
-    LoadReshape() = default;
+    OPENVINO_OP("LoadReorder", "SnippetsOpset", Load);
+    LoadReorder(const Output<Node>& x, size_t count = 1lu, const size_t offset = 0lu, std::vector<size_t> order = {});
+    LoadReorder() = default;
 
     void set_offset(size_t offset) { set_output_offset(offset, 0); }
     void set_count(size_t count) { set_output_count(count, 0); }

diff --git a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
@@ -11,7 +11,7 @@
 
 // SnippetS dialect
 OV_OP(Load, ov::snippets::op)
-OV_OP(LoadReshape, ov::snippets::op)
+OV_OP(LoadReorder, ov::snippets::op)
 OV_OP(LoopBegin, ov::snippets::op)
 OV_OP(LoopEnd, ov::snippets::op)
 OV_OP(Brgemm, ov::snippets::op)

diff --git a/src/common/snippets/src/lowered/pass/mark_invariant_shape_path.cpp b/src/common/snippets/src/lowered/pass/mark_invariant_shape_path.cpp
@@ -41,7 +41,7 @@ static bool is_affecting_op(const ExpressionPtr& expr) {
     const auto& node = expr->get_node();
     return ov::is_type<ov::snippets::op::Brgemm>(node) ||
            ov::is_type<ov::snippets::op::Reshape>(node) ||
-           ov::is_type<ov::snippets::op::LoadReshape>(node);
+           ov::is_type<ov::snippets::op::LoadReorder>(node);
 }
 }  // namespace
 

diff --git a/src/common/snippets/src/op/load.cpp b/src/common/snippets/src/op/load.cpp
@@ -41,19 +41,19 @@ std::shared_ptr<Node> Load::clone_with_new_inputs(const OutputVector& new_args)
     return std::make_shared<Load>(new_args.at(0), get_count(), get_offset());
 }
 
-LoadReshape::LoadReshape(const Output<ov::Node>& x, const size_t count, const size_t offset, std::vector<size_t> order)
+LoadReorder::LoadReorder(const Output<ov::Node>& x, const size_t count, const size_t offset, std::vector<size_t> order)
                             : Load(x, count, offset), m_order(std::move(order)) {
     const auto& in_shape = x.get_partial_shape();
     const auto in_shape_size = in_shape.size();
-    OPENVINO_ASSERT(m_order.size() == in_shape_size, "LoadReshape got new_order of invalid size");
+    OPENVINO_ASSERT(m_order.size() == in_shape_size, "LoadReorder got new_order of invalid size");
     OPENVINO_ASSERT(*std::max_element(m_order.begin(), m_order.end()) == in_shape_size - 1 &&
-                    *std::min_element(m_order.begin(), m_order.end()) == 0, "LoadReshape detected invalid values in new_order");
+                    *std::min_element(m_order.begin(), m_order.end()) == 0, "LoadReorder detected invalid values in new_order");
     const std::set<size_t> unique_dims(order.begin(), order.end());
-    OPENVINO_ASSERT(unique_dims.size() == order.size(), "LoadReshape order must not contain repeated elements");
+    OPENVINO_ASSERT(unique_dims.size() == order.size(), "LoadReorder order must not contain repeated elements");
     constructor_validate_and_infer_types();
 }
 
-void LoadReshape::validate_and_infer_types() {
+void LoadReorder::validate_and_infer_types() {
     validate_memory_access_params();
     const auto& old_shape = get_input_partial_shape(0);
     ov::PartialShape new_shape;
@@ -62,23 +62,23 @@ void LoadReshape::validate_and_infer_types() {
     set_output_type(0, get_input_element_type(0), new_shape);
 }
 
-bool LoadReshape::visit_attributes(AttributeVisitor& visitor) {
+bool LoadReorder::visit_attributes(AttributeVisitor& visitor) {
     MemoryAccess::visit_attributes(visitor);
     visitor.on_attribute("order", m_order);
     return true;
 }
 
-std::shared_ptr<Node> LoadReshape::clone_with_new_inputs(const OutputVector& new_args) const {
-    INTERNAL_OP_SCOPE(LoadReshape);
+std::shared_ptr<Node> LoadReorder::clone_with_new_inputs(const OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(LoadReorder);
     check_new_args_count(this, new_args);
-    return std::make_shared<LoadReshape>(new_args.at(0), get_count(), get_offset(), m_order);
+    return std::make_shared<LoadReorder>(new_args.at(0), get_count(), get_offset(), m_order);
 }
-LoadReshape::ShapeInfer::ShapeInfer(const std::shared_ptr<ov::Node>& n) {
-    const auto& loadReshape = ov::as_type_ptr<LoadReshape>(n);
-    OPENVINO_ASSERT(loadReshape, "Got invalid node in LoadReshape::ShapeInfer");
-    m_order = loadReshape->m_order;
+LoadReorder::ShapeInfer::ShapeInfer(const std::shared_ptr<ov::Node>& n) {
+    const auto& loadReorder = ov::as_type_ptr<LoadReorder>(n);
+    OPENVINO_ASSERT(loadReorder, "Got invalid node in LoadReorder::ShapeInfer");
+    m_order = loadReorder->m_order;
 }
-IShapeInferSnippets::Result LoadReshape::ShapeInfer::infer(const std::vector<VectorDimsRef>& input_shapes) {
+IShapeInferSnippets::Result LoadReorder::ShapeInfer::infer(const std::vector<VectorDimsRef>& input_shapes) {
     OPENVINO_ASSERT(input_shapes.size() == 1, "Got unexpected number of input shapes");
     return {{utils::get_planar_vdims(input_shapes[0], m_order)}, ShapeInferStatus::success};
 }

diff --git a/src/common/snippets/src/op/reorder.cpp b/src/common/snippets/src/op/reorder.cpp
@@ -31,7 +31,7 @@ void Reorder::custom_constructor_validate_and_infer_types(std::vector<size_t> or
 
 void Reorder::validate_and_infer_types() {
     const auto& input_pshape = get_input_partial_shape(0);
-    const auto order = lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout();
+    const auto& order = lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout();
     OPENVINO_ASSERT(input_pshape.rank().is_static() && input_pshape.size() == order.size(),
                     "Incompatible shape and order sizes");
     const auto output_pshape = utils::get_planar_pshape(get_input_partial_shape(0), order);

diff --git a/src/common/snippets/src/pass/transpose_decomposition.cpp b/src/common/snippets/src/pass/transpose_decomposition.cpp
@@ -60,9 +60,9 @@ TransposeDecomposition::TransposeDecomposition() {
         const auto subtensor = std::vector<size_t>{1};
         const auto& layout = order->cast_vector<size_t>();
 
-        // todo: LoadReshape used here is essentially Load + an easy way to maintain correct shape propagation
+        // todo: LoadReorder used here is essentially Load + an easy way to maintain correct shape propagation
         //  fix this in future and develop a more consistent shape propagation approach.
-        auto load = std::make_shared<snippets::op::LoadReshape>(data_input, subtensor[0], 0, layout);
+        auto load = std::make_shared<snippets::op::LoadReorder>(data_input, subtensor[0], 0, layout);
         auto store = std::make_shared<snippets::op::Store>(load, subtensor[0]);
 
         PortDescriptorUtils::set_port_descriptor(load->input(0), subtensor, layout);

diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp
@@ -120,15 +120,18 @@ void RuntimeConfigurator::init_data_info(const lowered::LinearIRCPtr& linear_ir)
         const auto& shape_infer_seq = utils::get_first_child_shape_infer_expr_seq(param);
         ExpressionPtr mem_desc_expr = param;
         if (!shape_infer_seq.empty()) {
-            // If there is Reorder, we should take its desc because it affects on shape by target order
-            const auto& reordered_reshape_it = std::find_if(shape_infer_seq.cbegin(), shape_infer_seq.cend(),
-                                                            [](const ExpressionPtr& expr) {
-                                                               return ov::is_type<op::Reorder>(expr->get_node());
-                                                            });
-            if (reordered_reshape_it != shape_infer_seq.cend()) {
-                const auto& reshape = *reordered_reshape_it;
-                const auto& etype = reshape->get_node()->get_output_element_type(0);
-                update_io_parameters(reshape->get_input_port_descriptor(0), etype);
+            // [160048] Reorder, as any another ShapeInferOp, should just propagate input shape to output using target order
+            //          without data movement. However, currently we have to save desc of input of the Reorder
+            //          to support correct input data offsets calculations and MHAParallelWAOptimizer pass work.
+            //          Please, remove this code part when the mentioned ticket is completed.
+            const auto& reorder_it = std::find_if(shape_infer_seq.cbegin(), shape_infer_seq.cend(),
+                                                  [](const ExpressionPtr& expr) {
+                                                     return ov::is_type<op::Reorder>(expr->get_node());
+                                                  });
+            if (reorder_it != shape_infer_seq.cend()) {
+                const auto& reorder = *reorder_it;
+                const auto& etype = reorder->get_node()->get_output_element_type(0);
+                update_io_parameters(reorder->get_input_port_descriptor(0), etype);
                 continue;
             }
 

diff --git a/src/common/snippets/src/shape_inference/shape_inference.cpp b/src/common/snippets/src/shape_inference/shape_inference.cpp
@@ -64,7 +64,7 @@ const IShapeInferSnippetsFactory::TRegistry IShapeInferSnippetsFactory::registry
         // Note that Result has no output PortConnectors, so the shape must be empty
         SHAPE_INFER_PREDEFINED(ov::op::v0::Result, EmptyShapeInfer),
         //
-        SHAPE_INFER_OP_SPECIFIC(op::LoadReshape),
+        SHAPE_INFER_OP_SPECIFIC(op::LoadReorder),
         SHAPE_INFER_OP_SPECIFIC(op::Reshape),
         SHAPE_INFER_OP_SPECIFIC(op::Reorder),
         SHAPE_INFER_OP_SPECIFIC(op::RankNormalization),

diff --git a/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp b/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp
@@ -299,15 +299,15 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsImpossible) {
      *
      *        Param0(32,8,1)
      *             |
-     *       LoadReshape with order (1,2,0)
+     *       LoadReorder with order (1,2,0)
      *             |
      *           Store
      *             |
      *           Result
     */
     {
         auto param = linear_ir->push_node<ov::opset10::Parameter>(input_precision, input_shape_0);
-        auto load_reshape = linear_ir->push_node<ov::snippets::op::LoadReshape>(param.second, 1, 0, layout);
+        auto load_reshape = linear_ir->push_node<ov::snippets::op::LoadReorder>(param.second, 1, 0, layout);
         auto store = linear_ir->push_node<ov::snippets::op::Store>(load_reshape.second, 1, 0);
         init_expr_descriptors(*load_reshape.first, {subtensor, subtensor}, {order, layout});
         init_expr_descriptors(*store.first, {subtensor, subtensor}, {layout, layout});

diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp
@@ -7,7 +7,7 @@
 #include "snippets/lowered/loop_manager.hpp"
 #include "snippets/utils/utils.hpp"
 
-#ifndef OPENVINO_ARCH_ARM64
+#ifdef OPENVINO_ARCH_X86_64
 #    include "transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp"
 #    include "transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp"
 #endif

diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp
@@ -35,7 +35,7 @@ class CPURuntimeConfig : public ov::snippets::RuntimeConfig {
 
 class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator {
 public:
-    CPURuntimeConfigurator(ov::intel_cpu::MultiCacheWeakPtr cache = {});
+    CPURuntimeConfigurator(ov::intel_cpu::MultiCacheWeakPtr cache);
 
     /**
      * @brief Calculate Loop parameters of Loop emitters and update these values in CPURuntimeConfig

diff --git a/src/plugins/intel_cpu/src/emitters/snippets/repacked_input.hpp b/src/plugins/intel_cpu/src/emitters/snippets/repacked_input.hpp
@@ -12,6 +12,7 @@ namespace intel_cpu {
 struct RepackedInputKernel {
     RepackedInputKernel() = default;
     virtual ~RepackedInputKernel() = default;
+    virtual void operator()(const void* args) const = 0;
 };
 
 struct RepackedInput {

diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp
@@ -180,7 +180,7 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho
     jitters[snippets::op::Reorder::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter);
 
     jitters[snippets::op::Load::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter);
-    jitters[snippets::op::LoadReshape::get_type_info_static()] =
+    jitters[snippets::op::LoadReorder::get_type_info_static()] =
         CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter);
     jitters[snippets::op::BroadcastLoad::get_type_info_static()] =
         CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_broadcast_emitter);

diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp
@@ -170,9 +170,11 @@ status_t BrgemmCopyBKernel::create_kernel() {
     return code;
 }
 
-void BrgemmCopyBKernel::operator()(const call_args* args) const {
+void BrgemmCopyBKernel::operator()(const void* args) const {
+    const auto* call_args = reinterpret_cast<const BrgemmCopyBKernel::call_args*>(args);
+    OV_CPU_JIT_EMITTER_ASSERT(call_args, "Call arguments are nullptr!");
     OV_CPU_JIT_EMITTER_ASSERT(ker_, "Kernel is nullptr");
-    ker_(args);
+    ker_(call_args);
 }
 
 void BrgemmCopyBKernel::init_brgemm_copy_b_kernel(

diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp
@@ -153,7 +153,7 @@ struct BrgemmCopyBKernel : public RepackedInputKernel, public dnnl::impl::cpu::x
 
     dnnl::impl::status_t create_kernel() override;
 
-    void operator()(const call_args* args) const;
+    void operator()(const void* args) const override;
 
 private:
     void generate() override;

diff --git a/src/plugins/intel_cpu/src/extension.cpp b/src/plugins/intel_cpu/src/extension.cpp
@@ -170,7 +170,7 @@ class TypeRelaxedExtension : public ov::OpExtension<ov::op::TypeRelaxed<Op>> {
     OP_EXTENSION(ov::snippets::op::KernelStatic)      \
     OP_EXTENSION(ov::snippets::op::KernelDynamic)     \
     OP_EXTENSION(ov::snippets::op::Load)              \
-    OP_EXTENSION(ov::snippets::op::LoadReshape)       \
+    OP_EXTENSION(ov::snippets::op::LoadReorder)       \
     OP_EXTENSION(ov::snippets::op::LoopBegin)         \
     OP_EXTENSION(ov::snippets::op::LoopEnd)           \
     OP_EXTENSION(ov::snippets::op::Buffer)            \

diff --git a/src/plugins/intel_cpu/src/nodes/executors/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/executors/subgraph.cpp
@@ -3,11 +3,7 @@
 //
 
 #include "nodes/executors/subgraph.hpp"
-#if defined(OPENVINO_ARCH_ARM64)
-#    include "emitters/snippets/aarch64/cpu_generator.hpp"
-#else
-#    include "emitters/snippets/x64/cpu_generator.hpp"
-#endif
+
 #include "common/primitive_hashing_utils.hpp"
 #include "openvino/core/parallel.hpp"
 
@@ -102,9 +98,8 @@ void SubgraphBaseExecutor::init_parallel_domain(const std::shared_ptr<CPURuntime
                                                 std::vector<size_t>& domain) {
     init_parallel_domain(snippet_config->master_shape, snippet_config->tensor_rank, snippet_config->tile_rank, domain);
 }
-void SubgraphBaseExecutor::parallel_for6d(
-    const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
-    const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>& caller) {
+
+void SubgraphBaseExecutor::parallel_for6d(const initializer_functor& initializer, const call_functor& caller) {
     const auto& dom = m_parallel_exec_domain;
 
     parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) {
@@ -142,9 +137,7 @@ void SubgraphBaseExecutor::parallel_for6d(
     });
 }
 
-void SubgraphBaseExecutor::parallel_forNd(
-    const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
-    const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>& caller) {
+void SubgraphBaseExecutor::parallel_forNd(const initializer_functor& initializer, const call_functor& caller) {
     const auto& dom = m_parallel_exec_domain;
 
     parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) {

diff --git a/src/plugins/intel_cpu/src/nodes/executors/subgraph.hpp b/src/plugins/intel_cpu/src/nodes/executors/subgraph.hpp
@@ -66,21 +66,17 @@ class SubgraphBaseExecutor {
 protected:
     virtual void exec_impl(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) = 0;
 
-    virtual void parallel_for6d(
-        const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
-        const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>& caller);
-    virtual void parallel_forNd(
-        const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
-        const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>& caller);
+    using initializer_functor = std::function<void(jit_snippets_call_args&, size_t)>;
+    using call_functor = std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>;
+
+    virtual void parallel_for6d(const initializer_functor& initializer, const call_functor& caller);
+    virtual void parallel_forNd(const initializer_functor& initializer, const call_functor& caller);
 
     inline void update_scratchpad_ptr(void*& scratchpad_ptr, size_t ithr) const {
         if (m_buffer_scratchpad_size > 0)
             scratchpad_ptr = m_buffer_scratchpad->getDataAs<uint8_t>() + ithr * m_buffer_scratchpad_size;
     }
 
-    using initializer_functor = std::function<void(jit_snippets_call_args&, size_t)>;
-    using call_functor = std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>;
-
     std::shared_ptr<snippets::Schedule> m_schedule;
     // Holds index of output used as in execution domain
     // it should be compatible with a schedule's work size

diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp
@@ -41,14 +41,13 @@ pass::EliminateBrgemmCopyB::EliminateBrgemmCopyB() {
             transformation_callback(copy_b_node))
             return false;
 
-        // If there is non-empty and non-planar layout, we should insert reshape to support shape inference
+        // If there is non-planar layout, we should insert reshape to support shape inference
         if (!ov::snippets::utils::is_planar_layout(layout)) {
             const auto& subtensor = in_desc->get_subtensor();
             const auto& reshape = std::make_shared<ov::snippets::op::Reorder>(copy_b_node->input_value(0), layout);
             ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(reshape->input(0), subtensor, layout);
             ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(reshape->output(0), subtensor);
-            ov::replace_node_update_name(copy_b_node, reshape);
-            return true;
+            return ov::replace_node_update_name(copy_b_node, reshape);
         }
 
         // If there is no layout, we can just remove BrgemmCopyB from the subgraph

diff --git a/...s/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp b/...s/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp
@@ -103,12 +103,12 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin
         update_kernel(p.second, shape, layout, N, K, prc);
     }
 
-    const auto L2_cache_size = dnnl::utils::get_cache_size(2, true);
-    const auto fit_into_L2 = data_size < L2_cache_size;
-    // Heuristic: If external repacking data doesn't fit in the cache L2,
+    const auto cache_size = dnnl::utils::get_cache_size(1, true) + dnnl::utils::get_cache_size(2, true);
+    const auto fit_into_cache = data_size < cache_size;
+    // Heuristic: If external repacking data doesn't fit in the caches L1 and L2,
     //            external repacking should be executed in seperate parallel section before kernel execution.
-    cpu_config->repacking_impl_type =
-        fit_into_L2 ? CPURuntimeConfig::RepackingImplType::IN_PARALLEL : CPURuntimeConfig::RepackingImplType::SEPARATE;
+    cpu_config->repacking_impl_type = fit_into_cache ? CPURuntimeConfig::RepackingImplType::IN_PARALLEL
+                                                     : CPURuntimeConfig::RepackingImplType::SEPARATE;
 
     const auto is_impl_parallel = cpu_config->repacking_impl_type == CPURuntimeConfig::RepackingImplType::IN_PARALLEL;
 
@@ -131,7 +131,7 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin
         // Save original input offsets for input before repacking.
         // If the shape has not been changed, it means that we already created `RepackedInput` for this input
         // on previous pass call and now `cpu_config->io_data_offsets[i]` contains offsets not for original input -
-        // they were updated for blocked shapes/zeroed for previous initialization and we canonot use them as original
+        // they were updated for blocked shapes/zeroed for previous initialization and we cannot use them as original
         // offsets.
         const auto in_offsets =
             shape == cpu_config->latest_shapes[i] ? repacked_in.in_offsets() : cpu_config->io_data_offsets[i];