fix problem with rnn layer

allnes · Jan 2, 2024 · d7e1613 · d7e1613
1 parent b103378
commit d7e1613
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 4 deletions.
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.cpp
@@ -24,8 +24,7 @@ void ACLScheduler::set_num_threads(unsigned int num_threads) {}
 
 void ACLScheduler::schedule_custom(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) {
     const Window & max_window = window;
-    const unsigned int num_iterations =
-            max_window.num_iterations(hints.split_dimension()) == 1 ? 1 : max_window.num_iterations_total();
+    const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
     const auto _num_threads = std::min(num_iterations, static_cast<unsigned int>(parallel_get_num_threads()));
 
     if (num_iterations < 1) {

diff --git a/src/plugins/intel_cpu/src/nodes/rnn.cpp b/src/plugins/intel_cpu/src/nodes/rnn.cpp
@@ -514,6 +514,9 @@ void RNN::configurePortDataTypes() {
         // onednn doesn't have fp16 instance
         inDataTypes[xIdx] = outDataTypes[yIdx] = outDataTypes[hoIdx] = inDataTypes[hIdx] = memory::data_type::f32; // required by oneDNN.
 
+    if (cell_type == dnnl::algorithm::vanilla_augru && inDataTypes[aIdx] == memory::data_type::f16)
+        inDataTypes[aIdx] = memory::data_type::f32;
+
     if (outDataTypes[yIdx] == memory::data_type::bf16 && one_of(inDataTypes[xIdx], memory::data_type::s8, memory::data_type::u8))
         outDataTypes[yIdx] = memory::data_type::f32; // oneDNN does not support bf16 output precision for quantized rnn primitive yet
 }

diff --git a/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp b/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp
@@ -293,7 +293,8 @@ MemoryPtr DynamicBuffer::create_buffer(const dnnl::engine& eng) {
     const auto estimated_iters = estimate_iters();
     const Shape _shape = Shape({count, static_cast<size_t>(abs_stride * estimated_iters), len/elem_size});
     auto _descCreator = BlockedDescCreator::getCommonCreators().at(LayoutType::ncsp);
-    auto new_buffer_desc = _descCreator->createSharedDesc(from->getDesc().getPrecision(), _shape);
+    auto prec = from->getDesc().getPrecision() == ov::element::f16 ? ov::element::f32 : from->getDesc().getPrecision();
+    auto new_buffer_desc = _descCreator->createSharedDesc(prec, _shape);
 
     auto _ptr = std::make_shared<Memory>(eng, new_buffer_desc);
     return _ptr;
@@ -744,7 +745,8 @@ void TensorIterator::reshapeSubgraphInput() {
         auto &to_mems = input_mems[map_rule.to];
         const auto& body_inshape = to_mems.front()->getShape();
         if (body_inshape.isDynamic() || body_inshape.getDims() != new_dims) {
-            const auto desc = std::make_shared<CpuBlockedMemoryDesc>(to_mems.front()->getDesc().getPrecision(), Shape(new_dims));
+            auto prec = to_mems.front()->getDesc().getPrecision() == ov::element::f16 ? ov::element::f32 : to_mems.front()->getDesc().getPrecision();
+            const auto desc = std::make_shared<CpuBlockedMemoryDesc>(prec, Shape(new_dims));
             redefineToMemories(to_mems, desc);
         }
     }