openvinotoolkit · ilya-lavrenov · Oct 12, 2024 · Sep 2, 2024 · Oct 11, 2024
diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp
@@ -73,7 +73,6 @@ EncodedResults greedy_decoding(
     bool all_are_eos = std::all_of(eos_met.begin(), eos_met.end(), [](int elem) { return elem == 1; });
     if (!generation_config.ignore_eos && all_are_eos)
         return results;
-
 
     for (size_t i = 0; i < max_new_tokens - 1; ++i) {
         if (position_ids.has_value())

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
@@ -82,12 +82,15 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
             core.set_property(core_plugin_config);
             auto model = core.read_model(model_path / "openvino_model.xml");
             m_adapter_controller = AdapterController(model, m_generation_config.adapters, "base_model.model.model.", device);   // TODO: Make the prefix name configurable
+            utils::slice_matmul_statefull_model(model);
             m_model_runner = core.compile_model(model, device, compile_plugin_config).create_infer_request();
             m_adapter_controller->apply(m_model_runner, m_generation_config.adapters);
         } else {
             auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(plugin_config);
             core.set_property(core_plugin_config);
-            m_model_runner = core.compile_model(model_path / "openvino_model.xml", device, compile_plugin_config).create_infer_request();
+            auto model = core.read_model(model_path / "openvino_model.xml");
+            utils::slice_matmul_statefull_model(model);
+            m_model_runner = core.compile_model(model, device, compile_plugin_config).create_infer_request();
         }
 
         // If eos_token_id was not provided, take value

diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
@@ -5,6 +5,14 @@
 
 #include <fstream>
 
+#include "openvino/op/add.hpp"
+#include "openvino/op/divide.hpp"
+#include "openvino/op/multiply.hpp"
+#include "openvino/op/matmul.hpp"
+#include "openvino/op/slice.hpp"
+#include "openvino/op/tanh.hpp"
+#include "openvino/op/transpose.hpp"
+
 namespace ov {
 namespace genai {
 namespace utils {
@@ -225,6 +233,32 @@ ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::Token
 
     return {new_input_ids, new_attention_mask};
 }
+
+void slice_matmul_statefull_model(std::shared_ptr<ov::Model> model) {
+    ov::Node* matmul = nullptr;
+    auto last_node = model->output(0).get_node()->input_value(0).get_node();
+    if (matmul = dynamic_cast<ov::op::v0::MatMul*>(last_node)) {
+    } else if(auto add = dynamic_cast<ov::op::v1::Add*>(last_node)) {
+        matmul = dynamic_cast<ov::op::v0::MatMul*>(add->input_value(0).get_node());
+    } else if (auto transpose = dynamic_cast<ov::op::v1::Transpose*>(last_node)) {
+        matmul = dynamic_cast<ov::op::v0::MatMul*>(transpose->input_value(0).get_node());
+    } else if (auto multiply = dynamic_cast<ov::op::v1::Multiply*>(last_node)) {
+        if (auto tanh = dynamic_cast<ov::op::v0::Tanh*>(multiply->input_value(0).get_node())) {
+            if (auto divide = dynamic_cast<ov::op::v1::Divide*>(tanh->input_value(0).get_node())) {
+                matmul = dynamic_cast<ov::op::v0::MatMul*>(divide->input_value(0).get_node());
+            }
+        }
+    }
+
+    if (matmul && matmul->input(0).get_partial_shape().rank().get_length() == 3) {
+        auto start = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{-1});
+        auto stop = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{-2});
+        auto step = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{-1});
+        auto axis = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{1});
+        auto slice = std::make_shared<ov::op::v8::Slice>(matmul->input_value(0), start, stop, step, axis);
+        matmul->input(0).replace_source_output(slice);
+    }
+}
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
@@ -87,6 +87,8 @@ ProcessorConfig from_any_map(
 std::pair<ov::AnyMap, ov::AnyMap> split_core_complile_config(const ov::AnyMap& plugin_config);
 
 ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend);
+
+void slice_matmul_statefull_model(std::shared_ptr<ov::Model> model);
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov