Skip to content

Commit

Permalink
Slice the last matmull in stateful llm pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
olpipi committed Sep 19, 2024
1 parent 7b81bcb commit c726fe5
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 2 deletions.
1 change: 0 additions & 1 deletion src/cpp/src/greedy_decoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ EncodedResults greedy_decoding(
bool all_are_eos = std::all_of(eos_met.begin(), eos_met.end(), [](int elem) { return elem == 1; });
if (!generation_config.ignore_eos && all_are_eos)
return results;


for (size_t i = 0; i < max_new_tokens - 1; ++i) {
if (position_ids.has_value())
Expand Down
23 changes: 22 additions & 1 deletion src/cpp/src/llm_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
#include "utils.hpp"
#include "text_callback_streamer.hpp"

#include "openvino/op/matmul.hpp"
#include "openvino/op/slice.hpp"

namespace {

ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& fisrt, const ov::genai::TokenizedInputs& second){
Expand Down Expand Up @@ -65,6 +68,22 @@ std::pair<EncodedResults, int32_t> beam_search(
);

class StatefulLLMPipeline final : public LLMPipelineImplBase {
private:
void slice_matmul_statefull_model(std::shared_ptr<ov::Model> model) {
auto last_node = model->output(0).get_node()->input_value(0).get_node();
if (auto matmul = dynamic_cast<ov::op::v0::MatMul*>(last_node)) {
auto shape = matmul->input(0).get_partial_shape();
if (shape.rank().get_length() == 3 && shape[1] != 1) {
auto start = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{-1});
auto stop = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{-2});
auto step = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{-1});
auto axis = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{1});
auto slice = std::make_shared<ov::op::v8::Slice>(matmul->input_value(0), start, stop, step, axis);
matmul->input(0).replace_source_output(slice);
}
}
}

public:
ov::InferRequest m_model_runner;

Expand Down Expand Up @@ -94,7 +113,9 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
{
ov::Core core;
core.set_property(device, plugin_config);
m_model_runner = core.compile_model(model_path / "openvino_model.xml", device).create_infer_request();
auto model = core.read_model(model_path / "openvino_model.xml");
slice_matmul_statefull_model(model);
m_model_runner = core.compile_model(model, device).create_infer_request();

// If eos_token_id was not provided, take value
if (m_generation_config.eos_token_id == -1)
Expand Down

0 comments on commit c726fe5

Please sign in to comment.