From 3bc91631e726eb67b494927e306a33f192c7df61 Mon Sep 17 00:00:00 2001
From: yatarkan <yaroslav.tarkan@intel.com>
Date: Wed, 2 Oct 2024 18:50:49 +0400
Subject: [PATCH 01/17] Add model type to vlm config

---
 src/cpp/include/openvino/genai/vlm_config.hpp | 2 ++
 src/cpp/src/vlm_config.cpp                    | 1 +
 2 files changed, 3 insertions(+)

diff --git a/src/cpp/include/openvino/genai/vlm_config.hpp b/src/cpp/include/openvino/genai/vlm_config.hpp
index dd22e422bf..02d0f7c36a 100644
--- a/src/cpp/include/openvino/genai/vlm_config.hpp
+++ b/src/cpp/include/openvino/genai/vlm_config.hpp
@@ -12,6 +12,8 @@ namespace ov::genai {
 /// change VLMPipeline's behavior. Corresponds to config.json.
 class OPENVINO_GENAI_EXPORTS VLMConfig {
 public:
+    /// @brief A string denoting model type.
+    std::string model_type = "";
     /// @brief A size of a single embedding returned by a resampler.
     /// Used to initialize positional embeddings for resampler input.
     size_t hidden_size = 2304;
diff --git a/src/cpp/src/vlm_config.cpp b/src/cpp/src/vlm_config.cpp
index 36d997ecbe..a13a0da702 100644
--- a/src/cpp/src/vlm_config.cpp
+++ b/src/cpp/src/vlm_config.cpp
@@ -10,6 +10,7 @@ ov::genai::VLMConfig::VLMConfig(const std::filesystem::path& json_path) {
     OPENVINO_ASSERT(stream.is_open(), "Failed to open '" + json_path.string() + "' with processor config");
     nlohmann::json parsed = nlohmann::json::parse(stream);
     using ov::genai::utils::read_json_param;
+    read_json_param(parsed, "model_type", model_type); // TODO Consider checking supported model type here instead of VisionEncoder constructor
     read_json_param(parsed, "hidden_size", hidden_size);
     read_json_param(parsed, "scale_emb", scale_emb);
     read_json_param(parsed, "query_num", query_num);

From 870128869813e2818a7fed2142d2bae86276b425 Mon Sep 17 00:00:00 2001
From: yatarkan <yaroslav.tarkan@intel.com>
Date: Wed, 2 Oct 2024 19:49:09 +0400
Subject: [PATCH 02/17] Add llava specific config params to processor config

---
 .../openvino/genai/processor_config.hpp        |  8 ++++++++
 src/cpp/src/processor_config.cpp               | 18 +++++++++++++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/src/cpp/include/openvino/genai/processor_config.hpp b/src/cpp/include/openvino/genai/processor_config.hpp
index 9a70d1f3ae..b42f08db40 100644
--- a/src/cpp/include/openvino/genai/processor_config.hpp
+++ b/src/cpp/include/openvino/genai/processor_config.hpp
@@ -33,6 +33,14 @@ class OPENVINO_GENAI_EXPORTS ProcessorConfig {
     /// Applied after norm_mean.
     /// llava calls it image_std.
     std::array<float, 3> norm_std{1.0f, 1.0f, 1.0f};
+
+    // llava specific config params
+    std::array<float, 3> image_mean{0.0f, 0.0f, 0.0f};
+    std::array<float, 3> image_std{1.0f, 1.0f, 1.0f};
+    size_t crop_size_height = 336;
+    size_t crop_size_width = 336;
+    size_t size_shortest_edge = 336;
+
     /// @brief Default constructor
     ProcessorConfig() = default;
     /// @brief Construct ProcessorConfig from values in json_path.
diff --git a/src/cpp/src/processor_config.cpp b/src/cpp/src/processor_config.cpp
index 33673f7e79..cea7f98fd4 100644
--- a/src/cpp/src/processor_config.cpp
+++ b/src/cpp/src/processor_config.cpp
@@ -10,7 +10,7 @@ ov::genai::ProcessorConfig::ProcessorConfig(const std::filesystem::path& json_pa
     OPENVINO_ASSERT(stream.is_open(), "Failed to open '" + json_path.string() + "' with processor config");
     nlohmann::json parsed = nlohmann::json::parse(stream);
     using ov::genai::utils::read_json_param;
-    read_json_param(parsed, "patch_size", patch_size);
+    read_json_param(parsed, "patch_size", patch_size); // For llava - stored in config.json vision_config
     read_json_param(parsed, "scale_resolution", scale_resolution);
     read_json_param(parsed, "max_slice_nums", max_slice_nums);
     if (parsed.contains("norm_mean")) {
@@ -19,4 +19,20 @@ ov::genai::ProcessorConfig::ProcessorConfig(const std::filesystem::path& json_pa
     if (parsed.contains("norm_std")) {
         norm_std = parsed.at("norm_std").get<std::array<float, 3>>();
     }
+    
+    // Setting llava config params
+    if (parsed.contains("image_mean")) {
+        image_mean = parsed.at("image_mean").get<std::array<float, 3>>();
+    }
+    if (parsed.contains("image_std")) {
+        image_std = parsed.at("image_std").get<std::array<float, 3>>();
+    }
+
+    if (parsed.contains("crop_size")) {
+        crop_size_height = parsed.at("crop_size").at("height");
+        crop_size_width = parsed.at("crop_size").at("width");
+    }
+    if (parsed.contains("size")) {
+        size_shortest_edge = parsed.at("size").at("shortest_edge");
+    }
 }

From 5b4f1455a477047e9f98f9335062147aa33bf747 Mon Sep 17 00:00:00 2001
From: yatarkan <yaroslav.tarkan@intel.com>
Date: Wed, 2 Oct 2024 20:25:39 +0400
Subject: [PATCH 03/17] Add model type to vision encoder, separate encode
 methods for llava and minicpm

---
 .../include/openvino/genai/vision_encoder.hpp |  17 ++-
 src/cpp/src/vision_encoder.cpp                | 114 ++++++++++++++++--
 2 files changed, 118 insertions(+), 13 deletions(-)

diff --git a/src/cpp/include/openvino/genai/vision_encoder.hpp b/src/cpp/include/openvino/genai/vision_encoder.hpp
index 7370b7f8aa..3fe80bf24f 100644
--- a/src/cpp/include/openvino/genai/vision_encoder.hpp
+++ b/src/cpp/include/openvino/genai/vision_encoder.hpp
@@ -41,11 +41,16 @@ struct EncodedImage {
 /// ov::InferRequest and configured by ProcessorConfig.
 class OPENVINO_GENAI_EXPORTS VisionEncoder {
 public:
+    /// @brief A string denoting model type.
+    std::string model_type;
     /// @brief A model for image encoding.
     ov::InferRequest m_encoder;
     /// @brief A config to follow.
     ProcessorConfig m_processor_config;
 
+    // LLaVa specific members
+    ov::InferRequest m_vision_embeddings;
+
     /// @brief Construct from an already compiled model and a config.
     /// @param encoder Compiled model.
     /// @param processor_config Initial config.
@@ -65,7 +70,8 @@ class OPENVINO_GENAI_EXPORTS VisionEncoder {
         const std::filesystem::path& model_dir,
         const std::string& device="CPU",
         const ov::AnyMap device_config={},
-        ov::Core core=ov::Core{}
+        ov::Core core=ov::Core{},
+        std::string model_type=""
     );
 
     /// @brief Compute embeddings of an image.
@@ -117,5 +123,14 @@ class OPENVINO_GENAI_EXPORTS VisionEncoder {
             image, AnyMap{std::forward<Properties>(properties)...}
         );
     }
+
+private:
+    EncodedImage encode_minicpm(
+        const ov::Tensor& image, const ProcessorConfig& config
+    );
+
+    EncodedImage encode_llava(
+        const ov::Tensor& image, const ProcessorConfig& config
+    );
 };
 }
diff --git a/src/cpp/src/vision_encoder.cpp b/src/cpp/src/vision_encoder.cpp
index a35a5d8db7..f513b433d3 100644
--- a/src/cpp/src/vision_encoder.cpp
+++ b/src/cpp/src/vision_encoder.cpp
@@ -291,23 +291,94 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
     }
     return {resized_source, resized_source_size, encoded_slices, sliced_sizes};
 }
+
+
+ov::Tensor preprocess_image_llava(const ov::Tensor& image, const ProcessorConfig& config) {
+    bool do_resize = true;
+    bool do_center_crop = true;
+
+    // ov::Tensor to clip_image_u8
+    clip_image_u8 input_image{
+        int(image.get_shape().at(3)),
+        int(image.get_shape().at(2)),
+        {image.data<uint8_t>(), image.data<uint8_t>() + image.get_size()}
+    };
+
+    // Resize
+    clip_image_u8 resized_image;
+    if (do_resize) {
+        int target_size = config.size_shortest_edge;
+        float scale = static_cast<float>(target_size) / std::min(input_image.nx, input_image.ny);
+        int new_width = static_cast<int>(input_image.nx * scale);
+        int new_height = static_cast<int>(input_image.ny * scale);
+        bicubic_resize(input_image, resized_image, new_width, new_height);
+    } else {
+        resized_image = input_image;
+    }
+
+    // Center crop
+    clip_image_u8 cropped_image;
+    if (do_center_crop) {
+        int crop_height = config.crop_size_height;
+        int crop_width = config.crop_size_width;
+        int start_x = (resized_image.nx - crop_width) / 2;
+        int start_y = (resized_image.ny - crop_height) / 2;
+
+        cropped_image.nx = crop_width;
+        cropped_image.ny = crop_height;
+        cropped_image.buf.resize(3 * crop_width * crop_height);
+
+        for (int y = 0; y < crop_height; ++y) {
+            for (int x = 0; x < crop_width; ++x) {
+                for (int c = 0; c < 3; ++c) {
+                    cropped_image.buf[(y * crop_width + x) * 3 + c] = 
+                        resized_image.buf[((start_y + y) * resized_image.nx + (start_x + x)) * 3 + c];
+                }
+            }
+        }
+    } else {
+        cropped_image = resized_image;
+    }
+
+    // Normalize
+    clip_ctx ctx;
+    std::copy(config.image_mean.begin(), config.image_mean.end(), ctx.image_mean);
+    std::copy(config.image_std.begin(), config.image_std.end(), ctx.image_std);
+
+    clip_image_f32 normalized_image = clip_image_preprocess(ctx, cropped_image);
+
+    // Convert clip_image_f32 to ov::Tensor
+    ov::Tensor result(
+        ov::element::f32,
+        {1, 3, size_t(normalized_image.ny), size_t(normalized_image.nx)},
+        (void*)(normalized_image.buf.data())
+    );
+
+    return result;
+}
 }
 
-VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const std::string& device, const ov::AnyMap device_config, ov::Core core) :
-    VisionEncoder{
-        core.compile_model(
-            model_dir / "image_encoder.xml", device, device_config
-        ).create_infer_request(),
-        ov::genai::utils::from_config_json_if_exists<ov::genai::ProcessorConfig>(
+VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const std::string& device, const ov::AnyMap device_config, ov::Core core, std::string model_type) :
+    model_type(model_type) {
+        if (model_type == "minicpmv") {
+            m_encoder = core.compile_model(model_dir / "image_encoder.xml", device, device_config).create_infer_request();
+        } else if (model_type == "llava") {
+            // Vision embeddings model is merged with multi modal projector at model export stage by optimum-intel
+            m_vision_embeddings = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request();
+        } else {
+            OPENVINO_THROW("Unsupported model type: " + model_type);
+        }
+        m_processor_config = ov::genai::utils::from_config_json_if_exists<ov::genai::ProcessorConfig>(
             model_dir, "preprocessor_config.json"
-        )
-    } {}
+        );
+}
 
 EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ProcessorConfig& config) {
-    clip_ctx ctx_clip;
-    std::copy(config.norm_mean.begin(), config.norm_mean.end(), ctx_clip.image_mean);
-    std::copy(config.norm_std.begin(), config.norm_std.end(), ctx_clip.image_std);
-    return llava_image_embed_make_with_bytes_slice(ctx_clip, image, m_encoder, config.max_slice_nums, config.scale_resolution, config.patch_size, 0 == config.max_slice_nums);
+    if (model_type == "minicpmv") {
+        return encode_minicpm(image, config);
+    } else if (model_type == "llava") {
+        return encode_llava(image, config);
+    }
 }
 
 EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ov::AnyMap& config_map) {
@@ -315,3 +386,22 @@ EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ov::AnyMap& co
         config_map, m_processor_config
     ));
 }
+
+EncodedImage VisionEncoder::encode_minicpm(const ov::Tensor& image, const ProcessorConfig& config) {
+    clip_ctx ctx_clip;
+    std::copy(config.norm_mean.begin(), config.norm_mean.end(), ctx_clip.image_mean);
+    std::copy(config.norm_std.begin(), config.norm_std.end(), ctx_clip.image_std);
+    return llava_image_embed_make_with_bytes_slice(ctx_clip, image, m_encoder, config.max_slice_nums, config.scale_resolution, config.patch_size, 0 == config.max_slice_nums);
+}
+
+EncodedImage VisionEncoder::encode_llava(const ov::Tensor& image, const ProcessorConfig& config) {
+    ov::Tensor preprocessed_image = preprocess_image_llava(image, config);
+
+    m_vision_embeddings.set_tensor("pixel_values", preprocessed_image);
+    m_vision_embeddings.infer();
+
+    ov::Tensor image_features = m_vision_embeddings.get_output_tensor();
+    HeightWidth resized_source_size{config.crop_size_height / config.patch_size, config.crop_size_width / config.patch_size};
+
+    return {image_features, resized_source_size};
+}

From be3fab4233ed2b1dac6a8e4b2e34a5cc18474699 Mon Sep 17 00:00:00 2001
From: yatarkan <yaroslav.tarkan@intel.com>
Date: Wed, 2 Oct 2024 20:28:13 +0400
Subject: [PATCH 04/17] Enable llava model in vlm pipeline, separate preparing
 inputs embeds for llava and minicpm

---
 .../include/openvino/genai/vlm_pipeline.hpp   |  3 +
 src/cpp/src/vlm_pipeline.cpp                  | 91 ++++++++++++++++---
 2 files changed, 81 insertions(+), 13 deletions(-)

diff --git a/src/cpp/include/openvino/genai/vlm_pipeline.hpp b/src/cpp/include/openvino/genai/vlm_pipeline.hpp
index 85ea9dd661..44754d4b6c 100644
--- a/src/cpp/include/openvino/genai/vlm_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/vlm_pipeline.hpp
@@ -162,6 +162,9 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
 private:
     class VLMPipelineImpl;
     std::unique_ptr<VLMPipelineImpl> m_pimpl;
+
+    ov::Tensor get_inputs_embeds_minicpm(const std::string& prompt, const std::vector<ov::Tensor>& images);
+    ov::Tensor get_inputs_embeds_llava(const std::string& prompt, const std::vector<ov::Tensor>& images);
 };
 
 /*
diff --git a/src/cpp/src/vlm_pipeline.cpp b/src/cpp/src/vlm_pipeline.cpp
index 89eb535aa7..f640c9814e 100644
--- a/src/cpp/src/vlm_pipeline.cpp
+++ b/src/cpp/src/vlm_pipeline.cpp
@@ -312,20 +312,33 @@ VLMPipeline::VLMPipeline(
         )
     },
     m_tokenizer{tokenizer},
-    m_vision_encoder(model_dir, device, device_config, core),
-    m_resampler{core.compile_model(
-        model_dir / "resampler.xml", device, device_config
-    ).create_infer_request()},
-    m_embedding{core.compile_model(
-        model_dir / "embed_tokens.xml", device, device_config
-    ).create_infer_request()},
-    m_language{core.compile_model(
-        model_dir / "language_model.xml", device, device_config
-    ).create_infer_request()},
-    m_pos_embed_cache{
-        get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70})
-    },
+    m_vision_encoder(model_dir, device, device_config, core, m_vlm_config.model_type),
     m_is_chat_conversation{false} {
+        if (m_vlm_config.model_type == "minicpmv") {
+            m_resampler = core.compile_model(
+                model_dir / "resampler.xml", device, device_config
+            ).create_infer_request();
+
+            m_embedding = core.compile_model(
+                model_dir / "embed_tokens.xml", device, device_config
+            ).create_infer_request();
+
+            m_language = core.compile_model(
+                model_dir / "language_model.xml", device, device_config
+            ).create_infer_request();
+
+            m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70});
+        } else if (m_vlm_config.model_type == "llava") {
+            m_language = core.compile_model(
+                model_dir / "openvino_language_model.xml", device, device_config
+            ).create_infer_request();
+
+            // Reusing the same m_embedding for llava text_embeddings model
+            m_embedding = core.compile_model(
+                model_dir / "openvino_text_embeddings_model.xml", device, device_config
+            ).create_infer_request();
+        }
+
         m_language.get_tensor("attention_mask").set_shape({1, 0});
     }
 
@@ -448,12 +461,21 @@ DecodedResults VLMPipeline::generate(
             }
         }
     }
+
+    // if (m_vlm_config.model_type == "minicpmv") {
+    //     inputs_embeds = get_inputs_embeds_minicpm(prompt, images);
+    // } else if (m_vlm_config.model_type == "llava") {
+    //     inputs_embeds = get_inputs_embeds_llava(prompt, images);
+    // }
+
     m_language.set_tensor("inputs_embeds", inputs_embeds);
     size_t history_len = m_language.get_tensor("attention_mask").get_shape().at(1);
     m_language.get_tensor("attention_mask").set_shape({1, history_len + inputs_embeds.get_shape()[1]});
     std::fill_n(m_language.get_tensor("attention_mask").data<int64_t>(), m_language.get_tensor("attention_mask").get_size(), 1);
+    
     m_language.get_tensor("position_ids").set_shape({1, inputs_embeds.get_shape().at(1)});
     std::iota(m_language.get_tensor("position_ids").data<int64_t>(), m_language.get_tensor("position_ids").data<int64_t>() + m_language.get_tensor("position_ids").get_size(), history_len);
+    
     m_language.get_tensor("beam_idx").set_shape({ BATCH_SIZE });
     m_language.get_tensor("beam_idx").data<int32_t>()[0] = 0;
 
@@ -586,3 +608,46 @@ GenerationConfig VLMPipeline::get_generation_config() const {
 void VLMPipeline::set_generation_config(const GenerationConfig& new_config) {
     m_generation_config = new_config;
 }
+
+ov::Tensor VLMPipeline::get_inputs_embeds_llava(const std::string& prompt, const std::vector<ov::Tensor>& images) {
+    std::string image_token = "<image>"; // TODO Consider getting from vlm_config or json
+    std::string formatted_prompt = "USER: " + (images.empty() ? prompt : image_token + "\n" + prompt) + " ASSISTANT:";
+    ov::Tensor input_ids = m_tokenizer.encode(formatted_prompt).input_ids;
+    if (images.empty()) {
+        return process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb);
+    } else {
+        OPENVINO_ASSERT(1 == images.size(), "Only a single image allowed");
+        EncodedImage encoded_image = m_vision_encoder.encode(images.at(0));
+        ov::Tensor image_embeds = encoded_image.resized_source;
+        
+        ov::Tensor text_embeds = process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb);
+
+        int64_t image_token_index = 32000; // TODO Consider getting from m_vlm_config.image_token_index or config.json
+
+        return merge_text_and_image_embeddings(input_ids, text_embeds, image_embeds, image_token_index);
+    }
+}
+
+ov::Tensor VLMPipeline::get_inputs_embeds_minicpm(const std::string& prompt, const std::vector<ov::Tensor>& images) {
+    std::string wrapped = images.empty() ?
+        "<用户>" + prompt + "<AI>" : prompt + "<AI>";
+    ov::Tensor input_ids = m_tokenizer.encode(wrapped).input_ids;
+    
+    if (images.empty()) {
+        //<用户> + prompt + <AI> LLM first input
+        return process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb);
+    } else {
+        OPENVINO_ASSERT(1 == images.size(), "Only a single image allowed");
+        EncodedImage embeds = m_vision_encoder.encode(images.at(0));
+        ov::Tensor imgEmbedTensor = get_image_embedding(embeds, m_tokenizer, m_embedding, *this);
+
+        ov::Shape img_embed_shape = imgEmbedTensor.get_shape();
+        OPENVINO_ASSERT(
+            m_vlm_config.hidden_size == img_embed_shape.at(2),
+            "Unexpected embedding size");
+
+        //<用户> + image embedding + prompt + <AI> LLM first input
+        ov::Tensor prompt_tensor = process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb);
+        return concatenate_mid_dim(imgEmbedTensor, prompt_tensor);
+    }
+}

From 2ec5ef8bc381edda1eb81bfaaefdae97f2155d0a Mon Sep 17 00:00:00 2001
From: yatarkan <yaroslav.tarkan@intel.com>
Date: Fri, 4 Oct 2024 15:46:38 +0400
Subject: [PATCH 05/17] Add test for vlm sample with llava model

---
 .github/workflows/causal_lm_cpp.yml | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 3feb7c8563..bbccf0d22d 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -681,7 +681,7 @@ jobs:
           diff pred2.txt ref.txt
           echo "Chat sample python" passed
 
-  py-vlm_chat_sample-ubuntu:
+  cpp-vlm_chat_sample-ubuntu:
     runs-on: ubuntu-22.04-16-cores
     steps:
       - uses: actions/checkout@v4
@@ -700,17 +700,31 @@ jobs:
           source ./ov/setupvars.sh
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release --target visual_language_chat -j
-      - name: Download and convert a model and an image
+      - name: Download and convert MiniCPM-V-2_6 model and an image
         run: |
           source ./ov/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python ./samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py ./miniCPM-V-2_6/
           wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11
-      - run: >
+      - name: Run visual_language_chat sample - MiniCPM-V-2_6
+        run: >
           source ./ov/setupvars.sh
           && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ d5fbbd1a-d484-415c-88cb-9986625b7b11
           <<< $'What is on the image?\nWhat is special on the image?'
+      - name: Download and convert LLaVa 1.5 model and an image
+        run: |
+          source ./ov/setupvars.sh
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install git+https://github.com/huggingface/optimum-intel.git
+          optimum-cli export openvino --model llava-hf/llava-1.5-7b-hf ./llava_1_5_7b_ov/
+          wget https://llava-vl.github.io/static/images/monalisa.jpg
+      - name: Run visual_language_chat sample - LLaVa 1.5
+        run: >
+          source ./ov/setupvars.sh
+          && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./llava_1_5_7b_ov/ monalisa.jpg
+          <<< $'Who drew this painting?\nWhen did the painter live?'
 
   cpp-continuous-batching-ubuntu:
     runs-on: ubuntu-20.04-8-cores

From 49447b93a7b8467f31f1cea621fad237e7254694 Mon Sep 17 00:00:00 2001
From: yatarkan <yaroslav.tarkan@intel.com>
Date: Tue, 8 Oct 2024 20:42:46 +0400
Subject: [PATCH 06/17] Restore function for merging text and image embeds for
 llava

---
 src/cpp/src/vlm_pipeline.cpp | 50 +++++++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/src/cpp/src/vlm_pipeline.cpp b/src/cpp/src/vlm_pipeline.cpp
index f640c9814e..23f16d1602 100644
--- a/src/cpp/src/vlm_pipeline.cpp
+++ b/src/cpp/src/vlm_pipeline.cpp
@@ -294,6 +294,54 @@ ov::Tensor resample(VLMPipeline& pipe, const ov::Tensor& encoded_image, const st
     pipe.m_resampler.infer();
     return pipe.m_resampler.get_output_tensor();  // [N, query_num, new_hidden_size]
 }
+
+ov::Tensor merge_text_and_image_embeddings_llava(
+    const ov::Tensor& input_ids,
+    const ov::Tensor& text_embeds,
+    const ov::Tensor& image_embeds,
+    int64_t image_token_index
+) {
+    auto text_embeds_shape = text_embeds.get_shape();
+    auto image_embeds_shape = image_embeds.get_shape();
+
+    OPENVINO_ASSERT(
+        text_embeds_shape[2] == image_embeds_shape[2],
+        "Incompatible shapes between text_embeds and image_embeds"
+    );
+
+    size_t text_embeds_seq_length = text_embeds_shape[1];
+    size_t hidden_size = text_embeds_shape[2];
+    size_t image_embeds_seq_length = image_embeds_shape[1];
+
+    size_t merged_seq_length = text_embeds_seq_length + (image_embeds_seq_length - 1);
+
+    ov::Tensor merged_embeds(text_embeds.get_element_type(), {BATCH_SIZE, merged_seq_length, hidden_size});
+
+    const int64_t* input_ids_data = input_ids.data<const int64_t>();
+    const float* text_embeds_data = text_embeds.data<const float>();
+    const float* image_embeds_data = image_embeds.data<const float>();
+    float* merged_data = merged_embeds.data<float>();
+
+
+    size_t merged_idx = 0;
+    for (size_t s = 0; s < text_embeds_seq_length; ++s) {
+        if (input_ids_data[s] == image_token_index) {
+            for (size_t i = 0; i < image_embeds_seq_length; ++i) {
+                std::copy_n(image_embeds_data + i * hidden_size,
+                            hidden_size,
+                            merged_data + merged_idx * hidden_size);
+                merged_idx++;
+            }
+        } else {
+            std::copy_n(text_embeds_data + s * hidden_size,
+                        hidden_size,
+                        merged_data + merged_idx * hidden_size);
+            merged_idx++;
+        }
+    }
+
+    return merged_embeds;
+}
 }
 
 class ov::genai::VLMPipeline::VLMPipelineImpl {
@@ -624,7 +672,7 @@ ov::Tensor VLMPipeline::get_inputs_embeds_llava(const std::string& prompt, const
 
         int64_t image_token_index = 32000; // TODO Consider getting from m_vlm_config.image_token_index or config.json
 
-        return merge_text_and_image_embeddings(input_ids, text_embeds, image_embeds, image_token_index);
+        return merge_text_and_image_embeddings_llava(input_ids, text_embeds, image_embeds, image_token_index);
     }
 }
 

From 790981fdfee00a1585fd9f4cdc8bfd525da023a1 Mon Sep 17 00:00:00 2001
From: yatarkan <yaroslav.tarkan@intel.com>
Date: Tue, 8 Oct 2024 21:01:37 +0400
Subject: [PATCH 07/17] Move getting input embeds for minicpm to separate
 method

---
 src/cpp/src/vlm_pipeline.cpp | 253 ++++++++++++++++-------------------
 1 file changed, 117 insertions(+), 136 deletions(-)

diff --git a/src/cpp/src/vlm_pipeline.cpp b/src/cpp/src/vlm_pipeline.cpp
index 23f16d1602..4b1d13e984 100644
--- a/src/cpp/src/vlm_pipeline.cpp
+++ b/src/cpp/src/vlm_pipeline.cpp
@@ -398,123 +398,12 @@ DecodedResults VLMPipeline::generate(
     const GenerationConfig& generation_config,
     const StreamerVariant& streamer
 ) {
-    std::string images_prompt;
-    EncodedImage embeds;
-    if (!rgbs.empty()) {
-        OPENVINO_ASSERT(1 == rgbs.size(), "TODO: Only a single image allowed");
-        embeds = m_vision_encoder.encode(rgbs.at(0));
-        if (m_vlm_config.use_image_id) {
-            images_prompt = m_vlm_config.im_id_start + std::to_string(image_id) + m_vlm_config.im_id_end;
-            ++image_id;
-        }
-        std::string unk64;
-        for (size_t idx = 0; idx < m_vlm_config.query_num; ++idx) {
-            unk64 += m_vlm_config.unk;
-        }
-        images_prompt += m_vlm_config.im_start + unk64 + m_vlm_config.im_end;
-        if (embeds.slices) {
-            ov::Shape slices_shape = embeds.slices.get_shape();
-            for (size_t row_idx = 0; row_idx < slices_shape.at(0); ++row_idx) {
-                for (size_t col_idx = 0; col_idx < slices_shape.at(1); ++col_idx) {
-                    images_prompt += m_vlm_config.slice_start + unk64 + m_vlm_config.slice_end;
-                }
-                images_prompt += '\n';
-            }
-        }
-        if ('\n' != *(images_prompt.end() - 1)) {
-            // Image wasn't sliced, add \n to the end of image anyway.
-            // Strangely, \n isn't placed between </image><slice>.
-            images_prompt += '\n';
-        }
-    }
-    images_prompt += prompt;
-    std::string new_templated_chat_history;
-    if (m_is_chat_conversation) {
-        // KV cache in model already contains prompts and answers from previous iterations.
-        // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns
-        // token_ids = {<bos token>, ...<valuable tokens>}. So if tokenizer applies only to the new prompt,
-        // <bos token> will be inserted on every iteration.
-        // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt
-        // and takes only the difference between them.
-        // The chat history cannot be saved as already encoded tokens because generate call doesn't return <eos> token, but
-        // KV cache contains it. So we have to add it manually or get it by tokenization all chat history.
-        m_history.push_back({{"role", "user"}, {"content", images_prompt}});
-        constexpr bool add_generation_prompt = true;
-        new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
+    ov::Tensor inputs_embeds;
+    if (m_vlm_config.model_type == "minicpmv") {
+        inputs_embeds = get_inputs_embeds_minicpm(prompt, rgbs);
+    } else if (m_vlm_config.model_type == "llava") {
+        inputs_embeds = get_inputs_embeds_llava(prompt, rgbs);
     }
-    ov::Tensor special_tokens = m_tokenizer.encode(
-        m_vlm_config.im_start
-        + m_vlm_config.im_end
-        + m_vlm_config.slice_start
-        + m_vlm_config.slice_end
-    ).input_ids;
-    OPENVINO_ASSERT(
-        4 == special_tokens.get_shape().at(1),
-        "Every special token must be represented with a single int."
-    );
-    size_t im_start_id = special_tokens.data<int64_t>()[0];
-    size_t im_end_id = special_tokens.data<int64_t>()[1];
-    size_t slice_start_id = special_tokens.data<int64_t>()[2];
-    size_t slice_end_id = special_tokens.data<int64_t>()[3];
-    ov::Tensor input_ids = m_tokenizer.encode(new_templated_chat_history).input_ids;
-    m_embedding.set_input_tensor(input_ids);
-    m_embedding.infer();
-    ov::Tensor inputs_embeds = m_embedding.get_output_tensor();
-    OPENVINO_ASSERT(
-        m_vlm_config.hidden_size == inputs_embeds.get_shape().at(2),
-        "Unexpected embedding size"
-    );
-    if (!rgbs.empty()) {
-        int64_t* ids = input_ids.data<int64_t>();
-        const ov::Tensor& resampled_source = resample(*this, embeds.resized_source, {embeds.resized_source_size});
-        float* emb = resampled_source.data<float>();
-        bool replacing = false;
-        for (size_t token_idx = 0; token_idx < inputs_embeds.get_shape().at(1); ++token_idx) {
-            if (im_start_id == ids[token_idx]) {
-                replacing = true;
-            }
-            if (replacing) {
-                std::copy_n(emb, resampled_source.get_size(), inputs_embeds.data<float>() + token_idx * m_vlm_config.hidden_size);
-                token_idx += resampled_source.get_shape().at(1);
-                replacing = false;
-                break;
-            }
-        }
-        if (embeds.slices) {
-            size_t token_idx = 0;
-            const ov::Shape& slices_shape = embeds.slices.get_shape();
-            const std::vector<HeightWidth>& sliced_sizes = embeds.slices_sizes;
-            for (size_t i = 0; i < slices_shape.at(0); ++i) {
-                for (size_t ja = 0; ja < slices_shape.at(1); ++ja) {
-                    size_t d2 = slices_shape.at(2);
-                    size_t d3 = slices_shape.at(3);
-                    ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, embeds.slices.data<float>() + (i * slices_shape.at(1) + ja) * d2 * d3};
-                    const ov::Tensor& vision_embed_tensor_i_j = resample(*this, encoded_view, {sliced_sizes.at(i * slices_shape.at(1) + ja)});
-                    for (; token_idx < inputs_embeds.get_shape().at(1); ++token_idx) {
-                        if (slice_start_id == ids[token_idx]) {
-                            replacing = true;
-                        }
-                        if (slice_end_id == ids[token_idx]) {
-                            replacing = false;
-                            break;
-                        }
-                        if (replacing) {
-                            std::copy_n(vision_embed_tensor_i_j.data<float>(), vision_embed_tensor_i_j.get_size(), inputs_embeds.data<float>() + token_idx * m_vlm_config.hidden_size);
-                            token_idx += vision_embed_tensor_i_j.get_shape().at(1);
-                            replacing = false;
-                            break;
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    // if (m_vlm_config.model_type == "minicpmv") {
-    //     inputs_embeds = get_inputs_embeds_minicpm(prompt, images);
-    // } else if (m_vlm_config.model_type == "llava") {
-    //     inputs_embeds = get_inputs_embeds_llava(prompt, images);
-    // }
 
     m_language.set_tensor("inputs_embeds", inputs_embeds);
     size_t history_len = m_language.get_tensor("attention_mask").get_shape().at(1);
@@ -677,25 +566,117 @@ ov::Tensor VLMPipeline::get_inputs_embeds_llava(const std::string& prompt, const
 }
 
 ov::Tensor VLMPipeline::get_inputs_embeds_minicpm(const std::string& prompt, const std::vector<ov::Tensor>& images) {
-    std::string wrapped = images.empty() ?
-        "<用户>" + prompt + "<AI>" : prompt + "<AI>";
-    ov::Tensor input_ids = m_tokenizer.encode(wrapped).input_ids;
-    
-    if (images.empty()) {
-        //<用户> + prompt + <AI> LLM first input
-        return process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb);
-    } else {
-        OPENVINO_ASSERT(1 == images.size(), "Only a single image allowed");
-        EncodedImage embeds = m_vision_encoder.encode(images.at(0));
-        ov::Tensor imgEmbedTensor = get_image_embedding(embeds, m_tokenizer, m_embedding, *this);
-
-        ov::Shape img_embed_shape = imgEmbedTensor.get_shape();
-        OPENVINO_ASSERT(
-            m_vlm_config.hidden_size == img_embed_shape.at(2),
-            "Unexpected embedding size");
-
-        //<用户> + image embedding + prompt + <AI> LLM first input
-        ov::Tensor prompt_tensor = process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb);
-        return concatenate_mid_dim(imgEmbedTensor, prompt_tensor);
+    std::string images_prompt;
+    EncodedImage embeds;
+    if (!images.empty()) {
+        OPENVINO_ASSERT(1 == images.size(), "TODO: Only a single image allowed");
+        embeds = m_vision_encoder.encode(images.at(0));
+        if (m_vlm_config.use_image_id) {
+            images_prompt = m_vlm_config.im_id_start + std::to_string(image_id) + m_vlm_config.im_id_end;
+            ++image_id;
+        }
+        std::string unk64;
+        for (size_t idx = 0; idx < m_vlm_config.query_num; ++idx) {
+            unk64 += m_vlm_config.unk;
+        }
+        images_prompt += m_vlm_config.im_start + unk64 + m_vlm_config.im_end;
+        if (embeds.slices) {
+            ov::Shape slices_shape = embeds.slices.get_shape();
+            for (size_t row_idx = 0; row_idx < slices_shape.at(0); ++row_idx) {
+                for (size_t col_idx = 0; col_idx < slices_shape.at(1); ++col_idx) {
+                    images_prompt += m_vlm_config.slice_start + unk64 + m_vlm_config.slice_end;
+                }
+                images_prompt += '\n';
+            }
+        }
+        if ('\n' != *(images_prompt.end() - 1)) {
+            // Image wasn't sliced, add \n to the end of image anyway.
+            // Strangely, \n isn't placed between </image><slice>.
+            images_prompt += '\n';
+        }
+    }
+    images_prompt += prompt;
+    std::string new_templated_chat_history;
+    if (m_is_chat_conversation) {
+        // KV cache in model already contains prompts and answers from previous iterations.
+        // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns
+        // token_ids = {<bos token>, ...<valuable tokens>}. So if tokenizer applies only to the new prompt,
+        // <bos token> will be inserted on every iteration.
+        // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt
+        // and takes only the difference between them.
+        // The chat history cannot be saved as already encoded tokens because generate call doesn't return <eos> token, but
+        // KV cache contains it. So we have to add it manually or get it by tokenization all chat history.
+        m_history.push_back({{"role", "user"}, {"content", images_prompt}});
+        constexpr bool add_generation_prompt = true;
+        new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
+    }
+    ov::Tensor special_tokens = m_tokenizer.encode(
+        m_vlm_config.im_start
+        + m_vlm_config.im_end
+        + m_vlm_config.slice_start
+        + m_vlm_config.slice_end
+    ).input_ids;
+    OPENVINO_ASSERT(
+        4 == special_tokens.get_shape().at(1),
+        "Every special token must be represented with a single int."
+    );
+    size_t im_start_id = special_tokens.data<int64_t>()[0];
+    size_t im_end_id = special_tokens.data<int64_t>()[1];
+    size_t slice_start_id = special_tokens.data<int64_t>()[2];
+    size_t slice_end_id = special_tokens.data<int64_t>()[3];
+    ov::Tensor input_ids = m_tokenizer.encode(new_templated_chat_history).input_ids;
+    m_embedding.set_input_tensor(input_ids);
+    m_embedding.infer();
+    ov::Tensor inputs_embeds = m_embedding.get_output_tensor();
+    OPENVINO_ASSERT(
+        m_vlm_config.hidden_size == inputs_embeds.get_shape().at(2),
+        "Unexpected embedding size"
+    );
+    if (!images.empty()) {
+        int64_t* ids = input_ids.data<int64_t>();
+        const ov::Tensor& resampled_source = resample(*this, embeds.resized_source, {embeds.resized_source_size});
+        float* emb = resampled_source.data<float>();
+        bool replacing = false;
+        for (size_t token_idx = 0; token_idx < inputs_embeds.get_shape().at(1); ++token_idx) {
+            if (im_start_id == ids[token_idx]) {
+                replacing = true;
+            }
+            if (replacing) {
+                std::copy_n(emb, resampled_source.get_size(), inputs_embeds.data<float>() + token_idx * m_vlm_config.hidden_size);
+                token_idx += resampled_source.get_shape().at(1);
+                replacing = false;
+                break;
+            }
+        }
+        if (embeds.slices) {
+            size_t token_idx = 0;
+            const ov::Shape& slices_shape = embeds.slices.get_shape();
+            const std::vector<HeightWidth>& sliced_sizes = embeds.slices_sizes;
+            for (size_t i = 0; i < slices_shape.at(0); ++i) {
+                for (size_t ja = 0; ja < slices_shape.at(1); ++ja) {
+                    size_t d2 = slices_shape.at(2);
+                    size_t d3 = slices_shape.at(3);
+                    ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, embeds.slices.data<float>() + (i * slices_shape.at(1) + ja) * d2 * d3};
+                    const ov::Tensor& vision_embed_tensor_i_j = resample(*this, encoded_view, {sliced_sizes.at(i * slices_shape.at(1) + ja)});
+                    for (; token_idx < inputs_embeds.get_shape().at(1); ++token_idx) {
+                        if (slice_start_id == ids[token_idx]) {
+                            replacing = true;
+                        }
+                        if (slice_end_id == ids[token_idx]) {
+                            replacing = false;
+                            break;
+                        }
+                        if (replacing) {
+                            std::copy_n(vision_embed_tensor_i_j.data<float>(), vision_embed_tensor_i_j.get_size(), inputs_embeds.data<float>() + token_idx * m_vlm_config.hidden_size);
+                            token_idx += vision_embed_tensor_i_j.get_shape().at(1);
+                            replacing = false;
+                            break;
+                        }
+                    }
+                }
+            }
+        }
     }
+
+    return inputs_embeds;
 }

From 1b5435c5511470d164d07e44b9e2c28871ebfc19 Mon Sep 17 00:00:00 2001
From: yatarkan <yaroslav.tarkan@intel.com>
Date: Wed, 9 Oct 2024 15:12:31 +0400
Subject: [PATCH 08/17] Add vlm model type enum class

---
 .../include/openvino/genai/vision_encoder.hpp |  9 +++---
 src/cpp/include/openvino/genai/vlm_config.hpp |  5 +--
 .../include/openvino/genai/vlm_model_type.hpp | 31 +++++++++++++++++++
 src/cpp/src/vision_encoder.cpp                | 12 +++----
 src/cpp/src/vlm_config.cpp                    |  2 +-
 src/cpp/src/vlm_pipeline.cpp                  | 10 +++---
 6 files changed, 50 insertions(+), 19 deletions(-)
 create mode 100644 src/cpp/include/openvino/genai/vlm_model_type.hpp

diff --git a/src/cpp/include/openvino/genai/vision_encoder.hpp b/src/cpp/include/openvino/genai/vision_encoder.hpp
index 3fe80bf24f..e1c2be0102 100644
--- a/src/cpp/include/openvino/genai/vision_encoder.hpp
+++ b/src/cpp/include/openvino/genai/vision_encoder.hpp
@@ -5,6 +5,7 @@
 
 #include "openvino/genai/processor_config.hpp"
 #include <openvino/openvino.hpp>
+#include "vlm_model_type.hpp"
 
 namespace ov::genai {
 /// @brief A pair describing image size.
@@ -41,8 +42,8 @@ struct EncodedImage {
 /// ov::InferRequest and configured by ProcessorConfig.
 class OPENVINO_GENAI_EXPORTS VisionEncoder {
 public:
-    /// @brief A string denoting model type.
-    std::string model_type;
+    /// @brief A enum denoting model type.
+    VLMModelType model_type;
     /// @brief A model for image encoding.
     ov::InferRequest m_encoder;
     /// @brief A config to follow.
@@ -68,10 +69,10 @@ class OPENVINO_GENAI_EXPORTS VisionEncoder {
     /// @param core ov::Core to be used to compile the model.
     explicit VisionEncoder(
         const std::filesystem::path& model_dir,
+        const VLMModelType model_type,
         const std::string& device="CPU",
         const ov::AnyMap device_config={},
-        ov::Core core=ov::Core{},
-        std::string model_type=""
+        ov::Core core=ov::Core{}
     );
 
     /// @brief Compute embeddings of an image.
diff --git a/src/cpp/include/openvino/genai/vlm_config.hpp b/src/cpp/include/openvino/genai/vlm_config.hpp
index 02d0f7c36a..46983c080a 100644
--- a/src/cpp/include/openvino/genai/vlm_config.hpp
+++ b/src/cpp/include/openvino/genai/vlm_config.hpp
@@ -6,14 +6,15 @@
 #include "openvino/genai/visibility.hpp"
 #include <openvino/runtime/properties.hpp>
 #include <filesystem>
+#include "vlm_model_type.hpp"
 
 namespace ov::genai {
 /// @brief A Configuration class passed to VLMPipeline and used to
 /// change VLMPipeline's behavior. Corresponds to config.json.
 class OPENVINO_GENAI_EXPORTS VLMConfig {
 public:
-    /// @brief A string denoting model type.
-    std::string model_type = "";
+    /// @brief A enum denoting model type.
+    VLMModelType model_type;
     /// @brief A size of a single embedding returned by a resampler.
     /// Used to initialize positional embeddings for resampler input.
     size_t hidden_size = 2304;
diff --git a/src/cpp/include/openvino/genai/vlm_model_type.hpp b/src/cpp/include/openvino/genai/vlm_model_type.hpp
new file mode 100644
index 0000000000..9e35d543a7
--- /dev/null
+++ b/src/cpp/include/openvino/genai/vlm_model_type.hpp
@@ -0,0 +1,31 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+#include "openvino/genai/visibility.hpp"
+#include <openvino/core/except.hpp>
+
+namespace ov::genai {
+
+enum class OPENVINO_GENAI_EXPORTS VLMModelType {
+    MINICPM,
+    LLAVA,
+};
+
+inline VLMModelType to_vlm_model_type(const std::string& value) {
+    static const std::unordered_map<std::string, VLMModelType> model_types_map = {
+        {"minicpm", VLMModelType::MINICPM},
+        {"llava", VLMModelType::LLAVA}
+    };
+
+    auto it = model_types_map.find(value);
+    if (it != model_types_map.end()) {
+        return it->second;
+    }
+    OPENVINO_THROW("Unsupported '", value, "' VLM model type");
+}
+}
\ No newline at end of file
diff --git a/src/cpp/src/vision_encoder.cpp b/src/cpp/src/vision_encoder.cpp
index f513b433d3..02f7793297 100644
--- a/src/cpp/src/vision_encoder.cpp
+++ b/src/cpp/src/vision_encoder.cpp
@@ -358,15 +358,13 @@ ov::Tensor preprocess_image_llava(const ov::Tensor& image, const ProcessorConfig
 }
 }
 
-VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const std::string& device, const ov::AnyMap device_config, ov::Core core, std::string model_type) :
+VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const VLMModelType model_type, const std::string& device, const ov::AnyMap device_config, ov::Core core) :
     model_type(model_type) {
-        if (model_type == "minicpmv") {
+        if (model_type == VLMModelType::MINICPM) {
             m_encoder = core.compile_model(model_dir / "image_encoder.xml", device, device_config).create_infer_request();
-        } else if (model_type == "llava") {
+        } else if (model_type == VLMModelType::LLAVA) {
             // Vision embeddings model is merged with multi modal projector at model export stage by optimum-intel
             m_vision_embeddings = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request();
-        } else {
-            OPENVINO_THROW("Unsupported model type: " + model_type);
         }
         m_processor_config = ov::genai::utils::from_config_json_if_exists<ov::genai::ProcessorConfig>(
             model_dir, "preprocessor_config.json"
@@ -374,9 +372,9 @@ VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const std::
 }
 
 EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ProcessorConfig& config) {
-    if (model_type == "minicpmv") {
+    if (model_type == VLMModelType::MINICPM) {
         return encode_minicpm(image, config);
-    } else if (model_type == "llava") {
+    } else if (model_type == VLMModelType::LLAVA) {
         return encode_llava(image, config);
     }
 }
diff --git a/src/cpp/src/vlm_config.cpp b/src/cpp/src/vlm_config.cpp
index a13a0da702..8d7585f2bb 100644
--- a/src/cpp/src/vlm_config.cpp
+++ b/src/cpp/src/vlm_config.cpp
@@ -10,7 +10,7 @@ ov::genai::VLMConfig::VLMConfig(const std::filesystem::path& json_path) {
     OPENVINO_ASSERT(stream.is_open(), "Failed to open '" + json_path.string() + "' with processor config");
     nlohmann::json parsed = nlohmann::json::parse(stream);
     using ov::genai::utils::read_json_param;
-    read_json_param(parsed, "model_type", model_type); // TODO Consider checking supported model type here instead of VisionEncoder constructor
+    model_type = to_vlm_model_type(parsed.at("model_type"));
     read_json_param(parsed, "hidden_size", hidden_size);
     read_json_param(parsed, "scale_emb", scale_emb);
     read_json_param(parsed, "query_num", query_num);
diff --git a/src/cpp/src/vlm_pipeline.cpp b/src/cpp/src/vlm_pipeline.cpp
index 4b1d13e984..8160bd4e6a 100644
--- a/src/cpp/src/vlm_pipeline.cpp
+++ b/src/cpp/src/vlm_pipeline.cpp
@@ -360,9 +360,9 @@ VLMPipeline::VLMPipeline(
         )
     },
     m_tokenizer{tokenizer},
-    m_vision_encoder(model_dir, device, device_config, core, m_vlm_config.model_type),
+    m_vision_encoder(model_dir, m_vlm_config.model_type, device, device_config, core),
     m_is_chat_conversation{false} {
-        if (m_vlm_config.model_type == "minicpmv") {
+        if (m_vlm_config.model_type == VLMModelType::MINICPM) {
             m_resampler = core.compile_model(
                 model_dir / "resampler.xml", device, device_config
             ).create_infer_request();
@@ -376,7 +376,7 @@ VLMPipeline::VLMPipeline(
             ).create_infer_request();
 
             m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70});
-        } else if (m_vlm_config.model_type == "llava") {
+        } else if (m_vlm_config.model_type == VLMModelType::LLAVA) {
             m_language = core.compile_model(
                 model_dir / "openvino_language_model.xml", device, device_config
             ).create_infer_request();
@@ -399,9 +399,9 @@ DecodedResults VLMPipeline::generate(
     const StreamerVariant& streamer
 ) {
     ov::Tensor inputs_embeds;
-    if (m_vlm_config.model_type == "minicpmv") {
+    if (m_vlm_config.model_type == VLMModelType::MINICPM) {
         inputs_embeds = get_inputs_embeds_minicpm(prompt, rgbs);
-    } else if (m_vlm_config.model_type == "llava") {
+    } else if (m_vlm_config.model_type == VLMModelType::LLAVA) {
         inputs_embeds = get_inputs_embeds_llava(prompt, rgbs);
     }
 

From 8912b56d529175fd95b2ab2e7727356eb70237c2 Mon Sep 17 00:00:00 2001
From: yatarkan <yaroslav.tarkan@intel.com>
Date: Wed, 9 Oct 2024 16:03:06 +0400
Subject: [PATCH 09/17] Fix typo in minicpm model type

---
 src/cpp/include/openvino/genai/vlm_model_type.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/include/openvino/genai/vlm_model_type.hpp b/src/cpp/include/openvino/genai/vlm_model_type.hpp
index 9e35d543a7..0f811a116a 100644
--- a/src/cpp/include/openvino/genai/vlm_model_type.hpp
+++ b/src/cpp/include/openvino/genai/vlm_model_type.hpp
@@ -18,7 +18,7 @@ enum class OPENVINO_GENAI_EXPORTS VLMModelType {
 
 inline VLMModelType to_vlm_model_type(const std::string& value) {
     static const std::unordered_map<std::string, VLMModelType> model_types_map = {
-        {"minicpm", VLMModelType::MINICPM},
+        {"minicpmv", VLMModelType::MINICPM},
         {"llava", VLMModelType::LLAVA}
     };
 

From 683917511e87f841dc7c7a4aa16c334ba1478f94 Mon Sep 17 00:00:00 2001
From: guozhong wang <guozhong.wang@intel.com>
Date: Fri, 11 Oct 2024 17:09:35 +0800
Subject: [PATCH 10/17] Update optimum-intel (#945)

Co-authored-by: Andrei Kochin <andrei.kochin@intel.com>
Co-authored-by: Ekaterina Aidova <ekaterina.aidova@intel.com>
---
 .github/workflows/llm_bench-python.yml               | 4 ++--
 llm_bench/python/requirements.txt                    | 2 +-
 llm_bench/python/who_what_benchmark/requirements.txt | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml
index be309c732d..878ec77433 100644
--- a/.github/workflows/llm_bench-python.yml
+++ b/.github/workflows/llm_bench-python.yml
@@ -62,11 +62,11 @@ jobs:
         python ./llm_bench/python/benchmark.py -m tiny-random-qwen -d cpu -n 1 -f pt
     - name: Test tiny-random-baichuan2 on Linux
       run: |
-        python ./llm_bench/python/convert.py --model_id katuni4ka/tiny-random-baichuan2 --output_dir ./ov_models/tiny-random-baichuan2 --precision FP16
+        optimum-cli export openvino --model katuni4ka/tiny-random-baichuan2 --trust-remote-code --weight-format fp16 ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16
         python ./llm_bench/python/benchmark.py -m ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16/ -d cpu -n 1
     - name: Test tiny-stable-diffusion on Linux
       run: |
-        python ./llm_bench/python/convert.py --model_id segmind/tiny-sd --output_dir ./ov_models/tiny-sd --precision FP16
+        optimum-cli export openvino --model segmind/tiny-sd --trust-remote-code --weight-format fp16 ./ov_models/tiny-sd/pytorch/dldt/FP16/
         python ./llm_bench/python/benchmark.py -m ./ov_models/tiny-sd/pytorch/dldt/FP16/ -pf ./llm_bench/python/prompts/stable-diffusion.jsonl -d cpu -n 1
     - name: WWB Tests
       run: |
diff --git a/llm_bench/python/requirements.txt b/llm_bench/python/requirements.txt
index e9ab7d794a..6139bf843c 100644
--- a/llm_bench/python/requirements.txt
+++ b/llm_bench/python/requirements.txt
@@ -10,7 +10,7 @@ torch
 transformers>=4.40.0
 diffusers>=0.22.0
 #optimum is in dependency list of optimum-intel 
-git+https://github.com/huggingface/optimum-intel.git@f34bd61df89f57f61c282c02297980299981ee78#egg=optimum-intel
+git+https://github.com/huggingface/optimum-intel.git@main#egg=optimum-intel
 git+https://github.com/openvinotoolkit/nncf.git@develop#egg=nncf
 packaging
 psutil
diff --git a/llm_bench/python/who_what_benchmark/requirements.txt b/llm_bench/python/who_what_benchmark/requirements.txt
index aa85f09ead..caae595e69 100644
--- a/llm_bench/python/who_what_benchmark/requirements.txt
+++ b/llm_bench/python/who_what_benchmark/requirements.txt
@@ -2,7 +2,7 @@ transformers>=4.35.2
 sentence-transformers>=2.2.2
 openvino>=2024.3.0
 openvino-telemetry
-optimum-intel>=1.14
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
 openvino-tokenizers
 pandas>=2.0.3
 numpy>=1.23.5

From 729b06386b598105ab0c30efe040eaa66b5151e3 Mon Sep 17 00:00:00 2001
From: yatarkan <yaroslav.tarkan@intel.com>
Date: Fri, 11 Oct 2024 13:58:33 +0400
Subject: [PATCH 11/17] Add llava to supported models

---
 src/docs/SUPPORTED_MODELS.md | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/docs/SUPPORTED_MODELS.md b/src/docs/SUPPORTED_MODELS.md
index 1232a081dd..bc57e2863f 100644
--- a/src/docs/SUPPORTED_MODELS.md
+++ b/src/docs/SUPPORTED_MODELS.md
@@ -167,14 +167,23 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
       <th>Example HuggingFace Models</th>
     </tr>
     <tr>
-      <td><code>MiniCPM-V-2_6</code></td>
       <td>MiniCPMV</td>
+      <td><code>MiniCPM-V-2_6</code></td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/openbmb/MiniCPM-V-2_6"><code>openbmb/MiniCPM-V-2_6</code></a></li>
         </ul>
       </td>
     </tr>
+    <tr>
+      <td>LLaVA</td>
+      <td><code>LLaVA-v1.5</code></td>
+      <td>
+        <ul>
+          <li><a href="https://huggingface.co/llava-hf/llava-1.5-7b-hf"><code>llava-hf/llava-1.5-7b-hf</code></a></li>
+        </ul>
+      </td>
+    </tr>
   </tbody>
 </table>
 

From 18b49c72f3d4a6cbcce27036c07ed2ed12db1c57 Mon Sep 17 00:00:00 2001
From: yatarkan <yaroslav.tarkan@intel.com>
Date: Fri, 11 Oct 2024 14:53:42 +0400
Subject: [PATCH 12/17] Switch to optimum-intel from git in requirements

---
 README.md                           | 3 +--
 samples/requirements.txt            | 2 +-
 tests/python_tests/requirements.txt | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 9a4d73802b..163768b18e 100644
--- a/README.md
+++ b/README.md
@@ -40,10 +40,9 @@ Continuous batching functionality is used within OpenVINO Model Server (OVMS) to
 
     # Install optimum-intel to be able to download, convert and optimize LLMs from Hugging Face
     # Optimum is not required to run models, only to convert and compress
-    pip install optimum[openvino]
+    pip install optimum-intel@git+https://github.com/huggingface/optimum-intel.git
 
     # (Optional) Install (TBD) to be able to download models from Model Scope
-    #pip install optimum[openvino]
 ```
 
 ## Performing text generation 
diff --git a/samples/requirements.txt b/samples/requirements.txt
index 4821d6dbef..a61fb6d68f 100644
--- a/samples/requirements.txt
+++ b/samples/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-optimum[openvino]==1.22.0
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
 einops==0.8.0  # For Qwen
 transformers_stream_generator==0.0.5  # For Qwen
 diffusers==0.30.3
diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt
index 8c49f7c1e6..eab7f0f4c3 100644
--- a/tests/python_tests/requirements.txt
+++ b/tests/python_tests/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-optimum[openvino]==1.22.0
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
 onnx==1.16.1
 pytest
 llm_bench/python/who_what_benchmark

From 4592cd6623ec28760bc27a318749bf25c1a2b282 Mon Sep 17 00:00:00 2001
From: yatarkan <yaroslav.tarkan@intel.com>
Date: Fri, 11 Oct 2024 15:00:42 +0400
Subject: [PATCH 13/17] Remove redundant optimum install

---
 .github/workflows/causal_lm_cpp.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 23437b6f67..b8fbe397d2 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -715,7 +715,6 @@ jobs:
           source ./ov/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install git+https://github.com/huggingface/optimum-intel.git
           optimum-cli export openvino --model llava-hf/llava-1.5-7b-hf ./llava_1_5_7b_ov/
           wget https://llava-vl.github.io/static/images/monalisa.jpg
       - name: Run visual_language_chat sample - LLaVa 1.5

From 8f304288a24c79c02fefc2292c55a7eeb702029a Mon Sep 17 00:00:00 2001
From: yatarkan <yaroslav.tarkan@intel.com>
Date: Fri, 11 Oct 2024 15:34:04 +0400
Subject: [PATCH 14/17] Reorder supported vlm models

---
 src/docs/SUPPORTED_MODELS.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/docs/SUPPORTED_MODELS.md b/src/docs/SUPPORTED_MODELS.md
index bc57e2863f..fb6df36950 100644
--- a/src/docs/SUPPORTED_MODELS.md
+++ b/src/docs/SUPPORTED_MODELS.md
@@ -167,20 +167,20 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
       <th>Example HuggingFace Models</th>
     </tr>
     <tr>
-      <td>MiniCPMV</td>
-      <td><code>MiniCPM-V-2_6</code></td>
+      <td>LLaVA</td>
+      <td><code>LLaVA-v1.5</code></td>
       <td>
         <ul>
-          <li><a href="https://huggingface.co/openbmb/MiniCPM-V-2_6"><code>openbmb/MiniCPM-V-2_6</code></a></li>
+          <li><a href="https://huggingface.co/llava-hf/llava-1.5-7b-hf"><code>llava-hf/llava-1.5-7b-hf</code></a></li>
         </ul>
       </td>
     </tr>
     <tr>
-      <td>LLaVA</td>
-      <td><code>LLaVA-v1.5</code></td>
+      <td>MiniCPMV</td>
+      <td><code>MiniCPM-V-2_6</code></td>
       <td>
         <ul>
-          <li><a href="https://huggingface.co/llava-hf/llava-1.5-7b-hf"><code>llava-hf/llava-1.5-7b-hf</code></a></li>
+          <li><a href="https://huggingface.co/openbmb/MiniCPM-V-2_6"><code>openbmb/MiniCPM-V-2_6</code></a></li>
         </ul>
       </td>
     </tr>

From 04a0014834dbdab131c5af7611058a8db9fc1f37 Mon Sep 17 00:00:00 2001
From: yatarkan <yaroslav.tarkan@intel.com>
Date: Fri, 11 Oct 2024 15:44:11 +0400
Subject: [PATCH 15/17] Reuse m_vision_encoder

---
 src/cpp/include/openvino/genai/vision_encoder.hpp |  7 ++-----
 src/cpp/src/vision_encoder.cpp                    | 12 ++++++------
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/src/cpp/include/openvino/genai/vision_encoder.hpp b/src/cpp/include/openvino/genai/vision_encoder.hpp
index 5a5d0c37b0..902557d316 100644
--- a/src/cpp/include/openvino/genai/vision_encoder.hpp
+++ b/src/cpp/include/openvino/genai/vision_encoder.hpp
@@ -45,20 +45,17 @@ class OPENVINO_GENAI_EXPORTS VisionEncoder {
     /// @brief A enum denoting model type.
     VLMModelType model_type;
     /// @brief A model for image encoding.
-    ov::InferRequest m_encoder;
+    ov::InferRequest m_vision_encoder;
     /// @brief A config to follow.
     ProcessorConfig m_processor_config;
 
-    // LLaVa specific members
-    ov::InferRequest m_vision_embeddings;
-
     /// @brief Construct from an already compiled model and a config.
     /// @param encoder Compiled model.
     /// @param processor_config Initial config.
     explicit VisionEncoder(
         const ov::InferRequest& encoder,
         const ProcessorConfig& processor_config=ProcessorConfig{}
-    ) : m_encoder{encoder}, m_processor_config{processor_config} {}
+    ) : m_vision_encoder{encoder}, m_processor_config{processor_config} {}
 
     /// @brief Construct the encoder from model_dir.
     /// @param model_dir A folder containing openvino_embedding.xml and
diff --git a/src/cpp/src/vision_encoder.cpp b/src/cpp/src/vision_encoder.cpp
index 856db0b96e..6c926e0ed8 100644
--- a/src/cpp/src/vision_encoder.cpp
+++ b/src/cpp/src/vision_encoder.cpp
@@ -432,10 +432,10 @@ ov::Tensor preprocess_image_llava(const ov::Tensor& image, const ProcessorConfig
 VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const VLMModelType model_type, const std::string& device, const ov::AnyMap device_config, ov::Core core) :
     model_type(model_type) {
         if (model_type == VLMModelType::MINICPM) {
-            m_encoder = core.compile_model(model_dir / "image_encoder.xml", device, device_config).create_infer_request();
+            m_vision_encoder = core.compile_model(model_dir / "image_encoder.xml", device, device_config).create_infer_request();
         } else if (model_type == VLMModelType::LLAVA) {
             // Vision embeddings model is merged with multi modal projector at model export stage by optimum-intel
-            m_vision_embeddings = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request();
+            m_vision_encoder = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request();
         }
         m_processor_config = ov::genai::utils::from_config_json_if_exists<ov::genai::ProcessorConfig>(
             model_dir, "preprocessor_config.json"
@@ -462,16 +462,16 @@ EncodedImage VisionEncoder::encode_minicpm(const ov::Tensor& image, const Proces
     ctx_clip.image_size = m_processor_config.image_size;
     std::copy(config.norm_mean.begin(), config.norm_mean.end(), ctx_clip.image_mean);
     std::copy(config.norm_std.begin(), config.norm_std.end(), ctx_clip.image_std);
-    return llava_image_embed_make_with_bytes_slice(ctx_clip, image, m_encoder, config.max_slice_nums, config.scale_resolution, config.patch_size, 0 == config.max_slice_nums);
+    return llava_image_embed_make_with_bytes_slice(ctx_clip, image, m_vision_encoder, config.max_slice_nums, config.scale_resolution, config.patch_size, 0 == config.max_slice_nums);
 }
 
 EncodedImage VisionEncoder::encode_llava(const ov::Tensor& image, const ProcessorConfig& config) {
     ov::Tensor preprocessed_image = preprocess_image_llava(image, config);
 
-    m_vision_embeddings.set_tensor("pixel_values", preprocessed_image);
-    m_vision_embeddings.infer();
+    m_vision_encoder.set_tensor("pixel_values", preprocessed_image);
+    m_vision_encoder.infer();
 
-    ov::Tensor image_features = m_vision_embeddings.get_output_tensor();
+    ov::Tensor image_features = m_vision_encoder.get_output_tensor();
     ImageSize resized_source_size{config.crop_size_height / config.patch_size, config.crop_size_width / config.patch_size};
 
     return {image_features, resized_source_size};

From d9feaead7a1c3e474417a8c5e54b551dd6bec9b0 Mon Sep 17 00:00:00 2001
From: yatarkan <yaroslav.tarkan@intel.com>
Date: Fri, 11 Oct 2024 16:21:28 +0400
Subject: [PATCH 16/17] Fix samples requirements with lowering numpy for macos

---
 samples/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/samples/requirements.txt b/samples/requirements.txt
index a61fb6d68f..df71d0cbb1 100644
--- a/samples/requirements.txt
+++ b/samples/requirements.txt
@@ -1,5 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
+numpy<2.0.0; sys_platform == 'darwin'
 einops==0.8.0  # For Qwen
 transformers_stream_generator==0.0.5  # For Qwen
 diffusers==0.30.3

From 8f1e347da687f52e0ab4e5ed7c85035e8be86b5c Mon Sep 17 00:00:00 2001
From: yatarkan <yaroslav.tarkan@intel.com>
Date: Fri, 11 Oct 2024 17:17:56 +0400
Subject: [PATCH 17/17] Fix python tests requirements with numpy for macos

---
 tests/python_tests/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt
index eab7f0f4c3..372d3ac950 100644
--- a/tests/python_tests/requirements.txt
+++ b/tests/python_tests/requirements.txt
@@ -1,5 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
+numpy<2.0.0; sys_platform == 'darwin'
 onnx==1.16.1
 pytest
 llm_bench/python/who_what_benchmark