Wovchena · popovaan · Sep 27, 2024 · Sep 27, 2024 · Sep 27, 2024 · Sep 27, 2024
diff --git a/samples/python/vlm_chat_sample/README.md b/samples/python/vlm_chat_sample/README.md
@@ -0,0 +1,38 @@
+# Python vlm_chat_sample that supports VLM models
+
+This example showcases inference of text-generation Vision Language Models (VLMs): `miniCPM-V-2_6` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `openvino_genai.VLMPipeline` and configures it for the chat scenario. #TODO: add a link to notebook when available
+
+## Download and convert the model and tokenizers
+
+The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.
+
+It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported.
+
+```sh
+pip install --upgrade-strategy eager -r ../../requirements.txt
+```
+# TODO: add optimum cli command for miniCPM-V-2_6 when available
+
+## Run:
+https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 can be used as a sample image.
+
+`vlm_chat_sample.py ./miniCPM-V-2_6/ 319483352-d5fbbd1a-d484-415c-88cb-9986625b7b11.jpg`
+
+
+Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. # TODO: examples of larger models
+Modify the source code to change the device for inference to the GPU.
+
+See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models.
+
+### Troubleshooting
+
+#### Unicode characters encoding error on Windows
+
+Example error:
+```
+UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined>
+```
+
+If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this:
+1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
+2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`.
diff --git a/samples/python/vlm_chat_sample/vlm_chat_sample.py b/samples/python/vlm_chat_sample/vlm_chat_sample.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+
+import numpy as np
+import openvino_genai
+from PIL import Image
+from openvino import Tensor
+
+
+def streamer(subword: str):
+    '''
+
+    Args:
+        subword: sub-word of the generated text.
+
+    Returns: Return flag corresponds whether generation should be stopped.
+
+    '''
+    print(subword, end='', flush=True)
+
+    return False
+
+
+def read_image(path):
+    pic = Image.open(path)
+    image_data = np.array(pic.getdata()).reshape(1, 3, pic.size[1], pic.size[0]).astype(np.byte)
+    return Tensor(image_data)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('model_dir')
+    parser.add_argument('image_dir')
+    args = parser.parse_args()
+
+    image = read_image(args.image_dir)
+
+    device = 'CPU'  # GPU can be used as well
+    pipe = openvino_genai.VLMPipeline(args.model_dir, device)
+
+    config = openvino_genai.GenerationConfig()
+    config.max_new_tokens = 100
+
+    pipe.start_chat()
+    while True:
+        try:
+            prompt = input('question:\n')
+        except EOFError:
+            break
+        pipe(prompt, image=image, generation_config=config, streamer=streamer)
+        print('\n----------')
+    pipe.finish_chat()
+
+
+if '__main__' == __name__:
+    main()
diff --git a/src/cpp/src/vlm_pipeline.cpp b/src/cpp/src/vlm_pipeline.cpp
@@ -551,7 +551,25 @@ DecodedResults VLMPipeline::generate(
         }
         m_language.get_tensor("attention_mask").set_shape({1, 0});
     }
-    return {{m_tokenizer.decode(generated)}};
+
+    DecodedResults results;
+    results.texts = {m_tokenizer.decode(generated)};
+
+    // TODO: implement performance metrics
+    results.perf_metrics = ov::genai::PerfMetrics();
+    results.perf_metrics.m_evaluated = false;
+    results.perf_metrics.generate_duration = {0, 0};
+    results.perf_metrics.inference_duration= {0, 0};
+    results.perf_metrics.tokenization_duration = {0, 0};
+    results.perf_metrics.detokenization_duration= {0, 0};
+    results.perf_metrics.ttft = {0, 0};
+    results.perf_metrics.tpot= {0, 0};
+    results.perf_metrics.ipot= {0, 0};
+    results.perf_metrics.throughput= {0, 0};
+    results.perf_metrics.num_generated_tokens = generated.size();
+    results.perf_metrics.num_input_tokens= 0;
+
+    return results;
 }
 
 DecodedResults VLMPipeline::generate(

diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
@@ -18,7 +18,7 @@ if(NOT pybind11_POPULATED)
     add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR})
 endif()
 
-pybind11_add_module(py_generate_pipeline py_generate_pipeline.cpp py_whisper_pipeline.cpp utils.cpp)
+pybind11_add_module(py_generate_pipeline py_vlm_pipeline.cpp py_generate_pipeline.cpp py_whisper_pipeline.cpp utils.cpp)
 target_link_libraries(py_generate_pipeline PRIVATE openvino::genai)
 set_target_properties(py_generate_pipeline PROPERTIES
     ARCHIVE_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>"

diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py
@@ -17,7 +17,8 @@
     EncodedResults,
     GenerationConfig,
     GenerationResult,
-    LLMPipeline,
+    LLMPipeline, 
+    VLMPipeline, 
     PerfMetrics,
     RawPerfMetrics,
     SchedulerConfig,

diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
@@ -50,6 +50,7 @@ std::vector<float> get_ms(const T& instance, U T::*member) {
 }
 
 void init_whisper_pipeline(py::module_& m);
+void init_vlm_pipeline(py::module_& m);
 
 namespace {
 
@@ -310,68 +311,6 @@ auto cache_eviction_config_docstring = R"(
     :type aggregation_mode: openvino_genai.AggregationMode
 )";
 
-OptionalGenerationConfig update_config_from_kwargs(const OptionalGenerationConfig& config, const py::kwargs& kwargs) {
-    if(!config.has_value() && kwargs.empty())
-        return std::nullopt;
-
-    GenerationConfig res_config;
-    if(config.has_value())
-        res_config = *config;
-
-    for (const auto& item : kwargs) {
-        std::string key = py::cast<std::string>(item.first);
-        py::object value = py::cast<py::object>(item.second);
-
-        if (item.second.is_none()) {
-            // Even if argument key name does not fit GenerationConfig name 
-            // it's not an eror if it's not defined. 
-            // Some HF configs can have parameters for methods currenly unsupported in ov_genai
-            // but if their values are not set / None, then this should not block 
-            // us from reading such configs, e.g. {"typical_p": None, 'top_p': 1.0,...}
-            return res_config;
-        }
-
-        if (key == "max_new_tokens") {
-            res_config.max_new_tokens = py::cast<int>(item.second);
-        } else if (key == "max_length") {
-            res_config.max_length = py::cast<int>(item.second);
-        } else if (key == "ignore_eos") {
-            res_config.ignore_eos = py::cast<bool>(item.second);
-        } else if (key == "num_beam_groups") {
-            res_config.num_beam_groups = py::cast<int>(item.second);
-        } else if (key == "num_beams") {
-            res_config.num_beams = py::cast<int>(item.second);
-        } else if (key == "diversity_penalty") {
-            res_config.diversity_penalty = py::cast<float>(item.second);
-        } else if (key == "length_penalty") {
-            res_config.length_penalty = py::cast<float>(item.second);
-        } else if (key == "num_return_sequences") {
-            res_config.num_return_sequences = py::cast<int>(item.second);
-        } else if (key == "no_repeat_ngram_size") {
-            res_config.no_repeat_ngram_size = py::cast<int>(item.second);
-        } else if (key == "stop_criteria") {
-            res_config.stop_criteria = py::cast<StopCriteria>(item.second);
-        } else if (key == "temperature") {
-            res_config.temperature = py::cast<float>(item.second);
-        } else if (key == "top_p") {
-            res_config.top_p = py::cast<float>(item.second);
-        } else if (key == "top_k") {
-            res_config.top_k = py::cast<int>(item.second);
-        } else if (key == "do_sample") {
-            res_config.do_sample = py::cast<bool>(item.second);
-        } else if (key == "repetition_penalty") {
-            res_config.repetition_penalty = py::cast<float>(item.second);
-        } else if (key == "eos_token_id") {
-            res_config.set_eos_token_id(py::cast<int>(item.second));
-        } else {
-            throw(std::invalid_argument("'" + key + "' is incorrect GenerationConfig parameter name. "
-                                        "Use help(openvino_genai.GenerationConfig) to get list of acceptable parameters."));
-        }
-    }
-
-    return res_config;
-}
-
 py::list handle_utf8_results(const std::vector<std::string>& decoded_res) {
     // pybind11 decodes strings similar to Pythons's
     // bytes.decode('utf-8'). It raises if the decoding fails.
@@ -392,26 +331,10 @@ py::object call_common_generate(
     const utils::PyBindStreamerVariant& py_streamer, 
     const py::kwargs& kwargs
 ) {
-    auto updated_config = update_config_from_kwargs(config, kwargs);
+    auto updated_config = ov::genai::pybind::utils::update_config_from_kwargs(config, kwargs);
     py::object results;
     EncodedInputs tensor_data;
-    StreamerVariant streamer = std::monostate();
-
-    std::visit(utils::overloaded {
-    [&streamer](const std::function<bool(py::str)>& py_callback){
-        // Wrap python streamer with manual utf-8 decoding. Do not rely
-        // on pybind automatic decoding since it raises exceptions on incomplete strings.
-        auto callback_wrapped = [&py_callback](std::string subword) -> bool {
-            auto py_str = PyUnicode_DecodeUTF8(subword.data(), subword.length(), "replace");
-            return py_callback(py::reinterpret_borrow<py::str>(py_str));
-        };
-        streamer = callback_wrapped;
-    },
-    [&streamer](std::shared_ptr<StreamerBase> streamer_cls){
-        streamer = streamer_cls;
-    },
-    [](std::monostate none){ /*streamer is already a monostate */ }
-    }, py_streamer);
+    StreamerVariant streamer = ov::genai::pybind::utils::pystreamer_to_streamer(py_streamer);
 
     // Call suitable generate overload for each type of input.
     std::visit(utils::overloaded {
@@ -635,7 +558,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
      // Binding for GenerationConfig
     py::class_<GenerationConfig>(m, "GenerationConfig", generation_config_docstring)
         .def(py::init<std::string>(), py::arg("json_path"), "path where generation_config.json is stored")
-        .def(py::init([](py::kwargs kwargs) { return *update_config_from_kwargs(GenerationConfig(), kwargs); }))
+        .def(py::init([](py::kwargs kwargs) { return *ov::genai::pybind::utils::update_config_from_kwargs(GenerationConfig(), kwargs); }))
         .def_readwrite("max_new_tokens", &GenerationConfig::max_new_tokens)
         .def_readwrite("max_length", &GenerationConfig::max_length)
         .def_readwrite("ignore_eos", &GenerationConfig::ignore_eos)
@@ -840,4 +763,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
 
     // init whisper bindings
     init_whisper_pipeline(m);
+
+    // init vlm pipeline
+    init_vlm_pipeline(m);
 }