diff --git a/samples/python/chat_sample/chat_sample.py b/samples/python/chat_sample/chat_sample.py index 8d305086e5..8a6cc25ac2 100755 --- a/samples/python/chat_sample/chat_sample.py +++ b/samples/python/chat_sample/chat_sample.py @@ -6,9 +6,8 @@ import openvino_genai -def streamer(subword) -> openvino_genai.StreamerRunningStatus: +def streamer(subword): print(subword, end='', flush=True) - return openvino_genai.StreamerRunningStatus.RUNNING def main(): diff --git a/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py b/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py index 726391ba9b..9325a77d00 100755 --- a/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py +++ b/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py @@ -5,11 +5,10 @@ import argparse import openvino_genai -def streamer(subword): - print(subword, end='', flush=True) - # Return flag corresponds whether generation should be stopped. - # False means continue generation. - return False +def streamer(subword): + print(subword, end='', flush=True) + # Return flag corresponds whether generation should be stopped. + return openvino_genai.StreamerRunningStatus.RUNNING def main(): parser = argparse.ArgumentParser() diff --git a/samples/python/speculative_decoding_lm/speculative_decoding_lm.py b/samples/python/speculative_decoding_lm/speculative_decoding_lm.py index 217b8a2730..7f82f9e1b7 100755 --- a/samples/python/speculative_decoding_lm/speculative_decoding_lm.py +++ b/samples/python/speculative_decoding_lm/speculative_decoding_lm.py @@ -8,10 +8,9 @@ import threading def streamer(subword): - print(subword, end='', flush=True) - # Return flag corresponds whether generation should be stopped. - # False means continue generation. - return False + print(subword, end='', flush=True) + # Return flag corresponds whether generation should be stopped. + return openvino_genai.StreamerRunningStatus.RUNNING def main(): parser = argparse.ArgumentParser() diff --git a/samples/python/visual_language_chat/visual_language_chat.py b/samples/python/visual_language_chat/visual_language_chat.py index 5dd7b83b3b..aa621d18df 100755 --- a/samples/python/visual_language_chat/visual_language_chat.py +++ b/samples/python/visual_language_chat/visual_language_chat.py @@ -11,7 +11,7 @@ from pathlib import Path -def streamer(subword: str) -> bool: +def streamer(subword: str) -> openvino_genai.StreamerRunningStatus: ''' Args: @@ -25,6 +25,8 @@ def streamer(subword: str) -> bool: # No value is returned as in this example we don't want to stop the generation in this method. # "return None" will be treated the same as "return False". + return openvino_genai.StreamerRunningStatus.RUNNING + def read_image(path: str) -> Tensor: ''' diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index be2a677625..82bf0a544d 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -187,9 +187,6 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { SequenceGroup::Ptr sequence_group = std::make_shared(request_id, prompt_ids, generation_config, block_size); requests.push_back(sequence_group); - OPENVINO_ASSERT((!m_is_chat_conversation || !std::get_if>(&streamer)), - "For chat mode, please, use Steamer as StreamerBase class or as callback with a bool return value."); - std::shared_ptr streamer_ptr = std::visit(overloaded{ [&m_tokenizer = m_tokenizer]( const std::function& callback diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index bd3136d089..f46431d6e8 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -940,7 +940,7 @@ class LLMPipeline: """ This class is used for generation with LLMs """ - def __call__(self, inputs: openvino._pyopenvino.Tensor | TokenizedInputs | str | list[str], generation_config: GenerationConfig | None = None, streamer: typing.Callable[[str], StreamerRunningStatus] | typing.Callable[[str], bool] | StreamerBase | None = None, **kwargs) -> EncodedResults | DecodedResults: + def __call__(self, inputs: openvino._pyopenvino.Tensor | TokenizedInputs | str | list[str], generation_config: GenerationConfig | None = None, streamer: typing.Callable[[str], StreamerRunningStatus] | StreamerBase | None = None, **kwargs) -> EncodedResults | DecodedResults: """ Generates sequences or tokens for LLMs. If input is a string or list of strings then resulting sequences will be already detokenized. @@ -1025,7 +1025,7 @@ class LLMPipeline: """ def finish_chat(self) -> None: ... - def generate(self, inputs: openvino._pyopenvino.Tensor | TokenizedInputs | str | list[str], generation_config: GenerationConfig | None = None, streamer: typing.Callable[[str], StreamerRunningStatus] | typing.Callable[[str], bool] | StreamerBase | None = None, **kwargs) -> EncodedResults | DecodedResults: + def generate(self, inputs: openvino._pyopenvino.Tensor | TokenizedInputs | str | list[str], generation_config: GenerationConfig | None = None, streamer: typing.Callable[[str], StreamerRunningStatus] | StreamerBase | None = None, **kwargs) -> EncodedResults | DecodedResults: """ Generates sequences or tokens for LLMs. If input is a string or list of strings then resulting sequences will be already detokenized. @@ -1845,7 +1845,7 @@ class VLMPipeline: def finish_chat(self) -> None: ... @typing.overload - def generate(self, prompt: str, images: list[openvino._pyopenvino.Tensor], generation_config: GenerationConfig, streamer: typing.Callable[[str], StreamerRunningStatus] | typing.Callable[[str], bool] | StreamerBase | None = None, **kwargs) -> VLMDecodedResults: + def generate(self, prompt: str, images: list[openvino._pyopenvino.Tensor], generation_config: GenerationConfig, streamer: typing.Callable[[str], StreamerRunningStatus] | StreamerBase | None = None, **kwargs) -> VLMDecodedResults: """ Generates sequences for VLMs. @@ -1858,8 +1858,8 @@ class VLMPipeline: :param generation_config: generation_config :type generation_config: GenerationConfig or a Dict - :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped - :type : Callable[[str], bool], Callable[[str], ov.genai.StreamerRunningStatus], ov.genai.StreamerBase + :param streamer: streamer either as a lambda with a StreamerRunningStatus returning flag whether generation should be stopped. Please, be aware that status CANCELLED is not supported and work as STOP. + :type : Callable[[str], ov.genai.StreamerRunningStatus], ov.genai.StreamerBase :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. :type : Dict @@ -1868,7 +1868,7 @@ class VLMPipeline: :rtype: VLMDecodedResults """ @typing.overload - def generate(self, prompt: str, images: openvino._pyopenvino.Tensor, generation_config: GenerationConfig, streamer: typing.Callable[[str], StreamerRunningStatus] | typing.Callable[[str], bool] | StreamerBase | None = None, **kwargs) -> VLMDecodedResults: + def generate(self, prompt: str, images: openvino._pyopenvino.Tensor, generation_config: GenerationConfig, streamer: typing.Callable[[str], StreamerRunningStatus] | StreamerBase | None = None, **kwargs) -> VLMDecodedResults: """ Generates sequences for VLMs. @@ -1881,8 +1881,8 @@ class VLMPipeline: :param generation_config: generation_config :type generation_config: GenerationConfig or a Dict - :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped - :type : Callable[[str], bool], Callable[[str], ov.genai.StreamerRunningStatus], ov.genai.StreamerBase + :param streamer: streamer either as a lambda with a StreamerRunningStatus returning flag whether generation should be stopped. Please, be aware that status CANCELLED is not supported and work as STOP. + :type : Callable[[str], ov.genai.StreamerRunningStatus], ov.genai.StreamerBase :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. :type : Dict @@ -1904,7 +1904,8 @@ class VLMPipeline: image: ov.Tensor - input image, images: List[ov.Tensor] - input images, generation_config: GenerationConfig, - streamer: Callable[[str], bool], Callable[[str], ov.genai.StreamerRunningStatus], ov.genai.StreamerBase - streamer either as a lambda with a boolean returning flag whether generation should be stopped + streamer: Callable[[str], ov.genai.StreamerRunningStatus], ov.genai.StreamerBase - streamer either as a lambda with a StreamerRunningStatus returning flag whether generation should be stopped. + Please, be aware that status CANCELLED is not supported and work as STOP. :return: return results in decoded form :rtype: VLMDecodedResults diff --git a/src/python/py_utils.cpp b/src/python/py_utils.cpp index 17f5b475fe..52ce9df3bb 100644 --- a/src/python/py_utils.cpp +++ b/src/python/py_utils.cpp @@ -336,15 +336,6 @@ ov::genai::StreamerVariant pystreamer_to_streamer(const PyBindStreamerVariant& p ov::genai::StreamerVariant streamer = std::monostate(); std::visit(overloaded { - [&streamer](const std::function& py_callback){ - // Wrap python streamer with manual utf-8 decoding. Do not rely - // on pybind automatic decoding since it raises exceptions on incomplete strings. - auto callback_wrapped = [py_callback](std::string subword) -> bool { - auto py_str = PyUnicode_DecodeUTF8(subword.data(), subword.length(), "replace"); - return py_callback(py::reinterpret_borrow(py_str)); - }; - streamer = callback_wrapped; - }, [&streamer](const std::function& py_callback){ // Wrap python streamer with manual utf-8 decoding. Do not rely // on pybind automatic decoding since it raises exceptions on incomplete strings. diff --git a/src/python/py_utils.hpp b/src/python/py_utils.hpp index ec9997b14d..f085a5c922 100644 --- a/src/python/py_utils.hpp +++ b/src/python/py_utils.hpp @@ -18,7 +18,7 @@ namespace ov::genai::pybind::utils { // When StreamerVariant is used utf-8 decoding is done by pybind and can lead to exception on incomplete texts. // Therefore strings decoding should be handled with PyUnicode_DecodeUTF8(..., "replace") to not throw errors. -using PyBindStreamerVariant = std::variant, std::function, std::shared_ptr, std::monostate>; +using PyBindStreamerVariant = std::variant, std::shared_ptr, std::monostate>; template struct overloaded : Ts... { diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp index a45a99a1d9..9ebf3f2df5 100644 --- a/src/python/py_vlm_pipeline.cpp +++ b/src/python/py_vlm_pipeline.cpp @@ -31,8 +31,8 @@ auto vlm_generate_docstring = R"( :param generation_config: generation_config :type generation_config: GenerationConfig or a Dict - :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped - :type : Callable[[str], bool], Callable[[str], ov.genai.StreamerRunningStatus], ov.genai.StreamerBase + :param streamer: streamer either as a lambda with a StreamerRunningStatus returning flag whether generation should be stopped. Please, be aware that status CANCELLED is not supported and work as STOP. + :type : Callable[[str], ov.genai.StreamerRunningStatus], ov.genai.StreamerBase :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. :type : Dict @@ -53,7 +53,8 @@ auto vlm_generate_kwargs_docstring = R"( image: ov.Tensor - input image, images: List[ov.Tensor] - input images, generation_config: GenerationConfig, - streamer: Callable[[str], bool], Callable[[str], ov.genai.StreamerRunningStatus], ov.genai.StreamerBase - streamer either as a lambda with a boolean returning flag whether generation should be stopped + streamer: Callable[[str], ov.genai.StreamerRunningStatus], ov.genai.StreamerBase - streamer either as a lambda with a StreamerRunningStatus returning flag whether generation should be stopped. + Please, be aware that status CANCELLED is not supported and work as STOP. :return: return results in decoded form :rtype: VLMDecodedResults