diff --git a/samples/python/chat_sample/README.md b/samples/python/chat_sample/README.md index 66fe4b0d93..dc2c39b3a5 100644 --- a/samples/python/chat_sample/README.md +++ b/samples/python/chat_sample/README.md @@ -41,4 +41,4 @@ If you encounter an exception indicating a missing "chat template" when launchin The following template can be used as a default, but it may not work properly with every model: ``` "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}", -``` \ No newline at end of file +``` diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index d05d928df6..c4ff0a90ab 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -161,8 +161,10 @@ StaticLLMPipeline::StaticLLMPipeline( */ ov::Core core; // (1) Read the template model - this will be kvcache model - auto kvcache_model = core.read_model(path / "openvino_model.xml"); - // (2) TODO: Expose KV-cache input and output layers from kvcache model + m_kvcache_model = core.read_model(path / "openvino_model.xml"); + // (2) Expose KV-cache input and output layers from kvcache model + ov::pass::StatefulToStateless().run_on_model(m_kvcache_model); + align_u4_zp_constants(m_kvcache_model); // (3) Clone the model - this will be prefill m_prefill_model = m_kvcache_model->clone(); m_prefill_model->set_friendly_name(m_kvcache_model->get_friendly_name() + "_prefill"); @@ -179,7 +181,7 @@ StaticLLMPipeline::StaticLLMPipeline( m_prefill_model, device, extract_config_or_default(config, "PREFILL_CONFIG") ).create_infer_request(); m_kvcache_request = core.compile_model( - kvcache_model, device, extract_config_or_default(config, "GENERATE_CONFIG") + m_kvcache_model, device, extract_config_or_default(config, "GENERATE_CONFIG") ).create_infer_request(); // (7) Initialize tensors prepare_for_new_conversation();