From a4c97bc7f0ca66678c58fbece68f8bf36eedce23 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Thu, 7 Nov 2024 11:12:45 +0000 Subject: [PATCH 01/20] SDPA reproducer --- src/cpp/src/llm_pipeline.cpp | 119 +++++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 26221fd5c7..b0f50a255f 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -21,6 +21,119 @@ #include "sampler.hpp" #include "lm_encoding.hpp" + +#include "openvino/pass/matcher_pass.hpp" +#include "openvino/pass/manager.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "openvino/opsets/opset13.hpp" + +class ScaledDotProductAttentionDecomposition : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("ScaledDotProductAttentionDecomposition", "0"); + ScaledDotProductAttentionDecomposition() { + auto pattern_node = ov::pass::pattern::wrap_type(); + + ov::matcher_pass_callback callback = [&](ov::pass::pattern::Matcher& m) { + auto& pattern_to_output = m.get_pattern_value_map(); + auto node = ov::as_type_ptr( + pattern_to_output.at(pattern_node).get_node_shared_ptr()); + + if (node == nullptr || transformation_callback(node)) { + return false; + } + + auto new_output_node = decompose(node); + ov::replace_node(node, new_output_node); + return true; + }; + + auto m = std::make_shared(pattern_node, "ScaledDotProductAttentionDecomposition"); + register_matcher(m, callback); + } + std::shared_ptr decompose(std::shared_ptr node) { + using namespace ov::op; + using namespace ov; + auto query = node->input_value(0); + auto key = node->input_value(1); + auto value = node->input_value(2); + auto q_shape = register_new_node(query, element::i32); + auto k_shape = register_new_node(key, element::i32); + auto minus_one = register_new_node(v0::Constant::create(element::i32, Shape{}, {-1})); + auto minus_two = register_new_node(v0::Constant::create(element::i32, Shape{}, {-2})); + auto zero_i = register_new_node(v0::Constant::create(element::i32, Shape{}, {0})); + auto one_i = register_new_node(v0::Constant::create(element::i32, Shape{}, {1})); + auto one_f = register_new_node(one_i, query); + auto zero_f = register_new_node(zero_i, query); + + Output scale; + if (node->get_input_size() < 5) { + scale = register_new_node(q_shape, minus_one, zero_i)->output(0); + scale = register_new_node(scale, query); + auto sqrt_scale = register_new_node(scale); + scale = register_new_node(one_f, sqrt_scale); + } else { + scale = node->input_value(4); + } + + auto q_scaled = register_new_node(query, scale); + auto k_rank = register_new_node(k_shape, element::i32)->output(0); + auto k_last_dim = register_new_node(k_rank, minus_one); + auto k_next_dim = register_new_node(k_rank, minus_two)->output(0); + k_rank = register_new_node(k_rank, zero_i); + auto minus_inf = + register_new_node(v0::Constant::create(element::f32, Shape{}, {-std::numeric_limits::infinity()})) + ->output(0); + auto keep_dim_last = register_new_node(k_next_dim, zero_i); + auto k_dims_before_transpose = register_new_node(zero_i, keep_dim_last, one_i, element::i32); + + auto transpose_dims = + register_new_node(OutputVector{k_dims_before_transpose, k_last_dim, k_next_dim}, 0); + auto k_transposed = register_new_node(key, transpose_dims); + auto scaled_atten = register_new_node(q_scaled, k_transposed)->output(0); + minus_inf = register_new_node(minus_inf, scaled_atten); + + if (node->get_causal() || node->get_input_size() > 3) { + Output mask; + Output atten_mask; + if (!node->get_causal()) { + mask = node->input_value(3); + + // two types of masks are supported. A boolean mask where a value of True indicates that the element should + // take part in attention. A float mask of the same type as query, key, value that is added to the attention + // score. + if (mask.get_element_type() == element::boolean) { + atten_mask = register_new_node(mask, scaled_atten); + auto inv_mask = register_new_node(mask); + atten_mask = register_new_node(inv_mask, atten_mask, minus_inf); + } else { + atten_mask = mask; + } + } else { + auto target_s_len = register_new_node(q_shape, minus_two, zero_i); + auto source_s_len = register_new_node(k_shape, minus_two, zero_i); + auto ssl = register_new_node(source_s_len, zero_i); + auto tsl = register_new_node(target_s_len, zero_i); + auto mask_shape = register_new_node(OutputVector{tsl, ssl}, 0); + mask = register_new_node(minus_inf, mask_shape); + auto horizontal_range = register_new_node(zero_i, source_s_len, one_i, element::i32)->output(0); + horizontal_range = register_new_node(horizontal_range, zero_i); + auto stop = register_new_node(target_s_len, one_i); + auto vertical_range = register_new_node(one_i, stop, one_i, element::i32)->output(0); + vertical_range = register_new_node(vertical_range, one_i); + auto triu = register_new_node(horizontal_range, vertical_range); + atten_mask = register_new_node(triu, mask, zero_f); + } + scaled_atten = register_new_node(scaled_atten, atten_mask); + } + + scaled_atten = register_new_node(scaled_atten, -1); + auto result = register_new_node(scaled_atten, value); + result->set_friendly_name(node->get_friendly_name()); + copy_runtime_info(node, get_new_nodes()); + return result; + } +}; + namespace ov { namespace genai { @@ -74,6 +187,12 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(plugin_config); core.set_property(core_plugin_config); auto model = core.read_model(models_path / "openvino_model.xml"); + + std::cout << "[LOG_DEBUG] Apply SDPA..." << std::endl; + ov::pass::Manager manager; + manager.register_pass(); + manager.run_passes(model); + utils::slice_matmul_statefull_model(model); m_model_runner = core.compile_model(model, device, compile_plugin_config).create_infer_request(); } From 6e01dbbde5a25897c7c42059b962e544a959df7c Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Thu, 7 Nov 2024 11:22:41 +0000 Subject: [PATCH 02/20] Snapshot --- src/cpp/src/llm_pipeline.cpp | 8 +- src/cpp/src/llm_pipeline_static.cpp | 153 ++++++++++++++++++++++++++-- 2 files changed, 151 insertions(+), 10 deletions(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index b0f50a255f..edaca9b147 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -188,10 +188,10 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { core.set_property(core_plugin_config); auto model = core.read_model(models_path / "openvino_model.xml"); - std::cout << "[LOG_DEBUG] Apply SDPA..." << std::endl; - ov::pass::Manager manager; - manager.register_pass(); - manager.run_passes(model); + //std::cout << "[LOG_DEBUG] Apply SDPA..." << std::endl; + //ov::pass::Manager manager; + //manager.register_pass(); + //manager.run_passes(model); utils::slice_matmul_statefull_model(model); m_model_runner = core.compile_model(model, device, compile_plugin_config).create_infer_request(); diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 7174321ff5..5da31a2f20 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -6,6 +6,11 @@ #include #include "openvino/pass/stateful_to_stateless.hpp" + +#include "openvino/pass/matcher_pass.hpp" +#include "openvino/pass/manager.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" + #include "openvino/runtime/core.hpp" #include "openvino/opsets/opset13.hpp" #include "openvino/core/preprocess/pre_post_process.hpp" @@ -18,6 +23,113 @@ #include "json_utils.hpp" #include "utils.hpp" +class ScaledDotProductAttentionDecomposition : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("ScaledDotProductAttentionDecomposition", "0"); + ScaledDotProductAttentionDecomposition() { + auto pattern_node = ov::pass::pattern::wrap_type(); + + ov::matcher_pass_callback callback = [&](ov::pass::pattern::Matcher& m) { + auto& pattern_to_output = m.get_pattern_value_map(); + auto node = ov::as_type_ptr( + pattern_to_output.at(pattern_node).get_node_shared_ptr()); + + if (node == nullptr || transformation_callback(node)) { + return false; + } + + auto new_output_node = decompose(node); + ov::replace_node(node, new_output_node); + return true; + }; + + auto m = std::make_shared(pattern_node, "ScaledDotProductAttentionDecomposition"); + register_matcher(m, callback); + } + std::shared_ptr decompose(std::shared_ptr node) { + using namespace ov::op; + using namespace ov; + auto query = node->input_value(0); + auto key = node->input_value(1); + auto value = node->input_value(2); + auto q_shape = register_new_node(query, element::i32); + auto k_shape = register_new_node(key, element::i32); + auto minus_one = register_new_node(v0::Constant::create(element::i32, Shape{}, {-1})); + auto minus_two = register_new_node(v0::Constant::create(element::i32, Shape{}, {-2})); + auto zero_i = register_new_node(v0::Constant::create(element::i32, Shape{}, {0})); + auto one_i = register_new_node(v0::Constant::create(element::i32, Shape{}, {1})); + auto one_f = register_new_node(one_i, query); + auto zero_f = register_new_node(zero_i, query); + + Output scale; + if (node->get_input_size() < 5) { + scale = register_new_node(q_shape, minus_one, zero_i)->output(0); + scale = register_new_node(scale, query); + auto sqrt_scale = register_new_node(scale); + scale = register_new_node(one_f, sqrt_scale); + } else { + scale = node->input_value(4); + } + + auto q_scaled = register_new_node(query, scale); + auto k_rank = register_new_node(k_shape, element::i32)->output(0); + auto k_last_dim = register_new_node(k_rank, minus_one); + auto k_next_dim = register_new_node(k_rank, minus_two)->output(0); + k_rank = register_new_node(k_rank, zero_i); + auto minus_inf = + register_new_node(v0::Constant::create(element::f32, Shape{}, {-std::numeric_limits::infinity()})) + ->output(0); + auto keep_dim_last = register_new_node(k_next_dim, zero_i); + auto k_dims_before_transpose = register_new_node(zero_i, keep_dim_last, one_i, element::i32); + + auto transpose_dims = + register_new_node(OutputVector{k_dims_before_transpose, k_last_dim, k_next_dim}, 0); + auto k_transposed = register_new_node(key, transpose_dims); + auto scaled_atten = register_new_node(q_scaled, k_transposed)->output(0); + minus_inf = register_new_node(minus_inf, scaled_atten); + + if (node->get_causal() || node->get_input_size() > 3) { + Output mask; + Output atten_mask; + if (!node->get_causal()) { + mask = node->input_value(3); + + // two types of masks are supported. A boolean mask where a value of True indicates that the element should + // take part in attention. A float mask of the same type as query, key, value that is added to the attention + // score. + if (mask.get_element_type() == element::boolean) { + atten_mask = register_new_node(mask, scaled_atten); + auto inv_mask = register_new_node(mask); + atten_mask = register_new_node(inv_mask, atten_mask, minus_inf); + } else { + atten_mask = mask; + } + } else { + auto target_s_len = register_new_node(q_shape, minus_two, zero_i); + auto source_s_len = register_new_node(k_shape, minus_two, zero_i); + auto ssl = register_new_node(source_s_len, zero_i); + auto tsl = register_new_node(target_s_len, zero_i); + auto mask_shape = register_new_node(OutputVector{tsl, ssl}, 0); + mask = register_new_node(minus_inf, mask_shape); + auto horizontal_range = register_new_node(zero_i, source_s_len, one_i, element::i32)->output(0); + horizontal_range = register_new_node(horizontal_range, zero_i); + auto stop = register_new_node(target_s_len, one_i); + auto vertical_range = register_new_node(one_i, stop, one_i, element::i32)->output(0); + vertical_range = register_new_node(vertical_range, one_i); + auto triu = register_new_node(horizontal_range, vertical_range); + atten_mask = register_new_node(triu, mask, zero_f); + } + scaled_atten = register_new_node(scaled_atten, atten_mask); + } + + scaled_atten = register_new_node(scaled_atten, -1); + auto result = register_new_node(scaled_atten, value); + result->set_friendly_name(node->get_friendly_name()); + copy_runtime_info(node, get_new_nodes()); + return result; + } +}; + namespace { uint32_t align_to(uint32_t value, uint32_t alignment) { @@ -255,7 +367,7 @@ std::optional extract_npu_descriptor(ov::Core& core) { ov::AnyMap get_baseline_common_config() { ov::AnyMap config = { { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm" }, - { "NPUW_DEVICES", "NPU" }, + { "NPUW_DEVICES", "NPU,CPU" }, { "NPU_USE_NPUW", "YES" }, { "NPUW_FOLD", "YES" }, { "NPUW_DCOFF_TYPE", "f16" }, @@ -298,9 +410,9 @@ ov::AnyMap get_default_generate_config(const std::shared_ptr& model, const std::optional& npudesc, const GenerateHint hint) { auto config = get_default_common_config(model); - if (hint == GenerateHint::BEST_PERF) { - config.emplace("NPUW_ONLINE_PIPELINE", "NONE"); - } + //if (hint == GenerateHint::BEST_PERF) { + config.emplace("NPUW_ONLINE_PIPELINE", "NONE"); + //} // NB: Unconditionally set for generation model config.emplace("NPUW_DQ", "YES"); if (npudesc.has_value() && npudesc->arch == "4000") { @@ -353,6 +465,23 @@ void drop_cache_dir(ov::AnyMap& config) { } } +//std::shared_ptr redirect_new_kv_to_output(const std::shared_ptr& model) { + //const auto kStartOutputKVCacheLayers = 1u; + //for (int i = kStartOutputKVCacheLayers; i < model->outputs().size(); ++i) { + //auto kvout = model->output(i); + //auto kvrslt = kvout.get_node(); + //auto kvcat = kvrslt->inputs()[0].get_source_output().get_node(); + //auto kvval = kvcat->inputs()[1].get_source_output(); + //kvval.set_names({kvout.get_any_name()}); + //kvrslt->inputs()[0].replace_source_output(kvval); + //} + //model->validate_nodes_and_infer_types(); + //return model; +//} + +//std::shared_ptr redirect_new_kv_to_output(const std::shared_ptr& model) { +//} + } // anonymous namespace namespace ov { @@ -416,6 +545,11 @@ void StaticLLMPipeline::setupAndCompileModels( // (1) Read the template model - this will be kvcache model m_kvcache_model = core.read_model((models_path / "openvino_model.xml").string()); + + ov::pass::Manager manager; + manager.register_pass(); + manager.run_passes(m_kvcache_model); + // (2) Expose KV-cache input and output layers from kvcache model ov::pass::StatefulToStateless().run_on_model(m_kvcache_model); // (3) Align u4 ZP constants @@ -449,11 +583,18 @@ void StaticLLMPipeline::setupAndCompileModels( drop_cache_dir(prefill_config); drop_cache_dir(generate_config); + //m_kvcache_request = core.compile_model( + //m_kvcache_model, device, generate_config + //).create_infer_request(); + //m_prefill_request = core.compile_model( + //m_prefill_model, device, prefill_config + //).create_infer_request(); + m_kvcache_request = core.compile_model( - m_kvcache_model, device, generate_config + m_kvcache_model, "CPU" ).create_infer_request(); m_prefill_request = core.compile_model( - m_prefill_model, device, prefill_config + m_prefill_model, "CPU" ).create_infer_request(); } From 0448e22898c6eb2e27ca2f4a0e8d0dd0238e30a2 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Thu, 7 Nov 2024 16:05:14 +0000 Subject: [PATCH 03/20] Snapshot --- samples/cpp/chat_sample/chat_sample.cpp | 2 +- src/cpp/src/llm_pipeline.cpp | 119 ---------------- src/cpp/src/llm_pipeline_static.cpp | 172 +++++++++++++++++------- 3 files changed, 128 insertions(+), 165 deletions(-) diff --git a/samples/cpp/chat_sample/chat_sample.cpp b/samples/cpp/chat_sample/chat_sample.cpp index 41d63fc0f1..70ecb93821 100644 --- a/samples/cpp/chat_sample/chat_sample.cpp +++ b/samples/cpp/chat_sample/chat_sample.cpp @@ -10,7 +10,7 @@ int main(int argc, char* argv[]) try { std::string prompt; std::string models_path = argv[1]; - std::string device = "CPU"; // GPU, NPU can be used as well + std::string device = "NPU"; // GPU, NPU can be used as well ov::genai::LLMPipeline pipe(models_path, device); ov::genai::GenerationConfig config; diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index edaca9b147..26221fd5c7 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -21,119 +21,6 @@ #include "sampler.hpp" #include "lm_encoding.hpp" - -#include "openvino/pass/matcher_pass.hpp" -#include "openvino/pass/manager.hpp" -#include "openvino/pass/pattern/op/wrap_type.hpp" -#include "openvino/opsets/opset13.hpp" - -class ScaledDotProductAttentionDecomposition : public ov::pass::MatcherPass { -public: - OPENVINO_RTTI("ScaledDotProductAttentionDecomposition", "0"); - ScaledDotProductAttentionDecomposition() { - auto pattern_node = ov::pass::pattern::wrap_type(); - - ov::matcher_pass_callback callback = [&](ov::pass::pattern::Matcher& m) { - auto& pattern_to_output = m.get_pattern_value_map(); - auto node = ov::as_type_ptr( - pattern_to_output.at(pattern_node).get_node_shared_ptr()); - - if (node == nullptr || transformation_callback(node)) { - return false; - } - - auto new_output_node = decompose(node); - ov::replace_node(node, new_output_node); - return true; - }; - - auto m = std::make_shared(pattern_node, "ScaledDotProductAttentionDecomposition"); - register_matcher(m, callback); - } - std::shared_ptr decompose(std::shared_ptr node) { - using namespace ov::op; - using namespace ov; - auto query = node->input_value(0); - auto key = node->input_value(1); - auto value = node->input_value(2); - auto q_shape = register_new_node(query, element::i32); - auto k_shape = register_new_node(key, element::i32); - auto minus_one = register_new_node(v0::Constant::create(element::i32, Shape{}, {-1})); - auto minus_two = register_new_node(v0::Constant::create(element::i32, Shape{}, {-2})); - auto zero_i = register_new_node(v0::Constant::create(element::i32, Shape{}, {0})); - auto one_i = register_new_node(v0::Constant::create(element::i32, Shape{}, {1})); - auto one_f = register_new_node(one_i, query); - auto zero_f = register_new_node(zero_i, query); - - Output scale; - if (node->get_input_size() < 5) { - scale = register_new_node(q_shape, minus_one, zero_i)->output(0); - scale = register_new_node(scale, query); - auto sqrt_scale = register_new_node(scale); - scale = register_new_node(one_f, sqrt_scale); - } else { - scale = node->input_value(4); - } - - auto q_scaled = register_new_node(query, scale); - auto k_rank = register_new_node(k_shape, element::i32)->output(0); - auto k_last_dim = register_new_node(k_rank, minus_one); - auto k_next_dim = register_new_node(k_rank, minus_two)->output(0); - k_rank = register_new_node(k_rank, zero_i); - auto minus_inf = - register_new_node(v0::Constant::create(element::f32, Shape{}, {-std::numeric_limits::infinity()})) - ->output(0); - auto keep_dim_last = register_new_node(k_next_dim, zero_i); - auto k_dims_before_transpose = register_new_node(zero_i, keep_dim_last, one_i, element::i32); - - auto transpose_dims = - register_new_node(OutputVector{k_dims_before_transpose, k_last_dim, k_next_dim}, 0); - auto k_transposed = register_new_node(key, transpose_dims); - auto scaled_atten = register_new_node(q_scaled, k_transposed)->output(0); - minus_inf = register_new_node(minus_inf, scaled_atten); - - if (node->get_causal() || node->get_input_size() > 3) { - Output mask; - Output atten_mask; - if (!node->get_causal()) { - mask = node->input_value(3); - - // two types of masks are supported. A boolean mask where a value of True indicates that the element should - // take part in attention. A float mask of the same type as query, key, value that is added to the attention - // score. - if (mask.get_element_type() == element::boolean) { - atten_mask = register_new_node(mask, scaled_atten); - auto inv_mask = register_new_node(mask); - atten_mask = register_new_node(inv_mask, atten_mask, minus_inf); - } else { - atten_mask = mask; - } - } else { - auto target_s_len = register_new_node(q_shape, minus_two, zero_i); - auto source_s_len = register_new_node(k_shape, minus_two, zero_i); - auto ssl = register_new_node(source_s_len, zero_i); - auto tsl = register_new_node(target_s_len, zero_i); - auto mask_shape = register_new_node(OutputVector{tsl, ssl}, 0); - mask = register_new_node(minus_inf, mask_shape); - auto horizontal_range = register_new_node(zero_i, source_s_len, one_i, element::i32)->output(0); - horizontal_range = register_new_node(horizontal_range, zero_i); - auto stop = register_new_node(target_s_len, one_i); - auto vertical_range = register_new_node(one_i, stop, one_i, element::i32)->output(0); - vertical_range = register_new_node(vertical_range, one_i); - auto triu = register_new_node(horizontal_range, vertical_range); - atten_mask = register_new_node(triu, mask, zero_f); - } - scaled_atten = register_new_node(scaled_atten, atten_mask); - } - - scaled_atten = register_new_node(scaled_atten, -1); - auto result = register_new_node(scaled_atten, value); - result->set_friendly_name(node->get_friendly_name()); - copy_runtime_info(node, get_new_nodes()); - return result; - } -}; - namespace ov { namespace genai { @@ -187,12 +74,6 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(plugin_config); core.set_property(core_plugin_config); auto model = core.read_model(models_path / "openvino_model.xml"); - - //std::cout << "[LOG_DEBUG] Apply SDPA..." << std::endl; - //ov::pass::Manager manager; - //manager.register_pass(); - //manager.run_passes(model); - utils::slice_matmul_statefull_model(model); m_model_runner = core.compile_model(model, device, compile_plugin_config).create_infer_request(); } diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 5da31a2f20..03ec5d426d 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -7,6 +7,7 @@ #include "openvino/pass/stateful_to_stateless.hpp" +// NB: decompose SDPA #include "openvino/pass/matcher_pass.hpp" #include "openvino/pass/manager.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" @@ -410,9 +411,11 @@ ov::AnyMap get_default_generate_config(const std::shared_ptr& model, const std::optional& npudesc, const GenerateHint hint) { auto config = get_default_common_config(model); - //if (hint == GenerateHint::BEST_PERF) { - config.emplace("NPUW_ONLINE_PIPELINE", "NONE"); - //} + if (hint == GenerateHint::BEST_PERF) { + config.emplace("NPUW_ONLINE_PIPELINE", "NONE"); + } + std::cout << "[LOG_DEBUG] SUMP SUBS" << std::endl; + config.emplace("NPUW_DUMP_SUBS", "YES"); // NB: Unconditionally set for generation model config.emplace("NPUW_DQ", "YES"); if (npudesc.has_value() && npudesc->arch == "4000") { @@ -465,22 +468,98 @@ void drop_cache_dir(ov::AnyMap& config) { } } -//std::shared_ptr redirect_new_kv_to_output(const std::shared_ptr& model) { - //const auto kStartOutputKVCacheLayers = 1u; - //for (int i = kStartOutputKVCacheLayers; i < model->outputs().size(); ++i) { - //auto kvout = model->output(i); - //auto kvrslt = kvout.get_node(); - //auto kvcat = kvrslt->inputs()[0].get_source_output().get_node(); - //auto kvval = kvcat->inputs()[1].get_source_output(); - //kvval.set_names({kvout.get_any_name()}); - //kvrslt->inputs()[0].replace_source_output(kvval); - //} - //model->validate_nodes_and_infer_types(); - //return model; -//} - -//std::shared_ptr redirect_new_kv_to_output(const std::shared_ptr& model) { -//} +std::shared_ptr transpose_value_tensors(const std::shared_ptr& model) { + const auto original_parameters = model->get_parameters(); + + std::vector> new_params; + + for (auto node : model->get_ops()) { + if (node->get_friendly_name().find("Concat") != std::string::npos) { + auto param = node->input(0).get_source_output().get_node(); + auto transpose = node->input(1).get_source_output().get_node(); + + std::string in0_name = param->get_type_name(); + std::string in1_name = transpose->get_type_name(); + + if (in0_name.find("Parameter") != std::string::npos && + in1_name.find("Transpose") != std::string::npos) { + // Create new param + auto shape = param->get_shape(); + std::swap(shape[2], shape[3]); + + auto new_param = std::make_shared(param->get_element_type(), shape); + new_param->set_friendly_name(param->get_friendly_name()); + + new_params.push_back(new_param); + for (auto input_port : param->output(0).get_target_inputs()) { + input_port.replace_source_output(new_param); + } + + auto order_cst = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{4}, {0, 2, 3, 1}); + auto new_transpose = std::make_shared(transpose->input_value(0), + order_cst->output(0)); + new_transpose->set_friendly_name(transpose->get_friendly_name()); + node->input(1).replace_source_output(new_transpose->output(0)); + + auto new_concat = std::make_shared( + ov::OutputVector{new_param->output(0), new_transpose->output(0)}, 3 + ); + new_concat->set_friendly_name(node->get_friendly_name()); + for (auto input_port : node->output(0).get_target_inputs()) { + input_port.replace_source_output(new_concat); + } + } + } + + if (ov::is_type(node)) { + auto softmax = node->input(0).get_source_output().get_node(); + auto concat = node->input(1).get_source_output().get_node(); + if (std::string{softmax->get_type_name()}.find("Softmax") != std::string::npos && + std::string{concat->get_type_name()}.find("Concat") != std::string::npos) { + auto matmul = std::static_pointer_cast(node); + matmul->set_transpose_b(true); + } + } + + } + model->add_parameters(new_params); + return model; +} + +namespace opp = ov::pass::pattern; +class TransposeVMatMul : public ov::pass::MatcherPass { +public: + TransposeVMatMul() { + std::cout << "create pattern" << std::endl; + auto param = opp::wrap_type(); + auto transpose = opp::wrap_type({opp::any_input(), opp::any_input()}); + auto concat = opp::wrap_type({param, transpose}); + auto softmax = opp::wrap_type({opp::any_input()}); + auto matmul = opp::wrap_type({softmax, concat}); + + auto callback = [=](ov::pass::pattern::Matcher& m) { + std::cout << "FOUND MATCH" << std::endl; + auto& node_to_output = m.get_pattern_value_map(); + + auto matched_node_param = node_to_output.at(param).get_node_shared_ptr(); + auto matched_node_concat = node_to_output.at(concat).get_node_shared_ptr(); + auto matched_node_transpose = node_to_output.at(transpose).get_node_shared_ptr(); + auto matched_node_matmul = node_to_output.at(matmul).get_node_shared_ptr(); + + auto matched_param = std::static_pointer_cast(matched_node_param); + auto matched_concat = std::static_pointer_cast(matched_node_concat); + auto matched_transpose = std::static_pointer_cast(matched_node_transpose); + auto matched_matmul = std::static_pointer_cast(matched_node_matmul); + + std::cout << "passed all checks, pattern is found" << std::endl; + throw 1; + + return true; + }; + register_matcher(std::make_shared(matmul, "TransposeVMatMul"), std::move(callback)); + } +}; + } // anonymous namespace @@ -532,8 +611,8 @@ void StaticLLMPipeline::setupAndCompileModels( 1) Read the template model - this will be kvcache model 2) Expose KV-cache input and output layers from kvcache model 3) Align u4 ZP constants - TODO: get rid of this step in future - 4) Replace KV-cache tensors for the entire cache to tensors only for new token (before concat) - 5) Clone the model - this will be prefill + 4) Clone the model - this will be prefill + 5) Replace KV-cache tensors for the entire cache to tensors only for new token (before concat) 6) Reshape both models to static shape 7) Compile both models */ @@ -542,25 +621,20 @@ void StaticLLMPipeline::setupAndCompileModels( // NB: Get information about NPU if available auto npudesc = extract_npu_descriptor(core); - // (1) Read the template model - this will be kvcache model m_kvcache_model = core.read_model((models_path / "openvino_model.xml").string()); - - ov::pass::Manager manager; - manager.register_pass(); - manager.run_passes(m_kvcache_model); - // (2) Expose KV-cache input and output layers from kvcache model ov::pass::StatefulToStateless().run_on_model(m_kvcache_model); // (3) Align u4 ZP constants align_u4_zp_constants(m_kvcache_model); - // (4) Replace KV-tensors for the entire cache to tensors only for new token - m_kvcache_model = redirect_new_kv_to_output(m_kvcache_model); - // (5) Convert kvcache tensors to fp16 precision - m_kvcache_model = cvt_kvcache_to_fp16(m_kvcache_model); - // (6) Clone the model - this will be prefill + // (4) Clone the model - this will be prefill m_prefill_model = m_kvcache_model->clone(); m_prefill_model->set_friendly_name(m_kvcache_model->get_friendly_name() + "_prefill"); + // (5) Replace KV-tensors for the entire cache to tensors only for new token and decompose SDPA + std::cout << "[LOG_DEBUG] Apply SDPA..." << std::endl; + ov::pass::Manager manager; + manager.register_pass(); + manager.run_passes(m_kvcache_model); // (7) Reshape both models to static shape const uint32_t kMaxPromptLen = align_to(pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(1024u), 64u); const uint32_t kMinResponseLen = align_to(pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(128u), 64u); @@ -568,6 +642,20 @@ void StaticLLMPipeline::setupAndCompileModels( m_kvcache_desc = KVCacheDesc { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len }; reshape_to_static(m_prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes); reshape_to_static(m_kvcache_model, 1u, m_kvcache_desc.total_size, axes); + + //ov::save_model(m_kvcache_model, "kvcache-model.xml"); + + std::cout << "transpose tensors" << std::endl; + ov::pass::Manager m2; + m2.register_pass(); + m2.run_passes(m_kvcache_model); + //transpose_value_tensors(m_kvcache_model); + std::cout << "transpose tensors - done" << std::endl; + + m_kvcache_model = redirect_new_kv_to_output(m_kvcache_model); + // (6) Convert kvcache tensors to fp16 precision + m_kvcache_model = cvt_kvcache_to_fp16(m_kvcache_model); + m_prefill_model = cvt_kvcache_to_fp16(m_prefill_model); // (8) Compile both model auto prefill_config = pop_or_default( properties, "PREFILL_CONFIG", get_default_prefill_config(m_prefill_model, npudesc) @@ -583,19 +671,13 @@ void StaticLLMPipeline::setupAndCompileModels( drop_cache_dir(prefill_config); drop_cache_dir(generate_config); - //m_kvcache_request = core.compile_model( - //m_kvcache_model, device, generate_config - //).create_infer_request(); - //m_prefill_request = core.compile_model( - //m_prefill_model, device, prefill_config - //).create_infer_request(); - - m_kvcache_request = core.compile_model( - m_kvcache_model, "CPU" - ).create_infer_request(); - m_prefill_request = core.compile_model( - m_prefill_model, "CPU" - ).create_infer_request(); + ov::save_model(m_kvcache_model, "model-wo-sdpa.xml"); + m_kvcache_request = core.compile_model( + m_kvcache_model, device, generate_config + ).create_infer_request(); + m_prefill_request = core.compile_model( + m_prefill_model, device, prefill_config + ).create_infer_request(); } void StaticLLMPipeline::setupAndImportModels( From 393b91c322647112b950ed743bbd6cd6994d5477 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Fri, 8 Nov 2024 09:48:47 +0000 Subject: [PATCH 04/20] Snapshot --- samples/cpp/chat_sample/chat_sample.cpp | 4 +- src/cpp/src/llm_pipeline_static.cpp | 141 +++++++++++++----------- src/cpp/src/llm_pipeline_static.hpp | 3 +- 3 files changed, 82 insertions(+), 66 deletions(-) diff --git a/samples/cpp/chat_sample/chat_sample.cpp b/samples/cpp/chat_sample/chat_sample.cpp index 70ecb93821..5a13b97784 100644 --- a/samples/cpp/chat_sample/chat_sample.cpp +++ b/samples/cpp/chat_sample/chat_sample.cpp @@ -11,7 +11,9 @@ int main(int argc, char* argv[]) try { std::string models_path = argv[1]; std::string device = "NPU"; // GPU, NPU can be used as well - ov::genai::LLMPipeline pipe(models_path, device); + // + ov::AnyMap pipeline_config = { { "USE_OPT_LAYOUT", "YES" } }; + ov::genai::LLMPipeline pipe(models_path, device, pipeline_config); ov::genai::GenerationConfig config; config.max_new_tokens = 100; diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 03ec5d426d..627c403920 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -24,6 +24,33 @@ #include "json_utils.hpp" #include "utils.hpp" +template +ov::Tensor rotate_clockwise(const ov::Tensor& input) { + ov::Shape shape = input.get_shape(); + ov::Shape new_shape{shape[0], shape[1], shape[3], shape[2]}; + + ov::Tensor output(input.get_element_type(), new_shape); + + const auto* in_p = input.data(); + auto* out_p = output.data(); + + const int C = shape[1]; + const int H = shape[2]; + const int W = shape[3]; + + for (size_t c = 0; c < C; ++c) { + for (size_t i = 0; i < H; ++i) { + for (size_t j = 0; j < W; ++j) { + size_t in_idx = (c * H * W) + (i * W) + j; + size_t out_idx = (c * H * W) + (j * H) + i; + out_p[out_idx] = in_p[in_idx]; + } + } + } + + return output; +} + class ScaledDotProductAttentionDecomposition : public ov::pass::MatcherPass { public: OPENVINO_RTTI("ScaledDotProductAttentionDecomposition", "0"); @@ -414,8 +441,6 @@ ov::AnyMap get_default_generate_config(const std::shared_ptr& model, if (hint == GenerateHint::BEST_PERF) { config.emplace("NPUW_ONLINE_PIPELINE", "NONE"); } - std::cout << "[LOG_DEBUG] SUMP SUBS" << std::endl; - config.emplace("NPUW_DUMP_SUBS", "YES"); // NB: Unconditionally set for generation model config.emplace("NPUW_DQ", "YES"); if (npudesc.has_value() && npudesc->arch == "4000") { @@ -469,10 +494,12 @@ void drop_cache_dir(ov::AnyMap& config) { } std::shared_ptr transpose_value_tensors(const std::shared_ptr& model) { - const auto original_parameters = model->get_parameters(); + auto original_parameters = model->get_parameters(); std::vector> new_params; + std::map param_name_to_idx; + for (auto node : model->get_ops()) { if (node->get_friendly_name().find("Concat") != std::string::npos) { auto param = node->input(0).get_source_output().get_node(); @@ -489,8 +516,10 @@ std::shared_ptr transpose_value_tensors(const std::shared_ptr(param->get_element_type(), shape); new_param->set_friendly_name(param->get_friendly_name()); + new_param->outputs().begin()->get_tensor().set_names(param->outputs().begin()->get_tensor().get_names()); new_params.push_back(new_param); + param_name_to_idx[param->get_friendly_name()] = new_params.size() - 1; for (auto input_port : param->output(0).get_target_inputs()) { input_port.replace_source_output(new_param); } @@ -522,43 +551,14 @@ std::shared_ptr transpose_value_tensors(const std::shared_ptradd_parameters(new_params); - return model; -} - -namespace opp = ov::pass::pattern; -class TransposeVMatMul : public ov::pass::MatcherPass { -public: - TransposeVMatMul() { - std::cout << "create pattern" << std::endl; - auto param = opp::wrap_type(); - auto transpose = opp::wrap_type({opp::any_input(), opp::any_input()}); - auto concat = opp::wrap_type({param, transpose}); - auto softmax = opp::wrap_type({opp::any_input()}); - auto matmul = opp::wrap_type({softmax, concat}); - - auto callback = [=](ov::pass::pattern::Matcher& m) { - std::cout << "FOUND MATCH" << std::endl; - auto& node_to_output = m.get_pattern_value_map(); - - auto matched_node_param = node_to_output.at(param).get_node_shared_ptr(); - auto matched_node_concat = node_to_output.at(concat).get_node_shared_ptr(); - auto matched_node_transpose = node_to_output.at(transpose).get_node_shared_ptr(); - auto matched_node_matmul = node_to_output.at(matmul).get_node_shared_ptr(); - - auto matched_param = std::static_pointer_cast(matched_node_param); - auto matched_concat = std::static_pointer_cast(matched_node_concat); - auto matched_transpose = std::static_pointer_cast(matched_node_transpose); - auto matched_matmul = std::static_pointer_cast(matched_node_matmul); - - std::cout << "passed all checks, pattern is found" << std::endl; - throw 1; - - return true; - }; - register_matcher(std::make_shared(matmul, "TransposeVMatMul"), std::move(callback)); + for (int i = 0; i < original_parameters.size(); ++i) { + if (auto it = param_name_to_idx.find(original_parameters[i]->get_friendly_name()); it != param_name_to_idx.end()) { + original_parameters[i] = new_params[it->second]; + } } -}; + + return std::make_shared(model->get_results(), ov::SinkVector{}, original_parameters); +} } // anonymous namespace @@ -630,31 +630,30 @@ void StaticLLMPipeline::setupAndCompileModels( // (4) Clone the model - this will be prefill m_prefill_model = m_kvcache_model->clone(); m_prefill_model->set_friendly_name(m_kvcache_model->get_friendly_name() + "_prefill"); - // (5) Replace KV-tensors for the entire cache to tensors only for new token and decompose SDPA - std::cout << "[LOG_DEBUG] Apply SDPA..." << std::endl; - ov::pass::Manager manager; - manager.register_pass(); - manager.run_passes(m_kvcache_model); // (7) Reshape both models to static shape const uint32_t kMaxPromptLen = align_to(pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(1024u), 64u); const uint32_t kMinResponseLen = align_to(pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(128u), 64u); KVAxesPosition axes = get_kv_axes(get_model_type_from_json(models_path / "config.json")); - m_kvcache_desc = KVCacheDesc { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len }; + m_kvcache_desc = KVCacheDesc { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len, axes.seq_len }; reshape_to_static(m_prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes); reshape_to_static(m_kvcache_model, 1u, m_kvcache_desc.total_size, axes); - //ov::save_model(m_kvcache_model, "kvcache-model.xml"); - - std::cout << "transpose tensors" << std::endl; - ov::pass::Manager m2; - m2.register_pass(); - m2.run_passes(m_kvcache_model); - //transpose_value_tensors(m_kvcache_model); - std::cout << "transpose tensors - done" << std::endl; + const bool opt_layout = pop_or_default(properties, "USE_OPT_LAYOUT", "NO") == "YES"; + if (opt_layout) { + std::cout << "[LOG_DEBUG] Will enable opt layout " << std::endl; + // (5) Replace KV-tensors for the entire cache to tensors only for new token and decompose SDPA + ov::pass::Manager manager; + manager.register_pass(); + manager.run_passes(m_kvcache_model); + m_kvcache_model = transpose_value_tensors(m_kvcache_model); + m_kvcache_desc.v_seq_len = 3; + } m_kvcache_model = redirect_new_kv_to_output(m_kvcache_model); + // (6) Convert kvcache tensors to fp16 precision m_kvcache_model = cvt_kvcache_to_fp16(m_kvcache_model); + m_prefill_model = cvt_kvcache_to_fp16(m_prefill_model); // (8) Compile both model auto prefill_config = pop_or_default( @@ -672,12 +671,12 @@ void StaticLLMPipeline::setupAndCompileModels( drop_cache_dir(generate_config); ov::save_model(m_kvcache_model, "model-wo-sdpa.xml"); - m_kvcache_request = core.compile_model( - m_kvcache_model, device, generate_config - ).create_infer_request(); - m_prefill_request = core.compile_model( - m_prefill_model, device, prefill_config - ).create_infer_request(); + m_kvcache_request = core.compile_model( + m_kvcache_model, device, generate_config + ).create_infer_request(); + m_prefill_request = core.compile_model( + m_prefill_model, device, prefill_config + ).create_infer_request(); } void StaticLLMPipeline::setupAndImportModels( @@ -924,8 +923,18 @@ EncodedResults StaticLLMPipeline::generate( const auto& output_name = kvcache_compiled.outputs()[kStartOutputKVCacheLayers + i].get_any_name(); auto prefill_out_tensor = m_prefill_request.get_tensor(output_name); + + // FIXME: ... + auto kv_dim = m_kvcache_desc.k_seq_len; + if (kv_dim != m_kvcache_desc.v_seq_len && + output_name.find("value") != std::string::npos) { + auto rotated = rotate_clockwise(prefill_out_tensor); + prefill_out_tensor = rotated; + kv_dim = m_kvcache_desc.v_seq_len; + } + auto prefill_out_slice = make_tensor_slice( - prefill_out_tensor, m_kvcache_desc.dim, m_kvcache_desc.max_prompt_size - m_kvcache_desc.num_stored_tokens, m_kvcache_desc.max_prompt_size + prefill_out_tensor, kv_dim, m_kvcache_desc.max_prompt_size - m_kvcache_desc.num_stored_tokens, m_kvcache_desc.max_prompt_size ); const auto& input_name = kvcache_compiled.inputs()[kStartInputKVCacheLayers + i].get_any_name(); @@ -933,7 +942,7 @@ EncodedResults StaticLLMPipeline::generate( fill_tensor(kvcache_in_tensor, 0); auto kvcache_in_slice = make_tensor_slice( - kvcache_in_tensor, m_kvcache_desc.dim, 0u, m_kvcache_desc.num_stored_tokens + kvcache_in_tensor, kv_dim, 0u, m_kvcache_desc.num_stored_tokens ); prefill_out_slice.copy_to(kvcache_in_slice); @@ -976,12 +985,16 @@ EncodedResults StaticLLMPipeline::generate( // NB: Write KV-cache for the new token to the correct input position for the next iteration for (int i = 0; i < kvcache_compiled.outputs().size() - 1; ++i) { - const auto& input_name = kvcache_compiled.inputs()[kStartInputKVCacheLayers + i].get_any_name(); + const auto& input_name = kvcache_compiled.inputs()[kStartInputKVCacheLayers + i].get_any_name(); + const auto& output_name = kvcache_compiled.outputs()[kStartOutputKVCacheLayers + i].get_any_name(); + + const auto kv_dim = + output_name.find("value") != std::string::npos ? m_kvcache_desc.v_seq_len : m_kvcache_desc.k_seq_len; + auto kvcache_in_tensor = m_kvcache_request.get_tensor(input_name); auto kvcache_in_slice = make_tensor_slice( - kvcache_in_tensor, m_kvcache_desc.dim, m_kvcache_desc.num_stored_tokens - 1, m_kvcache_desc.num_stored_tokens + kvcache_in_tensor, kv_dim, m_kvcache_desc.num_stored_tokens - 1, m_kvcache_desc.num_stored_tokens ); - const auto& output_name = kvcache_compiled.outputs()[kStartOutputKVCacheLayers + i].get_any_name(); m_kvcache_request.get_tensor(output_name).copy_to(kvcache_in_slice); } } diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp index 55b75ae3b3..24213b304e 100644 --- a/src/cpp/src/llm_pipeline_static.hpp +++ b/src/cpp/src/llm_pipeline_static.hpp @@ -57,7 +57,8 @@ class StaticLLMPipeline final : public LLMPipelineImplBase { uint32_t max_prompt_size; uint32_t total_size; uint32_t num_stored_tokens; - uint32_t dim; + uint32_t k_seq_len; + uint32_t v_seq_len; }; // FIXME: Ideally, we don't need to keep those From 9b09844147468e1175902343be5a5eea342fb223 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Fri, 8 Nov 2024 16:57:04 +0000 Subject: [PATCH 05/20] Remove ugly pass --- src/cpp/src/llm_pipeline_static.cpp | 205 ++++++++++++++-------------- 1 file changed, 105 insertions(+), 100 deletions(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 627c403920..b34da5332f 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -4,12 +4,14 @@ #include "llm_pipeline_static.hpp" #include +#include #include "openvino/pass/stateful_to_stateless.hpp" // NB: decompose SDPA #include "openvino/pass/matcher_pass.hpp" #include "openvino/pass/manager.hpp" +#include "openvino/pass/graph_rewrite.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" #include "openvino/runtime/core.hpp" @@ -24,32 +26,69 @@ #include "json_utils.hpp" #include "utils.hpp" -template -ov::Tensor rotate_clockwise(const ov::Tensor& input) { - ov::Shape shape = input.get_shape(); - ov::Shape new_shape{shape[0], shape[1], shape[3], shape[2]}; +namespace { - ov::Tensor output(input.get_element_type(), new_shape); +struct Context { + std::vector> new_params; + std::vector> old_params; + using Ref = std::reference_wrapper; +}; - const auto* in_p = input.data(); - auto* out_p = output.data(); +namespace opp = ov::pass::pattern; +class TransposeValueTensors : public ov::pass::MatcherPass { +public: + TransposeValueTensors(Context::Ref ctx) { + auto param = opp::wrap_type(); + auto transpose = opp::wrap_type({opp::any_input(), opp::any_input()}); + auto concat = opp::wrap_type({param, transpose}); + auto softmax = opp::wrap_type({opp::any_input()}); + auto matmul = opp::wrap_type({softmax, concat}); + + auto callback = [=](ov::pass::pattern::Matcher& m) { + auto& node_to_output = m.get_pattern_value_map(); + + auto matched_node_param = node_to_output.at(param).get_node_shared_ptr(); + auto matched_node_concat = node_to_output.at(concat).get_node_shared_ptr(); + auto matched_node_transpose = node_to_output.at(transpose).get_node_shared_ptr(); + auto matched_node_matmul = node_to_output.at(matmul).get_node_shared_ptr(); + + auto matched_param = std::static_pointer_cast(matched_node_param); + auto matched_concat = std::static_pointer_cast(matched_node_concat); + auto matched_transpose = std::static_pointer_cast(matched_node_transpose); + auto matched_matmul = std::static_pointer_cast(matched_node_matmul); + + auto shape = matched_param->get_partial_shape(); + std::swap(shape[2], shape[3]); + auto new_param = std::make_shared(matched_param->get_element_type(), shape); + new_param->set_friendly_name(matched_param->get_friendly_name()); + new_param->outputs().begin()->get_tensor().set_names(matched_param->outputs().begin()->get_tensor().get_names()); + //for (auto input_port : matched_param->output(0).get_target_inputs()) { + for (auto input_port : matched_node_param->output(0).get_target_inputs()) { + input_port.replace_source_output(new_param); + } + ctx.get().new_params.push_back(new_param); + ctx.get().old_params.push_back(matched_param); + new_param->validate_and_infer_types(); + + auto order_cst = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{4}, {0, 2, 3, 1}); + auto new_transpose = std::make_shared(matched_transpose->input_value(0), + order_cst->output(0)); + new_transpose->set_friendly_name(matched_transpose->get_friendly_name()); + ov::replace_node(matched_transpose, new_transpose); + + auto new_concat = std::make_shared( + ov::OutputVector{new_param->output(0), new_transpose->output(0)}, 3u + ); + new_concat->set_friendly_name(matched_concat->get_friendly_name()); + ov::replace_node(matched_concat, new_concat); - const int C = shape[1]; - const int H = shape[2]; - const int W = shape[3]; + matched_matmul->set_transpose_b(true); - for (size_t c = 0; c < C; ++c) { - for (size_t i = 0; i < H; ++i) { - for (size_t j = 0; j < W; ++j) { - size_t in_idx = (c * H * W) + (i * W) + j; - size_t out_idx = (c * H * W) + (j * H) + i; - out_p[out_idx] = in_p[in_idx]; - } - } + return true; + }; + register_matcher(std::make_shared(matmul, "TransposeValueTensors"), std::move(callback)); } - - return output; -} +}; class ScaledDotProductAttentionDecomposition : public ov::pass::MatcherPass { public: @@ -57,7 +96,7 @@ class ScaledDotProductAttentionDecomposition : public ov::pass::MatcherPass { ScaledDotProductAttentionDecomposition() { auto pattern_node = ov::pass::pattern::wrap_type(); - ov::matcher_pass_callback callback = [&](ov::pass::pattern::Matcher& m) { + ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) { auto& pattern_to_output = m.get_pattern_value_map(); auto node = ov::as_type_ptr( pattern_to_output.at(pattern_node).get_node_shared_ptr()); @@ -158,7 +197,32 @@ class ScaledDotProductAttentionDecomposition : public ov::pass::MatcherPass { } }; -namespace { +template +ov::Tensor rotate_clockwise(const ov::Tensor& input) { + ov::Shape shape = input.get_shape(); + ov::Shape new_shape{shape[0], shape[1], shape[3], shape[2]}; + + ov::Tensor output(input.get_element_type(), new_shape); + + const auto* in_p = input.data(); + auto* out_p = output.data(); + + const int C = shape[1]; + const int H = shape[2]; + const int W = shape[3]; + + for (size_t c = 0; c < C; ++c) { + for (size_t i = 0; i < H; ++i) { + for (size_t j = 0; j < W; ++j) { + size_t in_idx = (c * H * W) + (i * W) + j; + size_t out_idx = (c * H * W) + (j * H) + i; + out_p[out_idx] = in_p[in_idx]; + } + } + } + + return output; +} uint32_t align_to(uint32_t value, uint32_t alignment) { return (value + alignment - 1) & ~(alignment - 1); @@ -443,6 +507,7 @@ ov::AnyMap get_default_generate_config(const std::shared_ptr& model, } // NB: Unconditionally set for generation model config.emplace("NPUW_DQ", "YES"); + config.emplace("NPUW_DUMP_SUBS", "YES"); if (npudesc.has_value() && npudesc->arch == "4000") { config.emplace("NPU_DPU_GROUPS", 4); } @@ -493,74 +558,6 @@ void drop_cache_dir(ov::AnyMap& config) { } } -std::shared_ptr transpose_value_tensors(const std::shared_ptr& model) { - auto original_parameters = model->get_parameters(); - - std::vector> new_params; - - std::map param_name_to_idx; - - for (auto node : model->get_ops()) { - if (node->get_friendly_name().find("Concat") != std::string::npos) { - auto param = node->input(0).get_source_output().get_node(); - auto transpose = node->input(1).get_source_output().get_node(); - - std::string in0_name = param->get_type_name(); - std::string in1_name = transpose->get_type_name(); - - if (in0_name.find("Parameter") != std::string::npos && - in1_name.find("Transpose") != std::string::npos) { - // Create new param - auto shape = param->get_shape(); - std::swap(shape[2], shape[3]); - - auto new_param = std::make_shared(param->get_element_type(), shape); - new_param->set_friendly_name(param->get_friendly_name()); - new_param->outputs().begin()->get_tensor().set_names(param->outputs().begin()->get_tensor().get_names()); - - new_params.push_back(new_param); - param_name_to_idx[param->get_friendly_name()] = new_params.size() - 1; - for (auto input_port : param->output(0).get_target_inputs()) { - input_port.replace_source_output(new_param); - } - - auto order_cst = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{4}, {0, 2, 3, 1}); - auto new_transpose = std::make_shared(transpose->input_value(0), - order_cst->output(0)); - new_transpose->set_friendly_name(transpose->get_friendly_name()); - node->input(1).replace_source_output(new_transpose->output(0)); - - auto new_concat = std::make_shared( - ov::OutputVector{new_param->output(0), new_transpose->output(0)}, 3 - ); - new_concat->set_friendly_name(node->get_friendly_name()); - for (auto input_port : node->output(0).get_target_inputs()) { - input_port.replace_source_output(new_concat); - } - } - } - - if (ov::is_type(node)) { - auto softmax = node->input(0).get_source_output().get_node(); - auto concat = node->input(1).get_source_output().get_node(); - if (std::string{softmax->get_type_name()}.find("Softmax") != std::string::npos && - std::string{concat->get_type_name()}.find("Concat") != std::string::npos) { - auto matmul = std::static_pointer_cast(node); - matmul->set_transpose_b(true); - } - } - - } - for (int i = 0; i < original_parameters.size(); ++i) { - if (auto it = param_name_to_idx.find(original_parameters[i]->get_friendly_name()); it != param_name_to_idx.end()) { - original_parameters[i] = new_params[it->second]; - } - } - - return std::make_shared(model->get_results(), ov::SinkVector{}, original_parameters); -} - - } // anonymous namespace namespace ov { @@ -638,14 +635,20 @@ void StaticLLMPipeline::setupAndCompileModels( reshape_to_static(m_prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes); reshape_to_static(m_kvcache_model, 1u, m_kvcache_desc.total_size, axes); - const bool opt_layout = pop_or_default(properties, "USE_OPT_LAYOUT", "NO") == "YES"; - if (opt_layout) { + const bool opt_layout = pop_or_default(properties, "DISABLE_OPT_LAYOUT", "NO") == "YES"; + if (opt_layout || true) { std::cout << "[LOG_DEBUG] Will enable opt layout " << std::endl; // (5) Replace KV-tensors for the entire cache to tensors only for new token and decompose SDPA - ov::pass::Manager manager; - manager.register_pass(); - manager.run_passes(m_kvcache_model); - m_kvcache_model = transpose_value_tensors(m_kvcache_model); + Context ctx; + ov::pass::GraphRewrite rewr; + rewr.add_matcher(); + rewr.add_matcher(std::ref(ctx)); + rewr.run_on_model(m_kvcache_model); + m_kvcache_model->add_parameters(ctx.new_params); + for (auto old_param : ctx.old_params) { + m_kvcache_model->remove_parameter(old_param); + } + ov::pass::Validate().run_on_model(m_kvcache_model); m_kvcache_desc.v_seq_len = 3; } @@ -914,7 +917,7 @@ EncodedResults StaticLLMPipeline::generate( // Inputs: input_ids, attention_mask, position_ids, ... // Outputs: logits, ... - const auto kStartInputKVCacheLayers = 3u; + //const auto kStartInputKVCacheLayers = 3u; const auto kStartOutputKVCacheLayers = 1u; // NB: Copy KV-cache tensors from prefill model to kvcache model @@ -937,7 +940,9 @@ EncodedResults StaticLLMPipeline::generate( prefill_out_tensor, kv_dim, m_kvcache_desc.max_prompt_size - m_kvcache_desc.num_stored_tokens, m_kvcache_desc.max_prompt_size ); - const auto& input_name = kvcache_compiled.inputs()[kStartInputKVCacheLayers + i].get_any_name(); + //const auto& input_name = kvcache_compiled.inputs()[kStartInputKVCacheLayers + i].get_any_name(); + std::string input_name = std::regex_replace(output_name, std::regex("present"), "past_key_values"); + auto kvcache_in_tensor = m_kvcache_request.get_tensor(input_name); fill_tensor(kvcache_in_tensor, 0); @@ -985,8 +990,8 @@ EncodedResults StaticLLMPipeline::generate( // NB: Write KV-cache for the new token to the correct input position for the next iteration for (int i = 0; i < kvcache_compiled.outputs().size() - 1; ++i) { - const auto& input_name = kvcache_compiled.inputs()[kStartInputKVCacheLayers + i].get_any_name(); const auto& output_name = kvcache_compiled.outputs()[kStartOutputKVCacheLayers + i].get_any_name(); + std::string input_name = std::regex_replace(output_name, std::regex("present"), "past_key_values"); const auto kv_dim = output_name.find("value") != std::string::npos ? m_kvcache_desc.v_seq_len : m_kvcache_desc.k_seq_len; From 7109db4574b5a6edd13c593b1e74ade4fdfcc01c Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Mon, 11 Nov 2024 12:47:29 +0000 Subject: [PATCH 06/20] Clean up --- samples/cpp/chat_sample/chat_sample.cpp | 6 +- src/cpp/src/llm_pipeline_static.cpp | 185 +++++++++++++----------- src/cpp/src/llm_pipeline_static.hpp | 4 +- 3 files changed, 105 insertions(+), 90 deletions(-) diff --git a/samples/cpp/chat_sample/chat_sample.cpp b/samples/cpp/chat_sample/chat_sample.cpp index 5a13b97784..41d63fc0f1 100644 --- a/samples/cpp/chat_sample/chat_sample.cpp +++ b/samples/cpp/chat_sample/chat_sample.cpp @@ -10,10 +10,8 @@ int main(int argc, char* argv[]) try { std::string prompt; std::string models_path = argv[1]; - std::string device = "NPU"; // GPU, NPU can be used as well - // - ov::AnyMap pipeline_config = { { "USE_OPT_LAYOUT", "YES" } }; - ov::genai::LLMPipeline pipe(models_path, device, pipeline_config); + std::string device = "CPU"; // GPU, NPU can be used as well + ov::genai::LLMPipeline pipe(models_path, device); ov::genai::GenerationConfig config; config.max_new_tokens = 100; diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index b34da5332f..e2d0679af7 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -28,15 +28,15 @@ namespace { -struct Context { - std::vector> new_params; - std::vector> old_params; - using Ref = std::reference_wrapper; -}; - namespace opp = ov::pass::pattern; class TransposeValueTensors : public ov::pass::MatcherPass { public: + struct Context { + std::vector> new_params; + std::vector> old_params; + using Ref = std::reference_wrapper; + }; + TransposeValueTensors(Context::Ref ctx) { auto param = opp::wrap_type(); auto transpose = opp::wrap_type({opp::any_input(), opp::any_input()}); @@ -62,13 +62,10 @@ class TransposeValueTensors : public ov::pass::MatcherPass { auto new_param = std::make_shared(matched_param->get_element_type(), shape); new_param->set_friendly_name(matched_param->get_friendly_name()); new_param->outputs().begin()->get_tensor().set_names(matched_param->outputs().begin()->get_tensor().get_names()); - //for (auto input_port : matched_param->output(0).get_target_inputs()) { - for (auto input_port : matched_node_param->output(0).get_target_inputs()) { - input_port.replace_source_output(new_param); - } + ov::replace_node(matched_param, new_param); + // NB: Save in order to add/remove to the model later on ctx.get().new_params.push_back(new_param); ctx.get().old_params.push_back(matched_param); - new_param->validate_and_infer_types(); auto order_cst = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{4}, {0, 2, 3, 1}); auto new_transpose = std::make_shared(matched_transpose->input_value(0), @@ -197,30 +194,61 @@ class ScaledDotProductAttentionDecomposition : public ov::pass::MatcherPass { } }; -template -ov::Tensor rotate_clockwise(const ov::Tensor& input) { - ov::Shape shape = input.get_shape(); - ov::Shape new_shape{shape[0], shape[1], shape[3], shape[2]}; - - ov::Tensor output(input.get_element_type(), new_shape); +bool transpose_value_tensors(std::shared_ptr model) { + ov::pass::GraphRewrite rewr; + rewr.add_matcher(); + TransposeValueTensors::Context ctx; + rewr.add_matcher(std::ref(ctx)); + rewr.run_on_model(model); - const auto* in_p = input.data(); - auto* out_p = output.data(); + model->add_parameters(ctx.new_params); + for (auto old_param : ctx.old_params) { + model->remove_parameter(old_param); + } + ov::pass::Validate().run_on_model(model); - const int C = shape[1]; - const int H = shape[2]; - const int W = shape[3]; + // NB: if new_params is not empty - pass has been applied + return !ctx.new_params.empty(); +} - for (size_t c = 0; c < C; ++c) { - for (size_t i = 0; i < H; ++i) { - for (size_t j = 0; j < W; ++j) { - size_t in_idx = (c * H * W) + (i * W) + j; - size_t out_idx = (c * H * W) + (j * H) + i; - out_p[out_idx] = in_p[in_idx]; +ov::Tensor rotate_clockwise(const ov::Tensor& input, ov::Tensor& output) { + ov::Shape in_shape = input.get_shape(); + OPENVINO_ASSERT(in_shape.size() == 4u); + OPENVINO_ASSERT(in_shape[0] == 1u); + + const auto in_strides = input.get_strides(); + const auto IC = in_shape[1]; + const auto IH = in_shape[2]; + const auto IW = in_shape[3]; + const auto IS_C = in_strides[1]; + const auto IS_H = in_strides[2]; + const auto IS_W = in_strides[3]; + + ov::Shape out_shape = output.get_shape(); + OPENVINO_ASSERT(out_shape == ov::Shape({1u, IC, IW, IH})); + + auto out_strides = output.get_strides(); + const auto OS_C = out_strides[1]; + const auto OS_H = out_strides[2]; + const auto OS_W = out_strides[3]; + + const auto* in_p = static_cast(input.data()); + auto* out_p = static_cast(output.data()); + + const auto elem_size = input.get_byte_size() / input.get_size(); + + // FIXME: Scalar implementation needs to be optimized! + for (size_t c = 0; c < IC; ++c) { + for (size_t i = 0; i < IH; ++i) { + for (size_t j = 0; j < IW; ++j) { + for (size_t b = 0; b < elem_size; ++b) { + const size_t in_idx = (c * IS_C) + (i * IS_H) + (j * IS_W) + b; + const size_t out_idx = (c * OS_C) + (i * OS_W) + (j * OS_H) + b; + out_p[out_idx] = in_p[in_idx]; + } } } } - return output; } @@ -459,7 +487,7 @@ std::optional extract_npu_descriptor(ov::Core& core) { ov::AnyMap get_baseline_common_config() { ov::AnyMap config = { { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm" }, - { "NPUW_DEVICES", "NPU,CPU" }, + { "NPUW_DEVICES", "NPU" }, { "NPU_USE_NPUW", "YES" }, { "NPUW_FOLD", "YES" }, { "NPUW_DCOFF_TYPE", "f16" }, @@ -502,12 +530,11 @@ ov::AnyMap get_default_generate_config(const std::shared_ptr& model, const std::optional& npudesc, const GenerateHint hint) { auto config = get_default_common_config(model); - if (hint == GenerateHint::BEST_PERF) { - config.emplace("NPUW_ONLINE_PIPELINE", "NONE"); - } + if (hint == GenerateHint::BEST_PERF) { + config.emplace("NPUW_ONLINE_PIPELINE", "NONE"); + } // NB: Unconditionally set for generation model config.emplace("NPUW_DQ", "YES"); - config.emplace("NPUW_DUMP_SUBS", "YES"); if (npudesc.has_value() && npudesc->arch == "4000") { config.emplace("NPU_DPU_GROUPS", 4); } @@ -609,9 +636,11 @@ void StaticLLMPipeline::setupAndCompileModels( 2) Expose KV-cache input and output layers from kvcache model 3) Align u4 ZP constants - TODO: get rid of this step in future 4) Clone the model - this will be prefill - 5) Replace KV-cache tensors for the entire cache to tensors only for new token (before concat) - 6) Reshape both models to static shape - 7) Compile both models + 5) Reshape both models to static shape + 6) Apply layout optimization if applicable + 7) Replace KV-cache tensors for the entire cache to tensors only for new token (before concat) + 8) Convert kv-cache tensors to f16 precision + 9) Compile both models */ ov::Core core; @@ -627,38 +656,34 @@ void StaticLLMPipeline::setupAndCompileModels( // (4) Clone the model - this will be prefill m_prefill_model = m_kvcache_model->clone(); m_prefill_model->set_friendly_name(m_kvcache_model->get_friendly_name() + "_prefill"); - // (7) Reshape both models to static shape + // (5) Reshape both models to static shape const uint32_t kMaxPromptLen = align_to(pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(1024u), 64u); const uint32_t kMinResponseLen = align_to(pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(128u), 64u); KVAxesPosition axes = get_kv_axes(get_model_type_from_json(models_path / "config.json")); - m_kvcache_desc = KVCacheDesc { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len, axes.seq_len }; + m_kvcache_desc = KVCacheDesc { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len, false}; reshape_to_static(m_prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes); reshape_to_static(m_kvcache_model, 1u, m_kvcache_desc.total_size, axes); - - const bool opt_layout = pop_or_default(properties, "DISABLE_OPT_LAYOUT", "NO") == "YES"; - if (opt_layout || true) { - std::cout << "[LOG_DEBUG] Will enable opt layout " << std::endl; - // (5) Replace KV-tensors for the entire cache to tensors only for new token and decompose SDPA - Context ctx; - ov::pass::GraphRewrite rewr; - rewr.add_matcher(); - rewr.add_matcher(std::ref(ctx)); - rewr.run_on_model(m_kvcache_model); - m_kvcache_model->add_parameters(ctx.new_params); - for (auto old_param : ctx.old_params) { - m_kvcache_model->remove_parameter(old_param); + // (6) Apply opt layout if applicable + const bool disable_opt_layout = pop_or_default(properties, "DISABLE_OPT_LAYOUT", "NO") == "YES"; + // NB: Try to apply opt transpose by default for all models that have + // KV-cache tensors in format [batch, num_heads, seq_len, emb_size] + if (m_kvcache_desc.seq_len == 2 && !disable_opt_layout) { + std::cout << "[LOG_DEBUG] Try to apply opt layout" << std::endl; + if (transpose_value_tensors(m_kvcache_model)) { + // NB: Check if TransposeValueTensors transformation was applied + std::cout << "[LOG DEBUG] Success: opt layout has been applied" << std::endl; + m_kvcache_desc.v_tensors_transposed = true; + } else { + // FIXME: Otherwise fuse SDPA back? + std::cout << "[LOG DEBUG] Failed: opt layout has not been applied" << std::endl; } - ov::pass::Validate().run_on_model(m_kvcache_model); - m_kvcache_desc.v_seq_len = 3; } - + // (7) Replace KV-cache tensors for the entire cache to tensors only for new token (before concat) m_kvcache_model = redirect_new_kv_to_output(m_kvcache_model); - - // (6) Convert kvcache tensors to fp16 precision + // (8) Convert kvcache tensors to fp16 precision m_kvcache_model = cvt_kvcache_to_fp16(m_kvcache_model); - m_prefill_model = cvt_kvcache_to_fp16(m_prefill_model); - // (8) Compile both model + // (9) Compile both model auto prefill_config = pop_or_default( properties, "PREFILL_CONFIG", get_default_prefill_config(m_prefill_model, npudesc) ); @@ -673,7 +698,6 @@ void StaticLLMPipeline::setupAndCompileModels( drop_cache_dir(prefill_config); drop_cache_dir(generate_config); - ov::save_model(m_kvcache_model, "model-wo-sdpa.xml"); m_kvcache_request = core.compile_model( m_kvcache_model, device, generate_config ).create_infer_request(); @@ -915,42 +939,35 @@ EncodedResults StaticLLMPipeline::generate( return results; } - // Inputs: input_ids, attention_mask, position_ids, ... // Outputs: logits, ... - //const auto kStartInputKVCacheLayers = 3u; const auto kStartOutputKVCacheLayers = 1u; - // NB: Copy KV-cache tensors from prefill model to kvcache model const auto& kvcache_compiled = m_kvcache_request.get_compiled_model(); for (int i = 0; i < kvcache_compiled.outputs().size() - 1; ++i) { const auto& output_name = kvcache_compiled.outputs()[kStartOutputKVCacheLayers + i].get_any_name(); - auto prefill_out_tensor = m_prefill_request.get_tensor(output_name); - - // FIXME: ... - auto kv_dim = m_kvcache_desc.k_seq_len; - if (kv_dim != m_kvcache_desc.v_seq_len && - output_name.find("value") != std::string::npos) { - auto rotated = rotate_clockwise(prefill_out_tensor); - prefill_out_tensor = rotated; - kv_dim = m_kvcache_desc.v_seq_len; - } + const auto input_name = std::regex_replace(output_name, std::regex("present"), "past_key_values"); + auto prefill_out_tensor = m_prefill_request.get_tensor(output_name); auto prefill_out_slice = make_tensor_slice( - prefill_out_tensor, kv_dim, m_kvcache_desc.max_prompt_size - m_kvcache_desc.num_stored_tokens, m_kvcache_desc.max_prompt_size + prefill_out_tensor, m_kvcache_desc.seq_len, m_kvcache_desc.max_prompt_size - m_kvcache_desc.num_stored_tokens, m_kvcache_desc.max_prompt_size ); - //const auto& input_name = kvcache_compiled.inputs()[kStartInputKVCacheLayers + i].get_any_name(); - std::string input_name = std::regex_replace(output_name, std::regex("present"), "past_key_values"); - auto kvcache_in_tensor = m_kvcache_request.get_tensor(input_name); fill_tensor(kvcache_in_tensor, 0); - auto kvcache_in_slice = make_tensor_slice( - kvcache_in_tensor, kv_dim, 0u, m_kvcache_desc.num_stored_tokens - ); - - prefill_out_slice.copy_to(kvcache_in_slice); + if (output_name.find("value") != std::string::npos && + m_kvcache_desc.v_tensors_transposed) { + auto kvcache_in_slice = make_tensor_slice( + kvcache_in_tensor, 3u, 0u, m_kvcache_desc.num_stored_tokens + ); + rotate_clockwise(prefill_out_slice, kvcache_in_slice); + } else { + auto kvcache_in_slice = make_tensor_slice( + kvcache_in_tensor, m_kvcache_desc.seq_len, 0u, m_kvcache_desc.num_stored_tokens + ); + prefill_out_slice.copy_to(kvcache_in_slice); + } } auto* input_ids_data = m_kvcache_request.get_tensor("input_ids").data(); @@ -993,8 +1010,8 @@ EncodedResults StaticLLMPipeline::generate( const auto& output_name = kvcache_compiled.outputs()[kStartOutputKVCacheLayers + i].get_any_name(); std::string input_name = std::regex_replace(output_name, std::regex("present"), "past_key_values"); - const auto kv_dim = - output_name.find("value") != std::string::npos ? m_kvcache_desc.v_seq_len : m_kvcache_desc.k_seq_len; + const auto kv_dim = (output_name.find("value") != std::string::npos && + m_kvcache_desc.v_tensors_transposed) ? 3u : m_kvcache_desc.seq_len; auto kvcache_in_tensor = m_kvcache_request.get_tensor(input_name); auto kvcache_in_slice = make_tensor_slice( diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp index 24213b304e..29ddbf8ed1 100644 --- a/src/cpp/src/llm_pipeline_static.hpp +++ b/src/cpp/src/llm_pipeline_static.hpp @@ -57,8 +57,8 @@ class StaticLLMPipeline final : public LLMPipelineImplBase { uint32_t max_prompt_size; uint32_t total_size; uint32_t num_stored_tokens; - uint32_t k_seq_len; - uint32_t v_seq_len; + uint32_t seq_len; + uint32_t v_tensors_transposed; }; // FIXME: Ideally, we don't need to keep those From 1e64ad8a89bd73ea94507e492be7d8012adc0776 Mon Sep 17 00:00:00 2001 From: Dmitry Matveev Date: Mon, 11 Nov 2024 18:57:45 +0000 Subject: [PATCH 07/20] NPU Static pipeline: Unroll SDPA - move K transpose to MatMul --- src/cpp/src/llm_pipeline_static.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index e2d0679af7..937a509315 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -146,10 +146,7 @@ class ScaledDotProductAttentionDecomposition : public ov::pass::MatcherPass { auto keep_dim_last = register_new_node(k_next_dim, zero_i); auto k_dims_before_transpose = register_new_node(zero_i, keep_dim_last, one_i, element::i32); - auto transpose_dims = - register_new_node(OutputVector{k_dims_before_transpose, k_last_dim, k_next_dim}, 0); - auto k_transposed = register_new_node(key, transpose_dims); - auto scaled_atten = register_new_node(q_scaled, k_transposed)->output(0); + auto scaled_atten = register_new_node(q_scaled, key, false, true)->output(0); minus_inf = register_new_node(minus_inf, scaled_atten); if (node->get_causal() || node->get_input_size() > 3) { From 975e59999a214953e965227e68aa03881c87bdd9 Mon Sep 17 00:00:00 2001 From: Anatoliy Talamanov Date: Tue, 12 Nov 2024 08:48:59 +0000 Subject: [PATCH 08/20] Update llm_pipeline_static.hpp --- src/cpp/src/llm_pipeline_static.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp index 29ddbf8ed1..2f9969f5d7 100644 --- a/src/cpp/src/llm_pipeline_static.hpp +++ b/src/cpp/src/llm_pipeline_static.hpp @@ -58,7 +58,7 @@ class StaticLLMPipeline final : public LLMPipelineImplBase { uint32_t total_size; uint32_t num_stored_tokens; uint32_t seq_len; - uint32_t v_tensors_transposed; + bool v_tensors_transposed; }; // FIXME: Ideally, we don't need to keep those From 3d82051c8324e3a0e77bbe7a88c1aa7ecd68ed96 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Tue, 12 Nov 2024 11:00:25 +0000 Subject: [PATCH 09/20] Enable opt layout only for llama-2-7b-hf model --- src/cpp/src/llm_pipeline_static.cpp | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index e2d0679af7..c1f0c1f0f7 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -413,12 +413,25 @@ KVAxesPosition get_kv_axes(const std::string& model_type) { return axes; } -std::string get_model_type_from_json(const std::filesystem::path& filepath) { +struct ModelDesc { + std::string type; + std::string name_or_path; + int num_key_value_heads; +}; + +ModelDesc get_modeldesc_from_json(const std::filesystem::path& filepath) { std::ifstream file(filepath); OPENVINO_ASSERT(file.is_open(), "Could not open file: " + filepath.string()); nlohmann::json config_data = nlohmann::json::parse(file); - std::string model_type = config_data["model_type"].get(); - return model_type; + + ModelDesc desc; + desc.type = config_data["model_type"].get(); + // NB: In case _name_or_path field isn't presented in config.json + if (config_data.contains("_name_or_path")) { + desc.name_or_path = config_data["_name_or_path"].get(); + } + desc.num_key_value_heads = config_data["num_key_value_heads"].get(); + return desc; } void reshape_to_static(std::shared_ptr model, @@ -659,15 +672,17 @@ void StaticLLMPipeline::setupAndCompileModels( // (5) Reshape both models to static shape const uint32_t kMaxPromptLen = align_to(pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(1024u), 64u); const uint32_t kMinResponseLen = align_to(pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(128u), 64u); - KVAxesPosition axes = get_kv_axes(get_model_type_from_json(models_path / "config.json")); + ModelDesc model_desc = get_modeldesc_from_json(models_path / "config.json"); + KVAxesPosition axes = get_kv_axes(model_desc.type); m_kvcache_desc = KVCacheDesc { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len, false}; reshape_to_static(m_prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes); reshape_to_static(m_kvcache_model, 1u, m_kvcache_desc.total_size, axes); // (6) Apply opt layout if applicable const bool disable_opt_layout = pop_or_default(properties, "DISABLE_OPT_LAYOUT", "NO") == "YES"; - // NB: Try to apply opt transpose by default for all models that have - // KV-cache tensors in format [batch, num_heads, seq_len, emb_size] - if (m_kvcache_desc.seq_len == 2 && !disable_opt_layout) { + // NB: Try to apply opt transpose only for Llama-2-7b-chat-hf model + if ((model_desc.name_or_path == "meta-llama/Llama-2-7b-chat-hf" || + (model_desc.type == "llama" && model_desc.num_key_value_heads == 32)) + && !disable_opt_layout) { std::cout << "[LOG_DEBUG] Try to apply opt layout" << std::endl; if (transpose_value_tensors(m_kvcache_model)) { // NB: Check if TransposeValueTensors transformation was applied From c1d334070e3943eee82654ef2727ba3097aaaa36 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Tue, 12 Nov 2024 12:48:22 +0000 Subject: [PATCH 10/20] Use model-level tranpose instead scalar one --- src/cpp/src/llm_pipeline_static.cpp | 91 +++++++++-------------------- 1 file changed, 26 insertions(+), 65 deletions(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index c8d93f1403..54f34a1175 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -191,7 +191,19 @@ class ScaledDotProductAttentionDecomposition : public ov::pass::MatcherPass { } }; -bool transpose_value_tensors(std::shared_ptr model) { +std::shared_ptr cvt_value_tensors_layout(std::shared_ptr model) { + ov::preprocess::PrePostProcessor ppp(model); + for (auto tensor : model->outputs()) { + if (tensor.get_any_name().find("value") != std::string::npos) { + // NB: [batch, num_heads, seq_len, emb_size] -> [batch, num_heads, emb_size, seq_len] + ppp.output(tensor.get_any_name()).model().set_layout(ov::Layout("BHSE")); + ppp.output(tensor.get_any_name()).tensor().set_layout(ov::Layout("BHES")); + } + } + return ppp.build(); +} + +bool optimize_value_tensors(std::shared_ptr model) { ov::pass::GraphRewrite rewr; rewr.add_matcher(); TransposeValueTensors::Context ctx; @@ -208,47 +220,6 @@ bool transpose_value_tensors(std::shared_ptr model) { return !ctx.new_params.empty(); } -ov::Tensor rotate_clockwise(const ov::Tensor& input, ov::Tensor& output) { - ov::Shape in_shape = input.get_shape(); - OPENVINO_ASSERT(in_shape.size() == 4u); - OPENVINO_ASSERT(in_shape[0] == 1u); - - const auto in_strides = input.get_strides(); - const auto IC = in_shape[1]; - const auto IH = in_shape[2]; - const auto IW = in_shape[3]; - const auto IS_C = in_strides[1]; - const auto IS_H = in_strides[2]; - const auto IS_W = in_strides[3]; - - ov::Shape out_shape = output.get_shape(); - OPENVINO_ASSERT(out_shape == ov::Shape({1u, IC, IW, IH})); - - auto out_strides = output.get_strides(); - const auto OS_C = out_strides[1]; - const auto OS_H = out_strides[2]; - const auto OS_W = out_strides[3]; - - const auto* in_p = static_cast(input.data()); - auto* out_p = static_cast(output.data()); - - const auto elem_size = input.get_byte_size() / input.get_size(); - - // FIXME: Scalar implementation needs to be optimized! - for (size_t c = 0; c < IC; ++c) { - for (size_t i = 0; i < IH; ++i) { - for (size_t j = 0; j < IW; ++j) { - for (size_t b = 0; b < elem_size; ++b) { - const size_t in_idx = (c * IS_C) + (i * IS_H) + (j * IS_W) + b; - const size_t out_idx = (c * OS_C) + (i * OS_W) + (j * OS_H) + b; - out_p[out_idx] = in_p[in_idx]; - } - } - } - } - return output; -} - uint32_t align_to(uint32_t value, uint32_t alignment) { return (value + alignment - 1) & ~(alignment - 1); } @@ -677,17 +648,12 @@ void StaticLLMPipeline::setupAndCompileModels( // (6) Apply opt layout if applicable const bool disable_opt_layout = pop_or_default(properties, "DISABLE_OPT_LAYOUT", "NO") == "YES"; // NB: Try to apply opt transpose only for Llama-2-7b-chat-hf model - if ((model_desc.name_or_path == "meta-llama/Llama-2-7b-chat-hf" || - (model_desc.type == "llama" && model_desc.num_key_value_heads == 32)) - && !disable_opt_layout) { - std::cout << "[LOG_DEBUG] Try to apply opt layout" << std::endl; - if (transpose_value_tensors(m_kvcache_model)) { + if ( model_desc.name_or_path == "meta-llama/Llama-2-7b-chat-hf" || + (model_desc.type == "llama" && model_desc.num_key_value_heads == 32)) { + if (optimize_value_tensors(m_kvcache_model)) { // NB: Check if TransposeValueTensors transformation was applied - std::cout << "[LOG DEBUG] Success: opt layout has been applied" << std::endl; m_kvcache_desc.v_tensors_transposed = true; - } else { - // FIXME: Otherwise fuse SDPA back? - std::cout << "[LOG DEBUG] Failed: opt layout has not been applied" << std::endl; + m_prefill_model = cvt_value_tensors_layout(m_prefill_model); } } // (7) Replace KV-cache tensors for the entire cache to tensors only for new token (before concat) @@ -956,30 +922,25 @@ EncodedResults StaticLLMPipeline::generate( // NB: Copy KV-cache tensors from prefill model to kvcache model const auto& kvcache_compiled = m_kvcache_request.get_compiled_model(); for (int i = 0; i < kvcache_compiled.outputs().size() - 1; ++i) { - const auto& output_name = kvcache_compiled.outputs()[kStartOutputKVCacheLayers + i].get_any_name(); const auto input_name = std::regex_replace(output_name, std::regex("present"), "past_key_values"); + const auto kv_dim = (output_name.find("value") != std::string::npos && + m_kvcache_desc.v_tensors_transposed) ? 3u : m_kvcache_desc.seq_len; + auto prefill_out_tensor = m_prefill_request.get_tensor(output_name); auto prefill_out_slice = make_tensor_slice( - prefill_out_tensor, m_kvcache_desc.seq_len, m_kvcache_desc.max_prompt_size - m_kvcache_desc.num_stored_tokens, m_kvcache_desc.max_prompt_size + prefill_out_tensor, kv_dim, m_kvcache_desc.max_prompt_size - m_kvcache_desc.num_stored_tokens, m_kvcache_desc.max_prompt_size ); auto kvcache_in_tensor = m_kvcache_request.get_tensor(input_name); fill_tensor(kvcache_in_tensor, 0); - if (output_name.find("value") != std::string::npos && - m_kvcache_desc.v_tensors_transposed) { - auto kvcache_in_slice = make_tensor_slice( - kvcache_in_tensor, 3u, 0u, m_kvcache_desc.num_stored_tokens - ); - rotate_clockwise(prefill_out_slice, kvcache_in_slice); - } else { - auto kvcache_in_slice = make_tensor_slice( - kvcache_in_tensor, m_kvcache_desc.seq_len, 0u, m_kvcache_desc.num_stored_tokens - ); - prefill_out_slice.copy_to(kvcache_in_slice); - } + auto kvcache_in_slice = make_tensor_slice( + kvcache_in_tensor, kv_dim, 0u, m_kvcache_desc.num_stored_tokens + ); + + prefill_out_slice.copy_to(kvcache_in_slice); } auto* input_ids_data = m_kvcache_request.get_tensor("input_ids").data(); From 03b5e25df95b96dafc7a60b8f3724cb667e42ff6 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Tue, 12 Nov 2024 13:18:20 +0000 Subject: [PATCH 11/20] Remove disable opt layout --- src/cpp/src/llm_pipeline_static.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 54f34a1175..ee76000a65 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -646,7 +646,6 @@ void StaticLLMPipeline::setupAndCompileModels( reshape_to_static(m_prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes); reshape_to_static(m_kvcache_model, 1u, m_kvcache_desc.total_size, axes); // (6) Apply opt layout if applicable - const bool disable_opt_layout = pop_or_default(properties, "DISABLE_OPT_LAYOUT", "NO") == "YES"; // NB: Try to apply opt transpose only for Llama-2-7b-chat-hf model if ( model_desc.name_or_path == "meta-llama/Llama-2-7b-chat-hf" || (model_desc.type == "llama" && model_desc.num_key_value_heads == 32)) { From 856c23a6b5a844fc5b86093a2d5510eda0979b5c Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Wed, 13 Nov 2024 11:09:44 +0000 Subject: [PATCH 12/20] Copy columns by row chunks --- src/cpp/src/llm_pipeline_static.cpp | 36 ++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index ee76000a65..fe90aaf77a 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -566,6 +566,36 @@ void drop_cache_dir(ov::AnyMap& config) { } } +void copy_columns_by_row_chunks(const ov::Tensor& src, ov::Tensor& dst) { + const auto src_shape = src.get_shape(); + + OPENVINO_ASSERT(src_shape.size() == 4u); + OPENVINO_ASSERT(src_shape == dst.get_shape()); + OPENVINO_ASSERT(src.get_byte_size() == dst.get_byte_size()); + + const auto src_strides = src.get_strides(); + const auto dst_strides = dst.get_strides(); + const auto elem_size = src.get_byte_size() / src.get_size(); + + const auto C = src_shape[1]; + const auto H = src_shape[2]; + const auto W = src_shape[3]; + + const auto IS_H = src_strides[2]; + const auto OS_H = dst_strides[2]; + + const size_t chunk_byte_size = W * elem_size; + + const auto* src_p = static_cast(src.data()); + auto* dst_p = static_cast(dst.data()); + + for (size_t i = 0; i < C*H; ++i) { + const size_t src_offset = i * IS_H; + const size_t dst_offset = i * OS_H; + std::copy_n(src_p + src_offset, chunk_byte_size, dst_p + dst_offset); + } +} + } // anonymous namespace namespace ov { @@ -939,7 +969,11 @@ EncodedResults StaticLLMPipeline::generate( kvcache_in_tensor, kv_dim, 0u, m_kvcache_desc.num_stored_tokens ); - prefill_out_slice.copy_to(kvcache_in_slice); + if (kv_dim == 3u) { + copy_columns_by_row_chunks(prefill_out_slice, kvcache_in_slice); + } else { + prefill_out_slice.copy_to(kvcache_in_slice); + } } auto* input_ids_data = m_kvcache_request.get_tensor("input_ids").data(); From 97e9ceb3c0b931e6108c8750f0737e8ff44d1b0f Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Wed, 13 Nov 2024 14:11:25 +0000 Subject: [PATCH 13/20] add parallel_for for kvcache copy --- src/cpp/CMakeLists.txt | 4 +++- src/cpp/src/llm_pipeline_static.cpp | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt index 49b640763d..9ed1f36873 100644 --- a/src/cpp/CMakeLists.txt +++ b/src/cpp/CMakeLists.txt @@ -56,6 +56,8 @@ ov_genai_build_jinja2cpp() # Library +find_package(OpenVINO REQUIRED COMPONENTS Runtime Threading) + file(GLOB_RECURSE SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/src/*.c") set(TARGET_NAME openvino_genai) @@ -71,7 +73,7 @@ target_include_directories(${TARGET_NAME} target_include_directories(${TARGET_NAME} SYSTEM PRIVATE "${safetensors.h_SOURCE_DIR}") -target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime PRIVATE nlohmann_json::nlohmann_json jinja2cpp) +target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime openvino::threading PRIVATE nlohmann_json::nlohmann_json jinja2cpp) target_compile_features(${TARGET_NAME} PUBLIC cxx_std_17) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index fe90aaf77a..58b06e8293 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -19,6 +19,7 @@ #include "openvino/core/preprocess/pre_post_process.hpp" #include "openvino/runtime/properties.hpp" #include "openvino/runtime/intel_npu/properties.hpp" +#include "openvino/core/parallel.hpp" #include @@ -950,7 +951,8 @@ EncodedResults StaticLLMPipeline::generate( const auto kStartOutputKVCacheLayers = 1u; // NB: Copy KV-cache tensors from prefill model to kvcache model const auto& kvcache_compiled = m_kvcache_request.get_compiled_model(); - for (int i = 0; i < kvcache_compiled.outputs().size() - 1; ++i) { + + ov::parallel_for(kvcache_compiled.outputs().size() - 1, [&](size_t i) { const auto& output_name = kvcache_compiled.outputs()[kStartOutputKVCacheLayers + i].get_any_name(); const auto input_name = std::regex_replace(output_name, std::regex("present"), "past_key_values"); @@ -974,7 +976,7 @@ EncodedResults StaticLLMPipeline::generate( } else { prefill_out_slice.copy_to(kvcache_in_slice); } - } + }); auto* input_ids_data = m_kvcache_request.get_tensor("input_ids").data(); auto* position_ids_data = m_kvcache_request.get_tensor("position_ids").data(); From ec3b0d203da4e09d4a7c37e63e4d09e7ffa9f8d7 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Wed, 13 Nov 2024 16:47:06 +0000 Subject: [PATCH 14/20] Move threading find package to different file --- CMakeLists.txt | 2 ++ src/cpp/CMakeLists.txt | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b08debbcd5..a5e7d6ddb8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,6 +31,8 @@ project(OpenVINOGenAI HOMEPAGE_URL "https://github.com/openvinotoolkit/openvino.genai" LANGUAGES CXX C) +find_package(OpenVINO REQUIRED COMPONENTS Runtime Threading) + # Find OpenVINODeveloperPackage first to compile with SDL flags find_package(OpenVINODeveloperPackage ${OpenVINOGenAI_VERSION} QUIET COMPONENTS Runtime diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt index 9ed1f36873..c967f71ece 100644 --- a/src/cpp/CMakeLists.txt +++ b/src/cpp/CMakeLists.txt @@ -56,8 +56,6 @@ ov_genai_build_jinja2cpp() # Library -find_package(OpenVINO REQUIRED COMPONENTS Runtime Threading) - file(GLOB_RECURSE SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/src/*.c") set(TARGET_NAME openvino_genai) From 91fa973fcdcf44c5ce6314e56936538c76fd3902 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Wed, 13 Nov 2024 18:21:22 +0000 Subject: [PATCH 15/20] Link threading privately --- CMakeLists.txt | 2 -- src/cpp/CMakeLists.txt | 4 +++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a5e7d6ddb8..b08debbcd5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,8 +31,6 @@ project(OpenVINOGenAI HOMEPAGE_URL "https://github.com/openvinotoolkit/openvino.genai" LANGUAGES CXX C) -find_package(OpenVINO REQUIRED COMPONENTS Runtime Threading) - # Find OpenVINODeveloperPackage first to compile with SDL flags find_package(OpenVINODeveloperPackage ${OpenVINOGenAI_VERSION} QUIET COMPONENTS Runtime diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt index c967f71ece..b5507a4555 100644 --- a/src/cpp/CMakeLists.txt +++ b/src/cpp/CMakeLists.txt @@ -56,6 +56,8 @@ ov_genai_build_jinja2cpp() # Library +find_package(OpenVINO REQUIRED COMPONENTS Runtime Threading) + file(GLOB_RECURSE SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/src/*.c") set(TARGET_NAME openvino_genai) @@ -71,7 +73,7 @@ target_include_directories(${TARGET_NAME} target_include_directories(${TARGET_NAME} SYSTEM PRIVATE "${safetensors.h_SOURCE_DIR}") -target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime openvino::threading PRIVATE nlohmann_json::nlohmann_json jinja2cpp) +target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime PRIVATE openvino::threading nlohmann_json::nlohmann_json jinja2cpp) target_compile_features(${TARGET_NAME} PUBLIC cxx_std_17) From 55e10b8a0300307c94fc1afb08b97ede87ecaee1 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Thu, 14 Nov 2024 08:11:20 +0000 Subject: [PATCH 16/20] Add assert with comment --- src/cpp/src/llm_pipeline_static.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 58b06e8293..5588eff1a7 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -59,6 +59,9 @@ class TransposeValueTensors : public ov::pass::MatcherPass { auto matched_matmul = std::static_pointer_cast(matched_node_matmul); auto shape = matched_param->get_partial_shape(); + OPENVINO_ASSERT(shape.size() == 4u); + // NB: Transpose Parameter that correspond to V-tensor it will + // speed-up its multiplication with attention scores std::swap(shape[2], shape[3]); auto new_param = std::make_shared(matched_param->get_element_type(), shape); new_param->set_friendly_name(matched_param->get_friendly_name()); From af6728b8ea50d32c3e66bb3b94be9eda8c40b756 Mon Sep 17 00:00:00 2001 From: Anatoliy Talamanov Date: Mon, 18 Nov 2024 13:33:13 +0000 Subject: [PATCH 17/20] Update src/cpp/src/llm_pipeline_static.cpp Co-authored-by: Vladimir Zlobin --- src/cpp/src/llm_pipeline_static.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 5588eff1a7..8989be3006 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -112,8 +112,8 @@ class ScaledDotProductAttentionDecomposition : public ov::pass::MatcherPass { }; auto m = std::make_shared(pattern_node, "ScaledDotProductAttentionDecomposition"); - register_matcher(m, callback); - } + register_matcher(m, std::move(callback)); + } std::shared_ptr decompose(std::shared_ptr node) { using namespace ov::op; using namespace ov; From 0ae6c1e5d5152694483e7fd9b6c41bce7ca3ec30 Mon Sep 17 00:00:00 2001 From: Anatoliy Talamanov Date: Mon, 18 Nov 2024 13:37:31 +0000 Subject: [PATCH 18/20] Update CMakeLists.txt --- src/cpp/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt index b5507a4555..4348880b89 100644 --- a/src/cpp/CMakeLists.txt +++ b/src/cpp/CMakeLists.txt @@ -56,7 +56,6 @@ ov_genai_build_jinja2cpp() # Library -find_package(OpenVINO REQUIRED COMPONENTS Runtime Threading) file(GLOB_RECURSE SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/src/*.c") From 604aea907cff80d9225c5625a4a67a4be8a427a6 Mon Sep 17 00:00:00 2001 From: Anatoliy Talamanov Date: Mon, 18 Nov 2024 13:37:49 +0000 Subject: [PATCH 19/20] Update CMakeLists.txt --- src/cpp/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt index 4348880b89..6a18bc969c 100644 --- a/src/cpp/CMakeLists.txt +++ b/src/cpp/CMakeLists.txt @@ -56,7 +56,6 @@ ov_genai_build_jinja2cpp() # Library - file(GLOB_RECURSE SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/src/*.c") set(TARGET_NAME openvino_genai) From 6aba78c2fcda3f92a65e743fa22b3b7181173b13 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Mon, 18 Nov 2024 13:48:28 +0000 Subject: [PATCH 20/20] Fix comments to review --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b08debbcd5..3c6e56d427 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,11 +33,11 @@ project(OpenVINOGenAI # Find OpenVINODeveloperPackage first to compile with SDL flags find_package(OpenVINODeveloperPackage ${OpenVINOGenAI_VERSION} QUIET - COMPONENTS Runtime + COMPONENTS Runtime Threading PATHS "${OpenVINO_DIR}") if(NOT OpenVINODeveloperPackage_FOUND) find_package(OpenVINO ${OpenVINOGenAI_VERSION} REQUIRED - COMPONENTS Runtime) + COMPONENTS Runtime Threading) endif() include(cmake/features.cmake)