From a8c290a2cdb1ccdd3b5aee185dcf4c6077b90d8c Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 11 Dec 2024 11:37:41 +0100 Subject: [PATCH 01/31] Dynamic KV-cache allocation. --- src/cpp/src/block_manager.hpp | 61 ++++++++- src/cpp/src/cache_manager.hpp | 116 ++++++++++++++++-- src/cpp/src/continuous_batching_impl.cpp | 8 +- src/cpp/src/device_config.hpp | 32 +++-- src/cpp/src/scheduler.hpp | 24 +++- .../utils/paged_attention_transformations.cpp | 4 +- tests/cpp/cache_manager.cpp | 16 +-- tests/cpp/scheduler.cpp | 22 ++-- 8 files changed, 224 insertions(+), 59 deletions(-) diff --git a/src/cpp/src/block_manager.hpp b/src/cpp/src/block_manager.hpp index dc82897dc8..a36133fbac 100644 --- a/src/cpp/src/block_manager.hpp +++ b/src/cpp/src/block_manager.hpp @@ -195,6 +195,7 @@ class BlockAllocator { size_t m_num_layers; bool m_enable_prefix_caching; ov::genai::OverwritableBlocksHashStore m_overwriteable_blocks; + bool m_initialized = false; public: /** * Constructs the BlockAllocator. @@ -205,13 +206,17 @@ class BlockAllocator { * Blocks returned will be vectors with this size, each vector entry to be associated with a separate layer's KV cache. */ BlockAllocator(size_t num_blocks, bool enable_prefix_caching, size_t num_layers = 1) : - m_free_blocks_num(num_layers, num_blocks), m_total_num_blocks(num_blocks), m_num_layers(num_layers), m_enable_prefix_caching(enable_prefix_caching), m_overwriteable_blocks(num_layers) { + m_total_num_blocks(num_blocks), m_num_layers(num_layers), m_enable_prefix_caching(enable_prefix_caching), m_overwriteable_blocks(num_layers) { OPENVINO_ASSERT(num_layers != 0, "num_layers must be non-zero"); m_free_blocks.resize(m_num_layers); - for (auto& per_layer_block_list : m_free_blocks) { - for (int block_id = 0; block_id < m_total_num_blocks; ++block_id) { - per_layer_block_list.push_back(std::make_shared(block_id)); + if (num_blocks > 0) { + m_free_blocks_num = std::vector(num_layers, num_blocks); + for (auto& per_layer_block_list : m_free_blocks) { + for (int block_id = 0; block_id < m_total_num_blocks; ++block_id) { + per_layer_block_list.push_back(std::make_shared(block_id)); + } } + m_initialized = true; } } @@ -220,6 +225,28 @@ class BlockAllocator { // OPENVINO_ASSERT(m_total_num_blocks == m_free_blocks.size()); } + void increase_kv_blocks_number(size_t new_kv_blocks_count) { + OPENVINO_ASSERT(new_kv_blocks_count > m_total_num_blocks, "New blocks number should be more than previous blocks number."); + if (!m_initialized) { + m_free_blocks_num = std::vector(m_num_layers, 0); + m_initialized = true; + } + size_t added_blocks = new_kv_blocks_count - m_total_num_blocks; + for (auto idx = 0; idx < m_free_blocks_num.size(); idx++) { + m_free_blocks_num[idx] = added_blocks; + } + for (auto& per_layer_block_list : m_free_blocks) { + for (int block_id = m_total_num_blocks; block_id < new_kv_blocks_count; ++block_id) { + per_layer_block_list.push_back(std::make_shared(block_id)); + } + } + m_total_num_blocks = new_kv_blocks_count; + } + + bool is_inilialized() const { + return m_initialized; + } + /** * Returns the number of free blocks for a given layer. * @param layer_idx Index of the layer. @@ -459,6 +486,13 @@ class BlockAllocator { for (size_t layer_idx = 0; layer_idx < m_num_layers; layer_idx++) sum += num_free_blocks(layer_idx); return static_cast(m_num_layers * m_total_num_blocks - sum) / (m_num_layers * m_total_num_blocks) * 100; } + + /** + * @return The total number of KV blocks . + */ + size_t get_total_number_of_kv_blocks() const { + return m_total_num_blocks; + } }; /** @@ -631,6 +665,10 @@ class BlockManager { return m_allocator.num_free_blocks(0); // relying on the invariant that all layers have identical number of blocks } + bool block_allocator_initialized() const { + return m_allocator.is_inilialized(); + } + /** * @param num_blocks A number of KV cache blocks * @return Whether this number of KV cache blocks may be assigned to new sequences. @@ -713,6 +751,21 @@ class BlockManager { return m_allocator.get_used_percentage(); } + /** + * Increases the number of KV blocks. + * @param num_blocks The new number of KV-blocks. + */ + void increase_kv_blocks_number(size_t num_blocks) { + m_allocator.increase_kv_blocks_number(num_blocks); + } + + /** + * @return The total number of KV blocks . + */ + size_t get_total_number_of_kv_blocks() const { + return m_allocator.get_total_number_of_kv_blocks(); + } + /** * @brief Forks a sequence, establishing a new sequence from an existing one, reusing * currently allocated blocks of the existing sequence. diff --git a/src/cpp/src/cache_manager.hpp b/src/cpp/src/cache_manager.hpp index a7444555ab..096f4bfadd 100644 --- a/src/cpp/src/cache_manager.hpp +++ b/src/cpp/src/cache_manager.hpp @@ -15,20 +15,46 @@ class CacheManager { DeviceConfig m_device_config; std::vector m_key_cache; std::vector m_value_cache; + size_t m_num_allocated_kv_blocks = 0; ov::Core m_core; + ov::InferRequest m_request; public: - explicit CacheManager(const DeviceConfig &device_config, ov::Core core) : + explicit CacheManager(const DeviceConfig &device_config, ov::InferRequest request, ov::Core core) : m_device_config(device_config), + m_request(request), m_core(core) { m_key_cache.reserve(m_device_config.get_num_layers()); m_value_cache.reserve(m_device_config.get_num_layers()); - const std::string device_name = device_config.get_device(); + //allocate_cache_if_needed(1); + } + + ov::Shape set_first_dim_and_make_static(const ov::PartialShape& shape, size_t dim) { + ov::PartialShape res_shape = shape; + res_shape[0] = dim; + OPENVINO_ASSERT(res_shape.is_static()); + return res_shape.to_shape(); + } + + void allocate_cache_if_needed(size_t num_kv_blocks) { + if (m_num_allocated_kv_blocks >= num_kv_blocks) { + return; + } + if (m_num_allocated_kv_blocks > 0) { + increase_cache(num_kv_blocks); + return; + } + m_num_allocated_kv_blocks = num_kv_blocks; + ov::Shape value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(), num_kv_blocks); + ov::Shape key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(), num_kv_blocks); + + const std::string device_name = m_device_config.get_device(); + if (device_name.find("GPU") == std::string::npos) {// Allocate KV caches for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { - ov::Tensor key_cache(device_config.get_cache_precision(), device_config.get_key_cache_shape()); - ov::Tensor value_cache(device_config.get_cache_precision(), device_config.get_value_cache_shape()); + ov::Tensor key_cache(m_device_config.get_cache_precision(), key_cache_shape); + ov::Tensor value_cache(m_device_config.get_cache_precision(), value_cache_shape); // force allocation std::memset(key_cache.data(), 0, key_cache.get_byte_size()); @@ -40,15 +66,85 @@ class CacheManager { } else { auto remote_context = m_core.get_default_context(device_name); for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { - ov::Tensor key_cache = remote_context.create_tensor(device_config.get_cache_precision(), - device_config.get_key_cache_shape()); - ov::Tensor value_cache = remote_context.create_tensor(device_config.get_cache_precision(), - device_config.get_value_cache_shape()); + ov::Tensor key_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), + key_cache_shape); + ov::Tensor value_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), + value_cache_shape); m_key_cache.emplace_back(key_cache); m_value_cache.emplace_back(value_cache); } } + update_request_tensor(); + } + + void update_request_tensor() { + for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { + m_request.set_tensor(std::string("key_cache.") + std::to_string(decoder_layer_id), m_key_cache[decoder_layer_id]); + m_request.set_tensor(std::string("value_cache.") + std::to_string(decoder_layer_id), m_value_cache[decoder_layer_id]); + } + } + + void increase_cache(size_t num_kv_blocks) { + OPENVINO_ASSERT(num_kv_blocks > m_num_allocated_kv_blocks); + ov::Shape new_value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(), num_kv_blocks); + ov::Shape new_key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(), num_kv_blocks); + + const std::string device_name = m_device_config.get_device(); + ov::Coordinate start_key{0,0,0,0}; + ov::Coordinate start_value{0,0,0,0}; + + if (device_name.find("GPU") == std::string::npos) { + for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { + + auto old_byte_size_key = m_key_cache[decoder_layer_id].get_byte_size(); + auto old_byte_size_value = m_value_cache[decoder_layer_id].get_byte_size(); + ov::Coordinate end_key(m_key_cache[decoder_layer_id].get_shape()); + ov::Coordinate end_value(m_value_cache[decoder_layer_id].get_shape()); + + ov::Tensor key_cache(m_device_config.get_cache_precision(), new_value_cache_shape); + ov::Tensor value_cache(m_device_config.get_cache_precision(), new_key_cache_shape); + + // force allocation + std::memset(key_cache.data(), 0, key_cache.get_byte_size()); + std::memset(value_cache.data(), 0, value_cache.get_byte_size()); + + // copy current cache data + ov::Tensor dst_key_roi(key_cache, start_key, end_key); + ov::Tensor dst_value_roi(value_cache, start_value, end_value); + m_key_cache[decoder_layer_id].copy_to(dst_key_roi); + m_value_cache[decoder_layer_id].copy_to(dst_value_roi); + + // set new cache tensors + m_key_cache[decoder_layer_id] = key_cache; + m_value_cache[decoder_layer_id] = value_cache; + } + } else { + auto remote_context = m_core.get_default_context(device_name); + for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { + + auto old_byte_size_key = m_key_cache[decoder_layer_id].get_byte_size(); + auto old_byte_size_value = m_value_cache[decoder_layer_id].get_byte_size(); + ov::Coordinate end_key(m_key_cache[decoder_layer_id].get_shape()); + ov::Coordinate end_value(m_value_cache[decoder_layer_id].get_shape()); + + ov::Tensor key_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), new_value_cache_shape); + ov::Tensor value_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), new_key_cache_shape); + + // copy current cache data + ov::Tensor dst_key_roi(key_cache, start_key, end_key); + ov::Tensor dst_value_roi(value_cache, start_value, end_value); + m_key_cache[decoder_layer_id].copy_to(dst_key_roi); + m_value_cache[decoder_layer_id].copy_to(dst_value_roi); + + // set new cache tensors + m_key_cache[decoder_layer_id] = key_cache; + m_value_cache[decoder_layer_id] = value_cache; + } + } + update_request_tensor(); + + m_num_allocated_kv_blocks = num_kv_blocks; } ov::Tensor get_key_cache(size_t decoder_layer_id) const { @@ -62,8 +158,8 @@ class CacheManager { } void copy_blocks(const std::map>& block_copy_map) { - ov::Shape key_shape = m_device_config.get_key_cache_shape(); - ov::Shape value_shape = m_device_config.get_value_cache_shape(); + ov::Shape key_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(), m_num_allocated_kv_blocks); + ov::Shape value_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(), m_num_allocated_kv_blocks); ov::Coordinate key_src_start_roi(key_shape.size(), 0); ov::Coordinate key_src_end_roi = key_shape; diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index 901c5c64be..4db9a66586 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -50,11 +50,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init( ov::InferRequest infer_request = core.compile_model(model, device_config.get_device(), properties).create_infer_request(); // setup KV caches - m_cache_manager = std::make_shared(device_config, core); - for (size_t decoder_layer_id = 0; decoder_layer_id < device_config.get_num_layers(); ++decoder_layer_id) { - infer_request.set_tensor(std::string("key_cache.") + std::to_string(decoder_layer_id), m_cache_manager->get_key_cache(decoder_layer_id)); - infer_request.set_tensor(std::string("value_cache.") + std::to_string(decoder_layer_id), m_cache_manager->get_value_cache(decoder_layer_id)); - } + m_cache_manager = std::make_shared(device_config, infer_request, core); SchedulerConfig updated_config = scheduler_config; // update KV blocks number in scheduler config @@ -69,7 +65,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init( can_use_partial_preemption = false; } - m_scheduler = std::make_shared(device_config.get_block_size(), updated_config, device_config.get_num_layers(), can_use_partial_preemption); + m_scheduler = std::make_shared(device_config.get_block_size(), m_cache_manager, updated_config, device_config.get_num_layers(), can_use_partial_preemption); // and finally create model runner bool is_use_cache_eviction = m_scheduler->get_config().use_cache_eviction; m_model_runner = std::make_shared(infer_request, m_scheduler->get_block_size(), device_config.get_num_layers(), is_use_cache_eviction); diff --git a/src/cpp/src/device_config.hpp b/src/cpp/src/device_config.hpp index 2af4559ef1..62cfea02ee 100644 --- a/src/cpp/src/device_config.hpp +++ b/src/cpp/src/device_config.hpp @@ -12,7 +12,7 @@ namespace ov::genai { class DeviceConfig { ov::element::Type m_kv_cache_type; - ov::Shape m_key_cache_shape, m_value_cache_shape; + ov::PartialShape m_key_cache_shape, m_value_cache_shape; ov::Shape::value_type m_num_kv_heads, m_head_size, m_num_decoder_layers; size_t m_num_kv_blocks = 0; size_t m_block_size = 0; @@ -80,11 +80,10 @@ class DeviceConfig { OPENVINO_THROW(m_device, " is not supported by OpenVINO Continuous Batching"); } - OPENVINO_ASSERT(scheduling_config.num_kv_blocks > 0 || scheduling_config.cache_size > 0, "num_kv_blocks or cache_size should be more than zero."); if (scheduling_config.num_kv_blocks > 0) { m_num_kv_blocks = scheduling_config.num_kv_blocks; } - else { + else if (scheduling_config.cache_size > 0) { m_cache_size = scheduling_config.cache_size; } } @@ -104,23 +103,22 @@ class DeviceConfig { m_head_size += 8; } - if (m_num_kv_blocks == 0) { - OPENVINO_ASSERT(m_cache_size > 0, "num_kv_blocks or cache_size should be more than zero."); + if (m_num_kv_blocks == 0 && m_cache_size > 0) { size_t size_in_bytes = m_cache_size * 1024 * 1024 * 1024; m_num_kv_blocks = size_in_bytes / (m_num_decoder_layers * 2 * m_num_kv_heads * m_block_size * m_head_size * m_kv_cache_type.size()); } - m_key_cache_shape = m_value_cache_shape = ov::Shape{m_num_kv_blocks, - m_num_kv_heads, - m_block_size, - m_head_size}; + m_key_cache_shape = m_value_cache_shape = ov::PartialShape{ov::Dimension::dynamic(), + ov::Dimension(m_num_kv_heads), + ov::Dimension(m_block_size), + ov::Dimension(m_head_size)}; if (m_device.find("GPU") != std::string::npos) { // Update key shape, as the key's shape is different from the value's shape - m_key_cache_shape = ov::Shape{m_num_kv_blocks, - m_num_kv_heads, - m_head_size, - m_block_size}; + m_key_cache_shape = ov::PartialShape{ov::Dimension::dynamic(), + ov::Dimension(m_num_kv_heads), + ov::Dimension(m_head_size), + ov::Dimension(m_block_size)}; } } @@ -136,13 +134,13 @@ class DeviceConfig { return m_num_decoder_layers; } - ov::Shape get_key_cache_shape() const { - OPENVINO_ASSERT(!m_key_cache_shape.empty()); + ov::PartialShape get_key_cache_shape() const { + OPENVINO_ASSERT(m_key_cache_shape.size()); return m_key_cache_shape; } - ov::Shape get_value_cache_shape() const { - OPENVINO_ASSERT(!m_value_cache_shape.empty()); + ov::PartialShape get_value_cache_shape() const { + OPENVINO_ASSERT(m_value_cache_shape.size()); return m_value_cache_shape; } diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp index 6de4adaa47..5dedce563c 100644 --- a/src/cpp/src/scheduler.hpp +++ b/src/cpp/src/scheduler.hpp @@ -11,6 +11,7 @@ #include "device_config.hpp" #include "block_manager.hpp" #include "sequence_group.hpp" +#include "cache_manager.hpp" namespace ov::genai { class Scheduler { @@ -19,6 +20,11 @@ class Scheduler { SchedulerConfig m_config; BlockManager m_block_manager; friend class CacheStateDumper; + std::shared_ptr m_cache_manager; + const size_t m_kv_blocks_initial_multiplier = 1; + const float m_cache_increase_rate = 2; + const float m_precentage_threshold_for_cache_increase = 80; + bool m_dynamic_memory_allocation = false; public: struct Output { @@ -36,7 +42,8 @@ class Scheduler { float m_cache_usage = 0.0; }; - explicit Scheduler(size_t block_size, const SchedulerConfig & config = {}, size_t num_layers = 1, bool can_use_partial_preemption = true) : + explicit Scheduler(size_t block_size, std::shared_ptr cache_manager, const SchedulerConfig & config = {}, size_t num_layers = 1, bool can_use_partial_preemption = true) : + m_cache_manager(cache_manager), m_can_use_partial_preemption(can_use_partial_preemption), m_config(config), m_block_manager(m_config.num_kv_blocks, m_config.enable_prefix_caching, block_size, num_layers) { @@ -46,6 +53,21 @@ class Scheduler { Output schedule(std::vector& sequence_groups) { Output scheduler_output; + if (!m_block_manager.block_allocator_initialized()) { + size_t prompt_sum_size = 0; + for (auto idx = 0; idx < sequence_groups.size(); idx++) { + prompt_sum_size += sequence_groups[idx]->get_prompt_len(); + } + size_t initial_kv_cache_size = prompt_sum_size * m_kv_blocks_initial_multiplier; + m_block_manager.increase_kv_blocks_number(initial_kv_cache_size); + m_dynamic_memory_allocation = true; + } + else if (m_dynamic_memory_allocation && m_block_manager.get_used_percentage() > m_precentage_threshold_for_cache_increase) { + size_t new_cache_size = (size_t)(m_block_manager.get_total_number_of_kv_blocks() * m_cache_increase_rate); + m_block_manager.increase_kv_blocks_number(new_cache_size); + } + m_cache_manager->allocate_cache_if_needed(m_block_manager.get_total_number_of_kv_blocks()); + if (m_config.dynamic_split_fuse) { // deepspeed-mii case // generation phase is always scheduled first diff --git a/src/cpp/src/utils/paged_attention_transformations.cpp b/src/cpp/src/utils/paged_attention_transformations.cpp index 53690f770c..dfb9c0cc5d 100644 --- a/src/cpp/src/utils/paged_attention_transformations.cpp +++ b/src/cpp/src/utils/paged_attention_transformations.cpp @@ -66,8 +66,8 @@ void set_kv_cache_type_and_shape(std::shared_ptr model, DeviceConfig& it_k->second->set_element_type(device_config.get_cache_precision()); it_v->second->set_element_type(device_config.get_cache_precision()); // TODO: CVS-145270 - it_k->second->set_partial_shape(to_partial_with_dyn_0_dim(device_config.get_key_cache_shape())); - it_v->second->set_partial_shape(to_partial_with_dyn_0_dim(device_config.get_value_cache_shape())); + it_k->second->set_partial_shape(device_config.get_key_cache_shape()); + it_v->second->set_partial_shape(device_config.get_value_cache_shape()); } model->validate_nodes_and_infer_types(); diff --git a/tests/cpp/cache_manager.cpp b/tests/cpp/cache_manager.cpp index b2a5396d5f..d902a37254 100644 --- a/tests/cpp/cache_manager.cpp +++ b/tests/cpp/cache_manager.cpp @@ -21,14 +21,14 @@ TEST(TestCacheManager, general_test) { size_t num_decoder_layers = 12; device_config.set_model_params(12, 64, num_decoder_layers); - auto cache_manager = std::make_shared(device_config, core); + // auto cache_manager = std::make_shared(device_config, core); - size_t allocated_bytes = 0; - for (size_t i = 0; i < num_decoder_layers; i++) { - auto key_cache = cache_manager->get_key_cache(i); - auto value_cache = cache_manager->get_value_cache(i); - allocated_bytes += key_cache.get_byte_size() + value_cache.get_byte_size(); - } + // size_t allocated_bytes = 0; + // for (size_t i = 0; i < num_decoder_layers; i++) { + // auto key_cache = cache_manager->get_key_cache(i); + // auto value_cache = cache_manager->get_value_cache(i); + // allocated_bytes += key_cache.get_byte_size() + value_cache.get_byte_size(); + // } - ASSERT_EQ(allocated_bytes, 2146959360); + // ASSERT_EQ(allocated_bytes, 2146959360); } diff --git a/tests/cpp/scheduler.cpp b/tests/cpp/scheduler.cpp index 40c3e73747..af4fcfc10e 100644 --- a/tests/cpp/scheduler.cpp +++ b/tests/cpp/scheduler.cpp @@ -43,7 +43,7 @@ TEST(TestScheduler, general_test) { // schedule 3 sequence groups that use 6 kv blocks - Scheduler scheduler = Scheduler(4, scheduler_config); + Scheduler scheduler = Scheduler(4, nullptr, scheduler_config); auto out1 = scheduler.schedule(requests); std::vector ref_ids = {0, 1, 2}; @@ -144,7 +144,7 @@ TEST_P(AppendSlotsSchedulerTest, test_append_slots_considers_all_sequences) { auto idx1 = (*sequence_group2)[0]->get_id(); std::vector requests = {sequence_group1, sequence_group2}; - Scheduler scheduler = Scheduler(4, scheduler_config); + Scheduler scheduler = Scheduler(4, nullptr, scheduler_config); auto out1 = scheduler.schedule(requests); std::vector ref_ids = {0, 1}; @@ -212,7 +212,7 @@ TEST_P(PartialPreemptionSchedulerTest, test_partial_preemption) { // schedule 2 sequence groups that use 5 kv blocks - Scheduler scheduler = Scheduler(4, scheduler_config); + Scheduler scheduler = Scheduler(4, nullptr, scheduler_config); auto out0 = scheduler.schedule(requests); for (auto seq: requests) { @@ -297,7 +297,7 @@ TEST(TestScheduler, test_partial_preemption_beam_search) { sequence_group->set_sequence_group_ptr(sequence_group); std::vector requests = {sequence_group}; - Scheduler scheduler = Scheduler(4, scheduler_config); + Scheduler scheduler = Scheduler(4, nullptr, scheduler_config); auto out = scheduler.schedule(requests); for (auto sequence: sequence_group->get_not_finished_sequences()) { sequence->append_token(token, 0.7); @@ -409,7 +409,7 @@ TEST(TestScheduler, test_partially_preempted_prompt) { // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks. - Scheduler scheduler = Scheduler(4, scheduler_config); + Scheduler scheduler = Scheduler(4, nullptr, scheduler_config); auto out1 = scheduler.schedule(requests); for (auto seq: requests) { @@ -503,7 +503,7 @@ TEST(TestScheduler, prefix_caching_test) { std::vector prompt_tokens = {0,1,2,3,4,5,6,7}; std::vector histrory_tokens = {}; // schedule prompt - Scheduler scheduler = Scheduler(4, scheduler_config); + Scheduler scheduler = Scheduler(4, nullptr, scheduler_config); size_t chat_iterations = 10; @@ -566,7 +566,7 @@ TEST(TestScheduler, prefix_caching_test_two_identical_sequences) { std::vector prompt_tokens = {0,1,2,3,4,5,6,7}; std::vector histrory_tokens = {}; // schedule prompt - Scheduler scheduler = Scheduler(4, scheduler_config); + Scheduler scheduler = Scheduler(4, nullptr, scheduler_config); size_t chat_iterations = 10; @@ -640,7 +640,7 @@ TEST(TestScheduler, prefix_caching_with_max_new_tokens_equal_1) { for (auto scheduler_config: configs) { std::vector prompt_tokens = {0,1,2,3,4,5,6,7}; // schedule prompt - Scheduler scheduler = Scheduler(32, scheduler_config); + Scheduler scheduler = Scheduler(32, nullptr, scheduler_config); size_t chat_iterations = 2; @@ -701,7 +701,7 @@ TEST(TestScheduler, test_partially_preempted_prompt_not_allowed) { // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks. const bool can_use_partial_preemption = false; - Scheduler scheduler = Scheduler(4, scheduler_config, 1, can_use_partial_preemption); + Scheduler scheduler = Scheduler(4, nullptr, scheduler_config, 1, can_use_partial_preemption); auto out1 = scheduler.schedule(requests); for (auto req : requests) @@ -775,7 +775,7 @@ TEST(TestScheduler, test_partially_preempted_prompt_not_allowed2) { // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks. const bool can_use_partial_preemption = false; - Scheduler scheduler = Scheduler(4, scheduler_config, 1, can_use_partial_preemption); + Scheduler scheduler = Scheduler(4, nullptr, scheduler_config, 1, can_use_partial_preemption); scheduler.schedule(requests); for (auto req: requests) req->finish_iteration(); @@ -890,7 +890,7 @@ TEST(TestScheduler, FullyPreemptsCacheEvictedSequences) { std::vector requests = {sequence_group1, sequence_group2}; - Scheduler scheduler = Scheduler(2, scheduler_config); + Scheduler scheduler = Scheduler(2, nullptr, scheduler_config); // prompt phase - schedules 1 block for seq 1, 5 blocks for seq 2 auto out = scheduler.schedule(requests); From 8b68108994fcf91239f2fbf7ac4ac66bb99e7f0a Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 11 Dec 2024 11:41:02 +0100 Subject: [PATCH 02/31] Minor corrections. --- src/cpp/src/scheduler.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp index 5dedce563c..80b6c19286 100644 --- a/src/cpp/src/scheduler.hpp +++ b/src/cpp/src/scheduler.hpp @@ -21,7 +21,7 @@ class Scheduler { BlockManager m_block_manager; friend class CacheStateDumper; std::shared_ptr m_cache_manager; - const size_t m_kv_blocks_initial_multiplier = 1; + const size_t m_kv_blocks_initial_multiplier = 4; const float m_cache_increase_rate = 2; const float m_precentage_threshold_for_cache_increase = 80; bool m_dynamic_memory_allocation = false; From 5c384a010159b37af32c316c5aa012f5695e31eb Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 12 Dec 2024 15:41:07 +0100 Subject: [PATCH 03/31] Fixed cpp tests, added tests of dynamic allocation. --- src/cpp/src/cache_manager.hpp | 2 - src/cpp/src/scheduler.hpp | 1 + tests/cpp/cache_manager.cpp | 119 ++++++++++++++++-- tests/cpp/scheduler.cpp | 60 ++++++--- .../python_tests/test_cache_optimizations.py | 14 ++- 5 files changed, 168 insertions(+), 28 deletions(-) diff --git a/src/cpp/src/cache_manager.hpp b/src/cpp/src/cache_manager.hpp index 096f4bfadd..2f201c48ad 100644 --- a/src/cpp/src/cache_manager.hpp +++ b/src/cpp/src/cache_manager.hpp @@ -26,8 +26,6 @@ class CacheManager { m_core(core) { m_key_cache.reserve(m_device_config.get_num_layers()); m_value_cache.reserve(m_device_config.get_num_layers()); - - //allocate_cache_if_needed(1); } ov::Shape set_first_dim_and_make_static(const ov::PartialShape& shape, size_t dim) { diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp index 80b6c19286..afef91ab5b 100644 --- a/src/cpp/src/scheduler.hpp +++ b/src/cpp/src/scheduler.hpp @@ -66,6 +66,7 @@ class Scheduler { size_t new_cache_size = (size_t)(m_block_manager.get_total_number_of_kv_blocks() * m_cache_increase_rate); m_block_manager.increase_kv_blocks_number(new_cache_size); } + OPENVINO_ASSERT(m_cache_manager != nullptr, "Cache manager needs to be set in the Scheduler constructor."); m_cache_manager->allocate_cache_if_needed(m_block_manager.get_total_number_of_kv_blocks()); if (m_config.dynamic_split_fuse) { diff --git a/tests/cpp/cache_manager.cpp b/tests/cpp/cache_manager.cpp index d902a37254..9fee7d423a 100644 --- a/tests/cpp/cache_manager.cpp +++ b/tests/cpp/cache_manager.cpp @@ -7,8 +7,43 @@ #include "scheduler.hpp" #include "device_config.hpp" #include "cache_manager.hpp" +#include "openvino/op/concat.hpp" -TEST(TestCacheManager, general_test) { +using namespace ov::genai; + +std::shared_ptr get_dummy_model(size_t num_layers) { + ov::NodeVector keys; + ov::NodeVector values; + ov::ParameterVector params; + auto shape = ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()}); + for (size_t i = 0; i < num_layers; i++) { + auto key = std::make_shared(ov::element::f16, shape); + auto value = std::make_shared(ov::element::f16, shape); + key->get_output_tensor(0).set_names({"key_cache." + std::to_string(i)}); + value->get_output_tensor(0).set_names({"value_cache." + std::to_string(i)}); + keys.push_back(key); + values.push_back(value); + params.push_back(key); + params.push_back(value); + } + const auto& concat1 = std::make_shared(keys, 1); + const auto& concat2 = std::make_shared(values, 1); + auto model = std::make_shared(ov::NodeVector{concat1, concat2}, params); + return std::make_shared(ov::NodeVector{concat1, concat2}, params); +} + +size_t get_total_accocated_bytes(std::shared_ptr cache_manager, size_t num_decoder_layers) { + size_t allocated_bytes = 0; + for (size_t i = 0; i < num_decoder_layers; i++) { + auto key_cache = cache_manager->get_key_cache(i); + auto value_cache = cache_manager->get_value_cache(i); + allocated_bytes += key_cache.get_byte_size() + value_cache.get_byte_size(); + } + return allocated_bytes; +} + + +TEST(TestCacheManager, test_cache_size_param) { ov::Core core; ov::genai::SchedulerConfig scheduler_config; scheduler_config.max_num_batched_tokens = 32; @@ -21,14 +56,78 @@ TEST(TestCacheManager, general_test) { size_t num_decoder_layers = 12; device_config.set_model_params(12, 64, num_decoder_layers); - // auto cache_manager = std::make_shared(device_config, core); - - // size_t allocated_bytes = 0; - // for (size_t i = 0; i < num_decoder_layers; i++) { - // auto key_cache = cache_manager->get_key_cache(i); - // auto value_cache = cache_manager->get_value_cache(i); - // allocated_bytes += key_cache.get_byte_size() + value_cache.get_byte_size(); - // } + ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request(); + auto cache_manager = std::make_shared(device_config, request, core); + auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers()); + OPENVINO_ASSERT(block_manager.block_allocator_initialized()); + cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks()); - // ASSERT_EQ(allocated_bytes, 2146959360); + ASSERT_EQ(get_total_accocated_bytes(cache_manager, num_decoder_layers), 2146959360); +} + + +TEST(TestCacheManager, test_kv_blocks_param) { + ov::Core core; + ov::genai::SchedulerConfig scheduler_config; + scheduler_config.max_num_batched_tokens = 32; + scheduler_config.num_kv_blocks = 150; + scheduler_config.cache_size = 0; + scheduler_config.max_num_seqs = 2; + + const std::string device = "CPU"; + ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); + size_t num_decoder_layers = 12; + device_config.set_model_params(12, 64, num_decoder_layers); + + ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request(); + auto cache_manager = std::make_shared(device_config, request, core); + auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers()); + OPENVINO_ASSERT(block_manager.block_allocator_initialized()); + OPENVINO_ASSERT(block_manager.get_total_number_of_kv_blocks(), scheduler_config.num_kv_blocks); } + + +TEST(TestCacheManager, test_dynamic_cache_increase) { + ov::Core core; + ov::genai::SchedulerConfig scheduler_config; + scheduler_config.max_num_batched_tokens = 32; + scheduler_config.num_kv_blocks = 0; + scheduler_config.cache_size = 0; + scheduler_config.max_num_seqs = 2; + + const std::string device = "CPU"; + ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); + size_t num_decoder_layers = 12; + size_t head_size = 64; + size_t num_kv_heads = 12; + device_config.set_model_params(num_kv_heads, head_size, num_decoder_layers); + size_t block_size_in_bytes = num_decoder_layers * 2 * num_kv_heads * device_config.get_block_size() * head_size * device_config.get_cache_precision().size(); + + + ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request(); + auto cache_manager = std::make_shared(device_config, request, core); + auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers()); + OPENVINO_ASSERT(!block_manager.block_allocator_initialized()); + + // check initial cache allocation + block_manager.increase_kv_blocks_number(100); + OPENVINO_ASSERT(block_manager.block_allocator_initialized()); + OPENVINO_ASSERT(block_manager.get_total_number_of_kv_blocks(), 100); + + cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks()); + OPENVINO_ASSERT(get_total_accocated_bytes(cache_manager, num_decoder_layers), 100 * block_size_in_bytes); + + + // check cache increase + block_manager.increase_kv_blocks_number(200); + OPENVINO_ASSERT(block_manager.block_allocator_initialized()); + OPENVINO_ASSERT(block_manager.get_total_number_of_kv_blocks(), 200); + + cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks()); + OPENVINO_ASSERT(get_total_accocated_bytes(cache_manager, num_decoder_layers), 200 * block_size_in_bytes); + + + // check that cache does not increase if new blocks were not allocated + cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks()); + OPENVINO_ASSERT(get_total_accocated_bytes(cache_manager, num_decoder_layers), 200 * block_size_in_bytes); +} \ No newline at end of file diff --git a/tests/cpp/scheduler.cpp b/tests/cpp/scheduler.cpp index af4fcfc10e..79c5d5a433 100644 --- a/tests/cpp/scheduler.cpp +++ b/tests/cpp/scheduler.cpp @@ -4,6 +4,7 @@ #include #include "openvino/runtime/core.hpp" +#include "openvino/op/concat.hpp" #include "openvino/genai/continuous_batching_pipeline.hpp" #include "openvino/genai/generation_config.hpp" #include "sequence_group.hpp" @@ -18,6 +19,38 @@ void clear_finished_sequences(std::vector& requests) { requests.erase(new_end, requests.end()); } +std::shared_ptr get_model(size_t num_layers) { + ov::NodeVector keys; + ov::NodeVector values; + ov::ParameterVector params; + auto shape = ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()}); + for (size_t i = 0; i < num_layers; i++) { + auto key = std::make_shared(ov::element::f16, shape); + auto value = std::make_shared(ov::element::f16, shape); + key->get_output_tensor(0).set_names({"key_cache." + std::to_string(i)}); + value->get_output_tensor(0).set_names({"value_cache." + std::to_string(i)}); + keys.push_back(key); + values.push_back(value); + params.push_back(key); + params.push_back(value); + } + const auto& concat1 = std::make_shared(keys, 1); + const auto& concat2 = std::make_shared(values, 1); + auto model = std::make_shared(ov::NodeVector{concat1, concat2}, params); + return std::make_shared(ov::NodeVector{concat1, concat2}, params); +} + +std::shared_ptr init_cache_manager(SchedulerConfig scheduler_config) { + ov::Core core = ov::Core(); + size_t num_decoder_layers = 12; + ov::InferRequest request = core.compile_model(get_model(num_decoder_layers)).create_infer_request(); + size_t head_size = 64, head_size_u8 = head_size + 8; + size_t num_kv_heads = 12; + ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); + device_config.set_model_params(num_kv_heads, head_size_u8, num_decoder_layers); + return std::make_shared(device_config, request, core); +} + TEST(TestScheduler, general_test) { std::array configs = {SchedulerConfig(), SchedulerConfig()}; configs.at(0).max_num_batched_tokens = 32; @@ -40,10 +73,9 @@ TEST(TestScheduler, general_test) { ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching); auto idx2 = (*sequence_group3)[0]->get_id(); std::vector requests = {sequence_group1, sequence_group2, sequence_group3}; - // schedule 3 sequence groups that use 6 kv blocks - Scheduler scheduler = Scheduler(4, nullptr, scheduler_config); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); auto out1 = scheduler.schedule(requests); std::vector ref_ids = {0, 1, 2}; @@ -144,7 +176,7 @@ TEST_P(AppendSlotsSchedulerTest, test_append_slots_considers_all_sequences) { auto idx1 = (*sequence_group2)[0]->get_id(); std::vector requests = {sequence_group1, sequence_group2}; - Scheduler scheduler = Scheduler(4, nullptr, scheduler_config); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); auto out1 = scheduler.schedule(requests); std::vector ref_ids = {0, 1}; @@ -212,7 +244,7 @@ TEST_P(PartialPreemptionSchedulerTest, test_partial_preemption) { // schedule 2 sequence groups that use 5 kv blocks - Scheduler scheduler = Scheduler(4, nullptr, scheduler_config); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); auto out0 = scheduler.schedule(requests); for (auto seq: requests) { @@ -297,7 +329,7 @@ TEST(TestScheduler, test_partial_preemption_beam_search) { sequence_group->set_sequence_group_ptr(sequence_group); std::vector requests = {sequence_group}; - Scheduler scheduler = Scheduler(4, nullptr, scheduler_config); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); auto out = scheduler.schedule(requests); for (auto sequence: sequence_group->get_not_finished_sequences()) { sequence->append_token(token, 0.7); @@ -405,11 +437,10 @@ TEST(TestScheduler, test_partially_preempted_prompt) { SequenceGroup::Ptr sequence_group2 = std::make_shared(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching); auto idx1 = (*sequence_group2)[0]->get_id(); - std::vector requests = {sequence_group1, sequence_group2}; - + std::vector requests = {sequence_group1, sequence_group2}; // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks. - Scheduler scheduler = Scheduler(4, nullptr, scheduler_config); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); auto out1 = scheduler.schedule(requests); for (auto seq: requests) { @@ -503,7 +534,7 @@ TEST(TestScheduler, prefix_caching_test) { std::vector prompt_tokens = {0,1,2,3,4,5,6,7}; std::vector histrory_tokens = {}; // schedule prompt - Scheduler scheduler = Scheduler(4, nullptr, scheduler_config); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); size_t chat_iterations = 10; @@ -566,7 +597,7 @@ TEST(TestScheduler, prefix_caching_test_two_identical_sequences) { std::vector prompt_tokens = {0,1,2,3,4,5,6,7}; std::vector histrory_tokens = {}; // schedule prompt - Scheduler scheduler = Scheduler(4, nullptr, scheduler_config); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); size_t chat_iterations = 10; @@ -640,7 +671,7 @@ TEST(TestScheduler, prefix_caching_with_max_new_tokens_equal_1) { for (auto scheduler_config: configs) { std::vector prompt_tokens = {0,1,2,3,4,5,6,7}; // schedule prompt - Scheduler scheduler = Scheduler(32, nullptr, scheduler_config); + Scheduler scheduler = Scheduler(32, init_cache_manager(scheduler_config), scheduler_config); size_t chat_iterations = 2; @@ -701,7 +732,7 @@ TEST(TestScheduler, test_partially_preempted_prompt_not_allowed) { // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks. const bool can_use_partial_preemption = false; - Scheduler scheduler = Scheduler(4, nullptr, scheduler_config, 1, can_use_partial_preemption); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config, 1, can_use_partial_preemption); auto out1 = scheduler.schedule(requests); for (auto req : requests) @@ -775,7 +806,7 @@ TEST(TestScheduler, test_partially_preempted_prompt_not_allowed2) { // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks. const bool can_use_partial_preemption = false; - Scheduler scheduler = Scheduler(4, nullptr, scheduler_config, 1, can_use_partial_preemption); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config, 1, can_use_partial_preemption); scheduler.schedule(requests); for (auto req: requests) req->finish_iteration(); @@ -874,7 +905,6 @@ TEST(TestScheduler, FullyPreemptsCacheEvictedSequences) { scheduler_config.use_cache_eviction = true; scheduler_config.cache_eviction_config = ov::genai::CacheEvictionConfig(2, 2, 6, ov::genai::AggregationMode::NORM_SUM); - std::vector tokens1 = {0, 1}; // 1 full block SequenceGroup::Ptr sequence_group1 = std::make_shared(0, ov::Tensor(ov::element::i64, {tokens1.size()}, @@ -890,7 +920,7 @@ TEST(TestScheduler, FullyPreemptsCacheEvictedSequences) { std::vector requests = {sequence_group1, sequence_group2}; - Scheduler scheduler = Scheduler(2, nullptr, scheduler_config); + Scheduler scheduler = Scheduler(2, init_cache_manager(scheduler_config), scheduler_config); // prompt phase - schedules 1 block for seq 1, 5 blocks for seq 2 auto out = scheduler.schedule(requests); diff --git a/tests/python_tests/test_cache_optimizations.py b/tests/python_tests/test_cache_optimizations.py index a34e604382..6052b45f25 100644 --- a/tests/python_tests/test_cache_optimizations.py +++ b/tests/python_tests/test_cache_optimizations.py @@ -16,7 +16,7 @@ from openvino import serialize from transformers import AutoTokenizer -from common import TESTS_ROOT +from common import TESTS_ROOT, run_test_pipeline, get_beam_search, get_greedy def load_prompts_dataset(file_name : str) -> Dict[str, List[str]]: @@ -144,3 +144,15 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t del model_cb_noopt +scheduler_params_list = [ + ({"num_kv_blocks": 0, "cache_size": 5, "dynamic_split_fuse": True, "enable_prefix_caching": True, "max_num_seqs": 500}, get_beam_search()), + ({"num_kv_blocks": 0, "cache_size": 5, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": True, "max_num_seqs": 500}, get_beam_search()), + ({"num_kv_blocks": 0, "cache_size": 5, "dynamic_split_fuse": True, "enable_prefix_caching": False, "max_num_seqs": 500}, get_greedy()), + ({"num_kv_blocks": 0, "cache_size": 5, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": False, "max_num_seqs": 500}, get_greedy()), + ({"num_kv_blocks": 0, "cache_size": 5, "dynamic_split_fuse": True, "use_cache_eviction": True, "cache_eviction_config": SHORT_CACHE_EVICTION_CONFIG}, get_beam_search()), + ({"num_kv_blocks": 0, "cache_size": 5, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "use_cache_eviction": False, "cache_eviction_config": SHORT_CACHE_EVICTION_CONFIG}, get_greedy())] +@pytest.mark.parametrize("params", scheduler_params_list) +@pytest.mark.precommit +def test_dynamic_memory_allocation(tmp_path, params): + run_test_pipeline(tmp_path, "facebook/opt-125m", params[0], params[1]) + From 0f32f1f61bf993904437e5f8039f95ebfded296d Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 12 Dec 2024 15:46:16 +0100 Subject: [PATCH 04/31] Fixed typo. --- tests/cpp/cache_manager.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/cpp/cache_manager.cpp b/tests/cpp/cache_manager.cpp index 9fee7d423a..a3be00b226 100644 --- a/tests/cpp/cache_manager.cpp +++ b/tests/cpp/cache_manager.cpp @@ -32,7 +32,7 @@ std::shared_ptr get_dummy_model(size_t num_layers) { return std::make_shared(ov::NodeVector{concat1, concat2}, params); } -size_t get_total_accocated_bytes(std::shared_ptr cache_manager, size_t num_decoder_layers) { +size_t get_total_allocated_bytes(std::shared_ptr cache_manager, size_t num_decoder_layers) { size_t allocated_bytes = 0; for (size_t i = 0; i < num_decoder_layers; i++) { auto key_cache = cache_manager->get_key_cache(i); @@ -62,7 +62,7 @@ TEST(TestCacheManager, test_cache_size_param) { OPENVINO_ASSERT(block_manager.block_allocator_initialized()); cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks()); - ASSERT_EQ(get_total_accocated_bytes(cache_manager, num_decoder_layers), 2146959360); + ASSERT_EQ(get_total_allocated_bytes(cache_manager, num_decoder_layers), 2146959360); } @@ -115,7 +115,7 @@ TEST(TestCacheManager, test_dynamic_cache_increase) { OPENVINO_ASSERT(block_manager.get_total_number_of_kv_blocks(), 100); cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks()); - OPENVINO_ASSERT(get_total_accocated_bytes(cache_manager, num_decoder_layers), 100 * block_size_in_bytes); + OPENVINO_ASSERT(get_total_allocated_bytes(cache_manager, num_decoder_layers), 100 * block_size_in_bytes); // check cache increase @@ -124,10 +124,10 @@ TEST(TestCacheManager, test_dynamic_cache_increase) { OPENVINO_ASSERT(block_manager.get_total_number_of_kv_blocks(), 200); cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks()); - OPENVINO_ASSERT(get_total_accocated_bytes(cache_manager, num_decoder_layers), 200 * block_size_in_bytes); + OPENVINO_ASSERT(get_total_allocated_bytes(cache_manager, num_decoder_layers), 200 * block_size_in_bytes); // check that cache does not increase if new blocks were not allocated cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks()); - OPENVINO_ASSERT(get_total_accocated_bytes(cache_manager, num_decoder_layers), 200 * block_size_in_bytes); + OPENVINO_ASSERT(get_total_allocated_bytes(cache_manager, num_decoder_layers), 200 * block_size_in_bytes); } \ No newline at end of file From d3f15fa2a0026b5f4d813d400f8db24a84d0ef9c Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 12 Dec 2024 15:55:35 +0100 Subject: [PATCH 05/31] Test corrected. --- tests/python_tests/test_cache_optimizations.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/python_tests/test_cache_optimizations.py b/tests/python_tests/test_cache_optimizations.py index b808c1a13f..1a57cc90a6 100644 --- a/tests/python_tests/test_cache_optimizations.py +++ b/tests/python_tests/test_cache_optimizations.py @@ -146,12 +146,12 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t scheduler_params_list = [ - ({"num_kv_blocks": 0, "cache_size": 5, "dynamic_split_fuse": True, "enable_prefix_caching": True, "max_num_seqs": 500}, get_beam_search()), - ({"num_kv_blocks": 0, "cache_size": 5, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": True, "max_num_seqs": 500}, get_beam_search()), - ({"num_kv_blocks": 0, "cache_size": 5, "dynamic_split_fuse": True, "enable_prefix_caching": False, "max_num_seqs": 500}, get_greedy()), - ({"num_kv_blocks": 0, "cache_size": 5, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": False, "max_num_seqs": 500}, get_greedy()), - ({"num_kv_blocks": 0, "cache_size": 5, "dynamic_split_fuse": True, "use_cache_eviction": True, "cache_eviction_config": SHORT_CACHE_EVICTION_CONFIG}, get_beam_search()), - ({"num_kv_blocks": 0, "cache_size": 5, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "use_cache_eviction": False, "cache_eviction_config": SHORT_CACHE_EVICTION_CONFIG}, get_greedy())] + ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "enable_prefix_caching": True, "max_num_seqs": 500}, get_beam_search()), + ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": True, "max_num_seqs": 500}, get_beam_search()), + ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "enable_prefix_caching": False, "max_num_seqs": 500}, get_greedy()), + ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": False, "max_num_seqs": 500}, get_greedy()), + ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "use_cache_eviction": True, "cache_eviction_config": SHORT_CACHE_EVICTION_CONFIG}, get_beam_search()), + ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "use_cache_eviction": False, "cache_eviction_config": SHORT_CACHE_EVICTION_CONFIG}, get_greedy())] @pytest.mark.parametrize("params", scheduler_params_list) @pytest.mark.precommit def test_dynamic_memory_allocation(tmp_path, params): From ec7ca26f6767572b22588c55e2531e4a02aa1cff Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 12 Dec 2024 17:37:00 +0100 Subject: [PATCH 06/31] Minor corrections. --- src/cpp/src/scheduler.hpp | 4 +-- .../python_tests/test_cache_optimizations.py | 28 ++++++++++++++----- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp index afef91ab5b..b8f7a1b6f3 100644 --- a/src/cpp/src/scheduler.hpp +++ b/src/cpp/src/scheduler.hpp @@ -22,7 +22,7 @@ class Scheduler { friend class CacheStateDumper; std::shared_ptr m_cache_manager; const size_t m_kv_blocks_initial_multiplier = 4; - const float m_cache_increase_rate = 2; + const float m_cache_growth_factor = 2; // commmon values 1.5 or 2 const float m_precentage_threshold_for_cache_increase = 80; bool m_dynamic_memory_allocation = false; @@ -63,7 +63,7 @@ class Scheduler { m_dynamic_memory_allocation = true; } else if (m_dynamic_memory_allocation && m_block_manager.get_used_percentage() > m_precentage_threshold_for_cache_increase) { - size_t new_cache_size = (size_t)(m_block_manager.get_total_number_of_kv_blocks() * m_cache_increase_rate); + size_t new_cache_size = (size_t)(m_block_manager.get_total_number_of_kv_blocks() * m_cache_growth_factor); m_block_manager.increase_kv_blocks_number(new_cache_size); } OPENVINO_ASSERT(m_cache_manager != nullptr, "Cache manager needs to be set in the Scheduler constructor."); diff --git a/tests/python_tests/test_cache_optimizations.py b/tests/python_tests/test_cache_optimizations.py index 1a57cc90a6..327b91c5c4 100644 --- a/tests/python_tests/test_cache_optimizations.py +++ b/tests/python_tests/test_cache_optimizations.py @@ -15,7 +15,7 @@ from openvino import serialize from transformers import AutoTokenizer -from common import TESTS_ROOT, run_test_pipeline, get_beam_search, get_greedy +from common import TESTS_ROOT, run_test_pipeline def load_prompts_dataset(file_name : str) -> Dict[str, List[str]]: @@ -145,13 +145,27 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t del model_cb_noopt +def get_greedy_seq_len_300() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_return_sequences = 3 + generation_config.max_new_tokens = 300 + return generation_config + +def get_beam_search_seq_len_300() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_beam_groups = 3 + generation_config.num_beams = 6 + generation_config.max_new_tokens = 300 + generation_config.num_return_sequences = generation_config.num_beams + return generation_config + scheduler_params_list = [ - ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "enable_prefix_caching": True, "max_num_seqs": 500}, get_beam_search()), - ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": True, "max_num_seqs": 500}, get_beam_search()), - ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "enable_prefix_caching": False, "max_num_seqs": 500}, get_greedy()), - ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": False, "max_num_seqs": 500}, get_greedy()), - ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "use_cache_eviction": True, "cache_eviction_config": SHORT_CACHE_EVICTION_CONFIG}, get_beam_search()), - ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "use_cache_eviction": False, "cache_eviction_config": SHORT_CACHE_EVICTION_CONFIG}, get_greedy())] + ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "enable_prefix_caching": True}, get_greedy_seq_len_300()), + ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": True}, get_beam_search_seq_len_300()), + ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "enable_prefix_caching": False}, get_greedy_seq_len_300()), + ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": False}, get_greedy_seq_len_300()), + ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "use_cache_eviction": True, "cache_eviction_config": SHORT_CACHE_EVICTION_CONFIG}, get_beam_search_seq_len_300()), + ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "use_cache_eviction": False, "cache_eviction_config": SHORT_CACHE_EVICTION_CONFIG}, get_greedy_seq_len_300())] @pytest.mark.parametrize("params", scheduler_params_list) @pytest.mark.precommit def test_dynamic_memory_allocation(tmp_path, params): From 175f2416b27814eb14e6e77f09cb95c992716893 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Fri, 13 Dec 2024 11:51:59 +0100 Subject: [PATCH 07/31] Minor corrections. --- src/cpp/src/scheduler.hpp | 7 ++++--- tests/python_tests/test_cache_optimizations.py | 5 ++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp index b8f7a1b6f3..c32b3746dd 100644 --- a/src/cpp/src/scheduler.hpp +++ b/src/cpp/src/scheduler.hpp @@ -21,9 +21,9 @@ class Scheduler { BlockManager m_block_manager; friend class CacheStateDumper; std::shared_ptr m_cache_manager; - const size_t m_kv_blocks_initial_multiplier = 4; + const size_t m_kv_blocks_initial_multiplier = 2; const float m_cache_growth_factor = 2; // commmon values 1.5 or 2 - const float m_precentage_threshold_for_cache_increase = 80; + const float m_precentage_threshold_for_cache_increase = 100; bool m_dynamic_memory_allocation = false; public: @@ -52,6 +52,7 @@ class Scheduler { Output schedule(std::vector& sequence_groups) { Output scheduler_output; + float eps = 1e-5; if (!m_block_manager.block_allocator_initialized()) { size_t prompt_sum_size = 0; @@ -62,7 +63,7 @@ class Scheduler { m_block_manager.increase_kv_blocks_number(initial_kv_cache_size); m_dynamic_memory_allocation = true; } - else if (m_dynamic_memory_allocation && m_block_manager.get_used_percentage() > m_precentage_threshold_for_cache_increase) { + else if (m_dynamic_memory_allocation && (m_block_manager.get_used_percentage() + eps) > m_precentage_threshold_for_cache_increase) { size_t new_cache_size = (size_t)(m_block_manager.get_total_number_of_kv_blocks() * m_cache_growth_factor); m_block_manager.increase_kv_blocks_number(new_cache_size); } diff --git a/tests/python_tests/test_cache_optimizations.py b/tests/python_tests/test_cache_optimizations.py index 327b91c5c4..3c09d34756 100644 --- a/tests/python_tests/test_cache_optimizations.py +++ b/tests/python_tests/test_cache_optimizations.py @@ -163,9 +163,8 @@ def get_beam_search_seq_len_300() -> GenerationConfig: ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "enable_prefix_caching": True}, get_greedy_seq_len_300()), ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": True}, get_beam_search_seq_len_300()), ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "enable_prefix_caching": False}, get_greedy_seq_len_300()), - ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": False}, get_greedy_seq_len_300()), - ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "use_cache_eviction": True, "cache_eviction_config": SHORT_CACHE_EVICTION_CONFIG}, get_beam_search_seq_len_300()), - ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "use_cache_eviction": False, "cache_eviction_config": SHORT_CACHE_EVICTION_CONFIG}, get_greedy_seq_len_300())] + ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": False}, get_beam_search_seq_len_300()), + ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "use_cache_eviction": True, "cache_eviction_config": SHORT_CACHE_EVICTION_CONFIG}, get_greedy_seq_len_300())] @pytest.mark.parametrize("params", scheduler_params_list) @pytest.mark.precommit def test_dynamic_memory_allocation(tmp_path, params): From a6facc50081f2592eb5a5a24b1f2f4e4c7e0fee8 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Fri, 13 Dec 2024 13:50:19 +0100 Subject: [PATCH 08/31] Minor correction. --- src/cpp/src/block_manager.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/block_manager.hpp b/src/cpp/src/block_manager.hpp index a36133fbac..d964032db0 100644 --- a/src/cpp/src/block_manager.hpp +++ b/src/cpp/src/block_manager.hpp @@ -233,7 +233,7 @@ class BlockAllocator { } size_t added_blocks = new_kv_blocks_count - m_total_num_blocks; for (auto idx = 0; idx < m_free_blocks_num.size(); idx++) { - m_free_blocks_num[idx] = added_blocks; + m_free_blocks_num[idx] += added_blocks; } for (auto& per_layer_block_list : m_free_blocks) { for (int block_id = m_total_num_blocks; block_id < new_kv_blocks_count; ++block_id) { From 34f6d27c3f35eaf792860148eb4faf37253aa659 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Fri, 13 Dec 2024 14:07:55 +0100 Subject: [PATCH 09/31] Removed not used code. --- src/cpp/src/cache_manager.hpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/cpp/src/cache_manager.hpp b/src/cpp/src/cache_manager.hpp index 2f201c48ad..0ae97a868d 100644 --- a/src/cpp/src/cache_manager.hpp +++ b/src/cpp/src/cache_manager.hpp @@ -94,9 +94,6 @@ class CacheManager { if (device_name.find("GPU") == std::string::npos) { for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { - - auto old_byte_size_key = m_key_cache[decoder_layer_id].get_byte_size(); - auto old_byte_size_value = m_value_cache[decoder_layer_id].get_byte_size(); ov::Coordinate end_key(m_key_cache[decoder_layer_id].get_shape()); ov::Coordinate end_value(m_value_cache[decoder_layer_id].get_shape()); @@ -120,9 +117,6 @@ class CacheManager { } else { auto remote_context = m_core.get_default_context(device_name); for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { - - auto old_byte_size_key = m_key_cache[decoder_layer_id].get_byte_size(); - auto old_byte_size_value = m_value_cache[decoder_layer_id].get_byte_size(); ov::Coordinate end_key(m_key_cache[decoder_layer_id].get_shape()); ov::Coordinate end_value(m_value_cache[decoder_layer_id].get_shape()); From 0c3bb2822a0611ad6e6539830a32554b2da05e3e Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 17 Dec 2024 11:59:14 +0100 Subject: [PATCH 10/31] Code optimizations. --- src/cpp/src/cache_manager.hpp | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/src/cpp/src/cache_manager.hpp b/src/cpp/src/cache_manager.hpp index 0ae97a868d..b088e5c221 100644 --- a/src/cpp/src/cache_manager.hpp +++ b/src/cpp/src/cache_manager.hpp @@ -60,6 +60,8 @@ class CacheManager { m_key_cache.emplace_back(key_cache); m_value_cache.emplace_back(value_cache); + + update_request_tensor(decoder_layer_id); } } else { auto remote_context = m_core.get_default_context(device_name); @@ -71,16 +73,15 @@ class CacheManager { m_key_cache.emplace_back(key_cache); m_value_cache.emplace_back(value_cache); + + update_request_tensor(decoder_layer_id); } } - update_request_tensor(); } - void update_request_tensor() { - for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { - m_request.set_tensor(std::string("key_cache.") + std::to_string(decoder_layer_id), m_key_cache[decoder_layer_id]); - m_request.set_tensor(std::string("value_cache.") + std::to_string(decoder_layer_id), m_value_cache[decoder_layer_id]); - } + void update_request_tensor(size_t decoder_layer_id) { + m_request.set_tensor(std::string("key_cache.") + std::to_string(decoder_layer_id), m_key_cache[decoder_layer_id]); + m_request.set_tensor(std::string("value_cache.") + std::to_string(decoder_layer_id), m_value_cache[decoder_layer_id]); } void increase_cache(size_t num_kv_blocks) { @@ -100,19 +101,23 @@ class CacheManager { ov::Tensor key_cache(m_device_config.get_cache_precision(), new_value_cache_shape); ov::Tensor value_cache(m_device_config.get_cache_precision(), new_key_cache_shape); - // force allocation - std::memset(key_cache.data(), 0, key_cache.get_byte_size()); - std::memset(value_cache.data(), 0, value_cache.get_byte_size()); - // copy current cache data ov::Tensor dst_key_roi(key_cache, start_key, end_key); ov::Tensor dst_value_roi(value_cache, start_value, end_value); m_key_cache[decoder_layer_id].copy_to(dst_key_roi); m_value_cache[decoder_layer_id].copy_to(dst_value_roi); + // force allocation on the added cache data + auto key_cache_roi_end = static_cast(key_cache.data()) + dst_key_roi.get_byte_size(); + auto value_cache_roi_end = static_cast(value_cache.data()) + dst_value_roi.get_byte_size(); + std::memset(key_cache_roi_end, 0, key_cache.get_byte_size() - dst_key_roi.get_byte_size()); + std::memset(value_cache_roi_end, 0, value_cache.get_byte_size() - dst_value_roi.get_byte_size()); + // set new cache tensors m_key_cache[decoder_layer_id] = key_cache; m_value_cache[decoder_layer_id] = value_cache; + + update_request_tensor(decoder_layer_id); } } else { auto remote_context = m_core.get_default_context(device_name); @@ -132,10 +137,10 @@ class CacheManager { // set new cache tensors m_key_cache[decoder_layer_id] = key_cache; m_value_cache[decoder_layer_id] = value_cache; + + update_request_tensor(decoder_layer_id); } } - update_request_tensor(); - m_num_allocated_kv_blocks = num_kv_blocks; } From a105a9fad58d2d8deefbb4482c57a58aedaa0ca5 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 17 Dec 2024 17:52:32 +0100 Subject: [PATCH 11/31] Code corrections. --- .../speculative_decoding_lm.cpp | 6 +-- .../speculative_decoding_lm.py | 6 +-- src/cpp/src/cache_manager.hpp | 24 ++++----- src/cpp/src/continuous_batching_impl.cpp | 21 +++++++- src/cpp/src/continuous_batching_impl.hpp | 8 +++ src/cpp/src/scheduler.hpp | 30 ++--------- .../utils/paged_attention_transformations.cpp | 7 --- tests/cpp/scheduler.cpp | 54 ++++--------------- tests/python_tests/common.py | 1 - tests/python_tests/ov_genai_test_utils.py | 4 +- 10 files changed, 59 insertions(+), 102 deletions(-) diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp index dc6761879c..ca5a60ec93 100644 --- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp +++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp @@ -26,15 +26,11 @@ int main(int argc, char* argv[]) try { // Please, set device for main model in `LLMPipeline` constructor and in in `ov::genai::draft_model` for draft. std::string main_device = "CPU", draft_device = "CPU"; - ov::genai::SchedulerConfig scheduler_config; - scheduler_config.cache_size = 5; - // Different devices require different block sizes, so different scheduler configs need to be set. ov::genai::LLMPipeline pipe( main_model_path, main_device, - ov::genai::draft_model(draft_model_path, draft_device), - ov::genai::scheduler_config(scheduler_config)); + ov::genai::draft_model(draft_model_path, draft_device)); auto streamer = [](std::string subword) { std::cout << subword << std::flush; diff --git a/samples/python/speculative_decoding_lm/speculative_decoding_lm.py b/samples/python/speculative_decoding_lm/speculative_decoding_lm.py index 612e59474e..217b8a2730 100755 --- a/samples/python/speculative_decoding_lm/speculative_decoding_lm.py +++ b/samples/python/speculative_decoding_lm/speculative_decoding_lm.py @@ -25,13 +25,9 @@ def main(): main_device = 'CPU' # GPU can be used as well draft_device = 'CPU' - scheduler_config = openvino_genai.SchedulerConfig() - # cache params - scheduler_config.cache_size = 2 - draft_model = openvino_genai.draft_model(args.draft_model_dir, draft_device) - pipe = openvino_genai.LLMPipeline(args.model_dir, main_device, scheduler_config=scheduler_config, draft_model=draft_model) + pipe = openvino_genai.LLMPipeline(args.model_dir, main_device, draft_model=draft_model) config = openvino_genai.GenerationConfig() config.max_new_tokens = 100 diff --git a/src/cpp/src/cache_manager.hpp b/src/cpp/src/cache_manager.hpp index b088e5c221..525e7f7653 100644 --- a/src/cpp/src/cache_manager.hpp +++ b/src/cpp/src/cache_manager.hpp @@ -19,6 +19,18 @@ class CacheManager { ov::Core m_core; ov::InferRequest m_request; + ov::Shape set_first_dim_and_make_static(const ov::PartialShape& shape, size_t dim) { + ov::PartialShape res_shape = shape; + res_shape[0] = dim; + OPENVINO_ASSERT(res_shape.is_static()); + return res_shape.to_shape(); + } + + void update_request_tensor(size_t decoder_layer_id) { + m_request.set_tensor(std::string("key_cache.") + std::to_string(decoder_layer_id), m_key_cache[decoder_layer_id]); + m_request.set_tensor(std::string("value_cache.") + std::to_string(decoder_layer_id), m_value_cache[decoder_layer_id]); + } + public: explicit CacheManager(const DeviceConfig &device_config, ov::InferRequest request, ov::Core core) : m_device_config(device_config), @@ -28,13 +40,6 @@ class CacheManager { m_value_cache.reserve(m_device_config.get_num_layers()); } - ov::Shape set_first_dim_and_make_static(const ov::PartialShape& shape, size_t dim) { - ov::PartialShape res_shape = shape; - res_shape[0] = dim; - OPENVINO_ASSERT(res_shape.is_static()); - return res_shape.to_shape(); - } - void allocate_cache_if_needed(size_t num_kv_blocks) { if (m_num_allocated_kv_blocks >= num_kv_blocks) { return; @@ -79,11 +84,6 @@ class CacheManager { } } - void update_request_tensor(size_t decoder_layer_id) { - m_request.set_tensor(std::string("key_cache.") + std::to_string(decoder_layer_id), m_key_cache[decoder_layer_id]); - m_request.set_tensor(std::string("value_cache.") + std::to_string(decoder_layer_id), m_value_cache[decoder_layer_id]); - } - void increase_cache(size_t num_kv_blocks) { OPENVINO_ASSERT(num_kv_blocks > m_num_allocated_kv_blocks); ov::Shape new_value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(), num_kv_blocks); diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index e99b4dd488..07fb6d5871 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -40,6 +40,24 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_pull_awaiting_requests m_awaiting_requests.clear(); } +void ContinuousBatchingPipeline::ContinuousBatchingImpl::_reallocate_kv_cache_if_needed(std::vector& sequence_groups) { + float eps = 1e-5; + if (!m_scheduler->get_block_manager().block_allocator_initialized()) { + size_t prompt_sum_size = 0; + for (auto idx = 0; idx < sequence_groups.size(); idx++) { + prompt_sum_size += sequence_groups[idx]->get_prompt_len(); + } + size_t initial_kv_cache_size = prompt_sum_size * m_kv_blocks_initial_multiplier; + m_scheduler->get_block_manager().increase_kv_blocks_number(initial_kv_cache_size); + m_dynamic_memory_allocation = true; + } + else if (m_dynamic_memory_allocation && (m_scheduler->get_block_manager().get_used_percentage() + eps) > m_precentage_threshold_for_cache_increase) { + size_t new_cache_size = (size_t)(m_scheduler->get_block_manager().get_total_number_of_kv_blocks() * m_cache_growth_factor); + m_scheduler->get_block_manager().increase_kv_blocks_number(new_cache_size); + } + m_cache_manager->allocate_cache_if_needed(m_scheduler->get_block_manager().get_total_number_of_kv_blocks()); +} + void ContinuousBatchingPipeline::ContinuousBatchingImpl::init( std::shared_ptr model, const SchedulerConfig& scheduler_config, @@ -64,7 +82,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init( can_use_partial_preemption = false; } - m_scheduler = std::make_shared(device_config.get_block_size(), m_cache_manager, updated_config, device_config.get_num_layers(), can_use_partial_preemption); + m_scheduler = std::make_shared(device_config.get_block_size(), updated_config, device_config.get_num_layers(), can_use_partial_preemption); // and finally create model runner bool is_use_cache_eviction = m_scheduler->get_config().use_cache_eviction; m_model_runner = std::make_shared(infer_request, m_scheduler->get_block_size(), device_config.get_num_layers(), is_use_cache_eviction); @@ -131,6 +149,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() { static ManualTimer timer("scheduling"); timer.start(); m_scheduler->clean_empty_blocks(m_requests); + _reallocate_kv_cache_if_needed(m_requests); scheduler_output = m_scheduler->schedule(m_requests); m_pipeline_metrics.scheduled_requests = scheduler_output.m_scheduled_sequence_groups_ids.size(); m_pipeline_metrics.cache_usage = scheduler_output.m_cache_usage; diff --git a/src/cpp/src/continuous_batching_impl.hpp b/src/cpp/src/continuous_batching_impl.hpp index 780bff6a31..f9c786af4e 100644 --- a/src/cpp/src/continuous_batching_impl.hpp +++ b/src/cpp/src/continuous_batching_impl.hpp @@ -30,6 +30,13 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc // flag to enable validation mode for sampler bool m_is_validation_mode_enabled = false; + // dynamic kv-cache allocation params + const size_t m_kv_blocks_initial_multiplier = 2; + const float m_cache_growth_factor = 2; // commmon values 1.5 or 2 + const float m_precentage_threshold_for_cache_increase = 100; + + bool m_dynamic_memory_allocation = false; + #ifdef DEBUG_CACHE_STATE_DUMP size_t step_count = 0; #endif @@ -41,6 +48,7 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc void _notify_requests_dropped_by_handle(); void _register_step_cache_usage(float step_cache_usage); float _get_current_running_average_cache_usage() const; + void _reallocate_kv_cache_if_needed(std::vector& sequence_groups); void maybe_evict_cache_blocks(const SchedulerConfig& sched_config); void init(std::shared_ptr model, diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp index c32b3746dd..3fe3b08380 100644 --- a/src/cpp/src/scheduler.hpp +++ b/src/cpp/src/scheduler.hpp @@ -11,7 +11,6 @@ #include "device_config.hpp" #include "block_manager.hpp" #include "sequence_group.hpp" -#include "cache_manager.hpp" namespace ov::genai { class Scheduler { @@ -20,11 +19,6 @@ class Scheduler { SchedulerConfig m_config; BlockManager m_block_manager; friend class CacheStateDumper; - std::shared_ptr m_cache_manager; - const size_t m_kv_blocks_initial_multiplier = 2; - const float m_cache_growth_factor = 2; // commmon values 1.5 or 2 - const float m_precentage_threshold_for_cache_increase = 100; - bool m_dynamic_memory_allocation = false; public: struct Output { @@ -42,8 +36,7 @@ class Scheduler { float m_cache_usage = 0.0; }; - explicit Scheduler(size_t block_size, std::shared_ptr cache_manager, const SchedulerConfig & config = {}, size_t num_layers = 1, bool can_use_partial_preemption = true) : - m_cache_manager(cache_manager), + explicit Scheduler(size_t block_size, const SchedulerConfig & config = {}, size_t num_layers = 1, bool can_use_partial_preemption = true) : m_can_use_partial_preemption(can_use_partial_preemption), m_config(config), m_block_manager(m_config.num_kv_blocks, m_config.enable_prefix_caching, block_size, num_layers) { @@ -52,23 +45,6 @@ class Scheduler { Output schedule(std::vector& sequence_groups) { Output scheduler_output; - float eps = 1e-5; - - if (!m_block_manager.block_allocator_initialized()) { - size_t prompt_sum_size = 0; - for (auto idx = 0; idx < sequence_groups.size(); idx++) { - prompt_sum_size += sequence_groups[idx]->get_prompt_len(); - } - size_t initial_kv_cache_size = prompt_sum_size * m_kv_blocks_initial_multiplier; - m_block_manager.increase_kv_blocks_number(initial_kv_cache_size); - m_dynamic_memory_allocation = true; - } - else if (m_dynamic_memory_allocation && (m_block_manager.get_used_percentage() + eps) > m_precentage_threshold_for_cache_increase) { - size_t new_cache_size = (size_t)(m_block_manager.get_total_number_of_kv_blocks() * m_cache_growth_factor); - m_block_manager.increase_kv_blocks_number(new_cache_size); - } - OPENVINO_ASSERT(m_cache_manager != nullptr, "Cache manager needs to be set in the Scheduler constructor."); - m_cache_manager->allocate_cache_if_needed(m_block_manager.get_total_number_of_kv_blocks()); if (m_config.dynamic_split_fuse) { // deepspeed-mii case @@ -131,6 +107,10 @@ class Scheduler { m_block_manager.free_blocks_from_sequence(seq_id, per_layer_logical_block_indices_to_free); } + BlockManager& get_block_manager() { + return m_block_manager; + } + private: static size_t _num_running_sequence_groups(const std::vector& sequence_groups) { size_t num_running = 0; diff --git a/src/cpp/src/utils/paged_attention_transformations.cpp b/src/cpp/src/utils/paged_attention_transformations.cpp index dfb9c0cc5d..464f20702e 100644 --- a/src/cpp/src/utils/paged_attention_transformations.cpp +++ b/src/cpp/src/utils/paged_attention_transformations.cpp @@ -10,12 +10,6 @@ namespace ov { namespace genai { namespace utils { -inline ov::PartialShape to_partial_with_dyn_0_dim(const ov::Shape& static_shape) { - ov::PartialShape partial_shape = static_shape; - partial_shape[0] = ov::Dimension::dynamic(); - return partial_shape; -} - size_t get_kv_cache_size(const std::shared_ptr model) { const auto& parameters = model->get_parameters(); // extract num_kv_heads and head_size @@ -65,7 +59,6 @@ void set_kv_cache_type_and_shape(std::shared_ptr model, DeviceConfig& for (auto it_k = key_cache_params.begin(), it_v = value_cache_params.begin(); it_k != key_cache_params.end();++it_k, ++it_v) { it_k->second->set_element_type(device_config.get_cache_precision()); it_v->second->set_element_type(device_config.get_cache_precision()); - // TODO: CVS-145270 it_k->second->set_partial_shape(device_config.get_key_cache_shape()); it_v->second->set_partial_shape(device_config.get_value_cache_shape()); } diff --git a/tests/cpp/scheduler.cpp b/tests/cpp/scheduler.cpp index 79c5d5a433..0fb3ea38be 100644 --- a/tests/cpp/scheduler.cpp +++ b/tests/cpp/scheduler.cpp @@ -19,38 +19,6 @@ void clear_finished_sequences(std::vector& requests) { requests.erase(new_end, requests.end()); } -std::shared_ptr get_model(size_t num_layers) { - ov::NodeVector keys; - ov::NodeVector values; - ov::ParameterVector params; - auto shape = ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()}); - for (size_t i = 0; i < num_layers; i++) { - auto key = std::make_shared(ov::element::f16, shape); - auto value = std::make_shared(ov::element::f16, shape); - key->get_output_tensor(0).set_names({"key_cache." + std::to_string(i)}); - value->get_output_tensor(0).set_names({"value_cache." + std::to_string(i)}); - keys.push_back(key); - values.push_back(value); - params.push_back(key); - params.push_back(value); - } - const auto& concat1 = std::make_shared(keys, 1); - const auto& concat2 = std::make_shared(values, 1); - auto model = std::make_shared(ov::NodeVector{concat1, concat2}, params); - return std::make_shared(ov::NodeVector{concat1, concat2}, params); -} - -std::shared_ptr init_cache_manager(SchedulerConfig scheduler_config) { - ov::Core core = ov::Core(); - size_t num_decoder_layers = 12; - ov::InferRequest request = core.compile_model(get_model(num_decoder_layers)).create_infer_request(); - size_t head_size = 64, head_size_u8 = head_size + 8; - size_t num_kv_heads = 12; - ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); - device_config.set_model_params(num_kv_heads, head_size_u8, num_decoder_layers); - return std::make_shared(device_config, request, core); -} - TEST(TestScheduler, general_test) { std::array configs = {SchedulerConfig(), SchedulerConfig()}; configs.at(0).max_num_batched_tokens = 32; @@ -75,7 +43,7 @@ TEST(TestScheduler, general_test) { std::vector requests = {sequence_group1, sequence_group2, sequence_group3}; // schedule 3 sequence groups that use 6 kv blocks - Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); + Scheduler scheduler = Scheduler(4, scheduler_config); auto out1 = scheduler.schedule(requests); std::vector ref_ids = {0, 1, 2}; @@ -176,7 +144,7 @@ TEST_P(AppendSlotsSchedulerTest, test_append_slots_considers_all_sequences) { auto idx1 = (*sequence_group2)[0]->get_id(); std::vector requests = {sequence_group1, sequence_group2}; - Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); + Scheduler scheduler = Scheduler(4, scheduler_config); auto out1 = scheduler.schedule(requests); std::vector ref_ids = {0, 1}; @@ -244,7 +212,7 @@ TEST_P(PartialPreemptionSchedulerTest, test_partial_preemption) { // schedule 2 sequence groups that use 5 kv blocks - Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); + Scheduler scheduler = Scheduler(4, scheduler_config); auto out0 = scheduler.schedule(requests); for (auto seq: requests) { @@ -329,7 +297,7 @@ TEST(TestScheduler, test_partial_preemption_beam_search) { sequence_group->set_sequence_group_ptr(sequence_group); std::vector requests = {sequence_group}; - Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); + Scheduler scheduler = Scheduler(4, scheduler_config); auto out = scheduler.schedule(requests); for (auto sequence: sequence_group->get_not_finished_sequences()) { sequence->append_token(token, 0.7); @@ -440,7 +408,7 @@ TEST(TestScheduler, test_partially_preempted_prompt) { std::vector requests = {sequence_group1, sequence_group2}; // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks. - Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); + Scheduler scheduler = Scheduler(4, scheduler_config); auto out1 = scheduler.schedule(requests); for (auto seq: requests) { @@ -534,7 +502,7 @@ TEST(TestScheduler, prefix_caching_test) { std::vector prompt_tokens = {0,1,2,3,4,5,6,7}; std::vector histrory_tokens = {}; // schedule prompt - Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); + Scheduler scheduler = Scheduler(4, scheduler_config); size_t chat_iterations = 10; @@ -597,7 +565,7 @@ TEST(TestScheduler, prefix_caching_test_two_identical_sequences) { std::vector prompt_tokens = {0,1,2,3,4,5,6,7}; std::vector histrory_tokens = {}; // schedule prompt - Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); + Scheduler scheduler = Scheduler(4, scheduler_config); size_t chat_iterations = 10; @@ -671,7 +639,7 @@ TEST(TestScheduler, prefix_caching_with_max_new_tokens_equal_1) { for (auto scheduler_config: configs) { std::vector prompt_tokens = {0,1,2,3,4,5,6,7}; // schedule prompt - Scheduler scheduler = Scheduler(32, init_cache_manager(scheduler_config), scheduler_config); + Scheduler scheduler = Scheduler(32, scheduler_config); size_t chat_iterations = 2; @@ -732,7 +700,7 @@ TEST(TestScheduler, test_partially_preempted_prompt_not_allowed) { // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks. const bool can_use_partial_preemption = false; - Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config, 1, can_use_partial_preemption); + Scheduler scheduler = Scheduler(4, scheduler_config, 1, can_use_partial_preemption); auto out1 = scheduler.schedule(requests); for (auto req : requests) @@ -806,7 +774,7 @@ TEST(TestScheduler, test_partially_preempted_prompt_not_allowed2) { // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks. const bool can_use_partial_preemption = false; - Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config, 1, can_use_partial_preemption); + Scheduler scheduler = Scheduler(4, scheduler_config, 1, can_use_partial_preemption); scheduler.schedule(requests); for (auto req: requests) req->finish_iteration(); @@ -920,7 +888,7 @@ TEST(TestScheduler, FullyPreemptsCacheEvictedSequences) { std::vector requests = {sequence_group1, sequence_group2}; - Scheduler scheduler = Scheduler(2, init_cache_manager(scheduler_config), scheduler_config); + Scheduler scheduler = Scheduler(2, scheduler_config); // prompt phase - schedules 1 block for seq 1, 5 blocks for seq 2 auto out = scheduler.schedule(requests); diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index 50ee452f5c..9c025c1812 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -238,7 +238,6 @@ def get_test_dataset() -> Tuple[List[str], List[GenerationConfig]]: def get_scheduler_config(scheduler_params: dict = None) -> SchedulerConfig: scheduler_config = SchedulerConfig() - scheduler_config.cache_size = 1 if scheduler_params is None: scheduler_config.dynamic_split_fuse = True # vLLM specific diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py index b633497d32..77d068f788 100644 --- a/tests/python_tests/ov_genai_test_utils.py +++ b/tests/python_tests/ov_genai_test_utils.py @@ -282,6 +282,4 @@ def load_pipe(configs: List[Tuple], temp_path): @functools.lru_cache(1) def get_continuous_batching(path): - scheduler_config = ov_genai.SchedulerConfig() - scheduler_config.cache_size = 1 - return ov_genai.LLMPipeline(path, ov_genai.Tokenizer(path), 'CPU', **{"scheduler_config": scheduler_config}) + return ov_genai.LLMPipeline(path, ov_genai.Tokenizer(path), 'CPU') From a8531a5303bc6aafed49a1b01dd2c4fa14cf97c4 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 19 Dec 2024 10:38:22 +0100 Subject: [PATCH 12/31] Added available memory check for GPU. --- src/cpp/src/cache_manager.hpp | 10 ++++- src/cpp/src/continuous_batching_impl.cpp | 51 ++++++++++++++++++++--- src/cpp/src/continuous_batching_impl.hpp | 2 + src/cpp/src/llm_pipeline.cpp | 10 +++-- tests/python_tests/ov_genai_test_utils.py | 3 +- 5 files changed, 65 insertions(+), 11 deletions(-) diff --git a/src/cpp/src/cache_manager.hpp b/src/cpp/src/cache_manager.hpp index 525e7f7653..68a88d5e40 100644 --- a/src/cpp/src/cache_manager.hpp +++ b/src/cpp/src/cache_manager.hpp @@ -59,7 +59,10 @@ class CacheManager { ov::Tensor key_cache(m_device_config.get_cache_precision(), key_cache_shape); ov::Tensor value_cache(m_device_config.get_cache_precision(), value_cache_shape); - // force allocation + // Some optimizations like AVX2, AVX512, AMX require a minimal shape and + // perform multiplying by zero on the excess data. Uninitialized tensor data contain NAN's, + // so NAN * 0 returns non-zero invalid data. + // So we need to set zeros to all newly allocated tensors data. std::memset(key_cache.data(), 0, key_cache.get_byte_size()); std::memset(value_cache.data(), 0, value_cache.get_byte_size()); @@ -107,7 +110,10 @@ class CacheManager { m_key_cache[decoder_layer_id].copy_to(dst_key_roi); m_value_cache[decoder_layer_id].copy_to(dst_value_roi); - // force allocation on the added cache data + // Some optimizations like AVX2, AVX512, AMX require a minimal shape and + // perform multiplying by zero on the excess data. Uninitialized tensor data contain NAN's, + // so NAN * 0 returns non-zero invalid data. + // So we need to set zeros to all newly allocated tensors data. auto key_cache_roi_end = static_cast(key_cache.data()) + dst_key_roi.get_byte_size(); auto value_cache_roi_end = static_cast(value_cache.data()) + dst_value_roi.get_byte_size(); std::memset(key_cache_roi_end, 0, key_cache.get_byte_size() - dst_key_roi.get_byte_size()); diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index 458d7e96b3..dc007aa6d0 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -5,6 +5,7 @@ #include "continuous_batching_impl.hpp" #include "utils.hpp" #include "utils/paged_attention_transformations.hpp" +#include "openvino/runtime/intel_gpu/properties.hpp" namespace ov::genai { template struct overloaded : Ts... {using Ts::operator()...;}; @@ -42,20 +43,58 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_pull_awaiting_requests m_awaiting_requests.clear(); } +size_t ContinuousBatchingPipeline::ContinuousBatchingImpl::_get_available_gpu_memory() { + ov::Core core = utils::singleton_core(); + auto device = m_device_config->get_device(); + OPENVINO_ASSERT(device.find("GPU") != std::string::npos, "_get_available_gpu_memory() is applicable for GPU only."); + auto memory_statistics = core.get_property(device, ov::intel_gpu::memory_statistics); + auto device_type = core.get_property(device, ov::device::type.name(), {}).as(); + + // sum up all used device memory + std::vector device_memory_types = {"cl_mem", "usm_device"}; + size_t used_device_mem = 0; + for (auto mem_type: device_memory_types) { + used_device_mem += memory_statistics[mem_type]; + } + + if (device_type == "INTEGRATED") { + used_device_mem += memory_statistics["usm_host"]; + } + + // there could be unaccounted extra memory reserved by kernels, kept + // in memory pools, etc + // therefore, add a threshold to account for this + float used_memory_threshold = 1.1; + used_device_mem *= used_memory_threshold; + + auto total_device_memory = core.get_property(device, ov::intel_gpu::device_total_mem_size); + + return total_device_memory - used_device_mem; +} + void ContinuousBatchingPipeline::ContinuousBatchingImpl::_reallocate_kv_cache_if_needed(std::vector& sequence_groups) { float eps = 1e-5; + auto device = m_device_config->get_device(); + if (!m_scheduler->get_block_manager().block_allocator_initialized()) { - size_t prompt_sum_size = 0; + size_t seq_length_sum = 0; for (auto idx = 0; idx < sequence_groups.size(); idx++) { - prompt_sum_size += sequence_groups[idx]->get_prompt_len(); + seq_length_sum += sequence_groups[idx]->get_prompt_len() + m_generation_config.get_max_new_tokens(); } - size_t initial_kv_cache_size = prompt_sum_size * m_kv_blocks_initial_multiplier; - m_scheduler->get_block_manager().increase_kv_blocks_number(initial_kv_cache_size); + m_scheduler->get_block_manager().increase_kv_blocks_number(seq_length_sum); m_dynamic_memory_allocation = true; } else if (m_dynamic_memory_allocation && (m_scheduler->get_block_manager().get_used_percentage() + eps) > m_precentage_threshold_for_cache_increase) { size_t new_cache_size = (size_t)(m_scheduler->get_block_manager().get_total_number_of_kv_blocks() * m_cache_growth_factor); - m_scheduler->get_block_manager().increase_kv_blocks_number(new_cache_size); + if (device.find("GPU") == std::string::npos) { + m_scheduler->get_block_manager().increase_kv_blocks_number(new_cache_size); + } + else { + size_t available_gpu_memory_size = _get_available_gpu_memory(); + if (new_cache_size <= available_gpu_memory_size) { + m_scheduler->get_block_manager().increase_kv_blocks_number(new_cache_size); + } + } } m_cache_manager->allocate_cache_if_needed(m_scheduler->get_block_manager().get_total_number_of_kv_blocks()); } @@ -96,6 +135,8 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init( // If eos_token_id was not provided, take value if (m_generation_config.eos_token_id == -1) m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id()); + + m_device_config = std::make_shared(device_config); }; diff --git a/src/cpp/src/continuous_batching_impl.hpp b/src/cpp/src/continuous_batching_impl.hpp index af922bed54..187c9c1ee7 100644 --- a/src/cpp/src/continuous_batching_impl.hpp +++ b/src/cpp/src/continuous_batching_impl.hpp @@ -14,6 +14,7 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc std::shared_ptr m_cache_manager; std::shared_ptr m_model_runner; std::shared_ptr m_sampler; + std::shared_ptr m_device_config; // current requests to process std::vector m_requests; @@ -49,6 +50,7 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc void _register_step_cache_usage(float step_cache_usage); float _get_current_running_average_cache_usage() const; void _reallocate_kv_cache_if_needed(std::vector& sequence_groups); + void _get_available_gpu_memory(std::vector& sequence_groups); void maybe_evict_cache_blocks(const SchedulerConfig& sched_config); void init(std::shared_ptr model, diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 6d9aae30fa..7533d59c69 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -699,7 +699,8 @@ ov::genai::LLMPipeline::LLMPipeline( const ov::AnyMap& properties ){ auto start_time = std::chrono::steady_clock::now(); - if (properties.find(ov::genai::scheduler_config.name()) != properties.end()) { + if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || + properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end()) { auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties); m_pimpl = std::make_unique(models_path, tokenizer, scheduler_config, device, plugin_config); } else if (device == "NPU") { @@ -718,7 +719,8 @@ ov::genai::LLMPipeline::LLMPipeline( ){ auto start_time = std::chrono::steady_clock::now(); - if (config.find(ov::genai::scheduler_config.name()) != config.end()) { + if (config.find(ov::genai::scheduler_config.name()) != config.end() || + config.find(utils::DRAFT_MODEL_ARG_NAME) != config.end()) { auto [plugin_config, scheduler_config] = utils::split_scheduler_config(config); m_pimpl = std::make_unique(models_path, scheduler_config, device, plugin_config); } else if (device == "NPU") { @@ -741,7 +743,9 @@ ov::genai::LLMPipeline::LLMPipeline( auto [core_properties, plugin_config] = ov::genai::utils::split_core_compile_config(config); auto start_time = std::chrono::steady_clock::now(); - if (plugin_config.find(ov::genai::scheduler_config.name()) != plugin_config.end()) { + if (plugin_config.find(ov::genai::scheduler_config.name()) != plugin_config.end() || + plugin_config.find(utils::DRAFT_MODEL_ARG_NAME) != plugin_config.end()) { + auto [plugin_config_, scheduler_config] = utils::split_scheduler_config(plugin_config); m_pimpl = std::make_unique(model_str, weights_tensor, tokenizer, scheduler_config, device, plugin_config_, generation_config); diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py index 77d068f788..5f2702a774 100644 --- a/tests/python_tests/ov_genai_test_utils.py +++ b/tests/python_tests/ov_genai_test_utils.py @@ -282,4 +282,5 @@ def load_pipe(configs: List[Tuple], temp_path): @functools.lru_cache(1) def get_continuous_batching(path): - return ov_genai.LLMPipeline(path, ov_genai.Tokenizer(path), 'CPU') + scheduler_config = ov_genai.SchedulerConfig() + return ov_genai.LLMPipeline(path, ov_genai.Tokenizer(path), 'CPU', **{"scheduler_config": scheduler_config}) From 9043ba36e427abce3718dbd9566965e7c2990816 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 19 Dec 2024 10:42:29 +0100 Subject: [PATCH 13/31] Minor correction. --- src/cpp/src/continuous_batching_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/continuous_batching_impl.hpp b/src/cpp/src/continuous_batching_impl.hpp index 187c9c1ee7..1d14ea8b5b 100644 --- a/src/cpp/src/continuous_batching_impl.hpp +++ b/src/cpp/src/continuous_batching_impl.hpp @@ -50,7 +50,7 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc void _register_step_cache_usage(float step_cache_usage); float _get_current_running_average_cache_usage() const; void _reallocate_kv_cache_if_needed(std::vector& sequence_groups); - void _get_available_gpu_memory(std::vector& sequence_groups); + size_t _get_available_gpu_memory(); void maybe_evict_cache_blocks(const SchedulerConfig& sched_config); void init(std::shared_ptr model, From 9256f15e753d6113c2b838addb72982ab3ea6ea8 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 19 Dec 2024 15:37:39 +0100 Subject: [PATCH 14/31] Code corrections. --- src/cpp/src/continuous_batching_impl.cpp | 44 +++++++++++++++++++----- src/cpp/src/continuous_batching_impl.hpp | 2 +- src/cpp/src/device_config.hpp | 4 +++ 3 files changed, 40 insertions(+), 10 deletions(-) diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index dc007aa6d0..9893c8ca03 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -48,7 +48,7 @@ size_t ContinuousBatchingPipeline::ContinuousBatchingImpl::_get_available_gpu_me auto device = m_device_config->get_device(); OPENVINO_ASSERT(device.find("GPU") != std::string::npos, "_get_available_gpu_memory() is applicable for GPU only."); auto memory_statistics = core.get_property(device, ov::intel_gpu::memory_statistics); - auto device_type = core.get_property(device, ov::device::type.name(), {}).as(); + auto device_type = core.get_property(device, ov::device::type); // sum up all used device memory std::vector device_memory_types = {"cl_mem", "usm_device"}; @@ -57,7 +57,7 @@ size_t ContinuousBatchingPipeline::ContinuousBatchingImpl::_get_available_gpu_me used_device_mem += memory_statistics[mem_type]; } - if (device_type == "INTEGRATED") { + if (device_type == ov::device::Type::INTEGRATED) { used_device_mem += memory_statistics["usm_host"]; } @@ -67,6 +67,7 @@ size_t ContinuousBatchingPipeline::ContinuousBatchingImpl::_get_available_gpu_me float used_memory_threshold = 1.1; used_device_mem *= used_memory_threshold; + // total device memory in bytes auto total_device_memory = core.get_property(device, ov::intel_gpu::device_total_mem_size); return total_device_memory - used_device_mem; @@ -75,24 +76,49 @@ size_t ContinuousBatchingPipeline::ContinuousBatchingImpl::_get_available_gpu_me void ContinuousBatchingPipeline::ContinuousBatchingImpl::_reallocate_kv_cache_if_needed(std::vector& sequence_groups) { float eps = 1e-5; auto device = m_device_config->get_device(); + size_t block_size = m_device_config->get_block_size(); + size_t current_num_of_kv_blocks = m_scheduler->get_block_manager().get_total_number_of_kv_blocks(); if (!m_scheduler->get_block_manager().block_allocator_initialized()) { + // initial kv-blocks allocation size_t seq_length_sum = 0; for (auto idx = 0; idx < sequence_groups.size(); idx++) { - seq_length_sum += sequence_groups[idx]->get_prompt_len() + m_generation_config.get_max_new_tokens(); + auto seq_length = sequence_groups[idx]->get_prompt_len() * m_kv_blocks_initial_multiplier; + seq_length_sum += std::min(seq_length, m_generation_config.get_max_new_tokens(sequence_groups[idx]->get_prompt_len()));; } m_scheduler->get_block_manager().increase_kv_blocks_number(seq_length_sum); m_dynamic_memory_allocation = true; } - else if (m_dynamic_memory_allocation && (m_scheduler->get_block_manager().get_used_percentage() + eps) > m_precentage_threshold_for_cache_increase) { - size_t new_cache_size = (size_t)(m_scheduler->get_block_manager().get_total_number_of_kv_blocks() * m_cache_growth_factor); + else if (m_dynamic_memory_allocation && (m_scheduler->get_block_manager().get_used_percentage() + eps) > m_percentage_threshold_for_cache_increase) { + // get the expected number of kv blocks, considering that generated length will increase by m_cache_growth_factor + size_t expected_logical_kv_blocks_num = 0; + for (auto idx = 0; idx < sequence_groups.size(); idx++) { + auto num_blocks = sequence_groups[idx]->get_prompt_len() / block_size; + for (auto seq: sequence_groups[idx]->get_sequences()) { + num_blocks += std::min((size_t)(seq->get_generated_len() * m_cache_growth_factor), m_generation_config.get_max_new_tokens(sequence_groups[idx]->get_prompt_len())) / block_size; + } + expected_logical_kv_blocks_num += num_blocks; + } + + // get the expected number of physical kv-blocks + size_t expected_physical_blocks_num = (size_t)(current_num_of_kv_blocks * m_cache_growth_factor); + + size_t new_blocks_num = std::min(expected_logical_kv_blocks_num, expected_physical_blocks_num); + + // increase kv-cache if (device.find("GPU") == std::string::npos) { - m_scheduler->get_block_manager().increase_kv_blocks_number(new_cache_size); + m_scheduler->get_block_manager().increase_kv_blocks_number(new_blocks_num); } else { - size_t available_gpu_memory_size = _get_available_gpu_memory(); - if (new_cache_size <= available_gpu_memory_size) { - m_scheduler->get_block_manager().increase_kv_blocks_number(new_cache_size); + size_t available_gpu_memory = _get_available_gpu_memory(); + size_t required_memory = (new_blocks_num - current_num_of_kv_blocks) * m_device_config->get_block_size_in_bytes(); + if (required_memory <= available_gpu_memory) { + m_scheduler->get_block_manager().increase_kv_blocks_number(new_blocks_num); + } else { + size_t possible_blocks_to_add = available_gpu_memory / m_device_config->get_block_size_in_bytes(); + if (possible_blocks_to_add > 0) { + m_scheduler->get_block_manager().increase_kv_blocks_number(current_num_of_kv_blocks + possible_blocks_to_add); + } } } } diff --git a/src/cpp/src/continuous_batching_impl.hpp b/src/cpp/src/continuous_batching_impl.hpp index 1d14ea8b5b..8d1a96797e 100644 --- a/src/cpp/src/continuous_batching_impl.hpp +++ b/src/cpp/src/continuous_batching_impl.hpp @@ -34,7 +34,7 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc // dynamic kv-cache allocation params const size_t m_kv_blocks_initial_multiplier = 2; const float m_cache_growth_factor = 2; // commmon values 1.5 or 2 - const float m_precentage_threshold_for_cache_increase = 100; + const float m_percentage_threshold_for_cache_increase = 100; bool m_dynamic_memory_allocation = false; diff --git a/src/cpp/src/device_config.hpp b/src/cpp/src/device_config.hpp index 62cfea02ee..371142701c 100644 --- a/src/cpp/src/device_config.hpp +++ b/src/cpp/src/device_config.hpp @@ -151,5 +151,9 @@ class DeviceConfig { size_t get_block_size() const { return m_block_size; } + + size_t get_block_size_in_bytes() const { + return m_num_decoder_layers * 2 * m_num_kv_heads * m_block_size * m_head_size * get_cache_precision().size(); + } }; } From d926303f50bbd8d2ce88ae3078f0c06e47495975 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 19 Dec 2024 15:44:20 +0100 Subject: [PATCH 15/31] Minor correction. --- src/cpp/src/continuous_batching_impl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index 9893c8ca03..21d7b1da8f 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -84,7 +84,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_reallocate_kv_cache_if size_t seq_length_sum = 0; for (auto idx = 0; idx < sequence_groups.size(); idx++) { auto seq_length = sequence_groups[idx]->get_prompt_len() * m_kv_blocks_initial_multiplier; - seq_length_sum += std::min(seq_length, m_generation_config.get_max_new_tokens(sequence_groups[idx]->get_prompt_len()));; + seq_length_sum += std::min(seq_length, sequence_groups[idx]->get_prompt_len() + m_generation_config.get_max_new_tokens(sequence_groups[idx]->get_prompt_len()));; } m_scheduler->get_block_manager().increase_kv_blocks_number(seq_length_sum); m_dynamic_memory_allocation = true; From eb4d11042aadcf22006119ee801817ce30d20b3b Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 19 Dec 2024 16:58:25 +0100 Subject: [PATCH 16/31] Used correct core instance. --- src/cpp/src/continuous_batching_impl.cpp | 8 ++++---- src/cpp/src/continuous_batching_impl.hpp | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index 21d7b1da8f..17bdfdf906 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -33,6 +33,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl( bool is_need_per_layer_cache_control = scheduler_config.use_cache_eviction; utils::apply_paged_attention_transformations(model, device_config, is_need_per_layer_cache_control); + m_core = std::make_shared(core); init(model, scheduler_config, compile_properties, device_config, core); } @@ -44,11 +45,10 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_pull_awaiting_requests } size_t ContinuousBatchingPipeline::ContinuousBatchingImpl::_get_available_gpu_memory() { - ov::Core core = utils::singleton_core(); auto device = m_device_config->get_device(); OPENVINO_ASSERT(device.find("GPU") != std::string::npos, "_get_available_gpu_memory() is applicable for GPU only."); - auto memory_statistics = core.get_property(device, ov::intel_gpu::memory_statistics); - auto device_type = core.get_property(device, ov::device::type); + auto memory_statistics = m_core->get_property(device, ov::intel_gpu::memory_statistics); + auto device_type = m_core->get_property(device, ov::device::type); // sum up all used device memory std::vector device_memory_types = {"cl_mem", "usm_device"}; @@ -68,7 +68,7 @@ size_t ContinuousBatchingPipeline::ContinuousBatchingImpl::_get_available_gpu_me used_device_mem *= used_memory_threshold; // total device memory in bytes - auto total_device_memory = core.get_property(device, ov::intel_gpu::device_total_mem_size); + auto total_device_memory = m_core->get_property(device, ov::intel_gpu::device_total_mem_size); return total_device_memory - used_device_mem; } diff --git a/src/cpp/src/continuous_batching_impl.hpp b/src/cpp/src/continuous_batching_impl.hpp index 8d1a96797e..5847f8282c 100644 --- a/src/cpp/src/continuous_batching_impl.hpp +++ b/src/cpp/src/continuous_batching_impl.hpp @@ -37,6 +37,7 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc const float m_percentage_threshold_for_cache_increase = 100; bool m_dynamic_memory_allocation = false; + std::shared_ptr m_core; #ifdef DEBUG_CACHE_STATE_DUMP size_t step_count = 0; From f94929c2fdc04b7bdee9ffc599bba2ca13213f18 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Fri, 20 Dec 2024 15:47:40 +0100 Subject: [PATCH 17/31] Moved increasing of cache logic to scheduler. --- src/cpp/src/block_manager.hpp | 6 ++ src/cpp/src/cache_manager.hpp | 8 ++ src/cpp/src/continuous_batching_impl.cpp | 87 +---------------- src/cpp/src/continuous_batching_impl.hpp | 2 - src/cpp/src/scheduler.hpp | 119 +++++++++++++++++++++-- tests/cpp/scheduler.cpp | 53 +++++++--- 6 files changed, 166 insertions(+), 109 deletions(-) diff --git a/src/cpp/src/block_manager.hpp b/src/cpp/src/block_manager.hpp index d964032db0..ed2bdc9142 100644 --- a/src/cpp/src/block_manager.hpp +++ b/src/cpp/src/block_manager.hpp @@ -915,6 +915,12 @@ class BlockManager { * @return Whether enough KV cache blocks are available to host the sequences in the group. */ bool can_append_slots(SequenceGroup::CPtr seq_group) { + if (required_blocks_count(seq_group) > m_allocator.num_free_blocks(0)) { + std::cout << "required blocks: " << required_blocks_count(seq_group) << std::endl; + std::cout << "free blocks: " << m_allocator.num_free_blocks(0) << std::endl; + std::cout << seq_group->get_prompt_len() << " " << seq_group->get_sequences()[0]->get_generated_len() << std::endl; + } + return required_blocks_count(std::move(seq_group)) <= m_allocator.num_free_blocks(0); } diff --git a/src/cpp/src/cache_manager.hpp b/src/cpp/src/cache_manager.hpp index 68a88d5e40..dee32b7793 100644 --- a/src/cpp/src/cache_manager.hpp +++ b/src/cpp/src/cache_manager.hpp @@ -197,5 +197,13 @@ class CacheManager { } } } + + std::shared_ptr get_core() { + return std::make_shared(m_core); + } + + std::shared_ptr get_device_config() { + return std::make_shared(m_device_config); + } }; } diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index 17bdfdf906..42ff542a6d 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -5,7 +5,6 @@ #include "continuous_batching_impl.hpp" #include "utils.hpp" #include "utils/paged_attention_transformations.hpp" -#include "openvino/runtime/intel_gpu/properties.hpp" namespace ov::genai { template struct overloaded : Ts... {using Ts::operator()...;}; @@ -44,87 +43,6 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_pull_awaiting_requests m_awaiting_requests.clear(); } -size_t ContinuousBatchingPipeline::ContinuousBatchingImpl::_get_available_gpu_memory() { - auto device = m_device_config->get_device(); - OPENVINO_ASSERT(device.find("GPU") != std::string::npos, "_get_available_gpu_memory() is applicable for GPU only."); - auto memory_statistics = m_core->get_property(device, ov::intel_gpu::memory_statistics); - auto device_type = m_core->get_property(device, ov::device::type); - - // sum up all used device memory - std::vector device_memory_types = {"cl_mem", "usm_device"}; - size_t used_device_mem = 0; - for (auto mem_type: device_memory_types) { - used_device_mem += memory_statistics[mem_type]; - } - - if (device_type == ov::device::Type::INTEGRATED) { - used_device_mem += memory_statistics["usm_host"]; - } - - // there could be unaccounted extra memory reserved by kernels, kept - // in memory pools, etc - // therefore, add a threshold to account for this - float used_memory_threshold = 1.1; - used_device_mem *= used_memory_threshold; - - // total device memory in bytes - auto total_device_memory = m_core->get_property(device, ov::intel_gpu::device_total_mem_size); - - return total_device_memory - used_device_mem; -} - -void ContinuousBatchingPipeline::ContinuousBatchingImpl::_reallocate_kv_cache_if_needed(std::vector& sequence_groups) { - float eps = 1e-5; - auto device = m_device_config->get_device(); - size_t block_size = m_device_config->get_block_size(); - size_t current_num_of_kv_blocks = m_scheduler->get_block_manager().get_total_number_of_kv_blocks(); - - if (!m_scheduler->get_block_manager().block_allocator_initialized()) { - // initial kv-blocks allocation - size_t seq_length_sum = 0; - for (auto idx = 0; idx < sequence_groups.size(); idx++) { - auto seq_length = sequence_groups[idx]->get_prompt_len() * m_kv_blocks_initial_multiplier; - seq_length_sum += std::min(seq_length, sequence_groups[idx]->get_prompt_len() + m_generation_config.get_max_new_tokens(sequence_groups[idx]->get_prompt_len()));; - } - m_scheduler->get_block_manager().increase_kv_blocks_number(seq_length_sum); - m_dynamic_memory_allocation = true; - } - else if (m_dynamic_memory_allocation && (m_scheduler->get_block_manager().get_used_percentage() + eps) > m_percentage_threshold_for_cache_increase) { - // get the expected number of kv blocks, considering that generated length will increase by m_cache_growth_factor - size_t expected_logical_kv_blocks_num = 0; - for (auto idx = 0; idx < sequence_groups.size(); idx++) { - auto num_blocks = sequence_groups[idx]->get_prompt_len() / block_size; - for (auto seq: sequence_groups[idx]->get_sequences()) { - num_blocks += std::min((size_t)(seq->get_generated_len() * m_cache_growth_factor), m_generation_config.get_max_new_tokens(sequence_groups[idx]->get_prompt_len())) / block_size; - } - expected_logical_kv_blocks_num += num_blocks; - } - - // get the expected number of physical kv-blocks - size_t expected_physical_blocks_num = (size_t)(current_num_of_kv_blocks * m_cache_growth_factor); - - size_t new_blocks_num = std::min(expected_logical_kv_blocks_num, expected_physical_blocks_num); - - // increase kv-cache - if (device.find("GPU") == std::string::npos) { - m_scheduler->get_block_manager().increase_kv_blocks_number(new_blocks_num); - } - else { - size_t available_gpu_memory = _get_available_gpu_memory(); - size_t required_memory = (new_blocks_num - current_num_of_kv_blocks) * m_device_config->get_block_size_in_bytes(); - if (required_memory <= available_gpu_memory) { - m_scheduler->get_block_manager().increase_kv_blocks_number(new_blocks_num); - } else { - size_t possible_blocks_to_add = available_gpu_memory / m_device_config->get_block_size_in_bytes(); - if (possible_blocks_to_add > 0) { - m_scheduler->get_block_manager().increase_kv_blocks_number(current_num_of_kv_blocks + possible_blocks_to_add); - } - } - } - } - m_cache_manager->allocate_cache_if_needed(m_scheduler->get_block_manager().get_total_number_of_kv_blocks()); -} - void ContinuousBatchingPipeline::ContinuousBatchingImpl::init( std::shared_ptr model, const SchedulerConfig& scheduler_config, @@ -150,8 +68,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init( // as it may lead to performance slowdown can_use_partial_preemption = false; } - - m_scheduler = std::make_shared(device_config.get_block_size(), updated_config, device_config.get_num_layers(), can_use_partial_preemption); + m_scheduler = std::make_shared(device_config.get_block_size(), m_cache_manager, updated_config, device_config.get_num_layers(), can_use_partial_preemption); // and finally create model runner bool is_use_cache_eviction = m_scheduler->get_config().use_cache_eviction; m_model_runner = std::make_shared(infer_request, m_scheduler->get_block_size(), device_config.get_num_layers(), is_use_cache_eviction); @@ -214,13 +131,11 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() { _pull_awaiting_requests(); m_pipeline_metrics.requests = m_requests.size(); - Scheduler::Output scheduler_output; { static ManualTimer timer("scheduling"); timer.start(); m_scheduler->clean_empty_blocks(m_requests); - _reallocate_kv_cache_if_needed(m_requests); scheduler_output = m_scheduler->schedule(m_requests); m_pipeline_metrics.scheduled_requests = scheduler_output.m_scheduled_sequence_groups_ids.size(); m_pipeline_metrics.cache_usage = scheduler_output.m_cache_usage; diff --git a/src/cpp/src/continuous_batching_impl.hpp b/src/cpp/src/continuous_batching_impl.hpp index 5847f8282c..ad0000ee68 100644 --- a/src/cpp/src/continuous_batching_impl.hpp +++ b/src/cpp/src/continuous_batching_impl.hpp @@ -50,8 +50,6 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc void _notify_requests_dropped_by_handle(); void _register_step_cache_usage(float step_cache_usage); float _get_current_running_average_cache_usage() const; - void _reallocate_kv_cache_if_needed(std::vector& sequence_groups); - size_t _get_available_gpu_memory(); void maybe_evict_cache_blocks(const SchedulerConfig& sched_config); void init(std::shared_ptr model, diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp index 3fe3b08380..cbd8ef64a2 100644 --- a/src/cpp/src/scheduler.hpp +++ b/src/cpp/src/scheduler.hpp @@ -7,10 +7,12 @@ #include #include +#include "openvino/runtime/intel_gpu/properties.hpp" #include "openvino/genai/scheduler_config.hpp" #include "device_config.hpp" #include "block_manager.hpp" #include "sequence_group.hpp" +#include "cache_manager.hpp" namespace ov::genai { class Scheduler { @@ -20,6 +22,13 @@ class Scheduler { BlockManager m_block_manager; friend class CacheStateDumper; + bool m_dynamic_memory_allocation = false; + + // Dynamic KV-cache allocation params + size_t m_kv_blocks_initial_multiplier = 2; + const float m_cache_growth_factor = 2; // commmon values 1.5 or 2 + + std::shared_ptr m_cache_manager; public: struct Output { // IDs of scheduled groups @@ -36,15 +45,25 @@ class Scheduler { float m_cache_usage = 0.0; }; - explicit Scheduler(size_t block_size, const SchedulerConfig & config = {}, size_t num_layers = 1, bool can_use_partial_preemption = true) : + explicit Scheduler(size_t block_size, std::shared_ptr cache_manager, const SchedulerConfig & config = {}, size_t num_layers = 1, bool can_use_partial_preemption = true) : + m_cache_manager(cache_manager), m_can_use_partial_preemption(can_use_partial_preemption), m_config(config), m_block_manager(m_config.num_kv_blocks, m_config.enable_prefix_caching, block_size, num_layers) { + + // allocate kv-cache if the number of kv blocks is determined, + // otherwise cache will be allocated dynamically + if (m_block_manager.block_allocator_initialized()) { + m_cache_manager->allocate_cache_if_needed(m_block_manager.get_total_number_of_kv_blocks()); + } OPENVINO_ASSERT(num_layers != 0, "num_layers must be non-zero"); } Output schedule(std::vector& sequence_groups) { Output scheduler_output; + if (!m_block_manager.block_allocator_initialized()) { + _initialize_cache(sequence_groups); + } if (m_config.dynamic_split_fuse) { // deepspeed-mii case @@ -66,7 +85,6 @@ class Scheduler { _clear_waiting_sequences(sequence_groups); scheduler_output.m_cache_usage = m_block_manager.get_used_percentage(); - return scheduler_output; } @@ -107,10 +125,6 @@ class Scheduler { m_block_manager.free_blocks_from_sequence(seq_id, per_layer_logical_block_indices_to_free); } - BlockManager& get_block_manager() { - return m_block_manager; - } - private: static size_t _num_running_sequence_groups(const std::vector& sequence_groups) { size_t num_running = 0; @@ -241,6 +255,9 @@ class Scheduler { size_t available_slots = currently_allocated_token_slots - occupied_token_slots, required_slots = num_scheduled_tokens > available_slots ? num_scheduled_tokens - available_slots : 0; size_t num_required_blocks = (required_slots + block_size - 1) / block_size, num_free_blocks = m_block_manager.num_free_blocks(); + if (num_free_blocks == 0) { + _try_increase_cache(); + } size_t num_scheduled_blocks = std::min(num_required_blocks, num_free_blocks); // some scheduled blocks can be no fully occupied, so we need to take min between num_scheduled_blocks // and total "scheduled capacity" @@ -297,8 +314,11 @@ class Scheduler { // if we can't preemt any more sequences, clear scheduled tokens and move to next sequence if (!m_block_manager.can_append_slots(sequence_group)){ - sequence_group->clear_scheduled_tokens(); - continue; + _try_increase_cache(); + if (!m_block_manager.can_append_slots(sequence_group)) { + sequence_group->clear_scheduled_tokens(); + continue; + } } // allocate new slots @@ -374,8 +394,11 @@ class Scheduler { // apply KV cache limitations size_t block_size = get_block_size(); const size_t num_required_blocks = (sequence_len + block_size - 1) / block_size; - if (!m_block_manager.can_allocate_blocks(num_required_blocks)) - break; + if (!m_block_manager.can_allocate_blocks(num_required_blocks)) { + _try_increase_cache(); + if (!m_block_manager.can_allocate_blocks(num_required_blocks)) + break; + } // add scheduling information { @@ -409,6 +432,82 @@ class Scheduler { sequence_groups[sequence_group_id]->clear_waiting_sequences(); } } + + size_t _get_available_gpu_memory() { + auto device_config = m_cache_manager->get_device_config(); + auto core = m_cache_manager->get_core(); + auto device = device_config->get_device(); + OPENVINO_ASSERT(device.find("GPU") != std::string::npos, "_get_available_gpu_memory() is applicable for GPU only."); + auto memory_statistics = core->get_property(device, ov::intel_gpu::memory_statistics); + auto device_type = core->get_property(device, ov::device::type); + + // sum up all used device memory + std::vector device_memory_types = {"cl_mem", "usm_device"}; + size_t used_device_mem = 0; + for (auto mem_type: device_memory_types) { + used_device_mem += memory_statistics[mem_type]; + } + + if (device_type == ov::device::Type::INTEGRATED) { + used_device_mem += memory_statistics["usm_host"]; + } + + // there could be unaccounted extra memory reserved by kernels, kept + // in memory pools, etc + // therefore, add a threshold to account for this + float used_memory_threshold = 1.1; + used_device_mem *= used_memory_threshold; + + // total device memory in bytes + auto total_device_memory = core->get_property(device, ov::intel_gpu::device_total_mem_size); + + return total_device_memory - used_device_mem; + } + + void _initialize_cache(const std::vector& sequence_groups) { + size_t seq_length_sum = 0; + for (auto idx = 0; idx < sequence_groups.size(); idx++) { + auto seq_length = sequence_groups[idx]->get_prompt_len() * m_kv_blocks_initial_multiplier; + auto gen_config = sequence_groups[idx]->get_sampling_parameters(); + seq_length = std::min(seq_length, sequence_groups[idx]->get_prompt_len() + gen_config.get_max_new_tokens(sequence_groups[idx]->get_prompt_len())); + if (sequence_groups[idx]->get_sampling_parameters().is_beam_search()) { + + seq_length *= sequence_groups[idx]->get_sampling_parameters().num_beams; + } + seq_length_sum += seq_length; + } + m_block_manager.increase_kv_blocks_number(seq_length_sum); + m_cache_manager->allocate_cache_if_needed(m_block_manager.get_total_number_of_kv_blocks()); + m_dynamic_memory_allocation = true; + } + + void _try_increase_cache() { + if (!m_dynamic_memory_allocation) { + return; + } + auto device_config = m_cache_manager->get_device_config(); + auto device = device_config->get_device(); + size_t current_num_of_kv_blocks = m_block_manager.get_total_number_of_kv_blocks(); + size_t new_blocks_num = current_num_of_kv_blocks * m_cache_growth_factor; + + if (device.find("GPU") == std::string::npos) { + m_block_manager.increase_kv_blocks_number(new_blocks_num); + } + else { + size_t available_gpu_memory = _get_available_gpu_memory(); + size_t required_memory = (new_blocks_num - current_num_of_kv_blocks) * device_config->get_block_size_in_bytes(); + if (required_memory <= available_gpu_memory) { + m_block_manager.increase_kv_blocks_number(new_blocks_num); + } else { + size_t possible_blocks_to_add = available_gpu_memory / device_config->get_block_size_in_bytes(); + if (possible_blocks_to_add > 0) { + m_block_manager.increase_kv_blocks_number(current_num_of_kv_blocks + possible_blocks_to_add); + } + } + } + m_cache_manager->allocate_cache_if_needed(m_block_manager.get_total_number_of_kv_blocks()); + } + }; } diff --git a/tests/cpp/scheduler.cpp b/tests/cpp/scheduler.cpp index 0fb3ea38be..ea1720faa2 100644 --- a/tests/cpp/scheduler.cpp +++ b/tests/cpp/scheduler.cpp @@ -18,6 +18,37 @@ void clear_finished_sequences(std::vector& requests) { }); requests.erase(new_end, requests.end()); } +std::shared_ptr get_model(size_t num_layers) { + ov::NodeVector keys; + ov::NodeVector values; + ov::ParameterVector params; + auto shape = ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()}); + for (size_t i = 0; i < num_layers; i++) { + auto key = std::make_shared(ov::element::f16, shape); + auto value = std::make_shared(ov::element::f16, shape); + key->get_output_tensor(0).set_names({"key_cache." + std::to_string(i)}); + value->get_output_tensor(0).set_names({"value_cache." + std::to_string(i)}); + keys.push_back(key); + values.push_back(value); + params.push_back(key); + params.push_back(value); + } + const auto& concat1 = std::make_shared(keys, 1); + const auto& concat2 = std::make_shared(values, 1); + auto model = std::make_shared(ov::NodeVector{concat1, concat2}, params); + return std::make_shared(ov::NodeVector{concat1, concat2}, params); +} + +std::shared_ptr init_cache_manager(SchedulerConfig scheduler_config) { + ov::Core core = ov::Core(); + size_t num_decoder_layers = 12; + ov::InferRequest request = core.compile_model(get_model(num_decoder_layers)).create_infer_request(); + size_t head_size = 64, head_size_u8 = head_size + 8; + size_t num_kv_heads = 12; + ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); + device_config.set_model_params(num_kv_heads, head_size_u8, num_decoder_layers); + return std::make_shared(device_config, request, core); +} TEST(TestScheduler, general_test) { std::array configs = {SchedulerConfig(), SchedulerConfig()}; @@ -43,7 +74,7 @@ TEST(TestScheduler, general_test) { std::vector requests = {sequence_group1, sequence_group2, sequence_group3}; // schedule 3 sequence groups that use 6 kv blocks - Scheduler scheduler = Scheduler(4, scheduler_config); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); auto out1 = scheduler.schedule(requests); std::vector ref_ids = {0, 1, 2}; @@ -144,7 +175,7 @@ TEST_P(AppendSlotsSchedulerTest, test_append_slots_considers_all_sequences) { auto idx1 = (*sequence_group2)[0]->get_id(); std::vector requests = {sequence_group1, sequence_group2}; - Scheduler scheduler = Scheduler(4, scheduler_config); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); auto out1 = scheduler.schedule(requests); std::vector ref_ids = {0, 1}; @@ -212,7 +243,7 @@ TEST_P(PartialPreemptionSchedulerTest, test_partial_preemption) { // schedule 2 sequence groups that use 5 kv blocks - Scheduler scheduler = Scheduler(4, scheduler_config); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); auto out0 = scheduler.schedule(requests); for (auto seq: requests) { @@ -297,7 +328,7 @@ TEST(TestScheduler, test_partial_preemption_beam_search) { sequence_group->set_sequence_group_ptr(sequence_group); std::vector requests = {sequence_group}; - Scheduler scheduler = Scheduler(4, scheduler_config); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); auto out = scheduler.schedule(requests); for (auto sequence: sequence_group->get_not_finished_sequences()) { sequence->append_token(token, 0.7); @@ -408,7 +439,7 @@ TEST(TestScheduler, test_partially_preempted_prompt) { std::vector requests = {sequence_group1, sequence_group2}; // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks. - Scheduler scheduler = Scheduler(4, scheduler_config); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); auto out1 = scheduler.schedule(requests); for (auto seq: requests) { @@ -502,7 +533,7 @@ TEST(TestScheduler, prefix_caching_test) { std::vector prompt_tokens = {0,1,2,3,4,5,6,7}; std::vector histrory_tokens = {}; // schedule prompt - Scheduler scheduler = Scheduler(4, scheduler_config); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); size_t chat_iterations = 10; @@ -565,7 +596,7 @@ TEST(TestScheduler, prefix_caching_test_two_identical_sequences) { std::vector prompt_tokens = {0,1,2,3,4,5,6,7}; std::vector histrory_tokens = {}; // schedule prompt - Scheduler scheduler = Scheduler(4, scheduler_config); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); size_t chat_iterations = 10; @@ -639,7 +670,7 @@ TEST(TestScheduler, prefix_caching_with_max_new_tokens_equal_1) { for (auto scheduler_config: configs) { std::vector prompt_tokens = {0,1,2,3,4,5,6,7}; // schedule prompt - Scheduler scheduler = Scheduler(32, scheduler_config); + Scheduler scheduler = Scheduler(32, init_cache_manager(scheduler_config), scheduler_config); size_t chat_iterations = 2; @@ -700,7 +731,7 @@ TEST(TestScheduler, test_partially_preempted_prompt_not_allowed) { // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks. const bool can_use_partial_preemption = false; - Scheduler scheduler = Scheduler(4, scheduler_config, 1, can_use_partial_preemption); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config, 1, can_use_partial_preemption); auto out1 = scheduler.schedule(requests); for (auto req : requests) @@ -774,7 +805,7 @@ TEST(TestScheduler, test_partially_preempted_prompt_not_allowed2) { // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks. const bool can_use_partial_preemption = false; - Scheduler scheduler = Scheduler(4, scheduler_config, 1, can_use_partial_preemption); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config, 1, can_use_partial_preemption); scheduler.schedule(requests); for (auto req: requests) req->finish_iteration(); @@ -888,7 +919,7 @@ TEST(TestScheduler, FullyPreemptsCacheEvictedSequences) { std::vector requests = {sequence_group1, sequence_group2}; - Scheduler scheduler = Scheduler(2, scheduler_config); + Scheduler scheduler = Scheduler(2, init_cache_manager(scheduler_config), scheduler_config); // prompt phase - schedules 1 block for seq 1, 5 blocks for seq 2 auto out = scheduler.schedule(requests); From 38a42d62135bb613d4f8ae993fbaefa77e2faf5d Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Fri, 20 Dec 2024 16:57:24 +0100 Subject: [PATCH 18/31] Made sheduler config not needed for prompt lookup. --- .../prompt_lookup_decoding_lm.cpp | 3 +-- .../prompt_lookup_decoding_lm.py | 5 +---- src/cpp/src/llm_pipeline.cpp | 9 ++++++--- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp index e692110027..451a11b6f7 100644 --- a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp +++ b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp @@ -28,8 +28,7 @@ int main(int argc, char* argv[]) try { ov::genai::LLMPipeline pipe( model_path, device, - ov::genai::prompt_lookup(true), - ov::genai::scheduler_config(scheduler_config)); + ov::genai::prompt_lookup(true)); auto streamer = [](std::string subword) { std::cout << subword << std::flush; diff --git a/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py b/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py index 557897b6b1..726391ba9b 100755 --- a/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py +++ b/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py @@ -18,11 +18,8 @@ def main(): args = parser.parse_args() device = 'CPU' - scheduler_config = openvino_genai.SchedulerConfig() - # cache params - scheduler_config.cache_size = 2 - pipe = openvino_genai.LLMPipeline(args.model_dir, device, scheduler_config=scheduler_config, prompt_lookup=True) + pipe = openvino_genai.LLMPipeline(args.model_dir, device, prompt_lookup=True) config = openvino_genai.GenerationConfig() config.max_new_tokens = 100 diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index cd3ecf8ae3..06b58bee49 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -701,7 +701,8 @@ ov::genai::LLMPipeline::LLMPipeline( ){ auto start_time = std::chrono::steady_clock::now(); if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || - properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end()) { + properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || + properties.find(ov::genai::prompt_lookup.name()) != properties.end()) { auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties); m_pimpl = std::make_unique(models_path, tokenizer, scheduler_config, device, plugin_config); } else if (device == "NPU") { @@ -721,7 +722,8 @@ ov::genai::LLMPipeline::LLMPipeline( auto start_time = std::chrono::steady_clock::now(); if (config.find(ov::genai::scheduler_config.name()) != config.end() || - config.find(utils::DRAFT_MODEL_ARG_NAME) != config.end()) { + config.find(utils::DRAFT_MODEL_ARG_NAME) != config.end() || + config.find(ov::genai::prompt_lookup.name()) != config.end()) { auto [plugin_config, scheduler_config] = utils::split_scheduler_config(config); m_pimpl = std::make_unique(models_path, scheduler_config, device, plugin_config); } else if (device == "NPU") { @@ -745,7 +747,8 @@ ov::genai::LLMPipeline::LLMPipeline( auto start_time = std::chrono::steady_clock::now(); if (plugin_config.find(ov::genai::scheduler_config.name()) != plugin_config.end() || - plugin_config.find(utils::DRAFT_MODEL_ARG_NAME) != plugin_config.end()) { + plugin_config.find(utils::DRAFT_MODEL_ARG_NAME) != plugin_config.end() || + plugin_config.find(ov::genai::prompt_lookup.name()) != plugin_config.end()){ auto [plugin_config_, scheduler_config] = utils::split_scheduler_config(plugin_config); m_pimpl = std::make_unique(model_str, weights_tensor, From c7d54dda4e8b6264f07c596ab3169c7b8920c597 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Fri, 20 Dec 2024 16:58:53 +0100 Subject: [PATCH 19/31] Minor correction. --- .../prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp index 451a11b6f7..8b48dbade0 100644 --- a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp +++ b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp @@ -22,9 +22,6 @@ int main(int argc, char* argv[]) try { std::string device = "CPU"; - ov::genai::SchedulerConfig scheduler_config; - scheduler_config.cache_size = 5; - ov::genai::LLMPipeline pipe( model_path, device, From 51cb0a89cfed4212d5d91972ad9fd772af9f17d0 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Fri, 20 Dec 2024 17:25:11 +0100 Subject: [PATCH 20/31] Fixed error. --- .../speculative_decoding_lm.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp index ca5a60ec93..aa4ce0eed0 100644 --- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp +++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp @@ -6,9 +6,9 @@ #include "openvino/genai/llm_pipeline.hpp" int main(int argc, char* argv[]) try { - if (4 != argc) { - throw std::runtime_error(std::string{"Usage: "} + argv[0] + " ''"); - } + // if (4 != argc) { + // throw std::runtime_error(std::string{"Usage: "} + argv[0] + " ''"); + // } ov::genai::GenerationConfig config; config.max_new_tokens = 100; @@ -18,9 +18,9 @@ int main(int argc, char* argv[]) try { // add parameter to enable speculative decoding to generate candidates by draft_model while candidate probability is higher than `assistant_confidence_threshold` // config.assistant_confidence_threshold = 0.4; - std::string main_model_path = argv[1]; - std::string draft_model_path = argv[2]; - std::string prompt = argv[3]; + std::string main_model_path = "/home/panas/test_models/spec_dec/Llama-2-7b-chat-hf/"; + std::string draft_model_path = "/home/panas/test_models/spec_dec/tiny-llama-1.1b-chat/"; + std::string prompt = "What is openvino?"; // User can run main and draft model on different devices. // Please, set device for main model in `LLMPipeline` constructor and in in `ov::genai::draft_model` for draft. From c4c8c259f28520ac9eddf12e319622ad638b8e45 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Fri, 20 Dec 2024 17:27:11 +0100 Subject: [PATCH 21/31] Removed wrong changes. --- .../speculative_decoding_lm.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp index aa4ce0eed0..ca5a60ec93 100644 --- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp +++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp @@ -6,9 +6,9 @@ #include "openvino/genai/llm_pipeline.hpp" int main(int argc, char* argv[]) try { - // if (4 != argc) { - // throw std::runtime_error(std::string{"Usage: "} + argv[0] + " ''"); - // } + if (4 != argc) { + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " ''"); + } ov::genai::GenerationConfig config; config.max_new_tokens = 100; @@ -18,9 +18,9 @@ int main(int argc, char* argv[]) try { // add parameter to enable speculative decoding to generate candidates by draft_model while candidate probability is higher than `assistant_confidence_threshold` // config.assistant_confidence_threshold = 0.4; - std::string main_model_path = "/home/panas/test_models/spec_dec/Llama-2-7b-chat-hf/"; - std::string draft_model_path = "/home/panas/test_models/spec_dec/tiny-llama-1.1b-chat/"; - std::string prompt = "What is openvino?"; + std::string main_model_path = argv[1]; + std::string draft_model_path = argv[2]; + std::string prompt = argv[3]; // User can run main and draft model on different devices. // Please, set device for main model in `LLMPipeline` constructor and in in `ov::genai::draft_model` for draft. From bfcf9ffcd8f56638eb32ee971c35b42103715b6b Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Fri, 20 Dec 2024 17:27:33 +0100 Subject: [PATCH 22/31] Fixed error. --- src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp index 46b7b106a6..257c20bf01 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp @@ -52,8 +52,7 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con size_t main_cache_size = std::ceil(main_scheduler_config.cache_size * (1.f - k)), draft_cache_size = main_scheduler_config.cache_size - main_cache_size; - OPENVINO_ASSERT(main_cache_size > 0, "KV cache model cache size should be > 0"); - if (draft_cache_size == 0) { + if (draft_cache_size == 0 && main_cache_size > 0) { main_cache_size -= (main_cache_size > 1 ? 1 : 0); draft_cache_size = 1; } From 11b5e33db9ec85dd2a351c7d9f9f29a252e378dd Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Fri, 20 Dec 2024 17:30:03 +0100 Subject: [PATCH 23/31] Minor correction. --- .../speculative_decoding_lm.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/samples/python/speculative_decoding_lm/speculative_decoding_lm.py b/samples/python/speculative_decoding_lm/speculative_decoding_lm.py index 217b8a2730..919aa50124 100755 --- a/samples/python/speculative_decoding_lm/speculative_decoding_lm.py +++ b/samples/python/speculative_decoding_lm/speculative_decoding_lm.py @@ -14,20 +14,20 @@ def streamer(subword): return False def main(): - parser = argparse.ArgumentParser() - parser.add_argument('model_dir') - parser.add_argument('draft_model_dir') - parser.add_argument('prompt') - args = parser.parse_args() + # parser = argparse.ArgumentParser() + # parser.add_argument('model_dir') + # parser.add_argument('draft_model_dir') + # parser.add_argument('prompt') + # args = parser.parse_args() # User can run main and draft model on different devices. # Please, set device for main model in `openvino_genai.LLMPipeline` constructor and in openvino_genai.draft_model` for draft. main_device = 'CPU' # GPU can be used as well draft_device = 'CPU' - draft_model = openvino_genai.draft_model(args.draft_model_dir, draft_device) + draft_model = openvino_genai.draft_model("/home/panas/test_models/spec_dec/tiny-llama-1.1b-chat/", draft_device) - pipe = openvino_genai.LLMPipeline(args.model_dir, main_device, draft_model=draft_model) + pipe = openvino_genai.LLMPipeline("/home/panas/test_models/spec_dec/Llama-2-7b-chat-hf/", main_device, draft_model=draft_model) config = openvino_genai.GenerationConfig() config.max_new_tokens = 100 @@ -39,7 +39,7 @@ def main(): # Since the streamer is set, the results will be printed # every time a new token is generated and put into the streamer queue. - pipe.generate(args.prompt, config, streamer) + pipe.generate("What is openvino?", config, streamer) if '__main__' == __name__: main() From 64dab76e29d542feec2b5f012bb29ef9494887c7 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Fri, 20 Dec 2024 17:34:32 +0100 Subject: [PATCH 24/31] Removed wrong changes. --- .../speculative_decoding_lm.cpp | 1 - .../speculative_decoding_lm.py | 16 ++++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp index ca5a60ec93..e10228863f 100644 --- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp +++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp @@ -26,7 +26,6 @@ int main(int argc, char* argv[]) try { // Please, set device for main model in `LLMPipeline` constructor and in in `ov::genai::draft_model` for draft. std::string main_device = "CPU", draft_device = "CPU"; - // Different devices require different block sizes, so different scheduler configs need to be set. ov::genai::LLMPipeline pipe( main_model_path, main_device, diff --git a/samples/python/speculative_decoding_lm/speculative_decoding_lm.py b/samples/python/speculative_decoding_lm/speculative_decoding_lm.py index 919aa50124..217b8a2730 100755 --- a/samples/python/speculative_decoding_lm/speculative_decoding_lm.py +++ b/samples/python/speculative_decoding_lm/speculative_decoding_lm.py @@ -14,20 +14,20 @@ def streamer(subword): return False def main(): - # parser = argparse.ArgumentParser() - # parser.add_argument('model_dir') - # parser.add_argument('draft_model_dir') - # parser.add_argument('prompt') - # args = parser.parse_args() + parser = argparse.ArgumentParser() + parser.add_argument('model_dir') + parser.add_argument('draft_model_dir') + parser.add_argument('prompt') + args = parser.parse_args() # User can run main and draft model on different devices. # Please, set device for main model in `openvino_genai.LLMPipeline` constructor and in openvino_genai.draft_model` for draft. main_device = 'CPU' # GPU can be used as well draft_device = 'CPU' - draft_model = openvino_genai.draft_model("/home/panas/test_models/spec_dec/tiny-llama-1.1b-chat/", draft_device) + draft_model = openvino_genai.draft_model(args.draft_model_dir, draft_device) - pipe = openvino_genai.LLMPipeline("/home/panas/test_models/spec_dec/Llama-2-7b-chat-hf/", main_device, draft_model=draft_model) + pipe = openvino_genai.LLMPipeline(args.model_dir, main_device, draft_model=draft_model) config = openvino_genai.GenerationConfig() config.max_new_tokens = 100 @@ -39,7 +39,7 @@ def main(): # Since the streamer is set, the results will be printed # every time a new token is generated and put into the streamer queue. - pipe.generate("What is openvino?", config, streamer) + pipe.generate(args.prompt, config, streamer) if '__main__' == __name__: main() From bb24a36066de19a6c0219ce85ebe3fcac017eb47 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Mon, 23 Dec 2024 10:13:53 +0100 Subject: [PATCH 25/31] Fix. --- src/cpp/src/cache_manager.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cpp/src/cache_manager.hpp b/src/cpp/src/cache_manager.hpp index dee32b7793..0d34dc645e 100644 --- a/src/cpp/src/cache_manager.hpp +++ b/src/cpp/src/cache_manager.hpp @@ -101,8 +101,8 @@ class CacheManager { ov::Coordinate end_key(m_key_cache[decoder_layer_id].get_shape()); ov::Coordinate end_value(m_value_cache[decoder_layer_id].get_shape()); - ov::Tensor key_cache(m_device_config.get_cache_precision(), new_value_cache_shape); - ov::Tensor value_cache(m_device_config.get_cache_precision(), new_key_cache_shape); + ov::Tensor key_cache(m_device_config.get_cache_precision(), new_key_cache_shape); + ov::Tensor value_cache(m_device_config.get_cache_precision(), new_value_cache_shape); // copy current cache data ov::Tensor dst_key_roi(key_cache, start_key, end_key); @@ -131,8 +131,8 @@ class CacheManager { ov::Coordinate end_key(m_key_cache[decoder_layer_id].get_shape()); ov::Coordinate end_value(m_value_cache[decoder_layer_id].get_shape()); - ov::Tensor key_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), new_value_cache_shape); - ov::Tensor value_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), new_key_cache_shape); + ov::Tensor key_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), new_key_cache_shape); + ov::Tensor value_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), new_value_cache_shape); // copy current cache data ov::Tensor dst_key_roi(key_cache, start_key, end_key); From 13f9f087b1533e44b40bb60bd436a57110189409 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Mon, 23 Dec 2024 17:11:29 +0100 Subject: [PATCH 26/31] Fix of cache increasing for gpu. --- src/cpp/src/cache_manager.hpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/cpp/src/cache_manager.hpp b/src/cpp/src/cache_manager.hpp index 0d34dc645e..7358a4574d 100644 --- a/src/cpp/src/cache_manager.hpp +++ b/src/cpp/src/cache_manager.hpp @@ -131,14 +131,15 @@ class CacheManager { ov::Coordinate end_key(m_key_cache[decoder_layer_id].get_shape()); ov::Coordinate end_value(m_value_cache[decoder_layer_id].get_shape()); - ov::Tensor key_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), new_key_cache_shape); - ov::Tensor value_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), new_value_cache_shape); + ov::RemoteTensor key_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), new_key_cache_shape); + ov::RemoteTensor value_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), new_value_cache_shape); // copy current cache data - ov::Tensor dst_key_roi(key_cache, start_key, end_key); - ov::Tensor dst_value_roi(value_cache, start_value, end_value); - m_key_cache[decoder_layer_id].copy_to(dst_key_roi); - m_value_cache[decoder_layer_id].copy_to(dst_value_roi); + ov::RemoteTensor dst_key_roi(key_cache, start_key, end_key); + ov::RemoteTensor dst_value_roi(value_cache, start_value, end_value); + + dst_key_roi.copy_from(m_key_cache[decoder_layer_id]); + dst_value_roi.copy_from(m_value_cache[decoder_layer_id]); // set new cache tensors m_key_cache[decoder_layer_id] = key_cache; From eebac1feb1f09901f95ec152cd384082cf0d8e0e Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 24 Dec 2024 13:57:49 +0100 Subject: [PATCH 27/31] Applied comments. --- src/cpp/src/block_manager.hpp | 16 +-- src/cpp/src/cache_manager.hpp | 126 ++++++++++------------- src/cpp/src/continuous_batching_impl.cpp | 3 - src/cpp/src/continuous_batching_impl.hpp | 9 -- src/cpp/src/scheduler.hpp | 61 ++++++----- tests/cpp/cache_manager.cpp | 5 - 6 files changed, 90 insertions(+), 130 deletions(-) diff --git a/src/cpp/src/block_manager.hpp b/src/cpp/src/block_manager.hpp index d964032db0..4ca263777b 100644 --- a/src/cpp/src/block_manager.hpp +++ b/src/cpp/src/block_manager.hpp @@ -195,7 +195,6 @@ class BlockAllocator { size_t m_num_layers; bool m_enable_prefix_caching; ov::genai::OverwritableBlocksHashStore m_overwriteable_blocks; - bool m_initialized = false; public: /** * Constructs the BlockAllocator. @@ -216,7 +215,9 @@ class BlockAllocator { per_layer_block_list.push_back(std::make_shared(block_id)); } } - m_initialized = true; + } + else { + m_free_blocks_num = std::vector(m_num_layers, 0); } } @@ -227,10 +228,6 @@ class BlockAllocator { void increase_kv_blocks_number(size_t new_kv_blocks_count) { OPENVINO_ASSERT(new_kv_blocks_count > m_total_num_blocks, "New blocks number should be more than previous blocks number."); - if (!m_initialized) { - m_free_blocks_num = std::vector(m_num_layers, 0); - m_initialized = true; - } size_t added_blocks = new_kv_blocks_count - m_total_num_blocks; for (auto idx = 0; idx < m_free_blocks_num.size(); idx++) { m_free_blocks_num[idx] += added_blocks; @@ -243,9 +240,6 @@ class BlockAllocator { m_total_num_blocks = new_kv_blocks_count; } - bool is_inilialized() const { - return m_initialized; - } /** * Returns the number of free blocks for a given layer. @@ -665,10 +659,6 @@ class BlockManager { return m_allocator.num_free_blocks(0); // relying on the invariant that all layers have identical number of blocks } - bool block_allocator_initialized() const { - return m_allocator.is_inilialized(); - } - /** * @param num_blocks A number of KV cache blocks * @return Whether this number of KV cache blocks may be assigned to new sequences. diff --git a/src/cpp/src/cache_manager.hpp b/src/cpp/src/cache_manager.hpp index 7358a4574d..0c04823f4f 100644 --- a/src/cpp/src/cache_manager.hpp +++ b/src/cpp/src/cache_manager.hpp @@ -44,111 +44,91 @@ class CacheManager { if (m_num_allocated_kv_blocks >= num_kv_blocks) { return; } - if (m_num_allocated_kv_blocks > 0) { - increase_cache(num_kv_blocks); - return; - } + OPENVINO_ASSERT(m_key_cache.size() == m_value_cache.size()); m_num_allocated_kv_blocks = num_kv_blocks; ov::Shape value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(), num_kv_blocks); ov::Shape key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(), num_kv_blocks); const std::string device_name = m_device_config.get_device(); + ov::Coordinate start_key{0,0,0,0}; + ov::Coordinate start_value{0,0,0,0}; + if (device_name.find("GPU") == std::string::npos) {// Allocate KV caches for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { ov::Tensor key_cache(m_device_config.get_cache_precision(), key_cache_shape); ov::Tensor value_cache(m_device_config.get_cache_precision(), value_cache_shape); - // Some optimizations like AVX2, AVX512, AMX require a minimal shape and - // perform multiplying by zero on the excess data. Uninitialized tensor data contain NAN's, - // so NAN * 0 returns non-zero invalid data. - // So we need to set zeros to all newly allocated tensors data. - std::memset(key_cache.data(), 0, key_cache.get_byte_size()); - std::memset(value_cache.data(), 0, value_cache.get_byte_size()); - - m_key_cache.emplace_back(key_cache); - m_value_cache.emplace_back(value_cache); - - update_request_tensor(decoder_layer_id); - } - } else { - auto remote_context = m_core.get_default_context(device_name); - for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { - ov::Tensor key_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), - key_cache_shape); - ov::Tensor value_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), - value_cache_shape); - - m_key_cache.emplace_back(key_cache); - m_value_cache.emplace_back(value_cache); - - update_request_tensor(decoder_layer_id); - } - } - } + auto key_cache_roi_end = static_cast(key_cache.data()); + auto value_cache_roi_end = static_cast(value_cache.data()); + size_t key_roi_size_byte = 0; + size_t value_roi_size_byte = 0; - void increase_cache(size_t num_kv_blocks) { - OPENVINO_ASSERT(num_kv_blocks > m_num_allocated_kv_blocks); - ov::Shape new_value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(), num_kv_blocks); - ov::Shape new_key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(), num_kv_blocks); + if (m_key_cache.size() > decoder_layer_id) { + ov::Coordinate end_key = m_key_cache[decoder_layer_id].get_shape(); + ov::Coordinate end_value = m_value_cache[decoder_layer_id].get_shape(); - const std::string device_name = m_device_config.get_device(); - ov::Coordinate start_key{0,0,0,0}; - ov::Coordinate start_value{0,0,0,0}; + key_roi_size_byte = m_key_cache[decoder_layer_id].get_byte_size(); + value_roi_size_byte = m_value_cache[decoder_layer_id].get_byte_size(); + key_cache_roi_end = static_cast(key_cache.data()) + key_roi_size_byte; + value_cache_roi_end = static_cast(value_cache.data()) + value_roi_size_byte; + + // copy current cache data + ov::Tensor dst_key_roi(key_cache, start_key, end_key); + ov::Tensor dst_value_roi(value_cache, start_value, end_value); - if (device_name.find("GPU") == std::string::npos) { - for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { - ov::Coordinate end_key(m_key_cache[decoder_layer_id].get_shape()); - ov::Coordinate end_value(m_value_cache[decoder_layer_id].get_shape()); + m_key_cache[decoder_layer_id].copy_to(dst_key_roi); + m_value_cache[decoder_layer_id].copy_to(dst_value_roi); - ov::Tensor key_cache(m_device_config.get_cache_precision(), new_key_cache_shape); - ov::Tensor value_cache(m_device_config.get_cache_precision(), new_value_cache_shape); - - // copy current cache data - ov::Tensor dst_key_roi(key_cache, start_key, end_key); - ov::Tensor dst_value_roi(value_cache, start_value, end_value); - m_key_cache[decoder_layer_id].copy_to(dst_key_roi); - m_value_cache[decoder_layer_id].copy_to(dst_value_roi); + } // Some optimizations like AVX2, AVX512, AMX require a minimal shape and // perform multiplying by zero on the excess data. Uninitialized tensor data contain NAN's, // so NAN * 0 returns non-zero invalid data. // So we need to set zeros to all newly allocated tensors data. - auto key_cache_roi_end = static_cast(key_cache.data()) + dst_key_roi.get_byte_size(); - auto value_cache_roi_end = static_cast(value_cache.data()) + dst_value_roi.get_byte_size(); - std::memset(key_cache_roi_end, 0, key_cache.get_byte_size() - dst_key_roi.get_byte_size()); - std::memset(value_cache_roi_end, 0, value_cache.get_byte_size() - dst_value_roi.get_byte_size()); + std::memset(key_cache_roi_end, 0, key_cache.get_byte_size() - key_roi_size_byte); + std::memset(value_cache_roi_end, 0, value_cache.get_byte_size() - value_roi_size_byte); // set new cache tensors - m_key_cache[decoder_layer_id] = key_cache; - m_value_cache[decoder_layer_id] = value_cache; + if (m_key_cache.size() > decoder_layer_id) { + m_key_cache[decoder_layer_id] = key_cache; + m_value_cache[decoder_layer_id] = value_cache; + } + else { + m_key_cache.emplace_back(key_cache); + m_value_cache.emplace_back(value_cache); + } update_request_tensor(decoder_layer_id); } } else { auto remote_context = m_core.get_default_context(device_name); for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { - ov::Coordinate end_key(m_key_cache[decoder_layer_id].get_shape()); - ov::Coordinate end_value(m_value_cache[decoder_layer_id].get_shape()); - - ov::RemoteTensor key_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), new_key_cache_shape); - ov::RemoteTensor value_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), new_value_cache_shape); - - // copy current cache data - ov::RemoteTensor dst_key_roi(key_cache, start_key, end_key); - ov::RemoteTensor dst_value_roi(value_cache, start_value, end_value); - - dst_key_roi.copy_from(m_key_cache[decoder_layer_id]); - dst_value_roi.copy_from(m_value_cache[decoder_layer_id]); - - // set new cache tensors - m_key_cache[decoder_layer_id] = key_cache; - m_value_cache[decoder_layer_id] = value_cache; + ov::Tensor key_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), + key_cache_shape); + ov::Tensor value_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), + value_cache_shape); + if (m_key_cache.size() > decoder_layer_id) { + ov::Coordinate end_key = m_key_cache[decoder_layer_id].get_shape(); + ov::Coordinate end_value = m_value_cache[decoder_layer_id].get_shape(); + + // copy current cache data + ov::RemoteTensor dst_key_roi(key_cache, start_key, end_key); + ov::RemoteTensor dst_value_roi(value_cache, start_value, end_value); + dst_key_roi.copy_from(m_key_cache[decoder_layer_id]); + dst_value_roi.copy_from(m_value_cache[decoder_layer_id]); + + m_key_cache[decoder_layer_id] = key_cache; + m_value_cache[decoder_layer_id] = value_cache; + } + else { + m_key_cache.emplace_back(key_cache); + m_value_cache.emplace_back(value_cache); + } update_request_tensor(decoder_layer_id); } } - m_num_allocated_kv_blocks = num_kv_blocks; } ov::Tensor get_key_cache(size_t decoder_layer_id) const { diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index a5eaa6e49b..52ec6a8302 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -32,7 +32,6 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl( bool is_need_per_layer_cache_control = scheduler_config.use_cache_eviction; utils::apply_paged_attention_transformations(model, device_config, is_need_per_layer_cache_control); - m_core = std::make_shared(core); init(model, scheduler_config, compile_properties, device_config, core); } @@ -78,8 +77,6 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init( // If eos_token_id was not provided, take value if (m_generation_config.eos_token_id == -1) m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id()); - - m_device_config = std::make_shared(device_config); }; diff --git a/src/cpp/src/continuous_batching_impl.hpp b/src/cpp/src/continuous_batching_impl.hpp index ad0000ee68..8da05c6dfa 100644 --- a/src/cpp/src/continuous_batching_impl.hpp +++ b/src/cpp/src/continuous_batching_impl.hpp @@ -14,7 +14,6 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc std::shared_ptr m_cache_manager; std::shared_ptr m_model_runner; std::shared_ptr m_sampler; - std::shared_ptr m_device_config; // current requests to process std::vector m_requests; @@ -31,14 +30,6 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc // flag to enable validation mode for sampler bool m_is_validation_mode_enabled = false; - // dynamic kv-cache allocation params - const size_t m_kv_blocks_initial_multiplier = 2; - const float m_cache_growth_factor = 2; // commmon values 1.5 or 2 - const float m_percentage_threshold_for_cache_increase = 100; - - bool m_dynamic_memory_allocation = false; - std::shared_ptr m_core; - #ifdef DEBUG_CACHE_STATE_DUMP size_t step_count = 0; #endif diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp index cbd8ef64a2..7ae7a40a51 100644 --- a/src/cpp/src/scheduler.hpp +++ b/src/cpp/src/scheduler.hpp @@ -51,17 +51,12 @@ class Scheduler { m_config(config), m_block_manager(m_config.num_kv_blocks, m_config.enable_prefix_caching, block_size, num_layers) { - // allocate kv-cache if the number of kv blocks is determined, - // otherwise cache will be allocated dynamically - if (m_block_manager.block_allocator_initialized()) { - m_cache_manager->allocate_cache_if_needed(m_block_manager.get_total_number_of_kv_blocks()); - } OPENVINO_ASSERT(num_layers != 0, "num_layers must be non-zero"); } Output schedule(std::vector& sequence_groups) { Output scheduler_output; - if (!m_block_manager.block_allocator_initialized()) { + if (m_block_manager.get_total_number_of_kv_blocks() == 0) { _initialize_cache(sequence_groups); } @@ -83,6 +78,7 @@ class Scheduler { } } + m_cache_manager->allocate_cache_if_needed(m_block_manager.get_total_number_of_kv_blocks()); _clear_waiting_sequences(sequence_groups); scheduler_output.m_cache_usage = m_block_manager.get_used_percentage(); return scheduler_output; @@ -255,8 +251,10 @@ class Scheduler { size_t available_slots = currently_allocated_token_slots - occupied_token_slots, required_slots = num_scheduled_tokens > available_slots ? num_scheduled_tokens - available_slots : 0; size_t num_required_blocks = (required_slots + block_size - 1) / block_size, num_free_blocks = m_block_manager.num_free_blocks(); - if (num_free_blocks == 0) { - _try_increase_cache(); + while (num_required_blocks > num_free_blocks) { + if (!_try_increase_cache()) { + break; + } } size_t num_scheduled_blocks = std::min(num_required_blocks, num_free_blocks); // some scheduled blocks can be no fully occupied, so we need to take min between num_scheduled_blocks @@ -310,15 +308,18 @@ class Scheduler { size_t num_scheduled_tokens_per_seq = std::min(available_tokens_per_seq_in_megabatch, num_available_tokens_per_seq); sequence_group->schedule_tokens(num_scheduled_tokens_per_seq); + while (!m_block_manager.can_append_slots(sequence_group)){ + if (!_try_increase_cache()) { + break; + } + } + _apply_preemption(sequence_group_id, sequence_groups); // if we can't preemt any more sequences, clear scheduled tokens and move to next sequence - if (!m_block_manager.can_append_slots(sequence_group)){ - _try_increase_cache(); - if (!m_block_manager.can_append_slots(sequence_group)) { - sequence_group->clear_scheduled_tokens(); - continue; - } + if (!m_block_manager.can_append_slots(sequence_group)) { + sequence_group->clear_scheduled_tokens(); + continue; } // allocate new slots @@ -394,11 +395,13 @@ class Scheduler { // apply KV cache limitations size_t block_size = get_block_size(); const size_t num_required_blocks = (sequence_len + block_size - 1) / block_size; - if (!m_block_manager.can_allocate_blocks(num_required_blocks)) { - _try_increase_cache(); - if (!m_block_manager.can_allocate_blocks(num_required_blocks)) + while (!m_block_manager.can_allocate_blocks(num_required_blocks)){ + if (!_try_increase_cache()) { break; + } } + if (!m_block_manager.can_allocate_blocks(num_required_blocks)) + break; // add scheduling information { @@ -465,25 +468,26 @@ class Scheduler { } void _initialize_cache(const std::vector& sequence_groups) { - size_t seq_length_sum = 0; + size_t blocks_sum = 0; for (auto idx = 0; idx < sequence_groups.size(); idx++) { auto seq_length = sequence_groups[idx]->get_prompt_len() * m_kv_blocks_initial_multiplier; auto gen_config = sequence_groups[idx]->get_sampling_parameters(); seq_length = std::min(seq_length, sequence_groups[idx]->get_prompt_len() + gen_config.get_max_new_tokens(sequence_groups[idx]->get_prompt_len())); - if (sequence_groups[idx]->get_sampling_parameters().is_beam_search()) { - - seq_length *= sequence_groups[idx]->get_sampling_parameters().num_beams; + size_t blocks_num = std::ceil((float)seq_length / m_block_manager.get_block_size()); + if (gen_config.do_sample && gen_config.is_beam_search()) { + blocks_num *= gen_config.num_beams; + } else if (gen_config.do_sample && gen_config.is_multinomial()) { + blocks_num *= gen_config.num_return_sequences; } - seq_length_sum += seq_length; + blocks_sum += blocks_num; } - m_block_manager.increase_kv_blocks_number(seq_length_sum); - m_cache_manager->allocate_cache_if_needed(m_block_manager.get_total_number_of_kv_blocks()); + m_block_manager.increase_kv_blocks_number(blocks_sum); m_dynamic_memory_allocation = true; } - void _try_increase_cache() { + bool _try_increase_cache() { if (!m_dynamic_memory_allocation) { - return; + return false; } auto device_config = m_cache_manager->get_device_config(); auto device = device_config->get_device(); @@ -503,9 +507,12 @@ class Scheduler { if (possible_blocks_to_add > 0) { m_block_manager.increase_kv_blocks_number(current_num_of_kv_blocks + possible_blocks_to_add); } + else { + return false; + } } } - m_cache_manager->allocate_cache_if_needed(m_block_manager.get_total_number_of_kv_blocks()); + return true; } }; diff --git a/tests/cpp/cache_manager.cpp b/tests/cpp/cache_manager.cpp index a3be00b226..7f07980389 100644 --- a/tests/cpp/cache_manager.cpp +++ b/tests/cpp/cache_manager.cpp @@ -59,7 +59,6 @@ TEST(TestCacheManager, test_cache_size_param) { ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request(); auto cache_manager = std::make_shared(device_config, request, core); auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers()); - OPENVINO_ASSERT(block_manager.block_allocator_initialized()); cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks()); ASSERT_EQ(get_total_allocated_bytes(cache_manager, num_decoder_layers), 2146959360); @@ -82,7 +81,6 @@ TEST(TestCacheManager, test_kv_blocks_param) { ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request(); auto cache_manager = std::make_shared(device_config, request, core); auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers()); - OPENVINO_ASSERT(block_manager.block_allocator_initialized()); OPENVINO_ASSERT(block_manager.get_total_number_of_kv_blocks(), scheduler_config.num_kv_blocks); } @@ -107,11 +105,9 @@ TEST(TestCacheManager, test_dynamic_cache_increase) { ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request(); auto cache_manager = std::make_shared(device_config, request, core); auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers()); - OPENVINO_ASSERT(!block_manager.block_allocator_initialized()); // check initial cache allocation block_manager.increase_kv_blocks_number(100); - OPENVINO_ASSERT(block_manager.block_allocator_initialized()); OPENVINO_ASSERT(block_manager.get_total_number_of_kv_blocks(), 100); cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks()); @@ -120,7 +116,6 @@ TEST(TestCacheManager, test_dynamic_cache_increase) { // check cache increase block_manager.increase_kv_blocks_number(200); - OPENVINO_ASSERT(block_manager.block_allocator_initialized()); OPENVINO_ASSERT(block_manager.get_total_number_of_kv_blocks(), 200); cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks()); From 2715110c9c7238849cf50f4923d2652a806f8eec Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 24 Dec 2024 14:30:01 +0100 Subject: [PATCH 28/31] Update src/cpp/src/scheduler.hpp Co-authored-by: Ilya Lavrenov --- src/cpp/src/scheduler.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp index 7ae7a40a51..c5ee8a6250 100644 --- a/src/cpp/src/scheduler.hpp +++ b/src/cpp/src/scheduler.hpp @@ -474,7 +474,7 @@ class Scheduler { auto gen_config = sequence_groups[idx]->get_sampling_parameters(); seq_length = std::min(seq_length, sequence_groups[idx]->get_prompt_len() + gen_config.get_max_new_tokens(sequence_groups[idx]->get_prompt_len())); size_t blocks_num = std::ceil((float)seq_length / m_block_manager.get_block_size()); - if (gen_config.do_sample && gen_config.is_beam_search()) { + if (gen_config.is_beam_search()) { blocks_num *= gen_config.num_beams; } else if (gen_config.do_sample && gen_config.is_multinomial()) { blocks_num *= gen_config.num_return_sequences; From 1d3f85b2021d51a2cd862bec0d0092230202b649 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 24 Dec 2024 14:30:20 +0100 Subject: [PATCH 29/31] Update src/cpp/src/scheduler.hpp Co-authored-by: Ilya Lavrenov --- src/cpp/src/scheduler.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp index c5ee8a6250..3e7b07c346 100644 --- a/src/cpp/src/scheduler.hpp +++ b/src/cpp/src/scheduler.hpp @@ -476,7 +476,7 @@ class Scheduler { size_t blocks_num = std::ceil((float)seq_length / m_block_manager.get_block_size()); if (gen_config.is_beam_search()) { blocks_num *= gen_config.num_beams; - } else if (gen_config.do_sample && gen_config.is_multinomial()) { + } else if (gen_config.is_multinomial()) { blocks_num *= gen_config.num_return_sequences; } blocks_sum += blocks_num; From 3fa02d0506e644ba920cd5d9c97b960e27140636 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 24 Dec 2024 15:22:20 +0100 Subject: [PATCH 30/31] Update src/cpp/src/scheduler.hpp Co-authored-by: Ilya Lavrenov --- src/cpp/src/scheduler.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp index 3e7b07c346..ba0880b58a 100644 --- a/src/cpp/src/scheduler.hpp +++ b/src/cpp/src/scheduler.hpp @@ -251,7 +251,7 @@ class Scheduler { size_t available_slots = currently_allocated_token_slots - occupied_token_slots, required_slots = num_scheduled_tokens > available_slots ? num_scheduled_tokens - available_slots : 0; size_t num_required_blocks = (required_slots + block_size - 1) / block_size, num_free_blocks = m_block_manager.num_free_blocks(); - while (num_required_blocks > num_free_blocks) { + while (num_required_blocks > m_block_manager.num_free_blocks()) { if (!_try_increase_cache()) { break; } From a0456d83ad683bc2520f9e846d30bea045859e7f Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 24 Dec 2024 19:21:36 +0400 Subject: [PATCH 31/31] Update scheduler.hpp --- src/cpp/src/scheduler.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp index ba0880b58a..da65c68bec 100644 --- a/src/cpp/src/scheduler.hpp +++ b/src/cpp/src/scheduler.hpp @@ -250,13 +250,13 @@ class Scheduler { OPENVINO_ASSERT(currently_allocated_token_slots >= occupied_token_slots, "internal error"); size_t available_slots = currently_allocated_token_slots - occupied_token_slots, required_slots = num_scheduled_tokens > available_slots ? num_scheduled_tokens - available_slots : 0; - size_t num_required_blocks = (required_slots + block_size - 1) / block_size, num_free_blocks = m_block_manager.num_free_blocks(); + size_t num_required_blocks = (required_slots + block_size - 1) / block_size; while (num_required_blocks > m_block_manager.num_free_blocks()) { if (!_try_increase_cache()) { break; } } - size_t num_scheduled_blocks = std::min(num_required_blocks, num_free_blocks); + size_t num_scheduled_blocks = std::min(num_required_blocks, m_block_manager.num_free_blocks()); // some scheduled blocks can be no fully occupied, so we need to take min between num_scheduled_blocks // and total "scheduled capacity" num_scheduled_tokens = std::min(num_scheduled_tokens, available_slots + num_scheduled_blocks * block_size);