diff --git a/samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark.cpp b/samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark.cpp index 9043f80895..055e2805a6 100644 --- a/samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark.cpp +++ b/samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark.cpp @@ -428,7 +428,7 @@ int main(int argc, char* argv[]) try { options.add_options() ("n,num_prompts", "A number of prompts", cxxopts::value()->default_value("1000")) ("b,max_batch_size", "A maximum number of batched tokens", cxxopts::value()->default_value("256")) - ("dynamic_split_fuse", "Whether to use dynamic split-fuse or vLLM scheduling", cxxopts::value()->default_value("true")) + ("dynamic_split_fuse", "Whether to use dynamic split-fuse or vLLM scheduling. Use --dynamic_split_fuse=false to disable", cxxopts::value()->default_value("true")) ("m,model", "Path to model and tokenizers base directory", cxxopts::value()->default_value(".")) ("dataset", "Path to dataset .json file", cxxopts::value()->default_value("./ShareGPT_V3_unfiltered_cleaned_split.json")) ("max_input_len", "Max input length take from dataset", cxxopts::value()->default_value("1024")) @@ -437,6 +437,7 @@ int main(int argc, char* argv[]) try { ("cache_size", "Size of memory used for KV cache in GB. Default: 16", cxxopts::value()->default_value("16")) ("device", "Target device to run the model. Default: CPU", cxxopts::value()->default_value("CPU")) ("device_config", "Plugin configuration JSON. Example: '{\"MODEL_DISTRIBUTION_POLICY\":\"TENSOR_PARALLEL\",\"PERF_COUNT\":true}' Default: {\"PERF_COUNT\":true}", cxxopts::value()->default_value("{\"PERF_COUNT\":true}")) + ("full_log", "Whether to enable logging of additional information, like model configuration. Use --full_log=false to disable", cxxopts::value()->default_value("true")) ("h,help", "Print usage"); cxxopts::ParseResult result; @@ -464,6 +465,7 @@ int main(int argc, char* argv[]) try { const std::string device = result["device"].as(); const std::string device_config = result["device_config"].as(); const size_t cache_size = result["cache_size"].as(); + const bool full_log = result["full_log"].as(); // Create requests for generation Dataset dataset = filtered_dataset(models_path, dataset_path, num_prompts, max_input_len, max_output_len); @@ -488,6 +490,7 @@ int main(int argc, char* argv[]) try { std::cout << "\tMax output length: " << max_output_len << std::endl; std::cout << "\tTarget device: " << device << std::endl; std::cout << "\tPlugin configuration JSON: " << device_config << std::endl; + std::cout << "\tFull logging set to: " << full_log << std::endl; ov::AnyMap device_config_map = {}; if (!parse_plugin_config_string(device_config, device_config_map)) { @@ -497,7 +500,7 @@ int main(int argc, char* argv[]) try { // Benchmarking std::cout << "Loading models, creating pipelines, preparing environment..." << std::endl; - ov::genai::ContinuousBatchingPipeline pipe(models_path, scheduler_config, device, device_config_map); + ov::genai::ContinuousBatchingPipeline pipe(models_path, scheduler_config, device, device_config_map, {}, full_log); std::cout << "Model configuration: " << std::endl << pipe.get_model_configuration_string(); diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp index 232270a673..c65dd7eab3 100644 --- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp +++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp @@ -33,7 +33,8 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline { const SchedulerConfig& scheduler_config, const std::string& device = "CPU", const ov::AnyMap& llm_plugin_config = {}, - const ov::AnyMap& tokenizer_plugin_config = {}); + const ov::AnyMap& tokenizer_plugin_config = {}, + const bool full_log = false); /** * @brief Constructs a ContinuousBatchingPipeline when ov::genai::Tokenizer is initialized manually using file from the different dirs. @@ -49,7 +50,8 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline { const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device="CPU", - const ov::AnyMap& plugin_config={} + const ov::AnyMap& plugin_config={}, + const bool full_log = false ); ov::genai::Tokenizer get_tokenizer(); diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp index dd36f6976e..2e587832e5 100644 --- a/src/cpp/src/continuous_batching_pipeline.cpp +++ b/src/cpp/src/continuous_batching_pipeline.cpp @@ -88,7 +88,7 @@ class ContinuousBatchingPipeline::Impl { } public: - Impl(const std::string& models_path, const Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& plugin_config) : + Impl(const std::string& models_path, const Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& plugin_config, const bool full_log) : m_tokenizer{tokenizer} { ov::Core core; @@ -100,9 +100,12 @@ class ContinuousBatchingPipeline::Impl { apply_paged_attention_transformations(model, device_config); auto compiled_model = core.compile_model(model, device_config.get_device(), plugin_config); - ov::genai::utils::read_properties([compiled_model](const std::string& key) { - return compiled_model.get_property(key); }, - m_model_config_namevalues); + + if (full_log) { + ov::genai::utils::read_properties([compiled_model](const std::string& key) { + return compiled_model.get_property(key); }, + m_model_config_namevalues); + } ov::InferRequest infer_request = compiled_model.create_infer_request(); @@ -128,8 +131,8 @@ class ContinuousBatchingPipeline::Impl { // read default generation config } - Impl(const std::string& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& llm_plugin_config, const ov::AnyMap& tokenizer_plugin_config) - : Impl{models_path, Tokenizer(models_path, tokenizer_plugin_config), scheduler_config, device, llm_plugin_config} {} + Impl(const std::string& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& llm_plugin_config, const ov::AnyMap& tokenizer_plugin_config, const bool full_log) + : Impl{models_path, Tokenizer(models_path, tokenizer_plugin_config), scheduler_config, device, llm_plugin_config, full_log} {} ov::genai::GenerationConfig get_config() const { return m_generation_config; @@ -403,8 +406,10 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::string& model const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& llm_plugin_config, - const ov::AnyMap& tokenizer_plugin_config) { - m_impl = std::make_shared(models_path, scheduler_config, device, llm_plugin_config, tokenizer_plugin_config); + const ov::AnyMap& tokenizer_plugin_config, + const bool full_log + ) { + m_impl = std::make_shared(models_path, scheduler_config, device, llm_plugin_config, tokenizer_plugin_config, full_log); } ContinuousBatchingPipeline::ContinuousBatchingPipeline( @@ -412,8 +417,9 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, - const ov::AnyMap& plugin_config -) : m_impl{std::make_shared(model_path, tokenizer, scheduler_config, device, plugin_config)} {} + const ov::AnyMap& plugin_config, + const bool full_log +) : m_impl{std::make_shared(model_path, tokenizer, scheduler_config, device, plugin_config, full_log)} {} ov::genai::Tokenizer ContinuousBatchingPipeline::get_tokenizer() { return m_impl->get_tokenizer();