Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CB: drop profiling as it drops performance #1280

Merged
merged 1 commit into from
Nov 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 2 additions & 20 deletions src/cpp/src/continuous_batching_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(

ov::Core core;

auto [core_properties, compile_properties] = utils::split_core_complile_config(properties);
auto [core_properties, compile_properties] = utils::split_core_compile_config(properties);
core.set_property(core_properties);

// The model can be compiled for GPU as well
Expand Down Expand Up @@ -57,7 +57,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init(
}

SchedulerConfig updated_config = scheduler_config;
// update KV number in scheduler config
// update KV blocks number in scheduler config
if (scheduler_config.num_kv_blocks != device_config.get_num_kv_blocks()) {
updated_config.num_kv_blocks = device_config.get_num_kv_blocks();
}
Expand Down Expand Up @@ -166,24 +166,6 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
timer.start();
logits = m_model_runner->forward(m_requests, scheduler_output);
timer.end();

ov::InferRequest infer_request = m_model_runner->get_infer_request();
ov::CompiledModel compiled_model = infer_request.get_compiled_model();
const bool is_profiling_enabled = compiled_model.get_property(ov::enable_profiling);

// collect detailed statistic
if (is_profiling_enabled) {
std::vector<ov::ProfilingInfo> profiling_info = m_model_runner->get_infer_request().get_profiling_info();
for (const ov::ProfilingInfo& info : profiling_info) {
double current_time = info.real_time.count();
if (info.node_type == "PagedAttentionExtension") {
m_perf.m_paged_attention_time_ms += current_time;
} else if (info.node_type == "FullyConnected") {
m_perf.m_matmul_time_ms += current_time;
}
m_perf.m_infer_total_ms += current_time;
}
}
}

#ifdef DEBUG_CACHE_STATE_DUMP
Expand Down
4 changes: 2 additions & 2 deletions src/cpp/src/llm_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,15 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
{
ov::Core core;
if (auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) {
auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(*filtered_plugin_config);
auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_compile_config(*filtered_plugin_config);
core.set_property(core_plugin_config);
auto model = core.read_model(models_path / "openvino_model.xml");
m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model.");
m_adapter_controller = AdapterController(model, *m_generation_config.adapters, device); // TODO: Make the prefix name configurable
utils::slice_matmul_statefull_model(model);
m_model_runner = core.compile_model(model, device, compile_plugin_config).create_infer_request();
} else {
auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(plugin_config);
auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_compile_config(plugin_config);
core.set_property(core_plugin_config);
auto model = core.read_model(models_path / "openvino_model.xml");
utils::slice_matmul_statefull_model(model);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(
const ov::genai::ModelDesc draft_model_desc,
const ov::AnyMap& tokenizer_properties) {
ov::Core core;
auto [core_properties, compile_properties] = ov::genai::utils::split_core_complile_config(main_properties);
auto [core_properties, compile_properties] = ov::genai::utils::split_core_compile_config(main_properties);
core.set_property(core_properties);

std::filesystem::path openvino_model_name = "openvino_model.xml",
Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ ProcessorConfig from_any_map(
* There are not supported by `core.compile` function plugin options like `ENABLE_MMAP`
* Move this options to `core.set_property` config
*/
std::pair<ov::AnyMap, ov::AnyMap> split_core_complile_config(const ov::AnyMap& properties) {
std::pair<ov::AnyMap, ov::AnyMap> split_core_compile_config(const ov::AnyMap& properties) {
const std::vector<std::string> unsupported_by_compile_properties{"ENABLE_MMAP"};
ov::AnyMap core_properties;
ov::AnyMap compile_properties{properties};
Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ ProcessorConfig from_any_map(
const ProcessorConfig& initial
);

std::pair<ov::AnyMap, ov::AnyMap> split_core_complile_config(const ov::AnyMap& properties);
std::pair<ov::AnyMap, ov::AnyMap> split_core_compile_config(const ov::AnyMap& properties);

ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend);

Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/whisper_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi
const ov::AnyMap& properties)
: WhisperPipelineImplBase{models_path} {
ov::Core core = utils::singleton_core();
auto [core_properties, compile_properties] = ov::genai::utils::split_core_complile_config(properties);
auto [core_properties, compile_properties] = ov::genai::utils::split_core_compile_config(properties);
core.set_property(core_properties);

m_models.encoder =
Expand Down
Loading