diff --git a/src/c++/perf_analyzer/inference_profiler.cc b/src/c++/perf_analyzer/inference_profiler.cc index 46e2bcb52..38dad1da6 100644 --- a/src/c++/perf_analyzer/inference_profiler.cc +++ b/src/c++/perf_analyzer/inference_profiler.cc @@ -107,6 +107,14 @@ EnsembleDurations GetTotalEnsembleDurations(const ServerSideStats& stats) { EnsembleDurations result; + // Calculate avg cache hit latency and cache miss latency for ensemble model + // in case top level response caching is enabled. + const uint64_t ensemble_cache_hit_cnt = stats.cache_hit_count; + const uint64_t ensemble_cache_miss_cnt = stats.cache_miss_count; + result.total_cache_hit_time_avg_us += + AverageDurationInUs(stats.cache_hit_time_ns, ensemble_cache_hit_cnt); + result.total_cache_miss_time_avg_us += + AverageDurationInUs(stats.cache_miss_time_ns, ensemble_cache_miss_cnt); for (const auto& model_stats : stats.composing_models_stat) { if (model_stats.second.composing_models_stat.empty()) { // Cache hit count covers cache hits, not related to compute times @@ -238,7 +246,6 @@ ReportServerSideStats( if (parser->ResponseCacheEnabled()) { const uint64_t overhead_avg_us = GetOverheadDuration( cumm_avg_us, queue_avg_us, combined_cache_compute_avg_us); - std::cout << " (overhead " << overhead_avg_us << " usec + " << "queue " << queue_avg_us << " usec + " << "cache hit/miss " << combined_cache_compute_avg_us @@ -283,12 +290,18 @@ ReportServerSideStats( const uint64_t overhead_avg_us = GetOverheadDuration( cumm_avg_us, ensemble_times.total_queue_time_avg_us, ensemble_times.total_combined_cache_compute_time_avg_us); - std::cout << " (overhead " << overhead_avg_us << " usec + " - << "queue " << ensemble_times.total_queue_time_avg_us - << " usec + " - << "cache hit/miss " - << ensemble_times.total_combined_cache_compute_time_avg_us - << " usec)" << std::endl; + // FIXME - Refactor these calculations in case of ensemble top level + // response cache is enabled + if (!parser->TopLevelResponseCachingEnabled()) { + std::cout << " (overhead " << overhead_avg_us << " usec + " + << "queue " << ensemble_times.total_queue_time_avg_us + << " usec + " + << "cache hit/miss " + << ensemble_times.total_combined_cache_compute_time_avg_us + << " usec)" << std::endl; + } else { + std::cout << std::endl; + } std::cout << ident << ident << " Average Cache Hit Latency: " << ensemble_times.total_cache_hit_time_avg_us << " usec" << std::endl; @@ -1516,8 +1529,16 @@ InferenceProfiler::DetermineStatsModelVersion( *status_model_version = std::stoll(model_identifier.second); } } - - if (*status_model_version == -1) { + // FIXME - Investigate why composing model version is -1 in case of ensemble + // cache hit. + // + // In case of ensemble models, if top level response caching is + // enabled, the composing models versions are unavailable in case of a cache + // hit. This is due to the scheduler sends cache response and composing models + // do not get executed. It's a valid scenario and shouldn't throw error. + bool model_version_unspecified_and_invalid = + *status_model_version == -1 && !parser_->TopLevelResponseCachingEnabled(); + if (model_version_unspecified_and_invalid) { return cb::Error( "failed to find the requested model version", pa::GENERIC_ERROR); } @@ -1533,6 +1554,21 @@ InferenceProfiler::DetermineStatsModelVersion( return cb::Error::Success; } +// Only for unit-testing +#ifndef DOCTEST_CONFIG_DISABLE +cb::Error +InferenceProfiler::SetTopLevelResponseCaching( + bool enable_top_level_response_caching) +{ + parser_ = std::make_shared(cb::BackendKind::TRITON); + if (parser_ == nullptr) { + return cb::Error("Failed to initialize ModelParser"); + } + parser_->SetTopLevelResponseCaching(enable_top_level_response_caching); + return cb::Error::Success; +} +#endif + cb::Error InferenceProfiler::SummarizeServerStats( const std::map& start_status, @@ -1588,8 +1624,20 @@ InferenceProfiler::SummarizeServerStatsHelper( const auto& end_itr = end_status.find(this_id); if (end_itr == end_status.end()) { - return cb::Error( - "missing statistics for requested model", pa::GENERIC_ERROR); + // In case of ensemble models, if top level response caching is enabled, + // the composing models statistics are unavailable in case of a cache hit. + // This is due to the scheduler sends cache response and composing models do + // not get executed. It's a valid scenario and shouldn't throw error. + bool stats_not_found_and_invalid = + model_version == -1 && !parser_->TopLevelResponseCachingEnabled(); + if (stats_not_found_and_invalid) { + return cb::Error( + "missing statistics for requested model", pa::GENERIC_ERROR); + } else { + // Setting server stats 0 for composing model in case of ensemble request + // cache hit since the composing model will not be executed + server_stats->Reset(); + } } else { uint64_t start_infer_cnt = 0; uint64_t start_exec_cnt = 0; diff --git a/src/c++/perf_analyzer/inference_profiler.h b/src/c++/perf_analyzer/inference_profiler.h index 913b23ded..5faf4f6f8 100644 --- a/src/c++/perf_analyzer/inference_profiler.h +++ b/src/c++/perf_analyzer/inference_profiler.h @@ -52,6 +52,7 @@ namespace triton { namespace perfanalyzer { #ifndef DOCTEST_CONFIG_DISABLE class NaggyMockInferenceProfiler; class TestInferenceProfiler; +class ModelParser; #endif /// Constant parameters that determine the whether stopping criteria has met @@ -119,6 +120,28 @@ struct ServerSideStats { uint64_t cache_miss_time_ns; std::map composing_models_stat; + // This function sets composing model server stats to 0 in case of a cache hit + // when top level response cache is enabled, since composing models are not + // executed and do not have any stats + void Reset() + { + inference_count = 0; + execution_count = 0; + success_count = 0; + queue_count = 0; + compute_input_count = 0; + compute_infer_count = 0; + compute_output_count = 0; + cumm_time_ns = 0; + queue_time_ns = 0; + compute_input_time_ns = 0; + compute_infer_time_ns = 0; + compute_output_time_ns = 0; + cache_hit_count = 0; + cache_hit_time_ns = 0; + cache_miss_count = 0; + cache_miss_time_ns = 0; + } }; /// Holds the statistics recorded at the client side. @@ -530,12 +553,17 @@ class InferenceProfiler { /// measurement /// \param end_stats The stats for all models at the end of the measurement /// \param model_version The determined model version + cb::Error DetermineStatsModelVersion( const cb::ModelIdentifier& model_identifier, const std::map& start_stats, const std::map& end_stats, int64_t* model_version); +#ifndef DOCTEST_CONFIG_DISABLE + cb::Error SetTopLevelResponseCaching(bool enable_top_level_request_caching); +#endif + /// \param start_status The model status at the start of the measurement. /// \param end_status The model status at the end of the measurement. /// \param server_stats Returns the summary that the fields recorded by server @@ -738,6 +766,7 @@ class InferenceProfiler { #ifndef DOCTEST_CONFIG_DISABLE friend NaggyMockInferenceProfiler; friend TestInferenceProfiler; + friend ModelParser; public: InferenceProfiler() = default; diff --git a/src/c++/perf_analyzer/model_parser.cc b/src/c++/perf_analyzer/model_parser.cc index 1ab9f7a6d..8ffea56da 100644 --- a/src/c++/perf_analyzer/model_parser.cc +++ b/src/c++/perf_analyzer/model_parser.cc @@ -169,6 +169,10 @@ ModelParser::InitTriton( response_cache_enabled_ = cache_itr->value["enable"].GetBool(); } + if (cache_itr != config.MemberEnd()) { + top_level_response_caching_enabled_ = cache_itr->value["enable"].GetBool(); + } + return cb::Error::Success; } diff --git a/src/c++/perf_analyzer/model_parser.h b/src/c++/perf_analyzer/model_parser.h index c1400d079..ac76b3e22 100644 --- a/src/c++/perf_analyzer/model_parser.h +++ b/src/c++/perf_analyzer/model_parser.h @@ -35,6 +35,7 @@ namespace triton { namespace perfanalyzer { #ifndef DOCTEST_CONFIG_DISABLE class TestModelParser; class MockModelParser; +class InferenceProfiler; #endif struct ModelTensor { @@ -73,7 +74,8 @@ class ModelParser { outputs_(std::make_shared()), composing_models_map_(std::make_shared()), scheduler_type_(NONE), max_batch_size_(0), is_decoupled_(false), - response_cache_enabled_(false) + response_cache_enabled_(false), + top_level_response_caching_enabled_(false) { } @@ -151,6 +153,22 @@ class ModelParser { /// model bool ResponseCacheEnabled() const { return response_cache_enabled_; } + /// Returns whether or not top level request caching is enabled for this model + /// \return the truth value of whether top level request caching is enabled + /// for this model + bool TopLevelResponseCachingEnabled() const + { + return top_level_response_caching_enabled_; + } + +/// Only for testing +#ifndef DOCTEST_CONFIG_DISABLE + void SetTopLevelResponseCaching(bool enable_top_level_response_caching) + { + top_level_response_caching_enabled_ = enable_top_level_response_caching; + } +#endif + /// Get the details about the model inputs. /// \return The map with tensor_name and the tensor details /// stored as key-value pair. @@ -169,6 +187,7 @@ class ModelParser { return composing_models_map_; } + protected: ModelSchedulerType scheduler_type_; bool is_decoupled_; @@ -220,10 +239,12 @@ class ModelParser { std::string model_signature_name_; size_t max_batch_size_; bool response_cache_enabled_; + bool top_level_response_caching_enabled_; #ifndef DOCTEST_CONFIG_DISABLE friend TestModelParser; friend MockModelParser; + friend InferenceProfiler; public: ModelParser() = default; diff --git a/src/c++/perf_analyzer/test_inference_profiler.cc b/src/c++/perf_analyzer/test_inference_profiler.cc index 683219f15..8ff39605b 100644 --- a/src/c++/perf_analyzer/test_inference_profiler.cc +++ b/src/c++/perf_analyzer/test_inference_profiler.cc @@ -160,8 +160,15 @@ class TestInferenceProfiler : public InferenceProfiler { return InferenceProfiler::DetermineStatsModelVersion( model_identifier, start_stats, end_stats, model_version); } + + cb::Error SetTopLevelResponseCaching(bool enable_top_level_response_caching) + { + return InferenceProfiler::SetTopLevelResponseCaching( + enable_top_level_response_caching); + } }; + TEST_CASE("testing the ValidLatencyMeasurement function") { size_t valid_sequence_count{}; @@ -850,6 +857,25 @@ TEST_CASE("determine_stats_model_version: testing DetermineStatsModelVersion()") expect_exception = true; } + SUBCASE("One entry - version -1 - valid and in start") + { + model_identifier = {"ModelA", "-1"}; + start_stats_map.insert({{"ModelA", "3"}, old_stats}); + end_stats_map.insert({{"ModelA", "3"}, new_stats}); + cb::Error status = tip.SetTopLevelResponseCaching(true); + CHECK(status.IsOk()); + expected_model_version = -1; + } + + SUBCASE("One entry - version -1 - not valid") + { + model_identifier = {"ModelA", "-1"}; + end_stats_map.insert({{"ModelA", "3"}, old_stats}); + cb::Error status = tip.SetTopLevelResponseCaching(false); + CHECK(status.IsOk()); + expected_model_version = -1; + expect_exception = true; + } std::stringstream captured_cerr; std::streambuf* old = std::cerr.rdbuf(captured_cerr.rdbuf());