Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changes to support Ensemble Top Level Response Caching (#560) #642

Merged
merged 1 commit into from
May 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 59 additions & 11 deletions src/c++/perf_analyzer/inference_profiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,14 @@ EnsembleDurations
GetTotalEnsembleDurations(const ServerSideStats& stats)
{
EnsembleDurations result;
// Calculate avg cache hit latency and cache miss latency for ensemble model
// in case top level response caching is enabled.
const uint64_t ensemble_cache_hit_cnt = stats.cache_hit_count;
const uint64_t ensemble_cache_miss_cnt = stats.cache_miss_count;
result.total_cache_hit_time_avg_us +=
AverageDurationInUs(stats.cache_hit_time_ns, ensemble_cache_hit_cnt);
result.total_cache_miss_time_avg_us +=
AverageDurationInUs(stats.cache_miss_time_ns, ensemble_cache_miss_cnt);
for (const auto& model_stats : stats.composing_models_stat) {
if (model_stats.second.composing_models_stat.empty()) {
// Cache hit count covers cache hits, not related to compute times
Expand Down Expand Up @@ -238,7 +246,6 @@ ReportServerSideStats(
if (parser->ResponseCacheEnabled()) {
const uint64_t overhead_avg_us = GetOverheadDuration(
cumm_avg_us, queue_avg_us, combined_cache_compute_avg_us);

std::cout << " (overhead " << overhead_avg_us << " usec + "
<< "queue " << queue_avg_us << " usec + "
<< "cache hit/miss " << combined_cache_compute_avg_us
Expand Down Expand Up @@ -283,12 +290,18 @@ ReportServerSideStats(
const uint64_t overhead_avg_us = GetOverheadDuration(
cumm_avg_us, ensemble_times.total_queue_time_avg_us,
ensemble_times.total_combined_cache_compute_time_avg_us);
std::cout << " (overhead " << overhead_avg_us << " usec + "
<< "queue " << ensemble_times.total_queue_time_avg_us
<< " usec + "
<< "cache hit/miss "
<< ensemble_times.total_combined_cache_compute_time_avg_us
<< " usec)" << std::endl;
// FIXME - Refactor these calculations in case of ensemble top level
// response cache is enabled
if (!parser->TopLevelResponseCachingEnabled()) {
std::cout << " (overhead " << overhead_avg_us << " usec + "
<< "queue " << ensemble_times.total_queue_time_avg_us
<< " usec + "
<< "cache hit/miss "
<< ensemble_times.total_combined_cache_compute_time_avg_us
<< " usec)" << std::endl;
} else {
std::cout << std::endl;
}
std::cout << ident << ident << " Average Cache Hit Latency: "
<< ensemble_times.total_cache_hit_time_avg_us << " usec"
<< std::endl;
Expand Down Expand Up @@ -1516,8 +1529,16 @@ InferenceProfiler::DetermineStatsModelVersion(
*status_model_version = std::stoll(model_identifier.second);
}
}

if (*status_model_version == -1) {
// FIXME - Investigate why composing model version is -1 in case of ensemble
// cache hit.
//
// In case of ensemble models, if top level response caching is
// enabled, the composing models versions are unavailable in case of a cache
// hit. This is due to the scheduler sends cache response and composing models
// do not get executed. It's a valid scenario and shouldn't throw error.
bool model_version_unspecified_and_invalid =
*status_model_version == -1 && !parser_->TopLevelResponseCachingEnabled();
if (model_version_unspecified_and_invalid) {
return cb::Error(
"failed to find the requested model version", pa::GENERIC_ERROR);
}
Expand All @@ -1533,6 +1554,21 @@ InferenceProfiler::DetermineStatsModelVersion(
return cb::Error::Success;
}

// Only for unit-testing
#ifndef DOCTEST_CONFIG_DISABLE
cb::Error
InferenceProfiler::SetTopLevelResponseCaching(
bool enable_top_level_response_caching)
{
parser_ = std::make_shared<ModelParser>(cb::BackendKind::TRITON);
if (parser_ == nullptr) {
return cb::Error("Failed to initialize ModelParser");
}
parser_->SetTopLevelResponseCaching(enable_top_level_response_caching);
return cb::Error::Success;
}
#endif

cb::Error
InferenceProfiler::SummarizeServerStats(
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_status,
Expand Down Expand Up @@ -1588,8 +1624,20 @@ InferenceProfiler::SummarizeServerStatsHelper(

const auto& end_itr = end_status.find(this_id);
if (end_itr == end_status.end()) {
return cb::Error(
"missing statistics for requested model", pa::GENERIC_ERROR);
// In case of ensemble models, if top level response caching is enabled,
// the composing models statistics are unavailable in case of a cache hit.
// This is due to the scheduler sends cache response and composing models do
// not get executed. It's a valid scenario and shouldn't throw error.
bool stats_not_found_and_invalid =
model_version == -1 && !parser_->TopLevelResponseCachingEnabled();
if (stats_not_found_and_invalid) {
return cb::Error(
"missing statistics for requested model", pa::GENERIC_ERROR);
} else {
// Setting server stats 0 for composing model in case of ensemble request
// cache hit since the composing model will not be executed
server_stats->Reset();
}
} else {
uint64_t start_infer_cnt = 0;
uint64_t start_exec_cnt = 0;
Expand Down
29 changes: 29 additions & 0 deletions src/c++/perf_analyzer/inference_profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ namespace triton { namespace perfanalyzer {
#ifndef DOCTEST_CONFIG_DISABLE
class NaggyMockInferenceProfiler;
class TestInferenceProfiler;
class ModelParser;
#endif

/// Constant parameters that determine the whether stopping criteria has met
Expand Down Expand Up @@ -119,6 +120,28 @@ struct ServerSideStats {
uint64_t cache_miss_time_ns;

std::map<cb::ModelIdentifier, ServerSideStats> composing_models_stat;
// This function sets composing model server stats to 0 in case of a cache hit
// when top level response cache is enabled, since composing models are not
// executed and do not have any stats
void Reset()
{
inference_count = 0;
execution_count = 0;
success_count = 0;
queue_count = 0;
compute_input_count = 0;
compute_infer_count = 0;
compute_output_count = 0;
cumm_time_ns = 0;
queue_time_ns = 0;
compute_input_time_ns = 0;
compute_infer_time_ns = 0;
compute_output_time_ns = 0;
cache_hit_count = 0;
cache_hit_time_ns = 0;
cache_miss_count = 0;
cache_miss_time_ns = 0;
}
};

/// Holds the statistics recorded at the client side.
Expand Down Expand Up @@ -530,12 +553,17 @@ class InferenceProfiler {
/// measurement
/// \param end_stats The stats for all models at the end of the measurement
/// \param model_version The determined model version

cb::Error DetermineStatsModelVersion(
const cb::ModelIdentifier& model_identifier,
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_stats,
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& end_stats,
int64_t* model_version);

#ifndef DOCTEST_CONFIG_DISABLE
cb::Error SetTopLevelResponseCaching(bool enable_top_level_request_caching);
#endif

/// \param start_status The model status at the start of the measurement.
/// \param end_status The model status at the end of the measurement.
/// \param server_stats Returns the summary that the fields recorded by server
Expand Down Expand Up @@ -738,6 +766,7 @@ class InferenceProfiler {
#ifndef DOCTEST_CONFIG_DISABLE
friend NaggyMockInferenceProfiler;
friend TestInferenceProfiler;
friend ModelParser;

public:
InferenceProfiler() = default;
Expand Down
4 changes: 4 additions & 0 deletions src/c++/perf_analyzer/model_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,10 @@ ModelParser::InitTriton(
response_cache_enabled_ = cache_itr->value["enable"].GetBool();
}

if (cache_itr != config.MemberEnd()) {
top_level_response_caching_enabled_ = cache_itr->value["enable"].GetBool();
}

return cb::Error::Success;
}

Expand Down
23 changes: 22 additions & 1 deletion src/c++/perf_analyzer/model_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ namespace triton { namespace perfanalyzer {
#ifndef DOCTEST_CONFIG_DISABLE
class TestModelParser;
class MockModelParser;
class InferenceProfiler;
#endif

struct ModelTensor {
Expand Down Expand Up @@ -73,7 +74,8 @@ class ModelParser {
outputs_(std::make_shared<ModelTensorMap>()),
composing_models_map_(std::make_shared<ComposingModelMap>()),
scheduler_type_(NONE), max_batch_size_(0), is_decoupled_(false),
response_cache_enabled_(false)
response_cache_enabled_(false),
top_level_response_caching_enabled_(false)
{
}

Expand Down Expand Up @@ -151,6 +153,22 @@ class ModelParser {
/// model
bool ResponseCacheEnabled() const { return response_cache_enabled_; }

/// Returns whether or not top level request caching is enabled for this model
/// \return the truth value of whether top level request caching is enabled
/// for this model
bool TopLevelResponseCachingEnabled() const
{
return top_level_response_caching_enabled_;
}

/// Only for testing
#ifndef DOCTEST_CONFIG_DISABLE
void SetTopLevelResponseCaching(bool enable_top_level_response_caching)
{
top_level_response_caching_enabled_ = enable_top_level_response_caching;
}
#endif

/// Get the details about the model inputs.
/// \return The map with tensor_name and the tensor details
/// stored as key-value pair.
Expand All @@ -169,6 +187,7 @@ class ModelParser {
return composing_models_map_;
}


protected:
ModelSchedulerType scheduler_type_;
bool is_decoupled_;
Expand Down Expand Up @@ -220,10 +239,12 @@ class ModelParser {
std::string model_signature_name_;
size_t max_batch_size_;
bool response_cache_enabled_;
bool top_level_response_caching_enabled_;

#ifndef DOCTEST_CONFIG_DISABLE
friend TestModelParser;
friend MockModelParser;
friend InferenceProfiler;

public:
ModelParser() = default;
Expand Down
26 changes: 26 additions & 0 deletions src/c++/perf_analyzer/test_inference_profiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,15 @@ class TestInferenceProfiler : public InferenceProfiler {
return InferenceProfiler::DetermineStatsModelVersion(
model_identifier, start_stats, end_stats, model_version);
}

cb::Error SetTopLevelResponseCaching(bool enable_top_level_response_caching)
{
return InferenceProfiler::SetTopLevelResponseCaching(
enable_top_level_response_caching);
}
};


TEST_CASE("testing the ValidLatencyMeasurement function")
{
size_t valid_sequence_count{};
Expand Down Expand Up @@ -850,6 +857,25 @@ TEST_CASE("determine_stats_model_version: testing DetermineStatsModelVersion()")
expect_exception = true;
}

SUBCASE("One entry - version -1 - valid and in start")
{
model_identifier = {"ModelA", "-1"};
start_stats_map.insert({{"ModelA", "3"}, old_stats});
end_stats_map.insert({{"ModelA", "3"}, new_stats});
cb::Error status = tip.SetTopLevelResponseCaching(true);
CHECK(status.IsOk());
expected_model_version = -1;
}

SUBCASE("One entry - version -1 - not valid")
{
model_identifier = {"ModelA", "-1"};
end_stats_map.insert({{"ModelA", "3"}, old_stats});
cb::Error status = tip.SetTopLevelResponseCaching(false);
CHECK(status.IsOk());
expected_model_version = -1;
expect_exception = true;
}

std::stringstream captured_cerr;
std::streambuf* old = std::cerr.rdbuf(captured_cerr.rdbuf());
Expand Down
Loading