Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for fixed number of requests #633

Merged
merged 17 commits into from
May 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/c++/perf_analyzer/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ set(
profile_data_exporter.h
periodic_concurrency_manager.h
periodic_concurrency_worker.h
thread_config.h
)

add_executable(
Expand Down
2 changes: 2 additions & 0 deletions src/c++/perf_analyzer/client_backend/client_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ enum BackendKind {
TRITON_C_API = 3,
OPENAI = 4
};
std::string BackendKindToString(const BackendKind kind);

enum ProtocolType { HTTP = 0, GRPC = 1, UNKNOWN = 2 };
enum GrpcCompressionAlgorithm {
COMPRESS_NONE = 0,
Expand Down
49 changes: 49 additions & 0 deletions src/c++/perf_analyzer/command_line_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ CLParser::Usage(const std::string& msg)
"profiling>"
<< std::endl;
std::cerr << "\t--percentile <percentile>" << std::endl;
std::cerr << "\t--request-count <number of requests>" << std::endl;
std::cerr << "\tDEPRECATED OPTIONS" << std::endl;
std::cerr << "\t-t <number of concurrent requests>" << std::endl;
std::cerr << "\t-c <maximum concurrency>" << std::endl;
Expand Down Expand Up @@ -463,6 +464,14 @@ CLParser::Usage(const std::string& msg)
"that the average latency is used to determine stability",
18)
<< std::endl;
std::cerr
<< FormatMessage(
" --request-count: Specifies a total number of requests to "
"use for measurement. The default is 0, which means that there is "
"no request count and the measurement will proceed using windows "
"until stabilization is detected.",
18)
<< std::endl;
std::cerr << FormatMessage(
" --serial-sequences: Enables serial sequence mode "
"where a maximum of one request is outstanding at a time "
Expand Down Expand Up @@ -879,6 +888,7 @@ CLParser::ParseCommandLine(int argc, char** argv)
{"request-period", required_argument, 0, 59},
{"request-parameter", required_argument, 0, 60},
{"endpoint", required_argument, 0, 61},
{"request-count", required_argument, 0, 62},
{0, 0, 0, 0}};

// Parse commandline...
Expand Down Expand Up @@ -1614,6 +1624,13 @@ CLParser::ParseCommandLine(int argc, char** argv)
params_->endpoint = optarg;
break;
}
case 62: {
if (std::stoi(optarg) < 0) {
Usage("Failed to parse --request-count. The value must be > 0.");
}
params_->request_count = std::stoi(optarg);
break;
}
case 'v':
params_->extra_verbose = params_->verbose;
params_->verbose = true;
Expand Down Expand Up @@ -1705,6 +1722,13 @@ CLParser::ParseCommandLine(int argc, char** argv)
// Will be using user-provided time intervals, hence no control variable.
params_->search_mode = SearchMode::NONE;
}

// When the request-count feature is enabled, override the measurement mode to
// be count windows with a window size of the requested count
if (params_->request_count) {
params_->measurement_mode = MeasurementMode::COUNT_WINDOWS;
params_->measurement_request_count = params_->request_count;
}
}

void
Expand Down Expand Up @@ -1874,6 +1898,31 @@ CLParser::VerifyOptions()
"binary search mode.");
}

if (params_->request_count != 0) {
if (params_->using_concurrency_range) {
if (params_->request_count < params_->concurrency_range.start) {
Usage("request-count can not be less than concurrency");
}
if (params_->concurrency_range.start < params_->concurrency_range.end) {
Usage(
"request-count not supported with multiple concurrency values in "
"one run");
}
}
if (params_->using_request_rate_range) {
if (params_->request_count <
static_cast<int>(params_->request_rate_range[0])) {
Usage("request-count can not be less than request-rate");
}
if (params_->request_rate_range[SEARCH_RANGE::kSTART] <
params_->request_rate_range[SEARCH_RANGE::kEND]) {
Usage(
"request-count not supported with multiple request-rate values in "
"one run");
}
}
}

if (params_->kind == cb::TENSORFLOW_SERVING) {
if (params_->protocol != cb::ProtocolType::GRPC) {
Usage(
Expand Down
1 change: 1 addition & 0 deletions src/c++/perf_analyzer/command_line_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ struct PerfAnalyzerParameters {
uint64_t latency_threshold_ms = NO_LIMIT;
double stability_threshold = 0.1;
size_t max_trials = 10;
size_t request_count = 0;
bool zero_input = false;
size_t string_length = 128;
std::string string_data;
Expand Down
22 changes: 15 additions & 7 deletions src/c++/perf_analyzer/concurrency_manager.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -84,10 +84,10 @@ ConcurrencyManager::InitManagerFinalize()

cb::Error
ConcurrencyManager::ChangeConcurrencyLevel(
const size_t concurrent_request_count)
const size_t concurrent_request_count, const size_t request_count)
{
PauseSequenceWorkers();
ReconfigThreads(concurrent_request_count);
ReconfigThreads(concurrent_request_count, request_count);
ResumeSequenceWorkers();

std::cout << "Request concurrency: " << concurrent_request_count << std::endl;
Expand All @@ -109,7 +109,8 @@ ConcurrencyManager::PauseSequenceWorkers()
}

void
ConcurrencyManager::ReconfigThreads(const size_t concurrent_request_count)
ConcurrencyManager::ReconfigThreads(
size_t concurrent_request_count, size_t request_count)
{
// Always prefer to create new threads if the maximum limit has not been met
//
Expand All @@ -121,8 +122,7 @@ ConcurrencyManager::ReconfigThreads(const size_t concurrent_request_count)
(threads_.size() < max_threads_)) {
// Launch new thread for inferencing
threads_stat_.emplace_back(new ThreadStat());
threads_config_.emplace_back(
new ConcurrencyWorker::ThreadConfig(threads_config_.size()));
threads_config_.emplace_back(new ThreadConfig(threads_config_.size()));

workers_.push_back(
MakeWorker(threads_stat_.back(), threads_config_.back()));
Expand All @@ -138,13 +138,21 @@ ConcurrencyManager::ReconfigThreads(const size_t concurrent_request_count)
// and spread the remaining value
size_t avg_concurrency = concurrent_request_count / threads_.size();
size_t threads_add_one = concurrent_request_count % threads_.size();

size_t avg_req_count = request_count / threads_.size();
size_t req_count_add_one = request_count % threads_.size();

size_t seq_stat_index_offset = 0;
active_threads_ = 0;
for (size_t i = 0; i < threads_stat_.size(); i++) {
size_t concurrency = avg_concurrency + (i < threads_add_one ? 1 : 0);

threads_config_[i]->concurrency_ = concurrency;
threads_config_[i]->seq_stat_index_offset_ = seq_stat_index_offset;

size_t thread_num_reqs = avg_req_count + (i < req_count_add_one ? 1 : 0);
threads_config_[i]->num_requests_ = thread_num_reqs;

seq_stat_index_offset += concurrency;

if (concurrency) {
Expand All @@ -171,7 +179,7 @@ ConcurrencyManager::ResumeSequenceWorkers()
std::shared_ptr<IWorker>
ConcurrencyManager::MakeWorker(
std::shared_ptr<ThreadStat> thread_stat,
std::shared_ptr<ConcurrencyWorker::ThreadConfig> thread_config)
std::shared_ptr<ThreadConfig> thread_config)
{
uint32_t id = workers_.size();

Expand Down
14 changes: 8 additions & 6 deletions src/c++/perf_analyzer/concurrency_manager.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -89,14 +89,16 @@ class ConcurrencyManager : public LoadManager {
/// Adjusts the number of concurrent requests to be the same as
/// 'concurrent_request_count' (by creating or pausing threads)
/// \param concurent_request_count The number of concurrent requests.
/// \param request_count The number of requests to generate. If 0, then
/// there is no limit, and it will generate until told to stop.
/// \return cb::Error object indicating success or failure.
cb::Error ChangeConcurrencyLevel(const size_t concurrent_request_count);
cb::Error ChangeConcurrencyLevel(
const size_t concurrent_request_count, const size_t request_count = 0);

protected:
// Makes a new worker
virtual std::shared_ptr<IWorker> MakeWorker(
std::shared_ptr<ThreadStat>,
std::shared_ptr<ConcurrencyWorker::ThreadConfig>);
std::shared_ptr<ThreadStat>, std::shared_ptr<ThreadConfig>);

ConcurrencyManager(
const bool async, const bool streaming, const int32_t batch_size,
Expand All @@ -114,7 +116,7 @@ class ConcurrencyManager : public LoadManager {

size_t max_concurrency_;

std::vector<std::shared_ptr<ConcurrencyWorker::ThreadConfig>> threads_config_;
std::vector<std::shared_ptr<ThreadConfig>> threads_config_;

private:
void InitManagerFinalize() override;
Expand All @@ -126,7 +128,7 @@ class ConcurrencyManager : public LoadManager {
// Create new threads (if necessary), and then reconfigure all worker threads
// to handle the new concurrent request count
//
void ReconfigThreads(size_t concurrent_request_count);
void ReconfigThreads(size_t concurrent_request_count, size_t request_count);

// Restart all worker threads that were working on sequences
//
Expand Down
37 changes: 7 additions & 30 deletions src/c++/perf_analyzer/concurrency_worker.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -29,6 +29,7 @@

#include "load_worker.h"
#include "sequence_manager.h"
#include "thread_config.h"

namespace triton { namespace perfanalyzer {

Expand All @@ -49,28 +50,6 @@ class NaggyMockConcurrencyWorker;
///
class ConcurrencyWorker : public LoadWorker {
public:
struct ThreadConfig {
ThreadConfig(
size_t thread_id, size_t concurrency = 0,
size_t seq_stat_index_offset = 0)
: thread_id_(thread_id), concurrency_(concurrency),
seq_stat_index_offset_(seq_stat_index_offset), is_paused_(false)
{
}

// ID of corresponding worker thread
size_t thread_id_;

// The concurrency level that the worker should produce
size_t concurrency_;

// The starting sequence stat index for this worker
size_t seq_stat_index_offset_;

// Whether or not the thread is issuing new inference requests
bool is_paused_;
};

ConcurrencyWorker(
uint32_t id, std::shared_ptr<ThreadStat> thread_stat,
std::shared_ptr<ThreadConfig> thread_config,
Expand All @@ -85,11 +64,11 @@ class ConcurrencyWorker : public LoadWorker {
const std::shared_ptr<IInferDataManager>& infer_data_manager,
std::shared_ptr<SequenceManager> sequence_manager)
: LoadWorker(
id, thread_stat, parser, data_loader, factory, on_sequence_model,
async, streaming, batch_size, using_json_data, wake_signal,
wake_mutex, execute, infer_data_manager, sequence_manager),
thread_config_(thread_config), max_concurrency_(max_concurrency),
active_threads_(active_threads)
id, thread_stat, thread_config, parser, data_loader, factory,
on_sequence_model, async, streaming, batch_size, using_json_data,
wake_signal, wake_mutex, execute, infer_data_manager,
sequence_manager),
max_concurrency_(max_concurrency), active_threads_(active_threads)
{
}

Expand All @@ -109,8 +88,6 @@ class ConcurrencyWorker : public LoadWorker {
// threads?
size_t& active_threads_;

std::shared_ptr<ThreadConfig> thread_config_;

// Handle the case where execute_ is false
void HandleExecuteOff();

Expand Down
6 changes: 3 additions & 3 deletions src/c++/perf_analyzer/custom_load_manager.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -76,10 +76,10 @@ CustomLoadManager::CustomLoadManager(
}

cb::Error
CustomLoadManager::InitCustomIntervals()
CustomLoadManager::InitCustomIntervals(const size_t request_count)
{
PauseWorkers();
ConfigureThreads();
ConfigureThreads(request_count);
auto status = GenerateSchedule();
ResumeWorkers();
return status;
Expand Down
6 changes: 4 additions & 2 deletions src/c++/perf_analyzer/custom_load_manager.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -88,8 +88,10 @@ class CustomLoadManager : public RequestRateManager {

/// Initializes the load manager with the provided file containing request
/// intervals
/// \param request_count The number of requests to generate. If 0, then
/// there is no limit, and it will generate until told to stop.
/// \return cb::Error object indicating success or failure.
cb::Error InitCustomIntervals();
cb::Error InitCustomIntervals(const size_t request_count);

/// Computes the request rate from the time interval file. Fails with an error
/// if the file is not present or is empty.
Expand Down
Loading
Loading