triton-inference-server · tgerdesnv · May 9, 2024 · May 2, 2024 · May 2, 2024 · May 2, 2024
diff --git a/src/c++/perf_analyzer/CMakeLists.txt b/src/c++/perf_analyzer/CMakeLists.txt
@@ -112,6 +112,7 @@ set(
   profile_data_exporter.h
   periodic_concurrency_manager.h
   periodic_concurrency_worker.h
+  thread_config.h
 )
 
 add_executable(

diff --git a/src/c++/perf_analyzer/client_backend/client_backend.h b/src/c++/perf_analyzer/client_backend/client_backend.h
@@ -138,6 +138,8 @@ enum BackendKind {
   TRITON_C_API = 3,
   OPENAI = 4
 };
+std::string BackendKindToString(const BackendKind kind);
+
 enum ProtocolType { HTTP = 0, GRPC = 1, UNKNOWN = 2 };
 enum GrpcCompressionAlgorithm {
   COMPRESS_NONE = 0,

diff --git a/src/c++/perf_analyzer/command_line_parser.cc b/src/c++/perf_analyzer/command_line_parser.cc
@@ -137,6 +137,7 @@ CLParser::Usage(const std::string& msg)
                "profiling>"
             << std::endl;
   std::cerr << "\t--percentile <percentile>" << std::endl;
+  std::cerr << "\t--request-count <number of requests>" << std::endl;
   std::cerr << "\tDEPRECATED OPTIONS" << std::endl;
   std::cerr << "\t-t <number of concurrent requests>" << std::endl;
   std::cerr << "\t-c <maximum concurrency>" << std::endl;
@@ -463,6 +464,14 @@ CLParser::Usage(const std::string& msg)
              "that the average latency is used to determine stability",
              18)
       << std::endl;
+  std::cerr
+      << FormatMessage(
+             " --request-count: Specifies a total number of requests to "
+             "use for measurement. The default is 0, which means that there is "
+             "no request count and the measurement will proceed using windows "
+             "until stabilization is detected.",
+             18)
+      << std::endl;
   std::cerr << FormatMessage(
                    " --serial-sequences: Enables serial sequence mode "
                    "where a maximum of one request is outstanding at a time "
@@ -879,6 +888,7 @@ CLParser::ParseCommandLine(int argc, char** argv)
       {"request-period", required_argument, 0, 59},
       {"request-parameter", required_argument, 0, 60},
       {"endpoint", required_argument, 0, 61},
+      {"request-count", required_argument, 0, 62},
       {0, 0, 0, 0}};
 
   // Parse commandline...
@@ -1614,6 +1624,13 @@ CLParser::ParseCommandLine(int argc, char** argv)
           params_->endpoint = optarg;
           break;
         }
+        case 62: {
+          if (std::stoi(optarg) < 0) {
+            Usage("Failed to parse --request-count. The value must be > 0.");
+          }
+          params_->request_count = std::stoi(optarg);
+          break;
+        }
         case 'v':
           params_->extra_verbose = params_->verbose;
           params_->verbose = true;
@@ -1705,6 +1722,13 @@ CLParser::ParseCommandLine(int argc, char** argv)
     // Will be using user-provided time intervals, hence no control variable.
     params_->search_mode = SearchMode::NONE;
   }
+
+  // When the request-count feature is enabled, override the measurement mode to
+  // be count windows with a window size of the requested count
+  if (params_->request_count) {
+    params_->measurement_mode = MeasurementMode::COUNT_WINDOWS;
+    params_->measurement_request_count = params_->request_count;
+  }
 }
 
 void
@@ -1874,6 +1898,31 @@ CLParser::VerifyOptions()
         "binary search mode.");
   }
 
+  if (params_->request_count != 0) {
+    if (params_->using_concurrency_range) {
+      if (params_->request_count < params_->concurrency_range.start) {
+        Usage("request-count can not be less than concurrency");
+      }
+      if (params_->concurrency_range.start < params_->concurrency_range.end) {
+        Usage(
+            "request-count not supported with multiple concurrency values in "
+            "one run");
+      }
+    }
+    if (params_->using_request_rate_range) {
+      if (params_->request_count <
+          static_cast<int>(params_->request_rate_range[0])) {
+        Usage("request-count can not be less than request-rate");
+      }
+      if (params_->request_rate_range[SEARCH_RANGE::kSTART] <
+          params_->request_rate_range[SEARCH_RANGE::kEND]) {
+        Usage(
+            "request-count not supported with multiple request-rate values in "
+            "one run");
+      }
+    }
+  }
+
   if (params_->kind == cb::TENSORFLOW_SERVING) {
     if (params_->protocol != cb::ProtocolType::GRPC) {
       Usage(

diff --git a/src/c++/perf_analyzer/command_line_parser.h b/src/c++/perf_analyzer/command_line_parser.h
@@ -62,6 +62,7 @@ struct PerfAnalyzerParameters {
   uint64_t latency_threshold_ms = NO_LIMIT;
   double stability_threshold = 0.1;
   size_t max_trials = 10;
+  size_t request_count = 0;
   bool zero_input = false;
   size_t string_length = 128;
   std::string string_data;

diff --git a/src/c++/perf_analyzer/concurrency_manager.cc b/src/c++/perf_analyzer/concurrency_manager.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -84,10 +84,10 @@ ConcurrencyManager::InitManagerFinalize()
 
 cb::Error
 ConcurrencyManager::ChangeConcurrencyLevel(
-    const size_t concurrent_request_count)
+    const size_t concurrent_request_count, const size_t request_count)
 {
   PauseSequenceWorkers();
-  ReconfigThreads(concurrent_request_count);
+  ReconfigThreads(concurrent_request_count, request_count);
   ResumeSequenceWorkers();
 
   std::cout << "Request concurrency: " << concurrent_request_count << std::endl;
@@ -109,7 +109,8 @@ ConcurrencyManager::PauseSequenceWorkers()
 }
 
 void
-ConcurrencyManager::ReconfigThreads(const size_t concurrent_request_count)
+ConcurrencyManager::ReconfigThreads(
+    size_t concurrent_request_count, size_t request_count)
 {
   // Always prefer to create new threads if the maximum limit has not been met
   //
@@ -121,8 +122,7 @@ ConcurrencyManager::ReconfigThreads(const size_t concurrent_request_count)
          (threads_.size() < max_threads_)) {
     // Launch new thread for inferencing
     threads_stat_.emplace_back(new ThreadStat());
-    threads_config_.emplace_back(
-        new ConcurrencyWorker::ThreadConfig(threads_config_.size()));
+    threads_config_.emplace_back(new ThreadConfig(threads_config_.size()));
 
     workers_.push_back(
         MakeWorker(threads_stat_.back(), threads_config_.back()));
@@ -138,13 +138,21 @@ ConcurrencyManager::ReconfigThreads(const size_t concurrent_request_count)
     // and spread the remaining value
     size_t avg_concurrency = concurrent_request_count / threads_.size();
     size_t threads_add_one = concurrent_request_count % threads_.size();
+
+    size_t avg_req_count = request_count / threads_.size();
+    size_t req_count_add_one = request_count % threads_.size();
+
     size_t seq_stat_index_offset = 0;
     active_threads_ = 0;
     for (size_t i = 0; i < threads_stat_.size(); i++) {
       size_t concurrency = avg_concurrency + (i < threads_add_one ? 1 : 0);
 
       threads_config_[i]->concurrency_ = concurrency;
       threads_config_[i]->seq_stat_index_offset_ = seq_stat_index_offset;
+
+      size_t thread_num_reqs = avg_req_count + (i < req_count_add_one ? 1 : 0);
+      threads_config_[i]->num_requests_ = thread_num_reqs;
+
       seq_stat_index_offset += concurrency;
 
       if (concurrency) {
@@ -171,7 +179,7 @@ ConcurrencyManager::ResumeSequenceWorkers()
 std::shared_ptr<IWorker>
 ConcurrencyManager::MakeWorker(
     std::shared_ptr<ThreadStat> thread_stat,
-    std::shared_ptr<ConcurrencyWorker::ThreadConfig> thread_config)
+    std::shared_ptr<ThreadConfig> thread_config)
 {
   uint32_t id = workers_.size();
 

diff --git a/src/c++/perf_analyzer/concurrency_manager.h b/src/c++/perf_analyzer/concurrency_manager.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -89,14 +89,16 @@ class ConcurrencyManager : public LoadManager {
   /// Adjusts the number of concurrent requests to be the same as
   /// 'concurrent_request_count' (by creating or pausing threads)
   /// \param concurent_request_count The number of concurrent requests.
+  /// \param request_count The number of requests to generate. If 0, then
+  /// there is no limit, and it will generate until told to stop.
   /// \return cb::Error object indicating success or failure.
-  cb::Error ChangeConcurrencyLevel(const size_t concurrent_request_count);
+  cb::Error ChangeConcurrencyLevel(
+      const size_t concurrent_request_count, const size_t request_count = 0);
 
  protected:
   // Makes a new worker
   virtual std::shared_ptr<IWorker> MakeWorker(
-      std::shared_ptr<ThreadStat>,
-      std::shared_ptr<ConcurrencyWorker::ThreadConfig>);
+      std::shared_ptr<ThreadStat>, std::shared_ptr<ThreadConfig>);
 
   ConcurrencyManager(
       const bool async, const bool streaming, const int32_t batch_size,
@@ -114,7 +116,7 @@ class ConcurrencyManager : public LoadManager {
 
   size_t max_concurrency_;
 
-  std::vector<std::shared_ptr<ConcurrencyWorker::ThreadConfig>> threads_config_;
+  std::vector<std::shared_ptr<ThreadConfig>> threads_config_;
 
  private:
   void InitManagerFinalize() override;
@@ -126,7 +128,7 @@ class ConcurrencyManager : public LoadManager {
   // Create new threads (if necessary), and then reconfigure all worker threads
   // to handle the new concurrent request count
   //
-  void ReconfigThreads(size_t concurrent_request_count);
+  void ReconfigThreads(size_t concurrent_request_count, size_t request_count);
 
   // Restart all worker threads that were working on sequences
   //

diff --git a/src/c++/perf_analyzer/concurrency_worker.h b/src/c++/perf_analyzer/concurrency_worker.h
@@ -1,4 +1,4 @@
-// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -29,6 +29,7 @@
 
 #include "load_worker.h"
 #include "sequence_manager.h"
+#include "thread_config.h"
 
 namespace triton { namespace perfanalyzer {
 
@@ -49,28 +50,6 @@ class NaggyMockConcurrencyWorker;
 ///
 class ConcurrencyWorker : public LoadWorker {
  public:
-  struct ThreadConfig {
-    ThreadConfig(
-        size_t thread_id, size_t concurrency = 0,
-        size_t seq_stat_index_offset = 0)
-        : thread_id_(thread_id), concurrency_(concurrency),
-          seq_stat_index_offset_(seq_stat_index_offset), is_paused_(false)
-    {
-    }
-
-    // ID of corresponding worker thread
-    size_t thread_id_;
-
-    // The concurrency level that the worker should produce
-    size_t concurrency_;
-
-    // The starting sequence stat index for this worker
-    size_t seq_stat_index_offset_;
-
-    // Whether or not the thread is issuing new inference requests
-    bool is_paused_;
-  };
-
   ConcurrencyWorker(
       uint32_t id, std::shared_ptr<ThreadStat> thread_stat,
       std::shared_ptr<ThreadConfig> thread_config,
@@ -85,11 +64,11 @@ class ConcurrencyWorker : public LoadWorker {
       const std::shared_ptr<IInferDataManager>& infer_data_manager,
       std::shared_ptr<SequenceManager> sequence_manager)
       : LoadWorker(
-            id, thread_stat, parser, data_loader, factory, on_sequence_model,
-            async, streaming, batch_size, using_json_data, wake_signal,
-            wake_mutex, execute, infer_data_manager, sequence_manager),
-        thread_config_(thread_config), max_concurrency_(max_concurrency),
-        active_threads_(active_threads)
+            id, thread_stat, thread_config, parser, data_loader, factory,
+            on_sequence_model, async, streaming, batch_size, using_json_data,
+            wake_signal, wake_mutex, execute, infer_data_manager,
+            sequence_manager),
+        max_concurrency_(max_concurrency), active_threads_(active_threads)
   {
   }
 
@@ -109,8 +88,6 @@ class ConcurrencyWorker : public LoadWorker {
   // threads?
   size_t& active_threads_;
 
-  std::shared_ptr<ThreadConfig> thread_config_;
-
   // Handle the case where execute_ is false
   void HandleExecuteOff();
 

diff --git a/src/c++/perf_analyzer/custom_load_manager.cc b/src/c++/perf_analyzer/custom_load_manager.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -76,10 +76,10 @@ CustomLoadManager::CustomLoadManager(
 }
 
 cb::Error
-CustomLoadManager::InitCustomIntervals()
+CustomLoadManager::InitCustomIntervals(const size_t request_count)
 {
   PauseWorkers();
-  ConfigureThreads();
+  ConfigureThreads(request_count);
   auto status = GenerateSchedule();
   ResumeWorkers();
   return status;

diff --git a/src/c++/perf_analyzer/custom_load_manager.h b/src/c++/perf_analyzer/custom_load_manager.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -88,8 +88,10 @@ class CustomLoadManager : public RequestRateManager {
 
   /// Initializes the load manager with the provided file containing request
   /// intervals
+  /// \param request_count The number of requests to generate. If 0, then
+  /// there is no limit, and it will generate until told to stop.
   /// \return cb::Error object indicating success or failure.
-  cb::Error InitCustomIntervals();
+  cb::Error InitCustomIntervals(const size_t request_count);
 
   /// Computes the request rate from the time interval file. Fails with an error
   /// if the file is not present or is empty.