feat: Adding profiling support to the runtime

Signed-off-by: Naren Dasan <[email protected]>
pytorch · Oct 13, 2022 · 8a48100 · 8a48100
1 parent b70c913
commit 8a48100
Show file tree

Hide file tree

Showing 24 changed files with 238 additions and 142 deletions.
diff --git a/core/compiler.cpp b/core/compiler.cpp
@@ -31,7 +31,7 @@ void AddEngineToGraph(
     torch::jit::script::Module mod,
     std::shared_ptr<torch::jit::Graph>& g,
     const std::string& serialized_engine,
-    runtime::CudaDevice& device_info,
+    runtime::CUDADevice& device_info,
     std::string engine_id = "",
     bool fallback = false) {
   auto engine_ptr = c10::make_intrusive<runtime::TRTEngine>(
@@ -166,7 +166,7 @@ partitioning::GraphAndMapping BuildHybridGraph(
         auto engine = conversion::ConvertBlockToEngine(seg_block.block(), convert_info, static_params);
         auto temp_g = std::make_shared<torch::jit::Graph>();
         auto device_spec = convert_info.engine_settings.device;
-        auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
+        auto cuda_device = runtime::CUDADevice(device_spec.gpu_id, device_spec.device_type);
         AddEngineToGraph(new_mod, temp_g, engine, cuda_device, trt_engine_id.str(), true);
 
         seg_block.update_graph(temp_g);
@@ -283,7 +283,7 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
   torch::jit::Module new_mod(mod._ivalue()->name() + "_trt");
 
   auto device_spec = cfg.convert_info.engine_settings.device;
-  auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
+  auto cuda_device = runtime::CUDADevice(device_spec.gpu_id, device_spec.device_type);
 
   for (const torch::jit::Method& method : mod.get_methods()) {
     if (method.name().compare("forward") == 0) {
@@ -342,7 +342,7 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
   return new_mod;
 }
 
-torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine, runtime::CudaDevice cuda_device) {
+torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine, runtime::CUDADevice cuda_device) {
   std::ostringstream engine_id;
   engine_id << reinterpret_cast<const int*>(&engine);
   torch::jit::script::Module new_mod("tensorrt_engine_mod_" + engine_id.str());

diff --git a/core/compiler.h b/core/compiler.h
@@ -28,7 +28,7 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::
 
 torch::jit::script::Module CompileGraph(const torch::jit::script::Module& module, CompileSpec cfg);
 
-torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine, runtime::CudaDevice cuda_device);
+torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine, runtime::CUDADevice cuda_device);
 
 void set_device(const int gpu_id);
 

diff --git a/core/conversion/converters/impl/expand.cpp b/core/conversion/converters/impl/expand.cpp
@@ -374,12 +374,13 @@ auto expand_registrations TORCHTRT_UNUSED =
 
                // Collapse repeated dimension back into desired dimension
                std::vector<int64_t> collapse_shape_vec;
-               for (int k = 0; k < repeat_shape_dims.nbDims; k++) {
+               for (int64_t k = 0; k < repeat_shape_dims.nbDims; k++) {
                  if (k == dim) {
-                   int64_t collapse_dim = repeat_shape_dims.d[k] * repeat_shape_dims.d[++k];
+                   int64_t collapse_dim = repeat_shape_dims.d[k] * repeat_shape_dims.d[k+1];
                    // Set dim size to -1 if repeat is being done on dynamic dim
                    collapse_dim = std::max(collapse_dim, (int64_t)-1);
                    collapse_shape_vec.push_back(collapse_dim);
+                   k++;
                  } else {
                    collapse_shape_vec.push_back(repeat_shape_dims.d[k]);
                  }

diff --git a/core/conversion/converters/impl/select.cpp b/core/conversion/converters/impl/select.cpp
@@ -280,7 +280,7 @@ auto select_registrations TORCHTRT_UNUSED =
 
                std::vector<nvinfer1::ITensor*> tensors;
                std::vector<int32_t> adv_idx_indices;
-               for (auto i = 0; i < ts.size(); i++) {
+               for (size_t i = 0; i < ts.size(); i++) {
                  auto t = ts[i];
                  if (t.isTensor()) {
                    auto torch_tensor = t.toTensor().to(torch::kInt32);

diff --git a/core/runtime/BUILD b/core/runtime/BUILD
@@ -13,7 +13,7 @@ config_setting(
 cc_library(
     name = "runtime",
     srcs = [
-        "CudaDevice.cpp",
+        "CUDADevice.cpp",
         "DeviceList.cpp",
         "TRTEngine.cpp",
         "execute_engine.cpp",
@@ -22,6 +22,8 @@ cc_library(
     ],
     hdrs = [
         "runtime.h",
+        "CUDADevice.h",
+        "TRTEngine.h"
     ],
     deps = [
         "@tensorrt//:nvinfer",
@@ -36,6 +38,10 @@ cc_library(
 
 pkg_tar(
     name = "include",
-    srcs = ["runtime.h"],
+    srcs = [
+        "runtime.h",
+        "CUDADevice.h",
+        "TRTEngine.h"
+    ],
     package_dir = "core/runtime/",
 )
diff --git a/core/runtime/CMakeLists.txt b/core/runtime/CMakeLists.txt
@@ -2,7 +2,7 @@ set(lib_name "core_runtime")
 add_library(${lib_name} OBJECT)
 
 set(CXX_SRCS
-    "${CMAKE_CURRENT_SOURCE_DIR}/CudaDevice.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/CUDADevice.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/DeviceList.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/execute_engine.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/TRTEngine.cpp"
@@ -12,6 +12,8 @@ set(CXX_SRCS
 
 set(HEADER_FILES
     "${CMAKE_CURRENT_SOURCE_DIR}/runtime.h"
+    "${CMAKE_CURRENT_SOURCE_DIR}/CUDADevice.h"
+    "${CMAKE_CURRENT_SOURCE_DIR}/TRTEngine.h"
 )
 
 target_sources(${lib_name}

diff --git a/core/runtime/CudaDevice.cpp → core/runtime/CUDADevice.cpp b/core/runtime/CudaDevice.cpp → core/runtime/CUDADevice.cpp
@@ -11,10 +11,10 @@ const std::string DEVICE_INFO_DELIM = "%";
 
 typedef enum { ID_IDX = 0, SM_MAJOR_IDX, SM_MINOR_IDX, DEVICE_TYPE_IDX, DEVICE_NAME_IDX } SerializedDeviceInfoIndex;
 
-CudaDevice::CudaDevice() : id{-1}, major{-1}, minor{-1}, device_type{nvinfer1::DeviceType::kGPU} {}
+CUDADevice::CUDADevice() : id{-1}, major{-1}, minor{-1}, device_type{nvinfer1::DeviceType::kGPU} {}
 
-CudaDevice::CudaDevice(int64_t gpu_id, nvinfer1::DeviceType device_type) {
-  CudaDevice cuda_device;
+CUDADevice::CUDADevice(int64_t gpu_id, nvinfer1::DeviceType device_type) {
+  CUDADevice cuda_device;
   cudaDeviceProp device_prop;
 
   // Device ID
@@ -41,7 +41,7 @@ CudaDevice::CudaDevice(int64_t gpu_id, nvinfer1::DeviceType device_type) {
 // NOTE: Serialization Format for Device Info:
 // id%major%minor%(enum)device_type%device_name
 
-CudaDevice::CudaDevice(std::string device_info) {
+CUDADevice::CUDADevice(std::string device_info) {
   LOG_DEBUG("Deserializing Device Info: " << device_info);
 
   std::vector<std::string> tokens;
@@ -66,7 +66,7 @@ CudaDevice::CudaDevice(std::string device_info) {
   LOG_DEBUG("Deserialized Device Info: " << *this);
 }
 
-CudaDevice& CudaDevice::operator=(const CudaDevice& other) {
+CUDADevice& CUDADevice::operator=(const CUDADevice& other) {
   id = other.id;
   major = other.major;
   minor = other.minor;
@@ -75,7 +75,7 @@ CudaDevice& CudaDevice::operator=(const CudaDevice& other) {
   return (*this);
 }
 
-std::string CudaDevice::serialize() {
+std::string CUDADevice::serialize() {
   std::vector<std::string> content;
   content.resize(DEVICE_NAME_IDX + 1);
 
@@ -98,13 +98,13 @@ std::string CudaDevice::serialize() {
   return serialized_device_info;
 }
 
-std::string CudaDevice::getSMCapability() const {
+std::string CUDADevice::getSMCapability() const {
   std::stringstream ss;
   ss << major << "." << minor;
   return ss.str();
 }
 
-std::ostream& operator<<(std::ostream& os, const CudaDevice& device) {
+std::ostream& operator<<(std::ostream& os, const CUDADevice& device) {
   os << "Device(ID: " << device.id << ", Name: " << device.device_name << ", SM Capability: " << device.major << '.'
      << device.minor << ", Type: " << device.device_type << ')';
   return os;

diff --git a/core/runtime/CUDADevice.h b/core/runtime/CUDADevice.h
@@ -0,0 +1,33 @@
+#pragma once
+#include <string>
+#include "NvInfer.h"
+
+namespace torch_tensorrt {
+namespace core {
+namespace runtime {
+
+struct CUDADevice {
+  int64_t id; // CUDA device id
+  int64_t major; // CUDA compute major version
+  int64_t minor; // CUDA compute minor version
+  nvinfer1::DeviceType device_type;
+  std::string device_name;
+
+  CUDADevice();
+  CUDADevice(int64_t gpu_id, nvinfer1::DeviceType device_type);
+  CUDADevice(std::string serialized_device_info);
+  ~CUDADevice() = default;
+  CUDADevice(const CUDADevice& other) = default;
+  CUDADevice& operator=(const CUDADevice& other);
+  std::string serialize();
+  std::string getSMCapability() const;
+  friend std::ostream& operator<<(std::ostream& os, const CUDADevice& device);
+};
+
+void set_cuda_device(CUDADevice& cuda_device);
+// Gets the current active GPU (DLA will not show up through this)
+CUDADevice get_current_device();
+
+} // namespace torch_tensorrt
+} // namespace core
+} // namespace runtime
diff --git a/core/runtime/DeviceList.cpp b/core/runtime/DeviceList.cpp
@@ -15,19 +15,19 @@ DeviceList::DeviceList() {
   }
 
   for (int i = 0; i < num_devices; i++) {
-    device_list[i] = CudaDevice(i, nvinfer1::DeviceType::kGPU);
+    device_list[i] = CUDADevice(i, nvinfer1::DeviceType::kGPU);
   }
 
   // REVIEW: DO WE CARE ABOUT DLA?
 
   LOG_DEBUG("Runtime:\n Available CUDA Devices: \n" << this->dump_list());
 }
 
-void DeviceList::insert(int device_id, CudaDevice cuda_device) {
+void DeviceList::insert(int device_id, CUDADevice cuda_device) {
   device_list[device_id] = cuda_device;
 }
 
-CudaDevice DeviceList::find(int device_id) {
+CUDADevice DeviceList::find(int device_id) {
   return device_list[device_id];
 }
 

diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
@@ -16,7 +16,7 @@ std::string slugify(std::string s) {
   return s;
 }
 
-TRTEngine::TRTEngine(std::string serialized_engine, CudaDevice cuda_device) {
+TRTEngine::TRTEngine(std::string serialized_engine, CUDADevice cuda_device) {
   std::string _name = "deserialized_trt";
   new (this) TRTEngine(_name, serialized_engine, cuda_device);
 }
@@ -33,11 +33,11 @@ TRTEngine::TRTEngine(std::vector<std::string> serialized_info) {
   std::string _name = serialized_info[NAME_IDX];
   std::string engine_info = serialized_info[ENGINE_IDX];
 
-  CudaDevice cuda_device(serialized_info[DEVICE_IDX]);
+  CUDADevice cuda_device(serialized_info[DEVICE_IDX]);
   new (this) TRTEngine(_name, engine_info, cuda_device);
 }
 
-TRTEngine::TRTEngine(std::string mod_name, std::string serialized_engine, CudaDevice cuda_device) {
+TRTEngine::TRTEngine(std::string mod_name, std::string serialized_engine, CUDADevice cuda_device) {
   auto most_compatible_device = get_most_compatible_device(cuda_device);
   TORCHTRT_CHECK(most_compatible_device, "No compatible device was found for instantiating TensorRT engine");
   device_info = most_compatible_device.value();
@@ -85,6 +85,14 @@ TRTEngine::TRTEngine(std::string mod_name, std::string serialized_engine, CudaDe
   LOG_DEBUG(*this);
 }
 
+void TRTEngine::set_paths() {
+  execution_profile_path = profile_path + "/" + name + "_execution_profile.trace";
+  device_profile_path = profile_path + "/" + name + "_device_config_profile.trace";
+  input_profile_path = profile_path + "/" + name + "_input_profile.trace";
+  output_profile_path =  profile_path + "/" + name + "_output_profile.trace";
+  enqueue_profile_path =  profile_path + "/" + name + "_enqueue_profile.trace";
+}
+
 TRTEngine& TRTEngine::operator=(const TRTEngine& other) {
   rt = other.rt;
   cuda_engine = other.cuda_engine;

diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
@@ -0,0 +1,55 @@
+#pragma once
+#include <map>
+#include <memory>
+#include <mutex>
+#include <utility>
+#include "ATen/core/function_schema.h"
+#include "NvInfer.h"
+#include "core/util/prelude.h"
+#include "torch/custom_class.h"
+
+namespace torch_tensorrt {
+namespace core {
+namespace runtime {
+
+struct TRTEngine : torch::CustomClassHolder {
+  // Each engine needs it's own runtime object
+  std::shared_ptr<nvinfer1::IRuntime> rt;
+  std::shared_ptr<nvinfer1::ICudaEngine> cuda_engine;
+  std::shared_ptr<nvinfer1::IExecutionContext> exec_ctx;
+  std::pair<uint64_t, uint64_t> num_io;
+  std::string name;
+  std::mutex mu;
+  CUDADevice device_info;
+
+  std::string execution_profile_path;
+  std::string device_profile_path;
+  std::string input_profile_path;
+  std::string output_profile_path;
+  std::string enqueue_profile_path;
+  std::string profile_path = "/tmp";
+
+  std::unordered_map<uint64_t, uint64_t> in_binding_map;
+  std::unordered_map<uint64_t, uint64_t> out_binding_map;
+
+#ifndef NDEBUG
+  bool debug = true;
+#else
+  bool debuf = false;
+#endif
+
+  ~TRTEngine() = default;
+  TRTEngine(std::string serialized_engine, CUDADevice cuda_device);
+  TRTEngine(std::vector<std::string> serialized_info);
+  TRTEngine(std::string mod_name, std::string serialized_engine, CUDADevice cuda_device);
+  TRTEngine& operator=(const TRTEngine& other);
+  std::string to_str() const;
+  void set_paths();
+  friend std::ostream& operator<<(std::ostream& os, const TRTEngine& engine);
+  // TODO: Implement a call method
+  // c10::List<at::Tensor> Run(c10::List<at::Tensor> inputs);
+};
+
+} // namespace torch_tensorrt
+} // namespace core
+} // namespace runtime