From e84c8aad77369e90824d76702b26e459de9a7bb0 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Thu, 13 Oct 2022 15:11:34 -0700
Subject: [PATCH 01/22] feat: Adding profiling support to the runtime

Signed-off-by: Naren Dasan <naren@narendasan.com>
---
 core/compiler.cpp                             |   8 +-
 core/compiler.h                               |   2 +-
 core/conversion/converters/impl/expand.cpp    |   5 +-
 core/conversion/converters/impl/select.cpp    |   2 +-
 core/runtime/BUILD                            |  10 +-
 core/runtime/CMakeLists.txt                   |   4 +-
 .../{CudaDevice.cpp => CUDADevice.cpp}        |  16 +--
 core/runtime/CUDADevice.h                     |  33 +++++
 core/runtime/DeviceList.cpp                   |   6 +-
 core/runtime/TRTEngine.cpp                    |  14 ++-
 core/runtime/TRTEngine.h                      |  55 ++++++++
 core/runtime/execute_engine.cpp               | 118 +++++++++++-------
 core/runtime/register_jit_hooks.cpp           |   3 +
 core/runtime/runtime.cpp                      |  14 +--
 core/runtime/runtime.h                        |  58 ++-------
 cpp/bin/torchtrtc/BUILD                       |   2 +-
 cpp/bin/torchtrtc/main.cpp                    |   8 +-
 cpp/src/compile_spec.cpp                      |   2 +-
 cpp/src/torch_tensorrt.cpp                    |   2 +-
 cpp/src/types.cpp                             |   4 +-
 py/torch_tensorrt/csrc/tensorrt_backend.cpp   |   2 +-
 py/torch_tensorrt/csrc/tensorrt_classes.cpp   |   6 +-
 py/torch_tensorrt/csrc/tensorrt_classes.h     |   4 +-
 tests/util/run_graph_engine.cpp               |   2 +-
 24 files changed, 238 insertions(+), 142 deletions(-)
 rename core/runtime/{CudaDevice.cpp => CUDADevice.cpp} (87%)
 create mode 100644 core/runtime/CUDADevice.h
 create mode 100644 core/runtime/TRTEngine.h
diff --git a/core/compiler.cpp b/core/compiler.cpp
index 118ca7aa1c..29ffa5008e 100644
--- a/core/compiler.cpp
+++ b/core/compiler.cpp
@@ -31,7 +31,7 @@ void AddEngineToGraph(
     torch::jit::script::Module mod,
     std::shared_ptr<torch::jit::Graph>& g,
     const std::string& serialized_engine,
-    runtime::CudaDevice& device_info,
+    runtime::CUDADevice& device_info,
     std::string engine_id = "",
     bool fallback = false) {
   auto engine_ptr = c10::make_intrusive<runtime::TRTEngine>(
@@ -166,7 +166,7 @@ partitioning::GraphAndMapping BuildHybridGraph(
         auto engine = conversion::ConvertBlockToEngine(seg_block.block(), convert_info, static_params);
         auto temp_g = std::make_shared<torch::jit::Graph>();
         auto device_spec = convert_info.engine_settings.device;
-        auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
+        auto cuda_device = runtime::CUDADevice(device_spec.gpu_id, device_spec.device_type);
         AddEngineToGraph(new_mod, temp_g, engine, cuda_device, trt_engine_id.str(), true);
 
         seg_block.update_graph(temp_g);
@@ -283,7 +283,7 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
   torch::jit::Module new_mod(mod._ivalue()->name() + "_trt");
 
   auto device_spec = cfg.convert_info.engine_settings.device;
-  auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
+  auto cuda_device = runtime::CUDADevice(device_spec.gpu_id, device_spec.device_type);
 
   for (const torch::jit::Method& method : mod.get_methods()) {
     if (method.name().compare("forward") == 0) {
@@ -342,7 +342,7 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
   return new_mod;
 }
 
-torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine, runtime::CudaDevice cuda_device) {
+torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine, runtime::CUDADevice cuda_device) {
   std::ostringstream engine_id;
   engine_id << reinterpret_cast<const int*>(&engine);
   torch::jit::script::Module new_mod("tensorrt_engine_mod_" + engine_id.str());
diff --git a/core/compiler.h b/core/compiler.h
index 1b7b3defe8..d35fae352d 100644
--- a/core/compiler.h
+++ b/core/compiler.h
@@ -28,7 +28,7 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::
 
 torch::jit::script::Module CompileGraph(const torch::jit::script::Module& module, CompileSpec cfg);
 
-torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine, runtime::CudaDevice cuda_device);
+torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine, runtime::CUDADevice cuda_device);
 
 void set_device(const int gpu_id);
 
diff --git a/core/conversion/converters/impl/expand.cpp b/core/conversion/converters/impl/expand.cpp
index e379614ad3..e6a6c0130c 100644
--- a/core/conversion/converters/impl/expand.cpp
+++ b/core/conversion/converters/impl/expand.cpp
@@ -374,12 +374,13 @@ auto expand_registrations TORCHTRT_UNUSED =
 
                // Collapse repeated dimension back into desired dimension
                std::vector<int64_t> collapse_shape_vec;
-               for (int k = 0; k < repeat_shape_dims.nbDims; k++) {
+               for (int64_t k = 0; k < repeat_shape_dims.nbDims; k++) {
                  if (k == dim) {
-                   int64_t collapse_dim = repeat_shape_dims.d[k] * repeat_shape_dims.d[++k];
+                   int64_t collapse_dim = repeat_shape_dims.d[k] * repeat_shape_dims.d[k+1];
                    // Set dim size to -1 if repeat is being done on dynamic dim
                    collapse_dim = std::max(collapse_dim, (int64_t)-1);
                    collapse_shape_vec.push_back(collapse_dim);
+                   k++;
                  } else {
                    collapse_shape_vec.push_back(repeat_shape_dims.d[k]);
                  }
diff --git a/core/conversion/converters/impl/select.cpp b/core/conversion/converters/impl/select.cpp
index 102f0d294f..25a5683833 100644
--- a/core/conversion/converters/impl/select.cpp
+++ b/core/conversion/converters/impl/select.cpp
@@ -280,7 +280,7 @@ auto select_registrations TORCHTRT_UNUSED =
 
                std::vector<nvinfer1::ITensor*> tensors;
                std::vector<int32_t> adv_idx_indices;
-               for (auto i = 0; i < ts.size(); i++) {
+               for (size_t i = 0; i < ts.size(); i++) {
                  auto t = ts[i];
                  if (t.isTensor()) {
                    auto torch_tensor = t.toTensor().to(torch::kInt32);
diff --git a/core/runtime/BUILD b/core/runtime/BUILD
index 0d9513fc74..a2a2cfa3b1 100644
--- a/core/runtime/BUILD
+++ b/core/runtime/BUILD
@@ -13,7 +13,7 @@ config_setting(
 cc_library(
     name = "runtime",
     srcs = [
-        "CudaDevice.cpp",
+        "CUDADevice.cpp",
         "DeviceList.cpp",
         "TRTEngine.cpp",
         "execute_engine.cpp",
@@ -22,6 +22,8 @@ cc_library(
     ],
     hdrs = [
         "runtime.h",
+        "CUDADevice.h",
+        "TRTEngine.h"
     ],
     deps = [
         "@tensorrt//:nvinfer",
@@ -36,6 +38,10 @@ cc_library(
 
 pkg_tar(
     name = "include",
-    srcs = ["runtime.h"],
+    srcs = [
+        "runtime.h",
+        "CUDADevice.h",
+        "TRTEngine.h"
+    ],
     package_dir = "core/runtime/",
 )
diff --git a/core/runtime/CMakeLists.txt b/core/runtime/CMakeLists.txt
index 4481014f54..a4a8919808 100644
--- a/core/runtime/CMakeLists.txt
+++ b/core/runtime/CMakeLists.txt
@@ -2,7 +2,7 @@ set(lib_name "core_runtime")
 add_library(${lib_name} OBJECT)
 
 set(CXX_SRCS
-    "${CMAKE_CURRENT_SOURCE_DIR}/CudaDevice.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/CUDADevice.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/DeviceList.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/execute_engine.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/TRTEngine.cpp"
@@ -12,6 +12,8 @@ set(CXX_SRCS
 
 set(HEADER_FILES
     "${CMAKE_CURRENT_SOURCE_DIR}/runtime.h"
+    "${CMAKE_CURRENT_SOURCE_DIR}/CUDADevice.h"
+    "${CMAKE_CURRENT_SOURCE_DIR}/TRTEngine.h"
 )
 
 target_sources(${lib_name}
diff --git a/core/runtime/CudaDevice.cpp b/core/runtime/CUDADevice.cpp
similarity index 87%
rename from core/runtime/CudaDevice.cpp
rename to core/runtime/CUDADevice.cpp
index b9281bcc83..b803e73482 100644
--- a/core/runtime/CudaDevice.cpp
+++ b/core/runtime/CUDADevice.cpp
@@ -11,10 +11,10 @@ const std::string DEVICE_INFO_DELIM = "%";
 
 typedef enum { ID_IDX = 0, SM_MAJOR_IDX, SM_MINOR_IDX, DEVICE_TYPE_IDX, DEVICE_NAME_IDX } SerializedDeviceInfoIndex;
 
-CudaDevice::CudaDevice() : id{-1}, major{-1}, minor{-1}, device_type{nvinfer1::DeviceType::kGPU} {}
+CUDADevice::CUDADevice() : id{-1}, major{-1}, minor{-1}, device_type{nvinfer1::DeviceType::kGPU} {}
 
-CudaDevice::CudaDevice(int64_t gpu_id, nvinfer1::DeviceType device_type) {
-  CudaDevice cuda_device;
+CUDADevice::CUDADevice(int64_t gpu_id, nvinfer1::DeviceType device_type) {
+  CUDADevice cuda_device;
   cudaDeviceProp device_prop;
 
   // Device ID
@@ -41,7 +41,7 @@ CudaDevice::CudaDevice(int64_t gpu_id, nvinfer1::DeviceType device_type) {
 // NOTE: Serialization Format for Device Info:
 // id%major%minor%(enum)device_type%device_name
 
-CudaDevice::CudaDevice(std::string device_info) {
+CUDADevice::CUDADevice(std::string device_info) {
   LOG_DEBUG("Deserializing Device Info: " << device_info);
 
   std::vector<std::string> tokens;
@@ -66,7 +66,7 @@ CudaDevice::CudaDevice(std::string device_info) {
   LOG_DEBUG("Deserialized Device Info: " << *this);
 }
 
-CudaDevice& CudaDevice::operator=(const CudaDevice& other) {
+CUDADevice& CUDADevice::operator=(const CUDADevice& other) {
   id = other.id;
   major = other.major;
   minor = other.minor;
@@ -75,7 +75,7 @@ CudaDevice& CudaDevice::operator=(const CudaDevice& other) {
   return (*this);
 }
 
-std::string CudaDevice::serialize() {
+std::string CUDADevice::serialize() {
   std::vector<std::string> content;
   content.resize(DEVICE_NAME_IDX + 1);
 
@@ -98,13 +98,13 @@ std::string CudaDevice::serialize() {
   return serialized_device_info;
 }
 
-std::string CudaDevice::getSMCapability() const {
+std::string CUDADevice::getSMCapability() const {
   std::stringstream ss;
   ss << major << "." << minor;
   return ss.str();
 }
 
-std::ostream& operator<<(std::ostream& os, const CudaDevice& device) {
+std::ostream& operator<<(std::ostream& os, const CUDADevice& device) {
   os << "Device(ID: " << device.id << ", Name: " << device.device_name << ", SM Capability: " << device.major << '.'
      << device.minor << ", Type: " << device.device_type << ')';
   return os;
diff --git a/core/runtime/CUDADevice.h b/core/runtime/CUDADevice.h
new file mode 100644
index 0000000000..695978074c
--- /dev/null
+++ b/core/runtime/CUDADevice.h
@@ -0,0 +1,33 @@
+#pragma once
+#include <string>
+#include "NvInfer.h"
+
+namespace torch_tensorrt {
+namespace core {
+namespace runtime {
+
+struct CUDADevice {
+  int64_t id; // CUDA device id
+  int64_t major; // CUDA compute major version
+  int64_t minor; // CUDA compute minor version
+  nvinfer1::DeviceType device_type;
+  std::string device_name;
+
+  CUDADevice();
+  CUDADevice(int64_t gpu_id, nvinfer1::DeviceType device_type);
+  CUDADevice(std::string serialized_device_info);
+  ~CUDADevice() = default;
+  CUDADevice(const CUDADevice& other) = default;
+  CUDADevice& operator=(const CUDADevice& other);
+  std::string serialize();
+  std::string getSMCapability() const;
+  friend std::ostream& operator<<(std::ostream& os, const CUDADevice& device);
+};
+
+void set_cuda_device(CUDADevice& cuda_device);
+// Gets the current active GPU (DLA will not show up through this)
+CUDADevice get_current_device();
+
+} // namespace torch_tensorrt
+} // namespace core
+} // namespace runtime
diff --git a/core/runtime/DeviceList.cpp b/core/runtime/DeviceList.cpp
index 9c9cff2a90..c095b08009 100644
--- a/core/runtime/DeviceList.cpp
+++ b/core/runtime/DeviceList.cpp
@@ -15,7 +15,7 @@ DeviceList::DeviceList() {
   }
 
   for (int i = 0; i < num_devices; i++) {
-    device_list[i] = CudaDevice(i, nvinfer1::DeviceType::kGPU);
+    device_list[i] = CUDADevice(i, nvinfer1::DeviceType::kGPU);
   }
 
   // REVIEW: DO WE CARE ABOUT DLA?
@@ -23,11 +23,11 @@ DeviceList::DeviceList() {
   LOG_DEBUG("Runtime:\n Available CUDA Devices: \n" << this->dump_list());
 }
 
-void DeviceList::insert(int device_id, CudaDevice cuda_device) {
+void DeviceList::insert(int device_id, CUDADevice cuda_device) {
   device_list[device_id] = cuda_device;
 }
 
-CudaDevice DeviceList::find(int device_id) {
+CUDADevice DeviceList::find(int device_id) {
   return device_list[device_id];
 }
 
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
index 1059324eda..9f7dd16827 100644
--- a/core/runtime/TRTEngine.cpp
+++ b/core/runtime/TRTEngine.cpp
@@ -16,7 +16,7 @@ std::string slugify(std::string s) {
   return s;
 }
 
-TRTEngine::TRTEngine(std::string serialized_engine, CudaDevice cuda_device) {
+TRTEngine::TRTEngine(std::string serialized_engine, CUDADevice cuda_device) {
   std::string _name = "deserialized_trt";
   new (this) TRTEngine(_name, serialized_engine, cuda_device);
 }
@@ -33,11 +33,11 @@ TRTEngine::TRTEngine(std::vector<std::string> serialized_info) {
   std::string _name = serialized_info[NAME_IDX];
   std::string engine_info = serialized_info[ENGINE_IDX];
 
-  CudaDevice cuda_device(serialized_info[DEVICE_IDX]);
+  CUDADevice cuda_device(serialized_info[DEVICE_IDX]);
   new (this) TRTEngine(_name, engine_info, cuda_device);
 }
 
-TRTEngine::TRTEngine(std::string mod_name, std::string serialized_engine, CudaDevice cuda_device) {
+TRTEngine::TRTEngine(std::string mod_name, std::string serialized_engine, CUDADevice cuda_device) {
   auto most_compatible_device = get_most_compatible_device(cuda_device);
   TORCHTRT_CHECK(most_compatible_device, "No compatible device was found for instantiating TensorRT engine");
   device_info = most_compatible_device.value();
@@ -85,6 +85,14 @@ TRTEngine::TRTEngine(std::string mod_name, std::string serialized_engine, CudaDe
   LOG_DEBUG(*this);
 }
 
+void TRTEngine::set_paths() {
+  execution_profile_path = profile_path + "/" + name + "_execution_profile.trace";
+  device_profile_path = profile_path + "/" + name + "_device_config_profile.trace";
+  input_profile_path = profile_path + "/" + name + "_input_profile.trace";
+  output_profile_path =  profile_path + "/" + name + "_output_profile.trace";
+  enqueue_profile_path =  profile_path + "/" + name + "_enqueue_profile.trace";
+}
+
 TRTEngine& TRTEngine::operator=(const TRTEngine& other) {
   rt = other.rt;
   cuda_engine = other.cuda_engine;
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
new file mode 100644
index 0000000000..f76880bf76
--- /dev/null
+++ b/core/runtime/TRTEngine.h
@@ -0,0 +1,55 @@
+#pragma once
+#include <map>
+#include <memory>
+#include <mutex>
+#include <utility>
+#include "ATen/core/function_schema.h"
+#include "NvInfer.h"
+#include "core/util/prelude.h"
+#include "torch/custom_class.h"
+
+namespace torch_tensorrt {
+namespace core {
+namespace runtime {
+
+struct TRTEngine : torch::CustomClassHolder {
+  // Each engine needs it's own runtime object
+  std::shared_ptr<nvinfer1::IRuntime> rt;
+  std::shared_ptr<nvinfer1::ICudaEngine> cuda_engine;
+  std::shared_ptr<nvinfer1::IExecutionContext> exec_ctx;
+  std::pair<uint64_t, uint64_t> num_io;
+  std::string name;
+  std::mutex mu;
+  CUDADevice device_info;
+
+  std::string execution_profile_path;
+  std::string device_profile_path;
+  std::string input_profile_path;
+  std::string output_profile_path;
+  std::string enqueue_profile_path;
+  std::string profile_path = "/tmp";
+
+  std::unordered_map<uint64_t, uint64_t> in_binding_map;
+  std::unordered_map<uint64_t, uint64_t> out_binding_map;
+
+#ifndef NDEBUG
+  bool debug = true;
+#else
+  bool debug = false;
+#endif
+
+  ~TRTEngine() = default;
+  TRTEngine(std::string serialized_engine, CUDADevice cuda_device);
+  TRTEngine(std::vector<std::string> serialized_info);
+  TRTEngine(std::string mod_name, std::string serialized_engine, CUDADevice cuda_device);
+  TRTEngine& operator=(const TRTEngine& other);
+  std::string to_str() const;
+  void set_paths();
+  friend std::ostream& operator<<(std::ostream& os, const TRTEngine& engine);
+  // TODO: Implement a call method
+  // c10::List<at::Tensor> Run(c10::List<at::Tensor> inputs);
+};
+
+} // namespace torch_tensorrt
+} // namespace core
+} // namespace runtime
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
index 18924cccd6..727c36f541 100644
--- a/core/runtime/execute_engine.cpp
+++ b/core/runtime/execute_engine.cpp
@@ -11,7 +11,7 @@ namespace core {
 namespace runtime {
 
 // Checks if the context switch requred for device ID
-bool is_switch_required(const CudaDevice& curr_device, const CudaDevice& engine_device) {
+bool is_switch_required(const CUDADevice& curr_device, const CUDADevice& engine_device) {
   // If SM capability is not the same as configured then switch
   if ((curr_device.major != engine_device.major) || (curr_device.minor != engine_device.minor)) {
     LOG_WARNING(
@@ -42,7 +42,7 @@ bool is_switch_required(const CudaDevice& curr_device, const CudaDevice& engine_
   return false;
 }
 
-CudaDevice select_cuda_device(const CudaDevice& engine_device) {
+CUDADevice select_cuda_device(const CUDADevice& engine_device) {
   auto new_target_device_opt = get_most_compatible_device(engine_device);
 
   // REVIEW: THIS DOES NOT LIST DLA PROBABLY, WHICH WE SHOULD
@@ -60,62 +60,94 @@ CudaDevice select_cuda_device(const CudaDevice& engine_device) {
 std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
   LOG_DEBUG("Attempting to run engine (ID: " << compiled_engine->name << ")");
 
-  CudaDevice curr_device = get_current_device();
-  LOG_DEBUG("Current Device: " << curr_device);
+  std::unique_ptr<torch::autograd::profiler::RecordProfile> execution_profiler_guard;
+  if (compiled_engine->debug) {
+    execution_profiler_guard.reset(new torch::autograd::profiler::RecordProfile(compiled_engine->execution_profile_path));
+  }
+
+  {
+    std::unique_ptr<torch::autograd::profiler::RecordProfile> device_profiler_guard;
+    if (compiled_engine->debug) {
+      device_profiler_guard.reset(new torch::autograd::profiler::RecordProfile(compiled_engine->device_profile_path));
+    }
+
+    CUDADevice curr_device = get_current_device();
+    LOG_DEBUG("Current Device: " << curr_device);
 
-  if (is_switch_required(curr_device, compiled_engine->device_info)) {
-    // Scan through available CUDA devices and set the CUDA device context correctly
-    CudaDevice device = select_cuda_device(compiled_engine->device_info);
-    set_cuda_device(device);
+    if (is_switch_required(curr_device, compiled_engine->device_info)) {
+      // Scan through available CUDA devices and set the CUDA device context correctly
+      CUDADevice device = select_cuda_device(compiled_engine->device_info);
+      set_cuda_device(device);
 
-    std::string target_device = "cuda:" + std::to_string(device.id);
+      std::string target_device = "cuda:" + std::to_string(device.id);
 
-    for (auto& in : inputs) {
-      in = in.to(torch::Device(target_device));
+      for (auto& in : inputs) {
+        in = in.to(torch::Device(target_device));
+      }
     }
   }
 
   std::vector<void*> gpu_handles;
-
   std::vector<at::Tensor> contig_inputs{};
-  contig_inputs.reserve(inputs.size());
-
-  for (size_t i = 0; i < inputs.size(); i++) {
-    uint64_t pyt_idx = compiled_engine->in_binding_map[i];
-    TORCHTRT_CHECK(
-        inputs[pyt_idx].is_cuda(),
-        "Expected input tensors to have device cuda, found device " << inputs[pyt_idx].device());
-    auto expected_type = util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getBindingDataType(i));
-    TORCHTRT_CHECK(
-        inputs[pyt_idx].dtype() == expected_type,
-        "Expected input tensors to have type " << expected_type << ", found type " << inputs[pyt_idx].dtype());
-    auto dims = core::util::toDimsPad(inputs[pyt_idx].sizes(), 1);
-    auto shape = core::util::toVec(dims);
-    contig_inputs.push_back(inputs[pyt_idx].view(shape).contiguous());
-    LOG_DEBUG("Input shape: " << dims);
-    compiled_engine->exec_ctx->setBindingDimensions(i, dims);
-    gpu_handles.push_back(contig_inputs.back().data_ptr());
-  }
+  {
+    std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
+    if (compiled_engine->debug) {
+      input_profiler_guard.reset(new torch::autograd::profiler::RecordProfile(compiled_engine->input_profile_path));
+    }
 
+    contig_inputs.reserve(inputs.size());
+
+    for (size_t i = 0; i < inputs.size(); i++) {
+      uint64_t pyt_idx = compiled_engine->in_binding_map[i];
+      TORCHTRT_CHECK(
+          inputs[pyt_idx].is_cuda(),
+          "Expected input tensors to have device cuda, found device " << inputs[pyt_idx].device());
+      auto expected_type = util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getBindingDataType(i));
+      TORCHTRT_CHECK(
+          inputs[pyt_idx].dtype() == expected_type,
+          "Expected input tensors to have type " << expected_type << ", found type " << inputs[pyt_idx].dtype());
+      auto dims = core::util::toDimsPad(inputs[pyt_idx].sizes(), 1);
+      auto shape = core::util::toVec(dims);
+      contig_inputs.push_back(inputs[pyt_idx].view(shape).contiguous());
+      LOG_DEBUG("Input shape: " << dims);
+      compiled_engine->exec_ctx->setBindingDimensions(i, dims);
+      gpu_handles.push_back(contig_inputs.back().data_ptr());
+    }
   TORCHTRT_CHECK(
-      compiled_engine->exec_ctx->allInputDimensionsSpecified(), "Not enough inputs provided (runtime.RunCudaEngine)");
+      compiled_engine->exec_ctx->allInputDimensionsSpecified(), "Not enough inputs provided (torch.ops.tensorrt.execute_engine)");
+  }
+
 
   std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
-  for (size_t o = inputs.size(); o < (compiled_engine->num_io.first + compiled_engine->num_io.second); o++) {
-    uint64_t pyt_idx = compiled_engine->out_binding_map[o];
-    auto out_shape = compiled_engine->exec_ctx->getBindingDimensions(o);
-    LOG_DEBUG("Output shape: " << out_shape);
-    auto dims = core::util::toVec(out_shape);
-    auto type = util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getBindingDataType(o));
-    outputs[pyt_idx] = std::move(at::empty(dims, {at::kCUDA}).to(type).contiguous());
-    gpu_handles.push_back(outputs[pyt_idx].data_ptr());
+  {
+    std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
+    if (compiled_engine->debug) {
+      output_profiler_guard.reset(new torch::autograd::profiler::RecordProfile(compiled_engine->output_profile_path));
+    }
+
+    for (size_t o = inputs.size(); o < (compiled_engine->num_io.first + compiled_engine->num_io.second); o++) {
+      uint64_t pyt_idx = compiled_engine->out_binding_map[o];
+      auto out_shape = compiled_engine->exec_ctx->getBindingDimensions(o);
+      LOG_DEBUG("Output shape: " << out_shape);
+      auto dims = core::util::toVec(out_shape);
+      auto type = util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getBindingDataType(o));
+      outputs[pyt_idx] = std::move(at::empty(dims, {at::kCUDA}).to(type).contiguous());
+      gpu_handles.push_back(outputs[pyt_idx].data_ptr());
+    }
   }
 
-  c10::cuda::CUDAStream stream = c10::cuda::getCurrentCUDAStream(inputs[0].device().index());
+  {
+    std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
+    if (compiled_engine->debug) {
+      enqueue_profiler_guard.reset(new torch::autograd::profiler::RecordProfile(compiled_engine->enqueue_profile_path));
+    }
+
+    c10::cuda::CUDAStream stream = c10::cuda::getCurrentCUDAStream(inputs[0].device().index());
 
-  // nvinfer1::IExecutionContext::enqueue is not thread safe and we need a mutex for it.
-  std::unique_lock<std::mutex> lock(compiled_engine->mu);
-  compiled_engine->exec_ctx->enqueueV2(gpu_handles.data(), stream, nullptr);
+    // nvinfer1::IExecutionContext::enqueue is not thread safe and we need a mutex for it.
+    std::unique_lock<std::mutex> lock(compiled_engine->mu);
+    compiled_engine->exec_ctx->enqueueV2(gpu_handles.data(), stream, nullptr);
+  }
 
   return outputs;
 }
diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
index 04eb371c0f..373b574a02 100644
--- a/core/runtime/register_jit_hooks.cpp
+++ b/core/runtime/register_jit_hooks.cpp
@@ -18,6 +18,9 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
         // TODO: .def("__call__", &TRTEngine::Run)
         // TODO: .def("run", &TRTEngine::Run)
         .def("__str__", &TRTEngine::to_str)
+        .def("__repr__", &TRTEngine::to_str)
+        .def_readwrite("debug", &TRTEngine::debug)
+        .def_readwrite("profile_path", &TRTEngine::profile_path)
         .def_pickle(
             [](const c10::intrusive_ptr<TRTEngine>& self) -> std::vector<std::string> {
               // Serialize TensorRT engine
diff --git a/core/runtime/runtime.cpp b/core/runtime/runtime.cpp
index 9c1d2c9d2c..49406e95b2 100644
--- a/core/runtime/runtime.cpp
+++ b/core/runtime/runtime.cpp
@@ -7,7 +7,7 @@ namespace torch_tensorrt {
 namespace core {
 namespace runtime {
 
-c10::optional<CudaDevice> get_most_compatible_device(const CudaDevice& target_device) {
+c10::optional<CUDADevice> get_most_compatible_device(const CUDADevice& target_device) {
   LOG_DEBUG("Target Device: " << target_device);
   auto device_options = find_compatible_devices(target_device);
   if (device_options.size() == 0) {
@@ -16,7 +16,7 @@ c10::optional<CudaDevice> get_most_compatible_device(const CudaDevice& target_de
     return {device_options[0]};
   }
 
-  CudaDevice best_match;
+  CUDADevice best_match;
   std::stringstream dev_list;
   dev_list << "[" << std::endl;
   for (auto device : device_options) {
@@ -41,11 +41,11 @@ c10::optional<CudaDevice> get_most_compatible_device(const CudaDevice& target_de
   }
 }
 
-std::vector<CudaDevice> find_compatible_devices(const CudaDevice& target_device) {
+std::vector<CUDADevice> find_compatible_devices(const CUDADevice& target_device) {
   auto dla_supported = get_dla_supported_SMs();
   auto device_list = get_available_device_list().get_devices();
 
-  std::vector<CudaDevice> compatible_devices;
+  std::vector<CUDADevice> compatible_devices;
 
   for (auto device : device_list) {
     auto poss_dev_cc = device.second.getSMCapability();
@@ -69,13 +69,13 @@ std::vector<CudaDevice> find_compatible_devices(const CudaDevice& target_device)
   return compatible_devices;
 }
 
-void set_cuda_device(CudaDevice& cuda_device) {
+void set_cuda_device(CUDADevice& cuda_device) {
   TORCHTRT_CHECK(
       (cudaSetDevice(cuda_device.id) == cudaSuccess), "Unable to set device: " << cuda_device << "as active device");
   LOG_DEBUG("Setting " << cuda_device << " as active device");
 }
 
-CudaDevice get_current_device() {
+CUDADevice get_current_device() {
   int device = -1;
   TORCHTRT_CHECK(
       (cudaGetDevice(reinterpret_cast<int*>(&device)) == cudaSuccess),
@@ -83,7 +83,7 @@ CudaDevice get_current_device() {
 
   int64_t device_id = static_cast<int64_t>(device);
 
-  return CudaDevice(device_id, nvinfer1::DeviceType::kGPU);
+  return CUDADevice(device_id, nvinfer1::DeviceType::kGPU);
 }
 
 namespace {
diff --git a/core/runtime/runtime.h b/core/runtime/runtime.h
index 22393954d4..420e37366a 100644
--- a/core/runtime/runtime.h
+++ b/core/runtime/runtime.h
@@ -7,6 +7,8 @@
 #include "NvInfer.h"
 #include "core/util/prelude.h"
 #include "torch/custom_class.h"
+#include "core/runtime/CUDADevice.h"
+#include "core/runtime/TRTEngine.h"
 
 namespace torch_tensorrt {
 namespace core {
@@ -16,59 +18,13 @@ using EngineID = int64_t;
 const std::string ABI_VERSION = "3";
 typedef enum { ABI_TARGET_IDX = 0, NAME_IDX, DEVICE_IDX, ENGINE_IDX } SerializedInfoIndex;
 
-struct CudaDevice {
-  int64_t id; // CUDA device id
-  int64_t major; // CUDA compute major version
-  int64_t minor; // CUDA compute minor version
-  nvinfer1::DeviceType device_type;
-  std::string device_name;
-
-  CudaDevice();
-  CudaDevice(int64_t gpu_id, nvinfer1::DeviceType device_type);
-  CudaDevice(std::string serialized_device_info);
-  ~CudaDevice() = default;
-  CudaDevice(const CudaDevice& other) = default;
-  CudaDevice& operator=(const CudaDevice& other);
-  std::string serialize();
-  std::string getSMCapability() const;
-  friend std::ostream& operator<<(std::ostream& os, const CudaDevice& device);
-};
-
-void set_cuda_device(CudaDevice& cuda_device);
-// Gets the current active GPU (DLA will not show up through this)
-CudaDevice get_current_device();
-
-c10::optional<CudaDevice> get_most_compatible_device(const CudaDevice& target_device);
-std::vector<CudaDevice> find_compatible_devices(const CudaDevice& target_device);
-
-struct TRTEngine : torch::CustomClassHolder {
-  // Each engine needs it's own runtime object
-  std::shared_ptr<nvinfer1::IRuntime> rt;
-  std::shared_ptr<nvinfer1::ICudaEngine> cuda_engine;
-  std::shared_ptr<nvinfer1::IExecutionContext> exec_ctx;
-  std::pair<uint64_t, uint64_t> num_io;
-  std::string name;
-  std::mutex mu;
-  CudaDevice device_info;
-
-  std::unordered_map<uint64_t, uint64_t> in_binding_map;
-  std::unordered_map<uint64_t, uint64_t> out_binding_map;
-
-  ~TRTEngine() = default;
-  TRTEngine(std::string serialized_engine, CudaDevice cuda_device);
-  TRTEngine(std::vector<std::string> serialized_info);
-  TRTEngine(std::string mod_name, std::string serialized_engine, CudaDevice cuda_device);
-  TRTEngine& operator=(const TRTEngine& other);
-  std::string to_str() const;
-  friend std::ostream& operator<<(std::ostream& os, const TRTEngine& engine);
-  // TODO: Implement a call method
-  // c10::List<at::Tensor> Run(c10::List<at::Tensor> inputs);
-};
+c10::optional<CUDADevice> get_most_compatible_device(const CUDADevice& target_device);
+std::vector<CUDADevice> find_compatible_devices(const CUDADevice& target_device);
 
 std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine);
 
 class DeviceList {
-  using DeviceMap = std::unordered_map<int, CudaDevice>;
+  using DeviceMap = std::unordered_map<int, CUDADevice>;
   DeviceMap device_list;
 
  public:
@@ -76,8 +32,8 @@ class DeviceList {
   DeviceList();
 
  public:
-  void insert(int device_id, CudaDevice cuda_device);
-  CudaDevice find(int device_id);
+  void insert(int device_id, CUDADevice cuda_device);
+  CUDADevice find(int device_id);
   DeviceMap get_devices();
   std::string dump_list();
 };
diff --git a/cpp/bin/torchtrtc/BUILD b/cpp/bin/torchtrtc/BUILD
index 737a8481f1..b95809dc20 100644
--- a/cpp/bin/torchtrtc/BUILD
+++ b/cpp/bin/torchtrtc/BUILD
@@ -22,7 +22,7 @@ cc_binary(
         "parser_util.h",
     ],
     linkopts = [
-        "-l:libdl.so",
+        "-ldl",
     ],
     deps = [
         "//third_party/args",
diff --git a/cpp/bin/torchtrtc/main.cpp b/cpp/bin/torchtrtc/main.cpp
index bc3d5d4af0..b5f30080b9 100644
--- a/cpp/bin/torchtrtc/main.cpp
+++ b/cpp/bin/torchtrtc/main.cpp
@@ -299,7 +299,7 @@ int main(int argc, char** argv) {
   }
 
   std::vector<torchtrt::Input> ranges;
-  for (const auto spec : args::get(input_shapes)) {
+  for (const auto& spec : args::get(input_shapes)) {
     ranges.push_back(torchtrtc::parserutil::parse_input(spec));
     std::stringstream ss;
     ss << "Parsed Input: " << ranges.back();
@@ -343,17 +343,17 @@ int main(int argc, char** argv) {
 
     compile_settings.min_block_size = min_block_size;
 
-    for (const auto _op : args::get(torch_executed_ops)) {
+    for (const auto& _op : args::get(torch_executed_ops)) {
       compile_settings.torch_executed_ops.push_back(_op);
     }
 
-    for (const auto _mod : args::get(torch_executed_mods)) {
+    for (const auto& _mod : args::get(torch_executed_mods)) {
       compile_settings.torch_executed_modules.push_back(_mod);
     }
   }
 
   if (enabled_precisions) {
-    for (const auto precision : args::get(enabled_precisions)) {
+    for (const auto& precision : args::get(enabled_precisions)) {
       auto dtype = torchtrtc::parserutil::parse_dtype(precision);
       if (dtype == torchtrt::DataType::kFloat) {
         compile_settings.enabled_precisions.insert(torch::kF32);
diff --git a/cpp/src/compile_spec.cpp b/cpp/src/compile_spec.cpp
index 3d7d9b15d3..8fc22b7861 100644
--- a/cpp/src/compile_spec.cpp
+++ b/cpp/src/compile_spec.cpp
@@ -13,7 +13,7 @@ nvinfer1::DataType toTRTDataType(DataType value);
 nvinfer1::TensorFormat toTRTTensorFormat(TensorFormat value);
 torchtrt::core::ir::Input to_internal_input(Input& i);
 std::vector<torchtrt::core::ir::Input> to_vec_internal_inputs(std::vector<Input>& external);
-torchtrt::core::runtime::CudaDevice to_internal_cuda_device(Device device);
+torchtrt::core::runtime::CUDADevice to_internal_cuda_device(Device device);
 
 namespace torchscript {
 CompileSpec::CompileSpec(std::vector<c10::ArrayRef<int64_t>> fixed_sizes) {
diff --git a/cpp/src/torch_tensorrt.cpp b/cpp/src/torch_tensorrt.cpp
index 22855aeb03..3aca25003a 100644
--- a/cpp/src/torch_tensorrt.cpp
+++ b/cpp/src/torch_tensorrt.cpp
@@ -7,7 +7,7 @@
 
 namespace torch_tensorrt {
 // Defined in types.cpp
-torch_tensorrt::core::runtime::CudaDevice to_internal_cuda_device(Device device);
+torch_tensorrt::core::runtime::CUDADevice to_internal_cuda_device(Device device);
 namespace torchscript {
 // Defined in compile_spec.cpp
 torch_tensorrt::core::CompileSpec to_internal_compile_spec(CompileSpec external);
diff --git a/cpp/src/types.cpp b/cpp/src/types.cpp
index 992147cc84..7083461350 100644
--- a/cpp/src/types.cpp
+++ b/cpp/src/types.cpp
@@ -280,7 +280,7 @@ std::vector<torch_tensorrt::core::ir::Input> to_vec_internal_inputs(std::vector<
   return internal;
 }
 
-torch_tensorrt::core::runtime::CudaDevice to_internal_cuda_device(Device device) {
+torch_tensorrt::core::runtime::CUDADevice to_internal_cuda_device(Device device) {
   auto device_type = nvinfer1::DeviceType::kGPU;
   switch (device.device_type) {
     case Device::DeviceType::kDLA:
@@ -290,6 +290,6 @@ torch_tensorrt::core::runtime::CudaDevice to_internal_cuda_device(Device device)
     default:
       device_type = nvinfer1::DeviceType::kGPU;
   }
-  return torch_tensorrt::core::runtime::CudaDevice(device.gpu_id, device_type);
+  return torch_tensorrt::core::runtime::CUDADevice(device.gpu_id, device_type);
 }
 } // namespace torch_tensorrt
diff --git a/py/torch_tensorrt/csrc/tensorrt_backend.cpp b/py/torch_tensorrt/csrc/tensorrt_backend.cpp
index e88e55cca0..671385319c 100644
--- a/py/torch_tensorrt/csrc/tensorrt_backend.cpp
+++ b/py/torch_tensorrt/csrc/tensorrt_backend.cpp
@@ -28,7 +28,7 @@ c10::impl::GenericDict TensorRTBackend::compile(c10::IValue mod_val, c10::impl::
     auto cfg = raw_spec->toInternalCompileSpec();
     auto convert_cfg = std::move(cfg.convert_info);
     auto device_spec = convert_cfg.engine_settings.device;
-    auto device = core::runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
+    auto device = core::runtime::CUDADevice(device_spec.gpu_id, device_spec.device_type);
     auto serialized_engine = core::ConvertGraphToTRTEngine(mod_, method_name, cfg);
     auto engine_handle = c10::make_intrusive<core::runtime::TRTEngine>(it->key(), serialized_engine, device);
     handles.insert(method_name, at::IValue(engine_handle));
diff --git a/py/torch_tensorrt/csrc/tensorrt_classes.cpp b/py/torch_tensorrt/csrc/tensorrt_classes.cpp
index 1721ffd6c9..01519458e5 100644
--- a/py/torch_tensorrt/csrc/tensorrt_classes.cpp
+++ b/py/torch_tensorrt/csrc/tensorrt_classes.cpp
@@ -40,7 +40,7 @@ nvinfer1::DataType toTRTDataType(DataType value) {
   }
 }
 
-Device::Device(const core::runtime::CudaDevice& internal_dev) {
+Device::Device(const core::runtime::CUDADevice& internal_dev) {
   device_type = DeviceType::kGPU;
   gpu_id = internal_dev.id;
   dla_core = -1;
@@ -174,8 +174,8 @@ nvinfer1::DeviceType toTRTDeviceType(DeviceType value) {
   }
 }
 
-core::runtime::CudaDevice Device::toInternalRuntimeDevice() {
-  return core::runtime::CudaDevice(gpu_id, toTRTDeviceType(device_type));
+core::runtime::CUDADevice Device::toInternalRuntimeDevice() {
+  return core::runtime::CUDADevice(gpu_id, toTRTDeviceType(device_type));
 }
 
 std::string Device::to_str() {
diff --git a/py/torch_tensorrt/csrc/tensorrt_classes.h b/py/torch_tensorrt/csrc/tensorrt_classes.h
index be2fab3b8e..cdef3ffbf5 100644
--- a/py/torch_tensorrt/csrc/tensorrt_classes.h
+++ b/py/torch_tensorrt/csrc/tensorrt_classes.h
@@ -80,14 +80,14 @@ struct Device : torch::CustomClassHolder {
         allow_gpu_fallback(false) // allow_gpu_fallback
   {}
 
-  Device(const core::runtime::CudaDevice& internal_dev);
+  Device(const core::runtime::CUDADevice& internal_dev);
 
   ADD_ENUM_GET_SET(device_type, DeviceType, static_cast<int64_t>(DeviceType::kDLA));
   ADD_FIELD_GET_SET(gpu_id, int64_t);
   ADD_FIELD_GET_SET(dla_core, int64_t);
   ADD_FIELD_GET_SET(allow_gpu_fallback, bool);
 
-  core::runtime::CudaDevice toInternalRuntimeDevice();
+  core::runtime::CUDADevice toInternalRuntimeDevice();
   std::string to_str();
 };
 
diff --git a/tests/util/run_graph_engine.cpp b/tests/util/run_graph_engine.cpp
index 1d77550d1d..7df2db7d01 100644
--- a/tests/util/run_graph_engine.cpp
+++ b/tests/util/run_graph_engine.cpp
@@ -56,7 +56,7 @@ std::vector<core::ir::Input> toInputsDynamic(std::vector<at::Tensor> ten, bool d
 
 std::vector<at::Tensor> RunEngine(std::string& eng, std::vector<at::Tensor> inputs) {
   LOG_DEBUG("Running TRT version");
-  auto cuda_device = core::runtime::CudaDevice(0, nvinfer1::DeviceType::kGPU);
+  auto cuda_device = core::runtime::CUDADevice(0, nvinfer1::DeviceType::kGPU);
   auto engine_ptr = c10::make_intrusive<torch_tensorrt::core::runtime::TRTEngine>("test_engine", eng, cuda_device);
   auto outputs = torch_tensorrt::core::runtime::execute_engine(inputs, engine_ptr);
   return outputs;

From 65a7c13585a0663098d507b75ca7be3c6606124f Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Thu, 13 Oct 2022 16:30:55 -0700
Subject: [PATCH 02/22] refactor: A new TRTModule implementation using the
 internal runtime which should give TS for free

Signed-off-by: Naren Dasan <naren@narendasan.com>
---
 py/torch_tensorrt/_Device.py                 |  4 ++
 py/torch_tensorrt/csrc/torch_tensorrt_py.cpp |  3 ++
 py/torch_tensorrt/fx/trt_module_next.py      | 45 ++++++++++++++++++++
 3 files changed, 52 insertions(+)
 create mode 100644 py/torch_tensorrt/fx/trt_module_next.py

diff --git a/py/torch_tensorrt/_Device.py b/py/torch_tensorrt/_Device.py
index 16c9b8ea98..70017e2c55 100644
--- a/py/torch_tensorrt/_Device.py
+++ b/py/torch_tensorrt/_Device.py
@@ -111,6 +111,10 @@ def _to_internal(self) -> _C.Device:
         internal_dev.allow_gpu_fallback = self.allow_gpu_fallback
         return internal_dev
 
+    def _to_internal_cuda_device_str(self) -> str:
+        internal_dev = self._to_internal()
+        return internal_dev.to_str()
+
     @classmethod
     def _from_torch_device(cls, torch_dev: torch.device):
         if torch_dev.type != "cuda":
diff --git a/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp b/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp
index 4b87dea604..4c1b773c97 100644
--- a/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp
+++ b/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp
@@ -339,6 +339,9 @@ PYBIND11_MODULE(_C, m) {
       .value("GRAPH", core::util::logging::LogLevel::kGRAPH)
       .export_values();
 
+  py::module rt_sub_mod = m.def_submodule("rt");
+  rt_sub_mod.attr("ABI_VERSION") = py::string_(core::runtime::ABI_VERSION);
+
   py::module ts_sub_mod = m.def_submodule("ts");
   py::class_<CompileSpec>(ts_sub_mod, "CompileSpec")
       .def(py::init<>())
diff --git a/py/torch_tensorrt/fx/trt_module_next.py b/py/torch_tensorrt/fx/trt_module_next.py
new file mode 100644
index 0000000000..1d305d1fc3
--- /dev/null
+++ b/py/torch_tensorrt/fx/trt_module_next.py
@@ -0,0 +1,45 @@
+from operator import truediv
+from typing import Any, List, Sequence
+
+import torch
+
+from torch.classes.tensorrt import Engine
+from torch.ops.tensorrt import execute_engine
+
+from torch_tensorrt import (_C, Device)
+
+class TRTModule(torch.nn.module):
+    def __init__(
+        self,
+        engine_name: str,
+        device_info: Device,
+        serialized_engine: bytearray,
+    ):
+        super(TRTModule, self).__init__()
+        self.engine = Engine([
+            _C.rt.ABI_VERSION,
+            engine_name,
+            device_info._to_internal_cuda_device_str(),
+            serialized_engine
+        ])
+
+    def forward(self, *inputs):
+        try:
+            assert all([i.issubclass(torch.Tensor) for i in inputs])
+        except:
+            raise RuntimeError("TRTModule expects a flattened list of tensors as input")
+        outputs = execute_engine(list(inputs), self.engine)
+        return tuple(outputs)
+
+    def enable_profiling(self, profiler: None):
+        #TODO: CHANGE THIS SO IT MAKE MORE SENSE
+        self.engine.debug = True
+
+    def disable_profiling(self):
+        #TODO: HERE TOO
+        self.engine.debug = False
+
+    def get_layer_info(self) -> str:
+        raise RuntimeError("Engine Inspector needs to be implemented")
+        #assert TRT VERSION > 8.2
+        return self.engine.get_engine_information(_C.LayerInformationFormat.JSON)
\ No newline at end of file

From 808f3e24b90488bc6be0df683a7a821c8f3787d2 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Thu, 13 Oct 2022 16:48:07 -0700
Subject: [PATCH 03/22] feat: let Input generate random tensors following the
 spec

Signed-off-by: Naren Dasan <naren@narendasan.com>
---
 py/torch_tensorrt/_Input.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/py/torch_tensorrt/_Input.py b/py/torch_tensorrt/_Input.py
index c66d0ec788..06728fb095 100644
--- a/py/torch_tensorrt/_Input.py
+++ b/py/torch_tensorrt/_Input.py
@@ -279,3 +279,18 @@ def _from_tensor(cls, t: torch.Tensor):
             else torch.channels_last
         )
         return cls(shape=t.shape, dtype=t.dtype, format=frmt)
+
+    def example_tensor(self, optimization_profile_field: str = None):
+        if optimization_profile_field is not None:
+            try:
+                assert any([optimization_profile_field == field_name for field_name in ["min_shape", "opt_shape", "max_shape"]])
+            except:
+                raise ValueError("Invalid field name, expected one of min_shape, opt_shape, max_shape")
+
+        if optimization_profile_field is not None and self.shape_mode == Input._ShapeMode.STATIC:
+            raise ValueError("Specified a optimization profile field but the input is static")
+
+        if self.shape_mode == Input._ShapeMode.STATIC:
+            return torch.randn(self.shape).to(dtype=self.dtype)
+        else:
+            return torch.randn(self.shape[optimization_profile_field]).to(dtype=self.dtype)

From 0f003b8e4a7396fd0abe95d201f38e824a0da35e Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Thu, 3 Nov 2022 15:33:47 -0700
Subject: [PATCH 04/22] feat!(//core/runtime): Allow the Runtime to use binding
 names to align I/O

BREAKING CHANGE: This commit contains an ABI version upgrade meaning
that existing compiled modules will not work with this runtime.
Recompilation with a newer version of Torch-TensorRT will fix this.

This also ammends the C++ to allow users to explicitly set binding names
in the order they will be passed in and are expected to be returned.
This change is backwards compatible with the current API.

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/compiler.cpp                           |  28 +++-
 core/compiler.h                             |   6 +-
 core/runtime/TRTEngine.cpp                  | 163 +++++++++++++++-----
 core/runtime/TRTEngine.h                    |  24 ++-
 core/runtime/execute_engine.cpp             |  10 +-
 core/runtime/register_jit_hooks.cpp         |  19 +++
 core/runtime/runtime.h                      |  15 +-
 cpp/include/torch_tensorrt/torch_tensorrt.h |  12 +-
 cpp/src/torch_tensorrt.cpp                  |   9 +-
 tests/util/run_graph_engine.cpp             |   3 +-
 10 files changed, 222 insertions(+), 67 deletions(-)

diff --git a/core/compiler.cpp b/core/compiler.cpp
index 29ffa5008e..b412d8a3db 100644
--- a/core/compiler.cpp
+++ b/core/compiler.cpp
@@ -32,10 +32,16 @@ void AddEngineToGraph(
     std::shared_ptr<torch::jit::Graph>& g,
     const std::string& serialized_engine,
     runtime::CUDADevice& device_info,
+    const std::vector<std::string>& input_binding_names,
+    const std::vector<std::string>& output_binding_names,
     std::string engine_id = "",
     bool fallback = false) {
   auto engine_ptr = c10::make_intrusive<runtime::TRTEngine>(
-      mod._ivalue()->name() + "_engine_" + engine_id, serialized_engine, device_info);
+      mod._ivalue()->name() + "_engine_" + engine_id,
+      serialized_engine,
+      device_info,
+      input_binding_names,
+      output_binding_names);
   // Get required metadata about the engine out
   auto num_io = engine_ptr->num_io;
   auto name = engine_ptr->name;
@@ -167,7 +173,15 @@ partitioning::GraphAndMapping BuildHybridGraph(
         auto temp_g = std::make_shared<torch::jit::Graph>();
         auto device_spec = convert_info.engine_settings.device;
         auto cuda_device = runtime::CUDADevice(device_spec.gpu_id, device_spec.device_type);
-        AddEngineToGraph(new_mod, temp_g, engine, cuda_device, trt_engine_id.str(), true);
+        AddEngineToGraph(
+            new_mod,
+            temp_g,
+            engine,
+            cuda_device,
+            std::vector<std::string>(),
+            std::vector<std::string>(),
+            trt_engine_id.str(),
+            true);
 
         seg_block.update_graph(temp_g);
       }
@@ -331,7 +345,7 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
             "Not all operations in graph are supported by the compiler");
         // TODO find the right
         auto engine = conversion::ConvertBlockToEngine(g->block(), cfg.convert_info, static_params);
-        AddEngineToGraph(new_mod, new_g, engine, cuda_device);
+        AddEngineToGraph(new_mod, new_g, engine, cuda_device, std::vector<std::string>(), std::vector<std::string>());
       }
       auto new_method = new_mod._ivalue()->compilation_unit()->create_function(method.name(), new_g);
       auto schema = util::GenerateGraphSchema(new_method->name(), new_g);
@@ -342,12 +356,16 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
   return new_mod;
 }
 
-torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine, runtime::CUDADevice cuda_device) {
+torch::jit::script::Module EmbedEngineInNewModule(
+    const std::string& engine,
+    runtime::CUDADevice cuda_device,
+    const std::vector<std::string>& input_binding_names,
+    const std::vector<std::string>& output_binding_names) {
   std::ostringstream engine_id;
   engine_id << reinterpret_cast<const int*>(&engine);
   torch::jit::script::Module new_mod("tensorrt_engine_mod_" + engine_id.str());
   auto new_g = std::make_shared<torch::jit::Graph>();
-  AddEngineToGraph(new_mod, new_g, engine, cuda_device);
+  AddEngineToGraph(new_mod, new_g, engine, cuda_device, input_binding_names, output_binding_names);
   auto new_method = new_mod._ivalue()->compilation_unit()->create_function("forward", new_g);
   auto schema = util::GenerateGraphSchema(new_method->name(), new_g);
   new_mod.type()->addMethod(new_method);
diff --git a/core/compiler.h b/core/compiler.h
index d35fae352d..524f762325 100644
--- a/core/compiler.h
+++ b/core/compiler.h
@@ -28,7 +28,11 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::
 
 torch::jit::script::Module CompileGraph(const torch::jit::script::Module& module, CompileSpec cfg);
 
-torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine, runtime::CUDADevice cuda_device);
+torch::jit::script::Module EmbedEngineInNewModule(
+    const std::string& engine,
+    runtime::CUDADevice cuda_device,
+    const std::vector<std::string>& input_binding_names,
+    const std::vector<std::string>& output_binding_names);
 
 void set_device(const int gpu_id);
 
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
index 9f7dd16827..159bd6130d 100644
--- a/core/runtime/TRTEngine.cpp
+++ b/core/runtime/TRTEngine.cpp
@@ -16,9 +16,24 @@ std::string slugify(std::string s) {
   return s;
 }
 
-TRTEngine::TRTEngine(std::string serialized_engine, CUDADevice cuda_device) {
+std::vector<std::string> split(const std::string& str, char delim) {
+  std::vector<std::string> strings;
+  size_t start;
+  size_t end = 0;
+  while ((start = str.find_first_not_of(delim, end)) != std::string::npos) {
+    end = str.find(delim, start);
+    strings.push_back(str.substr(start, end - start));
+  }
+  return strings;
+}
+
+TRTEngine::TRTEngine(
+    std::string serialized_engine,
+    CUDADevice cuda_device,
+    const std::vector<std::string>& _in_binding_names,
+    const std::vector<std::string>& _out_binding_names) {
   std::string _name = "deserialized_trt";
-  new (this) TRTEngine(_name, serialized_engine, cuda_device);
+  new (this) TRTEngine(_name, serialized_engine, cuda_device, _in_binding_names, _out_binding_names);
 }
 
 TRTEngine::TRTEngine(std::vector<std::string> serialized_info) {
@@ -32,12 +47,20 @@ TRTEngine::TRTEngine(std::vector<std::string> serialized_info) {
           << ")");
   std::string _name = serialized_info[NAME_IDX];
   std::string engine_info = serialized_info[ENGINE_IDX];
+  std::vector<std::string> in_bindings = split(serialized_info[INPUT_BINDING_NAMES_IDX], '%');
+  std::vector<std::string> out_bindings = split(serialized_info[INPUT_BINDING_NAMES_IDX], '%');
 
   CUDADevice cuda_device(serialized_info[DEVICE_IDX]);
-  new (this) TRTEngine(_name, engine_info, cuda_device);
+
+  new (this) TRTEngine(_name, engine_info, cuda_device, in_bindings, out_bindings);
 }
 
-TRTEngine::TRTEngine(std::string mod_name, std::string serialized_engine, CUDADevice cuda_device) {
+TRTEngine::TRTEngine(
+    std::string mod_name,
+    std::string serialized_engine,
+    CUDADevice cuda_device,
+    const std::vector<std::string>& _in_binding_names,
+    const std::vector<std::string>& _out_binding_names) {
   auto most_compatible_device = get_most_compatible_device(cuda_device);
   TORCHTRT_CHECK(most_compatible_device, "No compatible device was found for instantiating TensorRT engine");
   device_info = most_compatible_device.value();
@@ -53,34 +76,84 @@ TRTEngine::TRTEngine(std::string mod_name, std::string serialized_engine, CUDADe
   exec_ctx = make_trt(cuda_engine->createExecutionContext());
   TORCHTRT_CHECK((exec_ctx.get() != nullptr), "Unable to create TensorRT execution context");
 
-  uint64_t inputs = 0;
-  uint64_t outputs = 0;
+  if (_in_binding_names.size() == 0 && _out_binding_names.size() == 0) {
+    uint64_t inputs = 0;
+    uint64_t outputs = 0;
+
+    for (int64_t x = 0; x < cuda_engine->getNbBindings(); x++) {
+      std::string bind_name = cuda_engine->getBindingName(x);
+      LOG_DEBUG("Binding name: " << bind_name);
+      auto delim = bind_name.find(".");
+      if (delim == std::string::npos) {
+        delim = bind_name.find("_");
+        TORCHTRT_CHECK(
+            delim != std::string::npos,
+            "Unable to determine binding index for input "
+                << bind_name
+                << "\nEnsure module was compiled with Torch-TensorRT.ts or follows Torch-TensorRT Runtime conventions");
+      }
+
+      std::string idx_s = bind_name.substr(delim + 1);
+      uint64_t idx = static_cast<uint64_t>(std::stoi(idx_s));
+
+      if (cuda_engine->bindingIsInput(x)) {
+        inputs++;
+        in_binding_map[x] = idx;
+        LOG_DEBUG("TRT Binding: " << x << ": PYT Input: " << idx);
+      } else {
+        outputs++;
+        out_binding_map[x] = idx;
+        LOG_DEBUG("TRT Binding: " << x << ": PYT Output: " << idx);
+      }
+    }
 
-  for (int64_t x = 0; x < cuda_engine->getNbBindings(); x++) {
-    std::string bind_name = cuda_engine->getBindingName(x);
-    LOG_DEBUG("Binding name: " << bind_name);
-    auto delim = bind_name.find(".");
-    if (delim == std::string::npos) {
-      delim = bind_name.find("_");
+    num_io = std::make_pair(inputs, outputs);
+    in_binding_names.resize(inputs);
+    out_binding_names.resize(outputs);
+
+    for (int64_t x = 0; x < cuda_engine->getNbBindings(); x++) {
+      std::cout << x << std::endl;
+      std::string bind_name = cuda_engine->getBindingName(x);
+      if (cuda_engine->bindingIsInput(x)) {
+        in_binding_names[in_binding_map.at(x)] = bind_name;
+      } else {
+        out_binding_names[out_binding_map.at(x)] = bind_name;
+      }
+    }
+  } else {
+    uint64_t inputs = _in_binding_names.size();
+    in_binding_names.reserve(inputs);
+    for (size_t pyt_idx = 0; pyt_idx < inputs; pyt_idx++) {
+      auto binding_name = _in_binding_names[pyt_idx];
+      auto trt_idx = cuda_engine->getBindingIndex(binding_name.c_str());
+      TORCHTRT_CHECK((trt_idx >= 0), "Could not find a TensorRT engine binding for input named " << binding_name);
       TORCHTRT_CHECK(
-          delim != std::string::npos,
-          "Unable to determine binding index for input "
-              << bind_name
-              << "\nEnsure module was compiled with Torch-TensorRT.ts or follows Torch-TensorRT Runtime conventions");
+          cuda_engine->bindingIsInput(trt_idx),
+          "Binding " << binding_name << " specified as input but found as output in TensorRT engine");
+      LOG_DEBUG(
+          "Input binding name: " << binding_name << "(trt: " << trt_idx << ","
+                                 << "pyt: " << pyt_idx << ")");
+      in_binding_map[trt_idx] = pyt_idx;
+      in_binding_names[pyt_idx] = _in_binding_names[pyt_idx];
     }
 
-    std::string idx_s = bind_name.substr(delim + 1);
-    uint64_t idx = static_cast<uint64_t>(std::stoi(idx_s));
-
-    if (cuda_engine->bindingIsInput(x)) {
-      inputs++;
-      in_binding_map[x] = idx;
-    } else {
-      outputs++;
-      out_binding_map[x] = idx;
+    uint64_t outputs = _out_binding_names.size();
+    out_binding_names.reserve(inputs);
+    for (size_t pyt_idx = 0; pyt_idx < outputs; pyt_idx++) {
+      auto binding_name = _out_binding_names[pyt_idx];
+      auto trt_idx = cuda_engine->getBindingIndex(binding_name.c_str());
+      TORCHTRT_CHECK((trt_idx >= 0), "Could not find a TensorRT engine binding for output named " << binding_name);
+      TORCHTRT_CHECK(
+          !cuda_engine->bindingIsInput(trt_idx),
+          "Binding " << binding_name << " specified as output but found as input in TensorRT engine");
+      LOG_DEBUG(
+          "Output binding name: " << binding_name << "(trt: " << trt_idx << ","
+                                  << "pyt: " << pyt_idx << ")");
+      out_binding_map[trt_idx] = pyt_idx;
+      out_binding_names[pyt_idx] = _out_binding_names[pyt_idx];
     }
+    num_io = std::make_pair(inputs, outputs);
   }
-  num_io = std::make_pair(inputs, outputs);
 
   LOG_DEBUG(*this);
 }
@@ -89,8 +162,8 @@ void TRTEngine::set_paths() {
   execution_profile_path = profile_path + "/" + name + "_execution_profile.trace";
   device_profile_path = profile_path + "/" + name + "_device_config_profile.trace";
   input_profile_path = profile_path + "/" + name + "_input_profile.trace";
-  output_profile_path =  profile_path + "/" + name + "_output_profile.trace";
-  enqueue_profile_path =  profile_path + "/" + name + "_enqueue_profile.trace";
+  output_profile_path = profile_path + "/" + name + "_output_profile.trace";
+  enqueue_profile_path = profile_path + "/" + name + "_enqueue_profile.trace";
 }
 
 TRTEngine& TRTEngine::operator=(const TRTEngine& other) {
@@ -103,25 +176,31 @@ TRTEngine& TRTEngine::operator=(const TRTEngine& other) {
 }
 
 std::string TRTEngine::to_str() const {
+  // clang-format off
   std::stringstream ss;
   ss << "Torch-TensorRT TensorRT Engine:" << std::endl;
   ss << "  Name: " << name << std::endl;
-  ss << "  Inputs: [" << std::endl;
-  for (uint64_t i = 0; i < num_io.first; i++) {
-    ss << "    id: " << i << std::endl;
-    ss << "      shape: " << exec_ctx->getBindingDimensions(i) << std::endl;
-    ss << "      dtype: " << util::TRTDataTypeToScalarType(exec_ctx->getEngine().getBindingDataType(i)) << std::endl;
-  }
-  ss << "  ]" << std::endl;
-  ss << "  Outputs: [" << std::endl;
-  for (uint64_t o = 0; o < num_io.second; o++) {
-    ss << "    id: " << o << std::endl;
-    ss << "      shape: " << exec_ctx->getBindingDimensions(o) << std::endl;
-    ss << "      dtype: " << util::TRTDataTypeToScalarType(exec_ctx->getEngine().getBindingDataType(o)) << std::endl;
+  ss << "  Bindings: {" << std::endl;
+  for (int64_t x = 0; x < cuda_engine->getNbBindings(); x++) {
+    if (cuda_engine->bindingIsInput(x)) {
+      const uint64_t pyt_idx = in_binding_map.at(x);
+  ss << "    (" << x << ": " << in_binding_names.at(pyt_idx) << ") Input: [" << std::endl;
+  ss << "      pytorch arg idx: " << pyt_idx << std::endl;
+  ss << "        shape: " << exec_ctx->getBindingDimensions(x) << std::endl;
+  ss << "        dtype: " << util::TRTDataTypeToScalarType(exec_ctx->getEngine().getBindingDataType(x)) << std::endl;
+  ss << "    ]" << std::endl;
+    } else {
+      const uint64_t pyt_idx = out_binding_map.at(x);
+  ss << "    (" << x <<  ": " << out_binding_names.at(pyt_idx) << ") Output: [" << std::endl;
+  ss << "      pytorch return idx: " << pyt_idx << std::endl;
+  ss << "        shape: " << exec_ctx->getBindingDimensions(x) << std::endl;
+  ss << "        dtype: " << util::TRTDataTypeToScalarType(exec_ctx->getEngine().getBindingDataType(x)) << std::endl;
+  ss << "    ]" << std::endl;
+    }
   }
-  ss << "  ]" << std::endl;
+  ss << "  }" << std::endl;
   ss << "  Device: " << device_info << std::endl;
-
+  // clang-format on
   return ss.str();
 }
 
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
index f76880bf76..4609802d01 100644
--- a/core/runtime/TRTEngine.h
+++ b/core/runtime/TRTEngine.h
@@ -29,8 +29,11 @@ struct TRTEngine : torch::CustomClassHolder {
   std::string enqueue_profile_path;
   std::string profile_path = "/tmp";
 
-  std::unordered_map<uint64_t, uint64_t> in_binding_map;
-  std::unordered_map<uint64_t, uint64_t> out_binding_map;
+  std::unordered_map<uint64_t, uint64_t> in_binding_map; // TRT IDX -> PYT IDX
+  std::unordered_map<uint64_t, uint64_t> out_binding_map; // TRT IDX -> PYT IDX
+
+  std::vector<std::string> in_binding_names; // ITO: PYT IDX
+  std::vector<std::string> out_binding_names; // ITO: PYT IDX
 
 #ifndef NDEBUG
   bool debug = true;
@@ -39,9 +42,18 @@ struct TRTEngine : torch::CustomClassHolder {
 #endif
 
   ~TRTEngine() = default;
-  TRTEngine(std::string serialized_engine, CUDADevice cuda_device);
+  TRTEngine(
+      std::string serialized_engine,
+      CUDADevice cuda_device,
+      const std::vector<std::string>& in_binding_names,
+      const std::vector<std::string>& out_binding_names);
   TRTEngine(std::vector<std::string> serialized_info);
-  TRTEngine(std::string mod_name, std::string serialized_engine, CUDADevice cuda_device);
+  TRTEngine(
+      std::string mod_name,
+      std::string serialized_engine,
+      CUDADevice cuda_device,
+      const std::vector<std::string>& in_binding_names,
+      const std::vector<std::string>& out_binding_names);
   TRTEngine& operator=(const TRTEngine& other);
   std::string to_str() const;
   void set_paths();
@@ -50,6 +62,6 @@ struct TRTEngine : torch::CustomClassHolder {
   // c10::List<at::Tensor> Run(c10::List<at::Tensor> inputs);
 };
 
-} // namespace torch_tensorrt
-} // namespace core
 } // namespace runtime
+} // namespace core
+} // namespace torch_tensorrt
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
index 727c36f541..2f24b8f54e 100644
--- a/core/runtime/execute_engine.cpp
+++ b/core/runtime/execute_engine.cpp
@@ -59,10 +59,12 @@ CUDADevice select_cuda_device(const CUDADevice& engine_device) {
 
 std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
   LOG_DEBUG("Attempting to run engine (ID: " << compiled_engine->name << ")");
+  compiled_engine->debug = false;
 
   std::unique_ptr<torch::autograd::profiler::RecordProfile> execution_profiler_guard;
   if (compiled_engine->debug) {
-    execution_profiler_guard.reset(new torch::autograd::profiler::RecordProfile(compiled_engine->execution_profile_path));
+    execution_profiler_guard.reset(
+        new torch::autograd::profiler::RecordProfile(compiled_engine->execution_profile_path));
   }
 
   {
@@ -113,11 +115,11 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
       compiled_engine->exec_ctx->setBindingDimensions(i, dims);
       gpu_handles.push_back(contig_inputs.back().data_ptr());
     }
-  TORCHTRT_CHECK(
-      compiled_engine->exec_ctx->allInputDimensionsSpecified(), "Not enough inputs provided (torch.ops.tensorrt.execute_engine)");
+    TORCHTRT_CHECK(
+        compiled_engine->exec_ctx->allInputDimensionsSpecified(),
+        "Not enough inputs provided (torch.ops.tensorrt.execute_engine)");
   }
 
-
   std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
   {
     std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
index 373b574a02..00f1895fc4 100644
--- a/core/runtime/register_jit_hooks.cpp
+++ b/core/runtime/register_jit_hooks.cpp
@@ -3,8 +3,24 @@
 namespace torch_tensorrt {
 namespace core {
 namespace runtime {
+
+const std::string BINDING_DELIM = "%";
 namespace {
 
+std::string serialize_bindings(const std::vector<std::string>& bindings) {
+  std::stringstream ss;
+  for (size_t i = 0; i < bindings.size() - 1; i++) {
+    ss << bindings[i] << BINDING_DELIM;
+  }
+  ss << bindings[bindings.size() - 1];
+
+  std::string serialized_binding_info = ss.str();
+
+  LOG_DEBUG("Serialized binding Info: " << serialized_binding_info);
+
+  return serialized_binding_info;
+}
+
 // TODO: Implement a call method
 // c10::List<at::Tensor> TRTEngine::Run(c10::List<at::Tensor> inputs) {
 //     auto input_vec = inputs.vec();
@@ -36,6 +52,9 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
               serialize_info[NAME_IDX] = self->name;
               serialize_info[DEVICE_IDX] = self->device_info.serialize();
               serialize_info[ENGINE_IDX] = trt_engine;
+              serialize_info[INPUT_BINDING_NAMES_IDX] = serialize_bindings(self->in_binding_names);
+              serialize_info[OUTPUT_BINDING_NAMES_IDX] = serialize_bindings(self->out_binding_names);
+
               return serialize_info;
             },
             [](std::vector<std::string> seralized_info) -> c10::intrusive_ptr<TRTEngine> {
diff --git a/core/runtime/runtime.h b/core/runtime/runtime.h
index 420e37366a..81347b70ba 100644
--- a/core/runtime/runtime.h
+++ b/core/runtime/runtime.h
@@ -5,18 +5,25 @@
 #include <utility>
 #include "ATen/core/function_schema.h"
 #include "NvInfer.h"
-#include "core/util/prelude.h"
-#include "torch/custom_class.h"
 #include "core/runtime/CUDADevice.h"
 #include "core/runtime/TRTEngine.h"
+#include "core/util/prelude.h"
+#include "torch/custom_class.h"
 
 namespace torch_tensorrt {
 namespace core {
 namespace runtime {
 
 using EngineID = int64_t;
-const std::string ABI_VERSION = "3";
-typedef enum { ABI_TARGET_IDX = 0, NAME_IDX, DEVICE_IDX, ENGINE_IDX } SerializedInfoIndex;
+const std::string ABI_VERSION = "4";
+typedef enum {
+  ABI_TARGET_IDX = 0,
+  NAME_IDX,
+  DEVICE_IDX,
+  ENGINE_IDX,
+  INPUT_BINDING_NAMES_IDX,
+  OUTPUT_BINDING_NAMES_IDX
+} SerializedInfoIndex;
 
 c10::optional<CUDADevice> get_most_compatible_device(const CUDADevice& target_device);
 std::vector<CUDADevice> find_compatible_devices(const CUDADevice& target_device);
diff --git a/cpp/include/torch_tensorrt/torch_tensorrt.h b/cpp/include/torch_tensorrt/torch_tensorrt.h
index 80db25e8f9..ddc29f8a07 100644
--- a/cpp/include/torch_tensorrt/torch_tensorrt.h
+++ b/cpp/include/torch_tensorrt/torch_tensorrt.h
@@ -761,18 +761,26 @@ TORCHTRT_API std::string convert_method_to_trt_engine(
  *
  * @param engine: std::string - Pre-built serialized TensorRT engine
  * @param device: CompileSepc::Device - Device information
+ * @param input_binding_names: std::vector<std::string> - Name of TensorRT bindings in order passed in by original
+ * PyTorch function (defaults to assuming convention below)
+ * @param output_binding_names: std::vector<std::string> - Name of TensorRT bindings in order returned by original
+ * PyTorch function (defaults to assuming convention below)
  *
  * Takes a pre-built serialized TensorRT engine and embeds it in a TorchScript
  * module. Registers execution of the engine as the forward method of the module
  * Forward is defined as: forward(Tensor[]) -> Tensor[]
  *
- * TensorRT bindings must have names with the following format:
+ * If binding names not specified TensorRT bindings must have names with the following format:
  * - [symbol].[index in input / output array]
  * ex.
  * - [x.0, x.1, x.2] -> [y.0]
  *
  * @return: A new module targeting a TensorRT engine
  */
-TORCHTRT_API torch::jit::Module embed_engine_in_new_module(const std::string& engine, Device device);
+TORCHTRT_API torch::jit::Module embed_engine_in_new_module(
+    const std::string& engine,
+    Device device,
+    const std::vector<std::string>& input_binding_names = std::vector<std::string>(),
+    const std::vector<std::string>& output_binding_names = std::vector<std::string>());
 } // namespace torchscript
 } // namespace torch_tensorrt
diff --git a/cpp/src/torch_tensorrt.cpp b/cpp/src/torch_tensorrt.cpp
index 3aca25003a..527e6c5cbf 100644
--- a/cpp/src/torch_tensorrt.cpp
+++ b/cpp/src/torch_tensorrt.cpp
@@ -33,8 +33,13 @@ torch::jit::script::Module compile(const torch::jit::script::Module& module, Com
   return torch_tensorrt::core::CompileGraph(module, to_internal_compile_spec(info));
 }
 
-torch::jit::Module embed_engine_in_new_module(const std::string& engine, Device device) {
-  return torch_tensorrt::core::EmbedEngineInNewModule(engine, to_internal_cuda_device(device));
+torch::jit::Module embed_engine_in_new_module(
+    const std::string& engine,
+    Device device,
+    const std::vector<std::string>& input_binding_names,
+    const std::vector<std::string>& output_binding_names) {
+  return torch_tensorrt::core::EmbedEngineInNewModule(
+      engine, to_internal_cuda_device(device), input_binding_names, output_binding_names);
 }
 
 } // namespace torchscript
diff --git a/tests/util/run_graph_engine.cpp b/tests/util/run_graph_engine.cpp
index 7df2db7d01..2ee966346c 100644
--- a/tests/util/run_graph_engine.cpp
+++ b/tests/util/run_graph_engine.cpp
@@ -57,7 +57,8 @@ std::vector<core::ir::Input> toInputsDynamic(std::vector<at::Tensor> ten, bool d
 std::vector<at::Tensor> RunEngine(std::string& eng, std::vector<at::Tensor> inputs) {
   LOG_DEBUG("Running TRT version");
   auto cuda_device = core::runtime::CUDADevice(0, nvinfer1::DeviceType::kGPU);
-  auto engine_ptr = c10::make_intrusive<torch_tensorrt::core::runtime::TRTEngine>("test_engine", eng, cuda_device);
+  auto engine_ptr = c10::make_intrusive<torch_tensorrt::core::runtime::TRTEngine>(
+      "test_engine", eng, cuda_device, std::vector<std::string>(), std::vector<std::string>());
   auto outputs = torch_tensorrt::core::runtime::execute_engine(inputs, engine_ptr);
   return outputs;
 }

From df3ac77fe8aa1084b9c25c354e87c4396f5c3469 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Thu, 3 Nov 2022 18:23:50 -0700
Subject: [PATCH 05/22] fix(//core/runtime): Resolving some issues with the
 runtime ABI

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/runtime/TRTEngine.cpp          | 18 +++++++++---------
 core/runtime/register_jit_hooks.cpp |  4 ++--
 core/runtime/runtime.h              |  3 ++-
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
index 159bd6130d..4b3361c33e 100644
--- a/core/runtime/TRTEngine.cpp
+++ b/core/runtime/TRTEngine.cpp
@@ -38,7 +38,7 @@ TRTEngine::TRTEngine(
 
 TRTEngine::TRTEngine(std::vector<std::string> serialized_info) {
   TORCHTRT_CHECK(
-      serialized_info.size() == ENGINE_IDX + 1,
+      serialized_info.size() == SERIALIZATION_LEN,
       "Program to be deserialized targets an incompatible Torch-TensorRT ABI");
   TORCHTRT_CHECK(
       serialized_info[ABI_TARGET_IDX] == ABI_VERSION,
@@ -48,7 +48,7 @@ TRTEngine::TRTEngine(std::vector<std::string> serialized_info) {
   std::string _name = serialized_info[NAME_IDX];
   std::string engine_info = serialized_info[ENGINE_IDX];
   std::vector<std::string> in_bindings = split(serialized_info[INPUT_BINDING_NAMES_IDX], '%');
-  std::vector<std::string> out_bindings = split(serialized_info[INPUT_BINDING_NAMES_IDX], '%');
+  std::vector<std::string> out_bindings = split(serialized_info[OUTPUT_BINDING_NAMES_IDX], '%');
 
   CUDADevice cuda_device(serialized_info[DEVICE_IDX]);
 
@@ -122,7 +122,7 @@ TRTEngine::TRTEngine(
     }
   } else {
     uint64_t inputs = _in_binding_names.size();
-    in_binding_names.reserve(inputs);
+    in_binding_names.resize(inputs);
     for (size_t pyt_idx = 0; pyt_idx < inputs; pyt_idx++) {
       auto binding_name = _in_binding_names[pyt_idx];
       auto trt_idx = cuda_engine->getBindingIndex(binding_name.c_str());
@@ -131,14 +131,14 @@ TRTEngine::TRTEngine(
           cuda_engine->bindingIsInput(trt_idx),
           "Binding " << binding_name << " specified as input but found as output in TensorRT engine");
       LOG_DEBUG(
-          "Input binding name: " << binding_name << "(trt: " << trt_idx << ","
-                                 << "pyt: " << pyt_idx << ")");
+          "Input binding name: " << binding_name << " (trt binding idx: " << trt_idx << ", "
+                                 << "pyt arg idx: " << pyt_idx << ")");
       in_binding_map[trt_idx] = pyt_idx;
       in_binding_names[pyt_idx] = _in_binding_names[pyt_idx];
     }
 
     uint64_t outputs = _out_binding_names.size();
-    out_binding_names.reserve(inputs);
+    out_binding_names.resize(outputs);
     for (size_t pyt_idx = 0; pyt_idx < outputs; pyt_idx++) {
       auto binding_name = _out_binding_names[pyt_idx];
       auto trt_idx = cuda_engine->getBindingIndex(binding_name.c_str());
@@ -147,10 +147,10 @@ TRTEngine::TRTEngine(
           !cuda_engine->bindingIsInput(trt_idx),
           "Binding " << binding_name << " specified as output but found as input in TensorRT engine");
       LOG_DEBUG(
-          "Output binding name: " << binding_name << "(trt: " << trt_idx << ","
-                                  << "pyt: " << pyt_idx << ")");
+          "Output binding name: " << binding_name << " (trt binding idx: " << trt_idx << ", "
+                                  << "pyt return idx: " << pyt_idx << ")");
       out_binding_map[trt_idx] = pyt_idx;
-      out_binding_names[pyt_idx] = _out_binding_names[pyt_idx];
+      out_binding_names[pyt_idx] = binding_name;
     }
     num_io = std::make_pair(inputs, outputs);
   }
diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
index 00f1895fc4..f00957d191 100644
--- a/core/runtime/register_jit_hooks.cpp
+++ b/core/runtime/register_jit_hooks.cpp
@@ -16,7 +16,7 @@ std::string serialize_bindings(const std::vector<std::string>& bindings) {
 
   std::string serialized_binding_info = ss.str();
 
-  LOG_DEBUG("Serialized binding Info: " << serialized_binding_info);
+  LOG_DEBUG("Serialized Binding Info: " << serialized_binding_info);
 
   return serialized_binding_info;
 }
@@ -46,7 +46,7 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
               auto trt_engine = std::string((const char*)serialized_trt_engine->data(), serialized_trt_engine->size());
 
               std::vector<std::string> serialize_info;
-              serialize_info.resize(ENGINE_IDX + 1);
+              serialize_info.resize(SERIALIZATION_LEN);
 
               serialize_info[ABI_TARGET_IDX] = ABI_VERSION;
               serialize_info[NAME_IDX] = self->name;
diff --git a/core/runtime/runtime.h b/core/runtime/runtime.h
index 81347b70ba..66ee3816d2 100644
--- a/core/runtime/runtime.h
+++ b/core/runtime/runtime.h
@@ -22,7 +22,8 @@ typedef enum {
   DEVICE_IDX,
   ENGINE_IDX,
   INPUT_BINDING_NAMES_IDX,
-  OUTPUT_BINDING_NAMES_IDX
+  OUTPUT_BINDING_NAMES_IDX,
+  SERIALIZATION_LEN, // NEVER USED FOR DATA, USED TO DETERMINE LENGTH OF SERIALIZED INFO
 } SerializedInfoIndex;
 
 c10::optional<CUDADevice> get_most_compatible_device(const CUDADevice& target_device);

From e804455e65897d8f4e1e232c1c4a7388039047a7 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Fri, 4 Nov 2022 02:18:09 -0700
Subject: [PATCH 06/22] feat(//core/runtime): Adding a TRT layer profiler

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/runtime/BUILD                            |  14 ++-
 core/runtime/CMakeLists.txt                   |   2 +
 core/runtime/TRTEngine.cpp                    |  19 ++--
 core/runtime/TRTEngine.h                      |  11 +-
 core/runtime/TRTEngineProfiler.cpp            | 102 ++++++++++++++++++
 core/runtime/TRTEngineProfiler.h              |  34 ++++++
 core/runtime/execute_engine.cpp               |  48 ++++++---
 core/runtime/register_jit_hooks.cpp           |   4 +-
 tests/core/partitioning/partitioning_test.bzl |   2 +-
 tests/cpp/BUILD                               |   1 +
 10 files changed, 204 insertions(+), 33 deletions(-)
 create mode 100644 core/runtime/TRTEngineProfiler.cpp
 create mode 100644 core/runtime/TRTEngineProfiler.h

diff --git a/core/runtime/BUILD b/core/runtime/BUILD
index a2a2cfa3b1..8d3bb28e2a 100644
--- a/core/runtime/BUILD
+++ b/core/runtime/BUILD
@@ -16,14 +16,19 @@ cc_library(
         "CUDADevice.cpp",
         "DeviceList.cpp",
         "TRTEngine.cpp",
+        "TRTEngineProfiler.cpp",
         "execute_engine.cpp",
         "register_jit_hooks.cpp",
         "runtime.cpp",
     ],
     hdrs = [
-        "runtime.h",
         "CUDADevice.h",
-        "TRTEngine.h"
+        "TRTEngine.h",
+        "TRTEngineProfiler.h",
+        "runtime.h",
+    ],
+    linkopts = [
+        "-lstdc++fs",
     ],
     deps = [
         "@tensorrt//:nvinfer",
@@ -39,9 +44,10 @@ cc_library(
 pkg_tar(
     name = "include",
     srcs = [
-        "runtime.h",
         "CUDADevice.h",
-        "TRTEngine.h"
+        "TRTEngine.h",
+        "TRTEngineProfiler.h",
+        "runtime.h",
     ],
     package_dir = "core/runtime/",
 )
diff --git a/core/runtime/CMakeLists.txt b/core/runtime/CMakeLists.txt
index a4a8919808..fed93f81ba 100644
--- a/core/runtime/CMakeLists.txt
+++ b/core/runtime/CMakeLists.txt
@@ -6,6 +6,7 @@ set(CXX_SRCS
     "${CMAKE_CURRENT_SOURCE_DIR}/DeviceList.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/execute_engine.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/TRTEngine.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/TRTEngineProfiler.h"
     "${CMAKE_CURRENT_SOURCE_DIR}/register_jit_hooks.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/runtime.cpp"
 )
@@ -14,6 +15,7 @@ set(HEADER_FILES
     "${CMAKE_CURRENT_SOURCE_DIR}/runtime.h"
     "${CMAKE_CURRENT_SOURCE_DIR}/CUDADevice.h"
     "${CMAKE_CURRENT_SOURCE_DIR}/TRTEngine.h"
+    "${CMAKE_CURRENT_SOURCE_DIR}/TRTEngineProfiler.h"
 )
 
 target_sources(${lib_name}
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
index 4b3361c33e..927e52dd8f 100644
--- a/core/runtime/TRTEngine.cpp
+++ b/core/runtime/TRTEngine.cpp
@@ -112,7 +112,6 @@ TRTEngine::TRTEngine(
     out_binding_names.resize(outputs);
 
     for (int64_t x = 0; x < cuda_engine->getNbBindings(); x++) {
-      std::cout << x << std::endl;
       std::string bind_name = cuda_engine->getBindingName(x);
       if (cuda_engine->bindingIsInput(x)) {
         in_binding_names[in_binding_map.at(x)] = bind_name;
@@ -158,12 +157,18 @@ TRTEngine::TRTEngine(
   LOG_DEBUG(*this);
 }
 
-void TRTEngine::set_paths() {
-  execution_profile_path = profile_path + "/" + name + "_execution_profile.trace";
-  device_profile_path = profile_path + "/" + name + "_device_config_profile.trace";
-  input_profile_path = profile_path + "/" + name + "_input_profile.trace";
-  output_profile_path = profile_path + "/" + name + "_output_profile.trace";
-  enqueue_profile_path = profile_path + "/" + name + "_enqueue_profile.trace";
+void TRTEngine::set_profiling_paths() {
+  device_profile_path =
+      std::experimental::filesystem::path{profile_path_prefix + "/" + name + "_device_config_profile.trace"}.string();
+  input_profile_path =
+      std::experimental::filesystem::path{profile_path_prefix + "/" + name + "_input_profile.trace"}.string();
+  output_profile_path =
+      std::experimental::filesystem::path{profile_path_prefix + "/" + name + "_output_profile.trace"}.string();
+  enqueue_profile_path =
+      std::experimental::filesystem::path{profile_path_prefix + "/" + name + "_enqueue_profile.trace"}.string();
+  trt_engine_profile_path =
+      std::experimental::filesystem::path{profile_path_prefix + "/" + name + "_engine_exectuion_profile.trace"}
+          .string();
 }
 
 TRTEngine& TRTEngine::operator=(const TRTEngine& other) {
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
index 4609802d01..1b6234db03 100644
--- a/core/runtime/TRTEngine.h
+++ b/core/runtime/TRTEngine.h
@@ -1,4 +1,5 @@
 #pragma once
+#include <experimental/filesystem>
 #include <map>
 #include <memory>
 #include <mutex>
@@ -22,12 +23,12 @@ struct TRTEngine : torch::CustomClassHolder {
   std::mutex mu;
   CUDADevice device_info;
 
-  std::string execution_profile_path;
   std::string device_profile_path;
   std::string input_profile_path;
   std::string output_profile_path;
   std::string enqueue_profile_path;
-  std::string profile_path = "/tmp";
+  std::string trt_engine_profile_path;
+  std::string profile_path_prefix = std::experimental::filesystem::temp_directory_path();
 
   std::unordered_map<uint64_t, uint64_t> in_binding_map; // TRT IDX -> PYT IDX
   std::unordered_map<uint64_t, uint64_t> out_binding_map; // TRT IDX -> PYT IDX
@@ -36,9 +37,9 @@ struct TRTEngine : torch::CustomClassHolder {
   std::vector<std::string> out_binding_names; // ITO: PYT IDX
 
 #ifndef NDEBUG
-  bool debug = true;
+  bool profile_execution = true;
 #else
-  bool debug = false;
+  bool profile_execution = false;
 #endif
 
   ~TRTEngine() = default;
@@ -56,7 +57,7 @@ struct TRTEngine : torch::CustomClassHolder {
       const std::vector<std::string>& out_binding_names);
   TRTEngine& operator=(const TRTEngine& other);
   std::string to_str() const;
-  void set_paths();
+  void set_profiling_paths();
   friend std::ostream& operator<<(std::ostream& os, const TRTEngine& engine);
   // TODO: Implement a call method
   // c10::List<at::Tensor> Run(c10::List<at::Tensor> inputs);
diff --git a/core/runtime/TRTEngineProfiler.cpp b/core/runtime/TRTEngineProfiler.cpp
new file mode 100644
index 0000000000..a115951610
--- /dev/null
+++ b/core/runtime/TRTEngineProfiler.cpp
@@ -0,0 +1,102 @@
+#include <algorithm>
+#include <fstream>
+#include <iomanip>
+
+#include "core/runtime/TRTEngineProfiler.h"
+
+namespace torch_tensorrt {
+namespace core {
+namespace runtime {
+
+void TRTEngineProfiler::reportLayerTime(const char* layer_name, float ms) noexcept {
+  profile[layer_name].count++;
+  profile[layer_name].time += ms;
+  if (std::find(layer_names.begin(), layer_names.end(), layer_name) == layer_names.end()) {
+    layer_names.push_back(layer_name);
+  }
+}
+
+TRTEngineProfiler::TRTEngineProfiler(const std::string& name, const std::vector<TRTEngineProfiler>& srcProfilers)
+    : name(name) {
+  for (const auto& srcProfiler : srcProfilers) {
+    for (const auto& rec : srcProfiler.profile) {
+      auto it = profile.find(rec.first);
+      if (it == profile.end()) {
+        profile.insert(rec);
+      } else {
+        it->second.time += rec.second.time;
+        it->second.count += rec.second.count;
+      }
+    }
+  }
+}
+
+void dump_trace(const std::string& path, const TRTEngineProfiler& value) {
+  std::stringstream out;
+  out << "[" << std::endl;
+  double ts = 0.0;
+  for (size_t i = 0; i < value.layer_names.size(); i++) {
+    auto layer_name = value.layer_names[i];
+    auto elem = value.profile.at(layer_name);
+
+    out << "  {" << std::endl;
+    out << "    \"name\": \"" << layer_name << "\"," << std::endl;
+    out << "    \"ph\": \"X\"," << std::endl;
+    out << "    \"ts\": " << ts * 1000 << "," << std::endl;
+    out << "    \"dur\": " << elem.time * 1000 << "," << std::endl;
+    out << "    \"tid\": 1," << std::endl;
+    out << "    \"pid\": \"" << value.name << " Engine Execution\"," << std::endl;
+    out << "    \"args\": {}" << std::endl;
+    out << "  }," << std::endl;
+
+    ts += elem.time;
+  }
+  out.seekp(-2, out.cur);
+  out << "\n]" << std::endl;
+  std::ofstream f(path);
+  f << out.str();
+  f.close();
+  return;
+}
+
+std::ostream& operator<<(std::ostream& out, const TRTEngineProfiler& value) {
+  out << "========== " << value.name << " profile ==========" << std::endl;
+  float totalTime = 0;
+  std::string layer_name = "TensorRT layer name";
+  int max_layer_name_len = std::max(static_cast<int>(layer_name.size()), 70);
+  for (const auto& elem : value.profile) {
+    totalTime += elem.second.time;
+    max_layer_name_len = std::max(max_layer_name_len, static_cast<int>(elem.first.size()));
+  }
+
+  auto old_settings = out.flags();
+  auto old_precision = out.precision();
+  // Output header
+  {
+    out << std::setfill(' ') << std::setw(max_layer_name_len) << layer_name << " ";
+    out << std::setw(12) << "Runtime, "
+        << "%"
+        << " ";
+    out << std::setw(12) << "Invocations"
+        << " ";
+    out << std::setw(12) << "Runtime, ms" << std::endl;
+  }
+  for (size_t i = 0; i < value.layer_names.size(); i++) {
+    layer_name = value.layer_names[i];
+    auto elem = value.profile.at(layer_name);
+    out << std::setw(max_layer_name_len) << layer_name << " ";
+    out << std::setw(12) << std::fixed << std::setprecision(1) << (elem.time * 100.0F / totalTime) << "%"
+        << " ";
+    out << std::setw(12) << elem.count << " ";
+    out << std::setw(12) << std::fixed << std::setprecision(2) << elem.time << std::endl;
+  }
+  out.flags(old_settings);
+  out.precision(old_precision);
+  out << "========== " << value.name << " total runtime = " << totalTime << " ms ==========" << std::endl;
+
+  return out;
+}
+
+} // namespace runtime
+} // namespace core
+} // namespace torch_tensorrt
\ No newline at end of file
diff --git a/core/runtime/TRTEngineProfiler.h b/core/runtime/TRTEngineProfiler.h
new file mode 100644
index 0000000000..34a901165b
--- /dev/null
+++ b/core/runtime/TRTEngineProfiler.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+#include "NvInfer.h"
+
+namespace torch_tensorrt {
+namespace core {
+namespace runtime {
+
+struct TRTEngineProfiler : public nvinfer1::IProfiler {
+  struct Record {
+    float time{0};
+    int count{0};
+  };
+
+  virtual void reportLayerTime(const char* layerName, float ms) noexcept;
+  TRTEngineProfiler(
+      const std::string& name,
+      const std::vector<TRTEngineProfiler>& srcProfilers = std::vector<TRTEngineProfiler>());
+  friend std::ostream& operator<<(std::ostream& out, const TRTEngineProfiler& value);
+  friend void dump_trace(const std::string& path, const TRTEngineProfiler& value);
+
+ private:
+  std::string name;
+  std::vector<std::string> layer_names;
+  std::map<std::string, Record> profile;
+};
+
+} // namespace runtime
+} // namespace core
+} // namespace torch_tensorrt
\ No newline at end of file
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
index 2f24b8f54e..bb6d4431f2 100644
--- a/core/runtime/execute_engine.cpp
+++ b/core/runtime/execute_engine.cpp
@@ -3,6 +3,7 @@
 #include "torch/csrc/jit/runtime/custom_operator.h"
 #include "torch/torch.h"
 
+#include "core/runtime/TRTEngineProfiler.h"
 #include "core/runtime/runtime.h"
 #include "core/util/prelude.h"
 
@@ -59,18 +60,25 @@ CUDADevice select_cuda_device(const CUDADevice& engine_device) {
 
 std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
   LOG_DEBUG("Attempting to run engine (ID: " << compiled_engine->name << ")");
-  compiled_engine->debug = false;
-
-  std::unique_ptr<torch::autograd::profiler::RecordProfile> execution_profiler_guard;
-  if (compiled_engine->debug) {
-    execution_profiler_guard.reset(
-        new torch::autograd::profiler::RecordProfile(compiled_engine->execution_profile_path));
+  // compiled_engine->debug = false;
+
+  if (compiled_engine->profile_execution) {
+    std::stringstream ss;
+    ss << "Execution profiling is enabled, find results here:" << std::endl;
+    compiled_engine->set_profiling_paths();
+    ss << "  Device selection profile: " << compiled_engine->device_profile_path << std::endl;
+    ss << "  Input packing profile: " << compiled_engine->input_profile_path << std::endl;
+    ss << "  Output packing profile: " << compiled_engine->output_profile_path << std::endl;
+    ss << "  TRT enqueue profile: " << compiled_engine->enqueue_profile_path << std::endl;
+    // ss << "  Engine execution profile (TensorRT format): " << compiled_engine->trt_engine_profile_path << std::endl;
+    LOG_DEBUG(ss.str());
   }
 
   {
     std::unique_ptr<torch::autograd::profiler::RecordProfile> device_profiler_guard;
-    if (compiled_engine->debug) {
-      device_profiler_guard.reset(new torch::autograd::profiler::RecordProfile(compiled_engine->device_profile_path));
+    if (compiled_engine->profile_execution) {
+      device_profiler_guard =
+          std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->device_profile_path);
     }
 
     CUDADevice curr_device = get_current_device();
@@ -93,8 +101,9 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
   std::vector<at::Tensor> contig_inputs{};
   {
     std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
-    if (compiled_engine->debug) {
-      input_profiler_guard.reset(new torch::autograd::profiler::RecordProfile(compiled_engine->input_profile_path));
+    if (compiled_engine->profile_execution) {
+      input_profiler_guard =
+          std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
     }
 
     contig_inputs.reserve(inputs.size());
@@ -123,8 +132,9 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
   std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
   {
     std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
-    if (compiled_engine->debug) {
-      output_profiler_guard.reset(new torch::autograd::profiler::RecordProfile(compiled_engine->output_profile_path));
+    if (compiled_engine->profile_execution) {
+      output_profiler_guard =
+          std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
     }
 
     for (size_t o = inputs.size(); o < (compiled_engine->num_io.first + compiled_engine->num_io.second); o++) {
@@ -140,15 +150,25 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
 
   {
     std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
-    if (compiled_engine->debug) {
-      enqueue_profiler_guard.reset(new torch::autograd::profiler::RecordProfile(compiled_engine->enqueue_profile_path));
+    if (compiled_engine->profile_execution) {
+      enqueue_profiler_guard =
+          std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path);
     }
 
     c10::cuda::CUDAStream stream = c10::cuda::getCurrentCUDAStream(inputs[0].device().index());
 
     // nvinfer1::IExecutionContext::enqueue is not thread safe and we need a mutex for it.
     std::unique_lock<std::mutex> lock(compiled_engine->mu);
+    std::unique_ptr<TRTEngineProfiler> trt_engine_profiler;
+    if (compiled_engine->profile_execution) {
+      trt_engine_profiler = std::make_unique<TRTEngineProfiler>(compiled_engine->name);
+      compiled_engine->exec_ctx->setProfiler(trt_engine_profiler.get());
+    }
     compiled_engine->exec_ctx->enqueueV2(gpu_handles.data(), stream, nullptr);
+    if (compiled_engine->profile_execution) {
+      LOG_INFO(std::endl << *trt_engine_profiler);
+      dump_trace(compiled_engine->trt_engine_profile_path, *trt_engine_profiler);
+    }
   }
 
   return outputs;
diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
index f00957d191..3891b5473e 100644
--- a/core/runtime/register_jit_hooks.cpp
+++ b/core/runtime/register_jit_hooks.cpp
@@ -35,8 +35,8 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
         // TODO: .def("run", &TRTEngine::Run)
         .def("__str__", &TRTEngine::to_str)
         .def("__repr__", &TRTEngine::to_str)
-        .def_readwrite("debug", &TRTEngine::debug)
-        .def_readwrite("profile_path", &TRTEngine::profile_path)
+        .def_readwrite("profile_execution", &TRTEngine::profile_execution)
+        .def_readwrite("profile_path_prefix", &TRTEngine::profile_path_prefix)
         .def_pickle(
             [](const c10::intrusive_ptr<TRTEngine>& self) -> std::vector<std::string> {
               // Serialize TensorRT engine
diff --git a/tests/core/partitioning/partitioning_test.bzl b/tests/core/partitioning/partitioning_test.bzl
index 87d5879ae8..fb6dd50991 100644
--- a/tests/core/partitioning/partitioning_test.bzl
+++ b/tests/core/partitioning/partitioning_test.bzl
@@ -22,5 +22,5 @@ def partitioning_test(name, visibility = None):
             ":use_pre_cxx11_abi": ["@libtorch_pre_cxx11_abi//:libtorch"],
             "//conditions:default": ["@libtorch//:libtorch"],
         }),
-        timeout = "short",
+        #timeout = "short",
     )
diff --git a/tests/cpp/BUILD b/tests/cpp/BUILD
index 3d56682189..78630f66a7 100644
--- a/tests/cpp/BUILD
+++ b/tests/cpp/BUILD
@@ -142,6 +142,7 @@ cc_test(
 
 cc_test(
     name = "test_compiled_modules",
+    timeout = "long",
     srcs = ["test_compiled_modules.cpp"],
     data = [
         "//tests/modules:jit_models",

From bbaf15215685b51215017e62302c2270268a97e8 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Fri, 4 Nov 2022 12:16:07 -0700
Subject: [PATCH 07/22] feat(//py): Exposed the new runtime in Python

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/runtime/TRTEngine.cpp                   | 36 ++++++++++++-
 core/runtime/TRTEngine.h                     | 38 +++++++++-----
 core/runtime/execute_engine.cpp              |  4 +-
 core/runtime/register_jit_hooks.cpp          |  7 ++-
 py/torch_tensorrt/csrc/tensorrt_backend.cpp  |  3 +-
 py/torch_tensorrt/csrc/tensorrt_classes.cpp  |  4 ++
 py/torch_tensorrt/csrc/tensorrt_classes.h    |  1 +
 py/torch_tensorrt/csrc/torch_tensorrt_py.cpp | 12 +++--
 py/torch_tensorrt/fx/trt_module_next.py      | 54 ++++++++++++--------
 9 files changed, 116 insertions(+), 43 deletions(-)

diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
index 927e52dd8f..e4f04642a0 100644
--- a/core/runtime/TRTEngine.cpp
+++ b/core/runtime/TRTEngine.cpp
@@ -47,8 +47,8 @@ TRTEngine::TRTEngine(std::vector<std::string> serialized_info) {
           << ")");
   std::string _name = serialized_info[NAME_IDX];
   std::string engine_info = serialized_info[ENGINE_IDX];
-  std::vector<std::string> in_bindings = split(serialized_info[INPUT_BINDING_NAMES_IDX], '%');
-  std::vector<std::string> out_bindings = split(serialized_info[OUTPUT_BINDING_NAMES_IDX], '%');
+  std::vector<std::string> in_bindings = split(serialized_info[INPUT_BINDING_NAMES_IDX], BINDING_DELIM[0]);
+  std::vector<std::string> out_bindings = split(serialized_info[OUTPUT_BINDING_NAMES_IDX], BINDING_DELIM[0]);
 
   CUDADevice cuda_device(serialized_info[DEVICE_IDX]);
 
@@ -171,6 +171,38 @@ void TRTEngine::set_profiling_paths() {
           .string();
 }
 
+void TRTEngine::enable_profiling() {
+  profile_execution = true;
+  trt_engine_profiler = std::make_unique<TRTEngineProfiler>(name);
+  exec_ctx->setProfiler(trt_engine_profiler.get());
+}
+
+void TRTEngine::disable_profiling() {
+  profile_execution = false;
+  exec_ctx = make_trt(cuda_engine->createExecutionContext());
+  TORCHTRT_CHECK((exec_ctx.get() != nullptr), "Unable to recreate TensorRT execution context");
+}
+
+std::string TRTEngine::get_engine_layer_info() {
+  auto inspector = cuda_engine->createEngineInspector();
+  return inspector->getEngineInformation(nvinfer1::LayerInformationFormat::kJSON);
+}
+
+void TRTEngine::dump_engine_layer_info_to_file(const std::string& path) {
+  auto inspector = cuda_engine->createEngineInspector();
+  std::ofstream f(path);
+  f << std::string(inspector->getEngineInformation(nvinfer1::LayerInformationFormat::kJSON));
+  f.close();
+  return;
+}
+
+void TRTEngine::dump_engine_layer_info() {
+  std::string layer_info_file =
+      std::experimental::filesystem::path{profile_path_prefix + "/" + name + "_engine_layer_information.json"}.string();
+  dump_engine_layer_info_to_file(layer_info_file);
+  return;
+}
+
 TRTEngine& TRTEngine::operator=(const TRTEngine& other) {
   rt = other.rt;
   cuda_engine = other.cuda_engine;
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
index 1b6234db03..132d52ebb8 100644
--- a/core/runtime/TRTEngine.h
+++ b/core/runtime/TRTEngine.h
@@ -4,11 +4,14 @@
 #include <memory>
 #include <mutex>
 #include <utility>
+
 #include "ATen/core/function_schema.h"
 #include "NvInfer.h"
-#include "core/util/prelude.h"
 #include "torch/custom_class.h"
 
+#include "core/runtime/TRTEngineProfiler.h"
+#include "core/util/prelude.h"
+
 namespace torch_tensorrt {
 namespace core {
 namespace runtime {
@@ -20,14 +23,8 @@ struct TRTEngine : torch::CustomClassHolder {
   std::shared_ptr<nvinfer1::IExecutionContext> exec_ctx;
   std::pair<uint64_t, uint64_t> num_io;
   std::string name;
-  std::mutex mu;
   CUDADevice device_info;
 
-  std::string device_profile_path;
-  std::string input_profile_path;
-  std::string output_profile_path;
-  std::string enqueue_profile_path;
-  std::string trt_engine_profile_path;
   std::string profile_path_prefix = std::experimental::filesystem::temp_directory_path();
 
   std::unordered_map<uint64_t, uint64_t> in_binding_map; // TRT IDX -> PYT IDX
@@ -36,12 +33,6 @@ struct TRTEngine : torch::CustomClassHolder {
   std::vector<std::string> in_binding_names; // ITO: PYT IDX
   std::vector<std::string> out_binding_names; // ITO: PYT IDX
 
-#ifndef NDEBUG
-  bool profile_execution = true;
-#else
-  bool profile_execution = false;
-#endif
-
   ~TRTEngine() = default;
   TRTEngine(
       std::string serialized_engine,
@@ -57,10 +48,29 @@ struct TRTEngine : torch::CustomClassHolder {
       const std::vector<std::string>& out_binding_names);
   TRTEngine& operator=(const TRTEngine& other);
   std::string to_str() const;
-  void set_profiling_paths();
+  void enable_profiling();
+  void disable_profiling();
+  std::string get_engine_layer_info();
+  void dump_engine_layer_info_to_file(const std::string& path);
+  void dump_engine_layer_info();
   friend std::ostream& operator<<(std::ostream& os, const TRTEngine& engine);
+  std::string BINDING_DELIM = "%";
   // TODO: Implement a call method
   // c10::List<at::Tensor> Run(c10::List<at::Tensor> inputs);
+
+  void set_profiling_paths();
+#ifndef NDEBUG
+  bool profile_execution = true;
+#else
+  bool profile_execution = false;
+#endif
+  std::string device_profile_path;
+  std::string input_profile_path;
+  std::string output_profile_path;
+  std::string enqueue_profile_path;
+  std::string trt_engine_profile_path;
+  std::mutex mu;
+  std::unique_ptr<TRTEngineProfiler> trt_engine_profiler;
 };
 
 } // namespace runtime
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
index bb6d4431f2..e1b35419b6 100644
--- a/core/runtime/execute_engine.cpp
+++ b/core/runtime/execute_engine.cpp
@@ -70,8 +70,8 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     ss << "  Input packing profile: " << compiled_engine->input_profile_path << std::endl;
     ss << "  Output packing profile: " << compiled_engine->output_profile_path << std::endl;
     ss << "  TRT enqueue profile: " << compiled_engine->enqueue_profile_path << std::endl;
-    // ss << "  Engine execution profile (TensorRT format): " << compiled_engine->trt_engine_profile_path << std::endl;
-    LOG_DEBUG(ss.str());
+    ss << "  Engine execution profile: " << compiled_engine->trt_engine_profile_path << std::endl;
+    LOG_INFO(ss.str());
   }
 
   {
diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
index 3891b5473e..e62b23b768 100644
--- a/core/runtime/register_jit_hooks.cpp
+++ b/core/runtime/register_jit_hooks.cpp
@@ -35,8 +35,13 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
         // TODO: .def("run", &TRTEngine::Run)
         .def("__str__", &TRTEngine::to_str)
         .def("__repr__", &TRTEngine::to_str)
-        .def_readwrite("profile_execution", &TRTEngine::profile_execution)
+        .def_readonly("BINDING_DELIM", &TRTEngine::BINDING_DELIM)
+        .def("enable_profiling", &TRTEngine::enable_profiling)
+        .def("disable_profiling", &TRTEngine::disable_profiling)
         .def_readwrite("profile_path_prefix", &TRTEngine::profile_path_prefix)
+        .def("dump_engine_layer_info_to_file", &TRTEngine::dump_engine_layer_info_to_file)
+        .def("dump_engine_layer_info", &TRTEngine::dump_engine_layer_info)
+        .def("get_engine_layer_info", &TRTEngine::get_engine_layer_info)
         .def_pickle(
             [](const c10::intrusive_ptr<TRTEngine>& self) -> std::vector<std::string> {
               // Serialize TensorRT engine
diff --git a/py/torch_tensorrt/csrc/tensorrt_backend.cpp b/py/torch_tensorrt/csrc/tensorrt_backend.cpp
index 671385319c..f06ef607b4 100644
--- a/py/torch_tensorrt/csrc/tensorrt_backend.cpp
+++ b/py/torch_tensorrt/csrc/tensorrt_backend.cpp
@@ -30,7 +30,8 @@ c10::impl::GenericDict TensorRTBackend::compile(c10::IValue mod_val, c10::impl::
     auto device_spec = convert_cfg.engine_settings.device;
     auto device = core::runtime::CUDADevice(device_spec.gpu_id, device_spec.device_type);
     auto serialized_engine = core::ConvertGraphToTRTEngine(mod_, method_name, cfg);
-    auto engine_handle = c10::make_intrusive<core::runtime::TRTEngine>(it->key(), serialized_engine, device);
+    auto engine_handle = c10::make_intrusive<core::runtime::TRTEngine>(
+        it->key(), serialized_engine, device, std::vector<std::string>(), std::vector<std::string>());
     handles.insert(method_name, at::IValue(engine_handle));
   }
 
diff --git a/py/torch_tensorrt/csrc/tensorrt_classes.cpp b/py/torch_tensorrt/csrc/tensorrt_classes.cpp
index 01519458e5..4d505b622d 100644
--- a/py/torch_tensorrt/csrc/tensorrt_classes.cpp
+++ b/py/torch_tensorrt/csrc/tensorrt_classes.cpp
@@ -178,6 +178,10 @@ core::runtime::CUDADevice Device::toInternalRuntimeDevice() {
   return core::runtime::CUDADevice(gpu_id, toTRTDeviceType(device_type));
 }
 
+std::string Device::toSerializedRuntimeDevice() {
+  return this->toInternalRuntimeDevice().serialize();
+}
+
 std::string Device::to_str() {
   std::stringstream ss;
   std::string fallback = allow_gpu_fallback ? "True" : "False";
diff --git a/py/torch_tensorrt/csrc/tensorrt_classes.h b/py/torch_tensorrt/csrc/tensorrt_classes.h
index cdef3ffbf5..4af9145275 100644
--- a/py/torch_tensorrt/csrc/tensorrt_classes.h
+++ b/py/torch_tensorrt/csrc/tensorrt_classes.h
@@ -88,6 +88,7 @@ struct Device : torch::CustomClassHolder {
   ADD_FIELD_GET_SET(allow_gpu_fallback, bool);
 
   core::runtime::CUDADevice toInternalRuntimeDevice();
+  std::string toSerializedRuntimeDevice();
   std::string to_str();
 };
 
diff --git a/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp b/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp
index 4c1b773c97..2ed1d1e297 100644
--- a/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp
+++ b/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp
@@ -166,8 +166,13 @@ bool CheckMethodOperatorSupport(const torch::jit::Module& module, const std::str
   return core::CheckMethodOperatorSupport(module, method_name);
 }
 
-torch::jit::Module EmbedEngineInNewModule(const py::bytes& engine, Device& device) {
-  return core::EmbedEngineInNewModule(engine, device.toInternalRuntimeDevice());
+torch::jit::Module EmbedEngineInNewModule(
+    const py::bytes& engine,
+    Device& device,
+    const std::vector<std::string>& input_binding_names,
+    const std::vector<std::string>& output_binding_names) {
+  return core::EmbedEngineInNewModule(
+      engine, device.toInternalRuntimeDevice(), input_binding_names, output_binding_names);
 }
 
 std::string get_build_info() {
@@ -308,6 +313,7 @@ PYBIND11_MODULE(_C, m) {
   py::class_<Device>(m, "Device")
       .def(py::init<>())
       .def("__str__", &torch_tensorrt::pyapi::Device::to_str)
+      .def("_to_serialized_runtime_device", &torch_tensorrt::pyapi::Device::toSerializedRuntimeDevice)
       .def_readwrite("device_type", &Device::device_type)
       .def_readwrite("gpu_id", &Device::gpu_id)
       .def_readwrite("dla_core", &Device::dla_core)
@@ -340,7 +346,7 @@ PYBIND11_MODULE(_C, m) {
       .export_values();
 
   py::module rt_sub_mod = m.def_submodule("rt");
-  rt_sub_mod.attr("ABI_VERSION") = py::string_(core::runtime::ABI_VERSION);
+  rt_sub_mod.attr("ABI_VERSION") = std::string(core::runtime::ABI_VERSION);
 
   py::module ts_sub_mod = m.def_submodule("ts");
   py::class_<CompileSpec>(ts_sub_mod, "CompileSpec")
diff --git a/py/torch_tensorrt/fx/trt_module_next.py b/py/torch_tensorrt/fx/trt_module_next.py
index 1d305d1fc3..689b33552f 100644
--- a/py/torch_tensorrt/fx/trt_module_next.py
+++ b/py/torch_tensorrt/fx/trt_module_next.py
@@ -2,44 +2,58 @@
 from typing import Any, List, Sequence
 
 import torch
+from torch_tensorrt import _C
+from torch_tensorrt._Device import Device
 
-from torch.classes.tensorrt import Engine
-from torch.ops.tensorrt import execute_engine
 
-from torch_tensorrt import (_C, Device)
-
-class TRTModule(torch.nn.module):
+class TRTModule(torch.nn.Module):
     def __init__(
         self,
         engine_name: str,
         device_info: Device,
         serialized_engine: bytearray,
+        input_names: List[str],
+        output_names: List[str],
     ):
         super(TRTModule, self).__init__()
-        self.engine = Engine([
-            _C.rt.ABI_VERSION,
-            engine_name,
-            device_info._to_internal_cuda_device_str(),
-            serialized_engine
-        ])
+        self.engine = torch.classes.tensorrt.Engine(
+            [
+                _C.rt.ABI_VERSION,
+                engine_name,
+                device_info._to_serialized_runtime_device(),
+                serialized_engine,
+                TRTModule._pack_binding_names(input_names),
+                TRTModule._pack_binding_names(output_names),
+            ]
+        )
 
     def forward(self, *inputs):
         try:
             assert all([i.issubclass(torch.Tensor) for i in inputs])
         except:
             raise RuntimeError("TRTModule expects a flattened list of tensors as input")
-        outputs = execute_engine(list(inputs), self.engine)
+
+        outputs = torch.ops.tensorrt.execute_engine(list(inputs), self.engine)
+
+        if len(outputs) == 1:
+            return outputs[0]
+
         return tuple(outputs)
 
-    def enable_profiling(self, profiler: None):
-        #TODO: CHANGE THIS SO IT MAKE MORE SENSE
-        self.engine.debug = True
+    def enable_profiling(self, profiling_results_dir: str = None):
+        if profiling_results_dir is not None:
+            self.engine.profile_path_prefix = profiling_results_dir
+        self.engine.enable_profiling()
 
     def disable_profiling(self):
-        #TODO: HERE TOO
-        self.engine.debug = False
+        self.engine.disable_profiling()
 
     def get_layer_info(self) -> str:
-        raise RuntimeError("Engine Inspector needs to be implemented")
-        #assert TRT VERSION > 8.2
-        return self.engine.get_engine_information(_C.LayerInformationFormat.JSON)
\ No newline at end of file
+        return self.engine.get_engine_layer_info()
+
+    def dump_layer_info(self):
+        return self.engine.dump_engine_layer_info()
+
+    @staticmethod
+    def _pack_binding_names(binding_names: List[str]) -> str:
+        return torch.classes.tensorrt.Engine.BINDING_DELIM.join(binding_names)

From 71872dfe1b4a083fd98459c114b6f0e57e53392c Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Thu, 10 Nov 2022 17:33:33 -0800
Subject: [PATCH 08/22] feat(//py/torch_tensorrt/fx): Compliant TRTModule
 implementation based on shared Torch-TensorRT runtime

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/runtime/TRTEngine.cpp              |  4 ++--
 core/runtime/TRTEngine.h                |  2 +-
 core/runtime/register_jit_hooks.cpp     |  7 +++----
 py/torch_tensorrt/fx/trt_module_next.py | 27 ++++++++++++++++++++-----
 4 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
index e4f04642a0..21627fd6fe 100644
--- a/core/runtime/TRTEngine.cpp
+++ b/core/runtime/TRTEngine.cpp
@@ -47,8 +47,8 @@ TRTEngine::TRTEngine(std::vector<std::string> serialized_info) {
           << ")");
   std::string _name = serialized_info[NAME_IDX];
   std::string engine_info = serialized_info[ENGINE_IDX];
-  std::vector<std::string> in_bindings = split(serialized_info[INPUT_BINDING_NAMES_IDX], BINDING_DELIM[0]);
-  std::vector<std::string> out_bindings = split(serialized_info[OUTPUT_BINDING_NAMES_IDX], BINDING_DELIM[0]);
+  std::vector<std::string> in_bindings = split(serialized_info[INPUT_BINDING_NAMES_IDX], BINDING_DELIM);
+  std::vector<std::string> out_bindings = split(serialized_info[OUTPUT_BINDING_NAMES_IDX], BINDING_DELIM);
 
   CUDADevice cuda_device(serialized_info[DEVICE_IDX]);
 
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
index 132d52ebb8..9c85ca09eb 100644
--- a/core/runtime/TRTEngine.h
+++ b/core/runtime/TRTEngine.h
@@ -54,7 +54,7 @@ struct TRTEngine : torch::CustomClassHolder {
   void dump_engine_layer_info_to_file(const std::string& path);
   void dump_engine_layer_info();
   friend std::ostream& operator<<(std::ostream& os, const TRTEngine& engine);
-  std::string BINDING_DELIM = "%";
+  static const char BINDING_DELIM = '%';
   // TODO: Implement a call method
   // c10::List<at::Tensor> Run(c10::List<at::Tensor> inputs);
 
diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
index e62b23b768..8c5f610ed0 100644
--- a/core/runtime/register_jit_hooks.cpp
+++ b/core/runtime/register_jit_hooks.cpp
@@ -3,14 +3,12 @@
 namespace torch_tensorrt {
 namespace core {
 namespace runtime {
-
-const std::string BINDING_DELIM = "%";
 namespace {
 
 std::string serialize_bindings(const std::vector<std::string>& bindings) {
   std::stringstream ss;
   for (size_t i = 0; i < bindings.size() - 1; i++) {
-    ss << bindings[i] << BINDING_DELIM;
+    ss << bindings[i] << TRTEngine::BINDING_DELIM;
   }
   ss << bindings[bindings.size() - 1];
 
@@ -35,7 +33,6 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
         // TODO: .def("run", &TRTEngine::Run)
         .def("__str__", &TRTEngine::to_str)
         .def("__repr__", &TRTEngine::to_str)
-        .def_readonly("BINDING_DELIM", &TRTEngine::BINDING_DELIM)
         .def("enable_profiling", &TRTEngine::enable_profiling)
         .def("disable_profiling", &TRTEngine::disable_profiling)
         .def_readwrite("profile_path_prefix", &TRTEngine::profile_path_prefix)
@@ -68,6 +65,8 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
 
 TORCH_LIBRARY(tensorrt, m) {
   m.def("execute_engine", execute_engine);
+  m.def("SERIALIZED_ENGINE_BINDING_DELIM", []() -> std::string { return std::string(1, TRTEngine::BINDING_DELIM); });
+  m.def("ABI_VERSION", []() -> std::string { return ABI_VERSION; });
 }
 
 } // namespace
diff --git a/py/torch_tensorrt/fx/trt_module_next.py b/py/torch_tensorrt/fx/trt_module_next.py
index 689b33552f..35ab61406b 100644
--- a/py/torch_tensorrt/fx/trt_module_next.py
+++ b/py/torch_tensorrt/fx/trt_module_next.py
@@ -1,5 +1,5 @@
 from operator import truediv
-from typing import Any, List, Sequence
+from typing import Any, List, Sequence, Tuple
 
 import torch
 from torch_tensorrt import _C
@@ -16,9 +16,12 @@ def __init__(
         output_names: List[str],
     ):
         super(TRTModule, self).__init__()
+        self.input_names = input_names
+        self.output_names = output_names
+        self.engine_name = engine_name
         self.engine = torch.classes.tensorrt.Engine(
             [
-                _C.rt.ABI_VERSION,
+                torch.ops.tensorrt.ABI_VERSION(),
                 engine_name,
                 device_info._to_serialized_runtime_device(),
                 serialized_engine,
@@ -28,10 +31,23 @@ def __init__(
         )
 
     def forward(self, *inputs):
+        assert len(inputs) == len(
+            self.input_names
+        ), f"Wrong number of inputs, expect {len(self.input_names)} get {len(inputs)}."
+
+        types = [issubclass(type(i), torch.Tensor) for i in inputs]
+
         try:
-            assert all([i.issubclass(torch.Tensor) for i in inputs])
+            assert all(types)
         except:
-            raise RuntimeError("TRTModule expects a flattened list of tensors as input")
+
+            def is_non_tensor(i: Tuple[Any, bool]) -> bool:
+                return not i[1]
+
+            non_tensors = [i[0] for i in filter(zip(inputs, types), is_non_tensor)]
+            raise RuntimeError(
+                f"TRTModule expects a flattened list of tensors as input, found non tensors: {non_tensors}"
+            )
 
         outputs = torch.ops.tensorrt.execute_engine(list(inputs), self.engine)
 
@@ -56,4 +72,5 @@ def dump_layer_info(self):
 
     @staticmethod
     def _pack_binding_names(binding_names: List[str]) -> str:
-        return torch.classes.tensorrt.Engine.BINDING_DELIM.join(binding_names)
+        delim = torch.ops.tensorrt.SERIALIZED_ENGINE_BINDING_DELIM()[0]
+        return delim.join(binding_names)

From 10afcb2f51167285ab1afaa68cd49294263a9feb Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Thu, 10 Nov 2022 18:05:43 -0800
Subject: [PATCH 09/22] refactor: CUDADevice -> RTDevice for better distinction
 from compile time device

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/compiler.cpp                             |  8 +++----
 core/compiler.h                               |  2 +-
 core/runtime/BUILD                            |  6 ++---
 core/runtime/CMakeLists.txt                   |  4 ++--
 core/runtime/DeviceList.cpp                   |  6 ++---
 core/runtime/{CUDADevice.cpp => RTDevice.cpp} | 16 ++++++-------
 core/runtime/{CUDADevice.h => RTDevice.h}     | 24 +++++++++----------
 core/runtime/TRTEngine.cpp                    |  8 ++++---
 core/runtime/TRTEngine.h                      |  6 ++---
 core/runtime/execute_engine.cpp               |  8 +++----
 core/runtime/runtime.cpp                      | 14 +++++------
 core/runtime/runtime.h                        | 12 +++++-----
 cpp/src/compile_spec.cpp                      |  2 +-
 cpp/src/torch_tensorrt.cpp                    |  4 ++--
 cpp/src/types.cpp                             |  4 ++--
 py/torch_tensorrt/csrc/tensorrt_backend.cpp   |  2 +-
 py/torch_tensorrt/csrc/tensorrt_classes.cpp   | 10 ++++----
 py/torch_tensorrt/csrc/tensorrt_classes.h     |  6 ++---
 py/torch_tensorrt/csrc/torch_tensorrt_py.cpp  |  5 ++--
 19 files changed, 74 insertions(+), 73 deletions(-)
 rename core/runtime/{CUDADevice.cpp => RTDevice.cpp} (85%)
 rename core/runtime/{CUDADevice.h => RTDevice.h} (54%)

diff --git a/core/compiler.cpp b/core/compiler.cpp
index b412d8a3db..e9aea7c84f 100644
--- a/core/compiler.cpp
+++ b/core/compiler.cpp
@@ -31,7 +31,7 @@ void AddEngineToGraph(
     torch::jit::script::Module mod,
     std::shared_ptr<torch::jit::Graph>& g,
     const std::string& serialized_engine,
-    runtime::CUDADevice& device_info,
+    runtime::RTDevice& device_info,
     const std::vector<std::string>& input_binding_names,
     const std::vector<std::string>& output_binding_names,
     std::string engine_id = "",
@@ -172,7 +172,7 @@ partitioning::GraphAndMapping BuildHybridGraph(
         auto engine = conversion::ConvertBlockToEngine(seg_block.block(), convert_info, static_params);
         auto temp_g = std::make_shared<torch::jit::Graph>();
         auto device_spec = convert_info.engine_settings.device;
-        auto cuda_device = runtime::CUDADevice(device_spec.gpu_id, device_spec.device_type);
+        auto cuda_device = runtime::RTDevice(device_spec.gpu_id, device_spec.device_type);
         AddEngineToGraph(
             new_mod,
             temp_g,
@@ -297,7 +297,7 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
   torch::jit::Module new_mod(mod._ivalue()->name() + "_trt");
 
   auto device_spec = cfg.convert_info.engine_settings.device;
-  auto cuda_device = runtime::CUDADevice(device_spec.gpu_id, device_spec.device_type);
+  auto cuda_device = runtime::RTDevice(device_spec.gpu_id, device_spec.device_type);
 
   for (const torch::jit::Method& method : mod.get_methods()) {
     if (method.name().compare("forward") == 0) {
@@ -358,7 +358,7 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
 
 torch::jit::script::Module EmbedEngineInNewModule(
     const std::string& engine,
-    runtime::CUDADevice cuda_device,
+    runtime::RTDevice cuda_device,
     const std::vector<std::string>& input_binding_names,
     const std::vector<std::string>& output_binding_names) {
   std::ostringstream engine_id;
diff --git a/core/compiler.h b/core/compiler.h
index 524f762325..fbed4eabe5 100644
--- a/core/compiler.h
+++ b/core/compiler.h
@@ -30,7 +30,7 @@ torch::jit::script::Module CompileGraph(const torch::jit::script::Module& module
 
 torch::jit::script::Module EmbedEngineInNewModule(
     const std::string& engine,
-    runtime::CUDADevice cuda_device,
+    runtime::RTDevice cuda_device,
     const std::vector<std::string>& input_binding_names,
     const std::vector<std::string>& output_binding_names);
 
diff --git a/core/runtime/BUILD b/core/runtime/BUILD
index 8d3bb28e2a..669feda90e 100644
--- a/core/runtime/BUILD
+++ b/core/runtime/BUILD
@@ -13,8 +13,8 @@ config_setting(
 cc_library(
     name = "runtime",
     srcs = [
-        "CUDADevice.cpp",
         "DeviceList.cpp",
+        "RTDevice.cpp",
         "TRTEngine.cpp",
         "TRTEngineProfiler.cpp",
         "execute_engine.cpp",
@@ -22,7 +22,7 @@ cc_library(
         "runtime.cpp",
     ],
     hdrs = [
-        "CUDADevice.h",
+        "RTDevice.h",
         "TRTEngine.h",
         "TRTEngineProfiler.h",
         "runtime.h",
@@ -44,7 +44,7 @@ cc_library(
 pkg_tar(
     name = "include",
     srcs = [
-        "CUDADevice.h",
+        "RTDevice.h",
         "TRTEngine.h",
         "TRTEngineProfiler.h",
         "runtime.h",
diff --git a/core/runtime/CMakeLists.txt b/core/runtime/CMakeLists.txt
index fed93f81ba..1c1e136c78 100644
--- a/core/runtime/CMakeLists.txt
+++ b/core/runtime/CMakeLists.txt
@@ -2,7 +2,7 @@ set(lib_name "core_runtime")
 add_library(${lib_name} OBJECT)
 
 set(CXX_SRCS
-    "${CMAKE_CURRENT_SOURCE_DIR}/CUDADevice.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/RTDevice.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/DeviceList.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/execute_engine.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/TRTEngine.cpp"
@@ -13,7 +13,7 @@ set(CXX_SRCS
 
 set(HEADER_FILES
     "${CMAKE_CURRENT_SOURCE_DIR}/runtime.h"
-    "${CMAKE_CURRENT_SOURCE_DIR}/CUDADevice.h"
+    "${CMAKE_CURRENT_SOURCE_DIR}/RTDevice.h"
     "${CMAKE_CURRENT_SOURCE_DIR}/TRTEngine.h"
     "${CMAKE_CURRENT_SOURCE_DIR}/TRTEngineProfiler.h"
 )
diff --git a/core/runtime/DeviceList.cpp b/core/runtime/DeviceList.cpp
index c095b08009..fd2b75357a 100644
--- a/core/runtime/DeviceList.cpp
+++ b/core/runtime/DeviceList.cpp
@@ -15,7 +15,7 @@ DeviceList::DeviceList() {
   }
 
   for (int i = 0; i < num_devices; i++) {
-    device_list[i] = CUDADevice(i, nvinfer1::DeviceType::kGPU);
+    device_list[i] = RTDevice(i, nvinfer1::DeviceType::kGPU);
   }
 
   // REVIEW: DO WE CARE ABOUT DLA?
@@ -23,11 +23,11 @@ DeviceList::DeviceList() {
   LOG_DEBUG("Runtime:\n Available CUDA Devices: \n" << this->dump_list());
 }
 
-void DeviceList::insert(int device_id, CUDADevice cuda_device) {
+void DeviceList::insert(int device_id, RTDevice cuda_device) {
   device_list[device_id] = cuda_device;
 }
 
-CUDADevice DeviceList::find(int device_id) {
+RTDevice DeviceList::find(int device_id) {
   return device_list[device_id];
 }
 
diff --git a/core/runtime/CUDADevice.cpp b/core/runtime/RTDevice.cpp
similarity index 85%
rename from core/runtime/CUDADevice.cpp
rename to core/runtime/RTDevice.cpp
index b803e73482..34ecc22e97 100644
--- a/core/runtime/CUDADevice.cpp
+++ b/core/runtime/RTDevice.cpp
@@ -11,10 +11,10 @@ const std::string DEVICE_INFO_DELIM = "%";
 
 typedef enum { ID_IDX = 0, SM_MAJOR_IDX, SM_MINOR_IDX, DEVICE_TYPE_IDX, DEVICE_NAME_IDX } SerializedDeviceInfoIndex;
 
-CUDADevice::CUDADevice() : id{-1}, major{-1}, minor{-1}, device_type{nvinfer1::DeviceType::kGPU} {}
+RTDevice::RTDevice() : id{-1}, major{-1}, minor{-1}, device_type{nvinfer1::DeviceType::kGPU} {}
 
-CUDADevice::CUDADevice(int64_t gpu_id, nvinfer1::DeviceType device_type) {
-  CUDADevice cuda_device;
+RTDevice::RTDevice(int64_t gpu_id, nvinfer1::DeviceType device_type) {
+  RTDevice cuda_device;
   cudaDeviceProp device_prop;
 
   // Device ID
@@ -41,7 +41,7 @@ CUDADevice::CUDADevice(int64_t gpu_id, nvinfer1::DeviceType device_type) {
 // NOTE: Serialization Format for Device Info:
 // id%major%minor%(enum)device_type%device_name
 
-CUDADevice::CUDADevice(std::string device_info) {
+RTDevice::RTDevice(std::string device_info) {
   LOG_DEBUG("Deserializing Device Info: " << device_info);
 
   std::vector<std::string> tokens;
@@ -66,7 +66,7 @@ CUDADevice::CUDADevice(std::string device_info) {
   LOG_DEBUG("Deserialized Device Info: " << *this);
 }
 
-CUDADevice& CUDADevice::operator=(const CUDADevice& other) {
+RTDevice& RTDevice::operator=(const RTDevice& other) {
   id = other.id;
   major = other.major;
   minor = other.minor;
@@ -75,7 +75,7 @@ CUDADevice& CUDADevice::operator=(const CUDADevice& other) {
   return (*this);
 }
 
-std::string CUDADevice::serialize() {
+std::string RTDevice::serialize() {
   std::vector<std::string> content;
   content.resize(DEVICE_NAME_IDX + 1);
 
@@ -98,13 +98,13 @@ std::string CUDADevice::serialize() {
   return serialized_device_info;
 }
 
-std::string CUDADevice::getSMCapability() const {
+std::string RTDevice::getSMCapability() const {
   std::stringstream ss;
   ss << major << "." << minor;
   return ss.str();
 }
 
-std::ostream& operator<<(std::ostream& os, const CUDADevice& device) {
+std::ostream& operator<<(std::ostream& os, const RTDevice& device) {
   os << "Device(ID: " << device.id << ", Name: " << device.device_name << ", SM Capability: " << device.major << '.'
      << device.minor << ", Type: " << device.device_type << ')';
   return os;
diff --git a/core/runtime/CUDADevice.h b/core/runtime/RTDevice.h
similarity index 54%
rename from core/runtime/CUDADevice.h
rename to core/runtime/RTDevice.h
index 695978074c..43374e33be 100644
--- a/core/runtime/CUDADevice.h
+++ b/core/runtime/RTDevice.h
@@ -6,28 +6,28 @@ namespace torch_tensorrt {
 namespace core {
 namespace runtime {
 
-struct CUDADevice {
+struct RTDevice {
   int64_t id; // CUDA device id
   int64_t major; // CUDA compute major version
   int64_t minor; // CUDA compute minor version
   nvinfer1::DeviceType device_type;
   std::string device_name;
 
-  CUDADevice();
-  CUDADevice(int64_t gpu_id, nvinfer1::DeviceType device_type);
-  CUDADevice(std::string serialized_device_info);
-  ~CUDADevice() = default;
-  CUDADevice(const CUDADevice& other) = default;
-  CUDADevice& operator=(const CUDADevice& other);
+  RTDevice();
+  RTDevice(int64_t gpu_id, nvinfer1::DeviceType device_type);
+  RTDevice(std::string serialized_device_info);
+  ~RTDevice() = default;
+  RTDevice(const RTDevice& other) = default;
+  RTDevice& operator=(const RTDevice& other);
   std::string serialize();
   std::string getSMCapability() const;
-  friend std::ostream& operator<<(std::ostream& os, const CUDADevice& device);
+  friend std::ostream& operator<<(std::ostream& os, const RTDevice& device);
 };
 
-void set_cuda_device(CUDADevice& cuda_device);
+void set_cuda_device(RTDevice& cuda_device);
 // Gets the current active GPU (DLA will not show up through this)
-CUDADevice get_current_device();
+RTDevice get_current_device();
 
-} // namespace torch_tensorrt
-} // namespace core
 } // namespace runtime
+} // namespace core
+} // namespace torch_tensorrt
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
index 21627fd6fe..dbbff7d11c 100644
--- a/core/runtime/TRTEngine.cpp
+++ b/core/runtime/TRTEngine.cpp
@@ -3,6 +3,7 @@
 #include <cuda_runtime.h>
 #include "NvInfer.h"
 #include "torch/csrc/jit/frontend/function_schema_parser.h"
+#include "torch/cuda.h"
 
 #include "core/runtime/runtime.h"
 #include "core/util/prelude.h"
@@ -29,7 +30,7 @@ std::vector<std::string> split(const std::string& str, char delim) {
 
 TRTEngine::TRTEngine(
     std::string serialized_engine,
-    CUDADevice cuda_device,
+    RTDevice cuda_device,
     const std::vector<std::string>& _in_binding_names,
     const std::vector<std::string>& _out_binding_names) {
   std::string _name = "deserialized_trt";
@@ -50,7 +51,7 @@ TRTEngine::TRTEngine(std::vector<std::string> serialized_info) {
   std::vector<std::string> in_bindings = split(serialized_info[INPUT_BINDING_NAMES_IDX], BINDING_DELIM);
   std::vector<std::string> out_bindings = split(serialized_info[OUTPUT_BINDING_NAMES_IDX], BINDING_DELIM);
 
-  CUDADevice cuda_device(serialized_info[DEVICE_IDX]);
+  RTDevice cuda_device(serialized_info[DEVICE_IDX]);
 
   new (this) TRTEngine(_name, engine_info, cuda_device, in_bindings, out_bindings);
 }
@@ -58,7 +59,7 @@ TRTEngine::TRTEngine(std::vector<std::string> serialized_info) {
 TRTEngine::TRTEngine(
     std::string mod_name,
     std::string serialized_engine,
-    CUDADevice cuda_device,
+    RTDevice cuda_device,
     const std::vector<std::string>& _in_binding_names,
     const std::vector<std::string>& _out_binding_names) {
   auto most_compatible_device = get_most_compatible_device(cuda_device);
@@ -178,6 +179,7 @@ void TRTEngine::enable_profiling() {
 }
 
 void TRTEngine::disable_profiling() {
+  torch::cuda::synchronize(device_info.id);
   profile_execution = false;
   exec_ctx = make_trt(cuda_engine->createExecutionContext());
   TORCHTRT_CHECK((exec_ctx.get() != nullptr), "Unable to recreate TensorRT execution context");
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
index 9c85ca09eb..f5f2cf909d 100644
--- a/core/runtime/TRTEngine.h
+++ b/core/runtime/TRTEngine.h
@@ -23,7 +23,7 @@ struct TRTEngine : torch::CustomClassHolder {
   std::shared_ptr<nvinfer1::IExecutionContext> exec_ctx;
   std::pair<uint64_t, uint64_t> num_io;
   std::string name;
-  CUDADevice device_info;
+  RTDevice device_info;
 
   std::string profile_path_prefix = std::experimental::filesystem::temp_directory_path();
 
@@ -36,14 +36,14 @@ struct TRTEngine : torch::CustomClassHolder {
   ~TRTEngine() = default;
   TRTEngine(
       std::string serialized_engine,
-      CUDADevice cuda_device,
+      RTDevice cuda_device,
       const std::vector<std::string>& in_binding_names,
       const std::vector<std::string>& out_binding_names);
   TRTEngine(std::vector<std::string> serialized_info);
   TRTEngine(
       std::string mod_name,
       std::string serialized_engine,
-      CUDADevice cuda_device,
+      RTDevice cuda_device,
       const std::vector<std::string>& in_binding_names,
       const std::vector<std::string>& out_binding_names);
   TRTEngine& operator=(const TRTEngine& other);
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
index e1b35419b6..29f3093c37 100644
--- a/core/runtime/execute_engine.cpp
+++ b/core/runtime/execute_engine.cpp
@@ -12,7 +12,7 @@ namespace core {
 namespace runtime {
 
 // Checks if the context switch requred for device ID
-bool is_switch_required(const CUDADevice& curr_device, const CUDADevice& engine_device) {
+bool is_switch_required(const RTDevice& curr_device, const RTDevice& engine_device) {
   // If SM capability is not the same as configured then switch
   if ((curr_device.major != engine_device.major) || (curr_device.minor != engine_device.minor)) {
     LOG_WARNING(
@@ -43,7 +43,7 @@ bool is_switch_required(const CUDADevice& curr_device, const CUDADevice& engine_
   return false;
 }
 
-CUDADevice select_cuda_device(const CUDADevice& engine_device) {
+RTDevice select_cuda_device(const RTDevice& engine_device) {
   auto new_target_device_opt = get_most_compatible_device(engine_device);
 
   // REVIEW: THIS DOES NOT LIST DLA PROBABLY, WHICH WE SHOULD
@@ -81,12 +81,12 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
           std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->device_profile_path);
     }
 
-    CUDADevice curr_device = get_current_device();
+    RTDevice curr_device = get_current_device();
     LOG_DEBUG("Current Device: " << curr_device);
 
     if (is_switch_required(curr_device, compiled_engine->device_info)) {
       // Scan through available CUDA devices and set the CUDA device context correctly
-      CUDADevice device = select_cuda_device(compiled_engine->device_info);
+      RTDevice device = select_cuda_device(compiled_engine->device_info);
       set_cuda_device(device);
 
       std::string target_device = "cuda:" + std::to_string(device.id);
diff --git a/core/runtime/runtime.cpp b/core/runtime/runtime.cpp
index 49406e95b2..67d0786d4a 100644
--- a/core/runtime/runtime.cpp
+++ b/core/runtime/runtime.cpp
@@ -7,7 +7,7 @@ namespace torch_tensorrt {
 namespace core {
 namespace runtime {
 
-c10::optional<CUDADevice> get_most_compatible_device(const CUDADevice& target_device) {
+c10::optional<RTDevice> get_most_compatible_device(const RTDevice& target_device) {
   LOG_DEBUG("Target Device: " << target_device);
   auto device_options = find_compatible_devices(target_device);
   if (device_options.size() == 0) {
@@ -16,7 +16,7 @@ c10::optional<CUDADevice> get_most_compatible_device(const CUDADevice& target_de
     return {device_options[0]};
   }
 
-  CUDADevice best_match;
+  RTDevice best_match;
   std::stringstream dev_list;
   dev_list << "[" << std::endl;
   for (auto device : device_options) {
@@ -41,11 +41,11 @@ c10::optional<CUDADevice> get_most_compatible_device(const CUDADevice& target_de
   }
 }
 
-std::vector<CUDADevice> find_compatible_devices(const CUDADevice& target_device) {
+std::vector<RTDevice> find_compatible_devices(const RTDevice& target_device) {
   auto dla_supported = get_dla_supported_SMs();
   auto device_list = get_available_device_list().get_devices();
 
-  std::vector<CUDADevice> compatible_devices;
+  std::vector<RTDevice> compatible_devices;
 
   for (auto device : device_list) {
     auto poss_dev_cc = device.second.getSMCapability();
@@ -69,13 +69,13 @@ std::vector<CUDADevice> find_compatible_devices(const CUDADevice& target_device)
   return compatible_devices;
 }
 
-void set_cuda_device(CUDADevice& cuda_device) {
+void set_cuda_device(RTDevice& cuda_device) {
   TORCHTRT_CHECK(
       (cudaSetDevice(cuda_device.id) == cudaSuccess), "Unable to set device: " << cuda_device << "as active device");
   LOG_DEBUG("Setting " << cuda_device << " as active device");
 }
 
-CUDADevice get_current_device() {
+RTDevice get_current_device() {
   int device = -1;
   TORCHTRT_CHECK(
       (cudaGetDevice(reinterpret_cast<int*>(&device)) == cudaSuccess),
@@ -83,7 +83,7 @@ CUDADevice get_current_device() {
 
   int64_t device_id = static_cast<int64_t>(device);
 
-  return CUDADevice(device_id, nvinfer1::DeviceType::kGPU);
+  return RTDevice(device_id, nvinfer1::DeviceType::kGPU);
 }
 
 namespace {
diff --git a/core/runtime/runtime.h b/core/runtime/runtime.h
index 66ee3816d2..b360d451e9 100644
--- a/core/runtime/runtime.h
+++ b/core/runtime/runtime.h
@@ -5,7 +5,7 @@
 #include <utility>
 #include "ATen/core/function_schema.h"
 #include "NvInfer.h"
-#include "core/runtime/CUDADevice.h"
+#include "core/runtime/RTDevice.h"
 #include "core/runtime/TRTEngine.h"
 #include "core/util/prelude.h"
 #include "torch/custom_class.h"
@@ -26,13 +26,13 @@ typedef enum {
   SERIALIZATION_LEN, // NEVER USED FOR DATA, USED TO DETERMINE LENGTH OF SERIALIZED INFO
 } SerializedInfoIndex;
 
-c10::optional<CUDADevice> get_most_compatible_device(const CUDADevice& target_device);
-std::vector<CUDADevice> find_compatible_devices(const CUDADevice& target_device);
+c10::optional<RTDevice> get_most_compatible_device(const RTDevice& target_device);
+std::vector<RTDevice> find_compatible_devices(const RTDevice& target_device);
 
 std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine);
 
 class DeviceList {
-  using DeviceMap = std::unordered_map<int, CUDADevice>;
+  using DeviceMap = std::unordered_map<int, RTDevice>;
   DeviceMap device_list;
 
  public:
@@ -40,8 +40,8 @@ class DeviceList {
   DeviceList();
 
  public:
-  void insert(int device_id, CUDADevice cuda_device);
-  CUDADevice find(int device_id);
+  void insert(int device_id, RTDevice cuda_device);
+  RTDevice find(int device_id);
   DeviceMap get_devices();
   std::string dump_list();
 };
diff --git a/cpp/src/compile_spec.cpp b/cpp/src/compile_spec.cpp
index 8fc22b7861..676b4efd7b 100644
--- a/cpp/src/compile_spec.cpp
+++ b/cpp/src/compile_spec.cpp
@@ -13,7 +13,7 @@ nvinfer1::DataType toTRTDataType(DataType value);
 nvinfer1::TensorFormat toTRTTensorFormat(TensorFormat value);
 torchtrt::core::ir::Input to_internal_input(Input& i);
 std::vector<torchtrt::core::ir::Input> to_vec_internal_inputs(std::vector<Input>& external);
-torchtrt::core::runtime::CUDADevice to_internal_cuda_device(Device device);
+torchtrt::core::runtime::RTDevice to_internal_rt_device(Device device);
 
 namespace torchscript {
 CompileSpec::CompileSpec(std::vector<c10::ArrayRef<int64_t>> fixed_sizes) {
diff --git a/cpp/src/torch_tensorrt.cpp b/cpp/src/torch_tensorrt.cpp
index 527e6c5cbf..8c54cf3a29 100644
--- a/cpp/src/torch_tensorrt.cpp
+++ b/cpp/src/torch_tensorrt.cpp
@@ -7,7 +7,7 @@
 
 namespace torch_tensorrt {
 // Defined in types.cpp
-torch_tensorrt::core::runtime::CUDADevice to_internal_cuda_device(Device device);
+torch_tensorrt::core::runtime::RTDevice to_internal_rt_device(Device device);
 namespace torchscript {
 // Defined in compile_spec.cpp
 torch_tensorrt::core::CompileSpec to_internal_compile_spec(CompileSpec external);
@@ -39,7 +39,7 @@ torch::jit::Module embed_engine_in_new_module(
     const std::vector<std::string>& input_binding_names,
     const std::vector<std::string>& output_binding_names) {
   return torch_tensorrt::core::EmbedEngineInNewModule(
-      engine, to_internal_cuda_device(device), input_binding_names, output_binding_names);
+      engine, to_internal_rt_device(device), input_binding_names, output_binding_names);
 }
 
 } // namespace torchscript
diff --git a/cpp/src/types.cpp b/cpp/src/types.cpp
index 7083461350..45ae34c3da 100644
--- a/cpp/src/types.cpp
+++ b/cpp/src/types.cpp
@@ -280,7 +280,7 @@ std::vector<torch_tensorrt::core::ir::Input> to_vec_internal_inputs(std::vector<
   return internal;
 }
 
-torch_tensorrt::core::runtime::CUDADevice to_internal_cuda_device(Device device) {
+torch_tensorrt::core::runtime::RTDevice to_internal_rt_device(Device device) {
   auto device_type = nvinfer1::DeviceType::kGPU;
   switch (device.device_type) {
     case Device::DeviceType::kDLA:
@@ -290,6 +290,6 @@ torch_tensorrt::core::runtime::CUDADevice to_internal_cuda_device(Device device)
     default:
       device_type = nvinfer1::DeviceType::kGPU;
   }
-  return torch_tensorrt::core::runtime::CUDADevice(device.gpu_id, device_type);
+  return torch_tensorrt::core::runtime::RTDevice(device.gpu_id, device_type);
 }
 } // namespace torch_tensorrt
diff --git a/py/torch_tensorrt/csrc/tensorrt_backend.cpp b/py/torch_tensorrt/csrc/tensorrt_backend.cpp
index f06ef607b4..cdc8dd5df3 100644
--- a/py/torch_tensorrt/csrc/tensorrt_backend.cpp
+++ b/py/torch_tensorrt/csrc/tensorrt_backend.cpp
@@ -28,7 +28,7 @@ c10::impl::GenericDict TensorRTBackend::compile(c10::IValue mod_val, c10::impl::
     auto cfg = raw_spec->toInternalCompileSpec();
     auto convert_cfg = std::move(cfg.convert_info);
     auto device_spec = convert_cfg.engine_settings.device;
-    auto device = core::runtime::CUDADevice(device_spec.gpu_id, device_spec.device_type);
+    auto device = core::runtime::RTDevice(device_spec.gpu_id, device_spec.device_type);
     auto serialized_engine = core::ConvertGraphToTRTEngine(mod_, method_name, cfg);
     auto engine_handle = c10::make_intrusive<core::runtime::TRTEngine>(
         it->key(), serialized_engine, device, std::vector<std::string>(), std::vector<std::string>());
diff --git a/py/torch_tensorrt/csrc/tensorrt_classes.cpp b/py/torch_tensorrt/csrc/tensorrt_classes.cpp
index 4d505b622d..489da576e2 100644
--- a/py/torch_tensorrt/csrc/tensorrt_classes.cpp
+++ b/py/torch_tensorrt/csrc/tensorrt_classes.cpp
@@ -40,7 +40,7 @@ nvinfer1::DataType toTRTDataType(DataType value) {
   }
 }
 
-Device::Device(const core::runtime::CUDADevice& internal_dev) {
+Device::Device(const core::runtime::RTDevice& internal_dev) {
   device_type = DeviceType::kGPU;
   gpu_id = internal_dev.id;
   dla_core = -1;
@@ -174,12 +174,12 @@ nvinfer1::DeviceType toTRTDeviceType(DeviceType value) {
   }
 }
 
-core::runtime::CUDADevice Device::toInternalRuntimeDevice() {
-  return core::runtime::CUDADevice(gpu_id, toTRTDeviceType(device_type));
+core::runtime::RTDevice Device::toInternalRTDevice() {
+  return core::runtime::RTDevice(gpu_id, toTRTDeviceType(device_type));
 }
 
-std::string Device::toSerializedRuntimeDevice() {
-  return this->toInternalRuntimeDevice().serialize();
+std::string Device::toSerializedRTDevice() {
+  return this->toInternalRTDevice().serialize();
 }
 
 std::string Device::to_str() {
diff --git a/py/torch_tensorrt/csrc/tensorrt_classes.h b/py/torch_tensorrt/csrc/tensorrt_classes.h
index 4af9145275..6762d078a1 100644
--- a/py/torch_tensorrt/csrc/tensorrt_classes.h
+++ b/py/torch_tensorrt/csrc/tensorrt_classes.h
@@ -80,15 +80,15 @@ struct Device : torch::CustomClassHolder {
         allow_gpu_fallback(false) // allow_gpu_fallback
   {}
 
-  Device(const core::runtime::CUDADevice& internal_dev);
+  Device(const core::runtime::RTDevice& internal_dev);
 
   ADD_ENUM_GET_SET(device_type, DeviceType, static_cast<int64_t>(DeviceType::kDLA));
   ADD_FIELD_GET_SET(gpu_id, int64_t);
   ADD_FIELD_GET_SET(dla_core, int64_t);
   ADD_FIELD_GET_SET(allow_gpu_fallback, bool);
 
-  core::runtime::CUDADevice toInternalRuntimeDevice();
-  std::string toSerializedRuntimeDevice();
+  core::runtime::RTDevice toInternalRTDevice();
+  std::string toSerializedRTDevice();
   std::string to_str();
 };
 
diff --git a/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp b/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp
index 2ed1d1e297..868dbb21fa 100644
--- a/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp
+++ b/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp
@@ -171,8 +171,7 @@ torch::jit::Module EmbedEngineInNewModule(
     Device& device,
     const std::vector<std::string>& input_binding_names,
     const std::vector<std::string>& output_binding_names) {
-  return core::EmbedEngineInNewModule(
-      engine, device.toInternalRuntimeDevice(), input_binding_names, output_binding_names);
+  return core::EmbedEngineInNewModule(engine, device.toInternalRTDevice(), input_binding_names, output_binding_names);
 }
 
 std::string get_build_info() {
@@ -313,7 +312,7 @@ PYBIND11_MODULE(_C, m) {
   py::class_<Device>(m, "Device")
       .def(py::init<>())
       .def("__str__", &torch_tensorrt::pyapi::Device::to_str)
-      .def("_to_serialized_runtime_device", &torch_tensorrt::pyapi::Device::toSerializedRuntimeDevice)
+      .def("_to_serialized_rt_device", &torch_tensorrt::pyapi::Device::toSerializedRTDevice)
       .def_readwrite("device_type", &Device::device_type)
       .def_readwrite("gpu_id", &Device::gpu_id)
       .def_readwrite("dla_core", &Device::dla_core)

From d14c7c474f3fdaaa5da621f0e4b3ea8f5ea1fef7 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Thu, 10 Nov 2022 18:56:52 -0800
Subject: [PATCH 10/22] feat(//examples): Demo that you can compile using FX
 then deploy in TS!!!

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 examples/fx/torch_trt_simple_example.py | 12 ++++++++++++
 py/torch_tensorrt/fx/trt_module_next.py |  4 ++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/examples/fx/torch_trt_simple_example.py b/examples/fx/torch_trt_simple_example.py
index 400dda3360..391643ee0b 100644
--- a/examples/fx/torch_trt_simple_example.py
+++ b/examples/fx/torch_trt_simple_example.py
@@ -70,6 +70,18 @@ def test_torch_tensorrt(model, inputs):
         > 0.99
     )
 
+    scripted_fx_module = torch.jit.trace(trt_fx_module, example_inputs=inputs_fx)
+    scripted_fx_module.save("/tmp/scripted_fx_module.ts")
+    scripted_fx_module = torch.jit.load("/tmp/scripted_fx_module.ts")
+
+    result_fp16 = scripted_fx_module(*inputs_fx)
+    assert (
+        torch.nn.functional.cosine_similarity(
+            ref_fp16.flatten(), result_fp16.flatten(), dim=0
+        )
+        > 0.99
+    )
+
 
 if __name__ == "__main__":
     model = torchvision.models.resnet18(pretrained=True).cuda().eval()
diff --git a/py/torch_tensorrt/fx/trt_module_next.py b/py/torch_tensorrt/fx/trt_module_next.py
index 35ab61406b..a50436834c 100644
--- a/py/torch_tensorrt/fx/trt_module_next.py
+++ b/py/torch_tensorrt/fx/trt_module_next.py
@@ -23,7 +23,7 @@ def __init__(
             [
                 torch.ops.tensorrt.ABI_VERSION(),
                 engine_name,
-                device_info._to_serialized_runtime_device(),
+                device_info._to_serialized_rt_device(),
                 serialized_engine,
                 TRTModule._pack_binding_names(input_names),
                 TRTModule._pack_binding_names(output_names),
@@ -33,7 +33,7 @@ def __init__(
     def forward(self, *inputs):
         assert len(inputs) == len(
             self.input_names
-        ), f"Wrong number of inputs, expect {len(self.input_names)} get {len(inputs)}."
+        ), f"Wrong number of inputs, expected {len(self.input_names)} got {len(inputs)}."
 
         types = [issubclass(type(i), torch.Tensor) for i in inputs]
 

From 7df9032611968ac00dccf3fc83c8a4c7adf4b592 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Thu, 10 Nov 2022 19:49:42 -0800
Subject: [PATCH 11/22] refactor(//py/torch_tensorrt): Updates to existing APIs
 for use in fx

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 py/torch_tensorrt/_Device.py |  4 +-
 py/torch_tensorrt/_Input.py  | 72 ++++++++++++++++++++++++++++++++----
 2 files changed, 66 insertions(+), 10 deletions(-)

diff --git a/py/torch_tensorrt/_Device.py b/py/torch_tensorrt/_Device.py
index 70017e2c55..0662e17aa1 100644
--- a/py/torch_tensorrt/_Device.py
+++ b/py/torch_tensorrt/_Device.py
@@ -111,9 +111,9 @@ def _to_internal(self) -> _C.Device:
         internal_dev.allow_gpu_fallback = self.allow_gpu_fallback
         return internal_dev
 
-    def _to_internal_cuda_device_str(self) -> str:
+    def _to_serialized_rt_device(self) -> str:
         internal_dev = self._to_internal()
-        return internal_dev.to_str()
+        return internal_dev._to_serialized_rt_device()
 
     @classmethod
     def _from_torch_device(cls, torch_dev: torch.device):
diff --git a/py/torch_tensorrt/_Input.py b/py/torch_tensorrt/_Input.py
index 06728fb095..4062edb6e6 100644
--- a/py/torch_tensorrt/_Input.py
+++ b/py/torch_tensorrt/_Input.py
@@ -263,7 +263,16 @@ def _parse_format(format: Any) -> _enums.TensorFormat:
             )
 
     @classmethod
-    def _from_tensor(cls, t: torch.Tensor):
+    def from_tensor(cls, t: torch.Tensor) -> "Input":
+        """
+        Produce a Input which contains the information of the given PyTorch tensor.
+
+        Args:
+            tensor (torch.Tensor): A PyTorch tensor.
+
+        Returns:
+            A Input object.
+        """
         if not any(
             [
                 t.is_contiguous(memory_format=torch.contiguous_format),
@@ -271,7 +280,7 @@ def _from_tensor(cls, t: torch.Tensor):
             ]
         ):
             raise ValueError(
-                "Tensor does not have a supported contiguous memory format, supported formats are contiguous or channel_last"
+                "Tensor does not have a supported memory format, supported formats are contiguous or channel_last"
             )
         frmt = (
             torch.contiguous_format
@@ -280,17 +289,64 @@ def _from_tensor(cls, t: torch.Tensor):
         )
         return cls(shape=t.shape, dtype=t.dtype, format=frmt)
 
-    def example_tensor(self, optimization_profile_field: str = None):
+    @classmethod
+    def from_tensors(cls, ts: torch.Tensor) -> List["Input"]:
+        """
+        Produce a list of Inputs which contain
+        the information of all the given PyTorch tensors.
+
+        Args:
+            tensors (Iterable[torch.Tensor]): A list of PyTorch tensors.
+
+        Returns:
+            A list of Inputs.
+        """
+
+        assert isinstance(ts, (list, tuple))
+        return [cls.from_tensor(t) for t in ts]
+
+    def example_tensor(self, optimization_profile_field: str = None) -> torch.Tensor:
+        """
+        Get an example tensor of the shape specified by the Input object
+
+        Args:
+            optimization_profile_field (Optional(str)): Name of the field to use for shape in the case the Input is dynamically shaped
+
+        Returns:
+            A PyTorch Tensor
+        """
         if optimization_profile_field is not None:
             try:
-                assert any([optimization_profile_field == field_name for field_name in ["min_shape", "opt_shape", "max_shape"]])
+                assert any(
+                    [
+                        optimization_profile_field == field_name
+                        for field_name in ["min_shape", "opt_shape", "max_shape"]
+                    ]
+                )
             except:
-                raise ValueError("Invalid field name, expected one of min_shape, opt_shape, max_shape")
+                raise ValueError(
+                    "Invalid field name, expected one of min_shape, opt_shape, max_shape"
+                )
+
+        if (
+            optimization_profile_field is not None
+            and self.shape_mode == Input._ShapeMode.STATIC
+        ):
+            raise ValueError(
+                "Specified a optimization profile field but the input is static"
+            )
 
-        if optimization_profile_field is not None and self.shape_mode == Input._ShapeMode.STATIC:
-            raise ValueError("Specified a optimization profile field but the input is static")
+        if (
+            optimization_profile_field is None
+            and self.shape_mode == Input._ShapeMode.DYNAMIC
+        ):
+            raise ValueError(
+                "Requested an example tensor from a dynamic shaped input but did not specific which profile field to use."
+            )
 
         if self.shape_mode == Input._ShapeMode.STATIC:
             return torch.randn(self.shape).to(dtype=self.dtype)
         else:
-            return torch.randn(self.shape[optimization_profile_field]).to(dtype=self.dtype)
+            return torch.randn(self.shape[optimization_profile_field]).to(
+                dtype=self.dtype
+            )

From e3b01f382693e5440a60f61f0376fbc04dcd6e87 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Thu, 17 Nov 2022 19:17:53 -0800
Subject: [PATCH 12/22] feat(//core/runtime): Encode TRT engine in base64
 instead of raw bytes

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/runtime/register_jit_hooks.cpp | 52 +++++++++++++++++++++++++++--
 1 file changed, 49 insertions(+), 3 deletions(-)

diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
index 8c5f610ed0..93c256032b 100644
--- a/core/runtime/register_jit_hooks.cpp
+++ b/core/runtime/register_jit_hooks.cpp
@@ -1,3 +1,5 @@
+#include <codecvt>
+
 #include "core/runtime/runtime.h"
 
 namespace torch_tensorrt {
@@ -19,6 +21,49 @@ std::string serialize_bindings(const std::vector<std::string>& bindings) {
   return serialized_binding_info;
 }
 
+static const std::string sym_table = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; //=
+std::string base64_encode(const std::string& in) {
+  std::string out;
+  int64_t val = 0, valb = -6;
+  for (unsigned char c : in) {
+    val = (val << 8) + c;
+    valb += 8;
+    while (valb >= 0) {
+      out.push_back(sym_table[(val >> valb) & 0x3F]);
+      valb -= 6;
+    }
+  }
+  if (valb > -6) {
+    out.push_back(sym_table[((val << 8) >> (valb + 8)) & 0x3F]);
+  };
+  while (out.size() % 4) {
+    out.push_back('=');
+  }
+  return out;
+}
+
+std::string base64_decode(const std::string& in) {
+  std::string out;
+  std::vector<int> T(256, -1);
+  for (int i = 0; i < 64; i++) {
+    T[sym_table[i]] = i;
+  }
+
+  int64_t val = 0, valb = -8;
+  for (unsigned char c : in) {
+    if (T[c] == -1) {
+      break;
+    }
+    val = (val << 6) + T[c];
+    valb += 6;
+    if (valb >= 0) {
+      out.push_back(char((val >> valb) & 0xFF));
+      valb -= 8;
+    }
+  }
+  return out;
+}
+
 // TODO: Implement a call method
 // c10::List<at::Tensor> TRTEngine::Run(c10::List<at::Tensor> inputs) {
 //     auto input_vec = inputs.vec();
@@ -53,14 +98,15 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
               serialize_info[ABI_TARGET_IDX] = ABI_VERSION;
               serialize_info[NAME_IDX] = self->name;
               serialize_info[DEVICE_IDX] = self->device_info.serialize();
-              serialize_info[ENGINE_IDX] = trt_engine;
+              serialize_info[ENGINE_IDX] = base64_encode(trt_engine);
               serialize_info[INPUT_BINDING_NAMES_IDX] = serialize_bindings(self->in_binding_names);
               serialize_info[OUTPUT_BINDING_NAMES_IDX] = serialize_bindings(self->out_binding_names);
 
               return serialize_info;
             },
-            [](std::vector<std::string> seralized_info) -> c10::intrusive_ptr<TRTEngine> {
-              return c10::make_intrusive<TRTEngine>(std::move(seralized_info));
+            [](std::vector<std::string> serialized_info) -> c10::intrusive_ptr<TRTEngine> {
+              serialized_info[ENGINE_IDX] = base64_decode(serialized_info[ENGINE_IDX]);
+              return c10::make_intrusive<TRTEngine>(std::move(serialized_info));
             });
 
 TORCH_LIBRARY(tensorrt, m) {

From ea270e397194be157a6e5ee5bfee6f5c7b0b5c14 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Thu, 17 Nov 2022 19:27:21 -0800
Subject: [PATCH 13/22] feat(//py/torch_tensorrt/fx): Adding the option to use
 the experimental runtime

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 py/torch_tensorrt/_TRTModule.py             | 147 ++++++++++++++++++++
 py/torch_tensorrt/__init__.py               |   1 +
 py/torch_tensorrt/_compile.py               |   1 +
 py/torch_tensorrt/fx/lower.py               |  40 +++++-
 py/torch_tensorrt/fx/lower_setting.py       |   2 +
 py/torch_tensorrt/fx/tools/trt_minimizer.py |  45 ++++--
 py/torch_tensorrt/fx/tools/trt_splitter.py  |  29 +++-
 py/torch_tensorrt/fx/trt_module_next.py     |  76 ----------
 py/torch_tensorrt/ts/_compile_spec.py       |   4 +-
 py/torch_tensorrt/ts/_compiler.py           |  17 ++-
 10 files changed, 259 insertions(+), 103 deletions(-)
 create mode 100644 py/torch_tensorrt/_TRTModule.py
 delete mode 100644 py/torch_tensorrt/fx/trt_module_next.py

diff --git a/py/torch_tensorrt/_TRTModule.py b/py/torch_tensorrt/_TRTModule.py
new file mode 100644
index 0000000000..9ef0e5f1d4
--- /dev/null
+++ b/py/torch_tensorrt/_TRTModule.py
@@ -0,0 +1,147 @@
+from operator import truediv
+from typing import Any, List, Sequence, Tuple
+
+import torch
+from torch_tensorrt import _C
+from torch_tensorrt._Device import Device
+
+
+class TRTModule(torch.nn.Module):
+    """TRTModule is a PyTorch module which encompasses an arbitrary TensorRT Engine.
+
+    This module is backed by the Torch-TensorRT runtime and is fully compatibile with both
+    FX / Python deployments (just ``import torch_tensorrt`` as part of the application) as
+    well as TorchScript / C++ deployments since TRTModule can be passed to ``torch.jit.trace``
+    and then saved.
+
+    The forward function is simpily forward(*args: torch.Tensor) -> Tuple[torch.Tensor] where
+    the internal implementation is ``return Tuple(torch.ops.tensorrt.execute_engine(list(inputs), self.engine))``
+
+    Attributes:
+        engine_name (str): Name of engine (for easier debugging)
+        engine (torch.classess.tensorrt.Engine): Torch-TensorRT TensorRT Engine instance, manages [de]serialization, device configuration, profiling
+        input_binding_names (List[str]): List of input TensorRT engine binding names in the order they would be passed to the TRT modules
+        output_binding_names (List[str]): List of output TensorRT engine binding names in the order they should be returned
+    """
+
+    def __init__(
+        self,
+        engine_name: str,
+        serialized_engine: bytearray,
+        input_binding_names: List[str],
+        output_binding_names: List[str],
+        target_device: Device = Device._current_device(),
+    ):
+        """__init__ method for torch_tensorrt.TRTModule
+
+        Takes a name, target device, serialized TensorRT engine, and binding names / order and constructs
+        a PyTorch ``torch.nn.Module`` around it.
+
+        Args:
+            engine_name (str): Name for the engine
+            serialized_engine (bytearray): Serialized TensorRT engine in the form of a bytearray
+            input_binding_names (List[str]): List of input TensorRT engine binding names in the order they would be passed to the TRT modules
+            output_binding_names (List[str]): List of output TensorRT engine binding names in the order they should be returned
+            target_device: (torch_tensorrt.Device): Device to instantiate TensorRT engine on. Must be a compatible device i.e. same GPU model / compute capability as was used to build the engine
+
+        Example:
+
+            ..code-block::python
+
+                with io.BytesIO() as engine_bytes:
+                    engine_bytes.write(trt_engine.serialize())
+                    engine_str = engine_bytes.getvalue()
+
+                trt_module = TRTModule(
+                    engine_name="my_engine",
+                    serialized_engine=engine_str,
+                    input_names=["x"],
+                    output_names=["output"],
+                )
+
+        """
+
+        super(TRTModule, self).__init__()
+        self.input_binding_names = input_binding_names
+        self.output_binding_names = output_binding_names
+        self.engine_name = engine_name
+        self.engine = torch.classes.tensorrt.Engine(
+            [
+                torch.ops.tensorrt.ABI_VERSION(),
+                engine_name,
+                target_device._to_serialized_rt_device(),
+                serialized_engine,
+                TRTModule._pack_binding_names(self.input_binding_names),
+                TRTModule._pack_binding_names(self.output_binding_names),
+            ]
+        )
+
+    def forward(self, *inputs):
+        """Implementation of the forward pass for a TensorRT engine
+
+        Args:
+            *inputs (torch.Tensor): Inputs to the forward function, must all be ``torch.Tensor``
+
+        Returns:
+            torch.Tensor or Tuple(torch.Tensor): Result of the engine computation
+        """
+
+        assert len(inputs) == len(
+            self.input_binding_names
+        ), f"Wrong number of inputs, expected {len(self.input_binding_names)} got {len(inputs)}."
+
+        types = [issubclass(type(i), torch.Tensor) for i in inputs]
+
+        try:
+            assert all(types)
+        except:
+
+            def is_non_tensor(i: Tuple[Any, bool]) -> bool:
+                return not i[1]
+
+            non_tensors = [i[0] for i in filter(zip(inputs, types), is_non_tensor)]
+            raise RuntimeError(
+                f"TRTModule expects a flattened list of tensors as input, found non tensors: {non_tensors}"
+            )
+
+        outputs = torch.ops.tensorrt.execute_engine(list(inputs), self.engine)
+
+        if len(outputs) == 1:
+            return outputs[0]
+
+        return tuple(outputs)
+
+    def enable_profiling(self, profiling_results_dir: str = None):
+        """Enable the profiler to collect latency information about the execution of the engine
+
+        Keyword Arguments:
+            profiling_results_dir (str): Absolute path to the directory to sort results of profiling.
+        """
+
+        if profiling_results_dir is not None:
+            self.engine.profile_path_prefix = profiling_results_dir
+        self.engine.enable_profiling()
+
+    def disable_profiling(self):
+        """Disable the profiler"""
+
+        self.engine.disable_profiling()
+
+    def get_layer_info(self) -> str:
+        """Get a JSON string containing the layer information encoded by the TensorRT engine in this module
+
+        Returns:
+
+            str: A JSON string which contains the layer information of the engine incapsulated in this module
+        """
+
+        return self.engine.get_engine_layer_info()
+
+    def dump_layer_info(self):
+        """Dump layer information encoded by the TensorRT engine in this module to STDOUT"""
+        return self.engine.dump_engine_layer_info()
+
+    @staticmethod
+    def _pack_binding_names(binding_names: List[str]) -> str:
+        delim = torch.ops.tensorrt.SERIALIZED_ENGINE_BINDING_DELIM()[0]
+        return delim.join(binding_names)
diff --git a/py/torch_tensorrt/__init__.py b/py/torch_tensorrt/__init__.py
index 68fde67e71..5d2407adf0 100644
--- a/py/torch_tensorrt/__init__.py
+++ b/py/torch_tensorrt/__init__.py
@@ -90,6 +90,7 @@ def _find_lib(name, paths):
 from torch_tensorrt import logging
 from torch_tensorrt._Input import Input
 from torch_tensorrt._Device import Device
+from torch_tensorrt._TRTModule import TRTModule
 
 from torch_tensorrt import fx
 
diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py
index 18b9901c56..cbd1b87c5c 100644
--- a/py/torch_tensorrt/_compile.py
+++ b/py/torch_tensorrt/_compile.py
@@ -146,6 +146,7 @@ def compile(
             max_batch_size=inputs[0].size(0),
             explicit_batch_dimension=True,
             dynamic_batch=False,
+            **kwargs,
         )
     else:
         raise RuntimeError("Module is an unknown format or the ir requested is unknown")
diff --git a/py/torch_tensorrt/fx/lower.py b/py/torch_tensorrt/fx/lower.py
index ad8338b104..b6ba52a8f8 100644
--- a/py/torch_tensorrt/fx/lower.py
+++ b/py/torch_tensorrt/fx/lower.py
@@ -1,5 +1,7 @@
 import dataclasses as dc
 import logging
+import dataclasses as dc
+import logging
 from typing import Any, Callable, Optional, Sequence
 
 # @manual=//deeplearning/trt/python:py_tensorrt
@@ -39,6 +41,7 @@ def compile(
     cuda_graph_batch_size=-1,
     dynamic_batch=True,
     is_aten=False,
+    use_experimental_fx_rt=False,
 ) -> nn.Module:
     """
     Takes in original module, input and lowering setting, run lowering workflow to turn module
@@ -56,6 +59,7 @@ def compile(
         save_timing_cache: Update timing cache with current timing cache data if set to True.
         cuda_graph_batch_size: Cuda graph batch size, default to be -1.
         dynamic_batch: batch dimension (dim=0) is dynamic.
+        use_experimental_fx_rt: Uses the next generation TRTModule which supports both Python and TorchScript based execution (including in C++).
     Returns:
         A torch.nn.Module lowered by TensorRT.
     """
@@ -70,6 +74,7 @@ def compile(
         cuda_graph_batch_size=cuda_graph_batch_size,
         dynamic_batch=dynamic_batch,
         is_aten=is_aten,
+        use_experimental_rt=use_experimental_fx_rt,
     )
     lowerer = Lowerer.create(lower_setting=lower_setting)
     return lowerer(module, input)
@@ -143,6 +148,7 @@ def default_split_function(
     splitter_setting = TRTSplitterSetting()
     splitter_setting.use_implicit_batch_dim = not lower_setting.explicit_batch_dimension
     splitter_setting.min_acc_module_size = lower_setting.min_acc_module_size
+    splitter_setting.use_experimental_rt = lower_setting.use_experimental_rt
     splitter = TRTSplitter(model, inputs, settings=splitter_setting)
     splitter.node_support_preview()
     return splitter.generate_split_results()
@@ -164,13 +170,33 @@ def lower_pass(
         """
         interpreter = create_trt_interpreter(lower_setting)
         interp_res: TRTInterpreterResult = interpreter(mod, input, module_name)
-        trt_module = TRTModule(
-            engine=interp_res.engine,
-            input_names=interp_res.input_names,
-            output_names=interp_res.output_names,
-            cuda_graph_batch_size=lower_setting.cuda_graph_batch_size,
-        )
-        return trt_module
+        if lower_setting.use_experimental_rt:
+            import io
+            from torch_tensorrt._TRTModule import TRTModule as TRTModuleNext
+            from torch_tensorrt._Device import Device
+
+            with io.BytesIO() as engine_bytes:
+                engine_bytes.write(interp_res.engine.serialize())
+                engine_str = engine_bytes.getvalue()
+
+            trt_module = TRTModuleNext(
+                engine_name=module_name + "_engine",
+                serialized_engine=engine_str,
+                input_binding_names=interp_res.input_names,
+                output_binding_names=interp_res.output_names,
+                target_device=Device(f"cuda:{torch.cuda.current_device()}"),
+                # cuda_graph_batch_size=lower_setting.cuda_graph_batch_size, # NOTE: Not sure what this is supposed to do
+            )
+            return trt_module
+
+        else:
+            trt_module = TRTModule(
+                engine=interp_res.engine,
+                input_names=interp_res.input_names,
+                output_names=interp_res.output_names,
+                cuda_graph_batch_size=lower_setting.cuda_graph_batch_size,
+            )
+            return trt_module
 
     return lower_pass
 
diff --git a/py/torch_tensorrt/fx/lower_setting.py b/py/torch_tensorrt/fx/lower_setting.py
index 6e184c14ea..a47f8c77c5 100644
--- a/py/torch_tensorrt/fx/lower_setting.py
+++ b/py/torch_tensorrt/fx/lower_setting.py
@@ -73,6 +73,7 @@ class LowerSetting(LowerSettingBasic):
     meaning all possible tactic sources.
     correctness_atol: absolute tolerance for correctness check
     correctness_rtol: relative tolerance for correctness check
+    use_experimental_rt: Uses the next generation TRTModule which supports both Python and TorchScript based execution (including in C++).
     """
 
     input_specs: List[InputTensorSpec] = dc.field(default_factory=list)
@@ -95,3 +96,4 @@ class LowerSetting(LowerSettingBasic):
     tactic_sources: Optional[int] = None
     correctness_atol: float = 0.1
     correctness_rtol: float = 0.1
+    use_experimental_rt: bool = False
diff --git a/py/torch_tensorrt/fx/tools/trt_minimizer.py b/py/torch_tensorrt/fx/tools/trt_minimizer.py
index 308687e0c9..9cd9d749b6 100644
--- a/py/torch_tensorrt/fx/tools/trt_minimizer.py
+++ b/py/torch_tensorrt/fx/tools/trt_minimizer.py
@@ -11,23 +11,47 @@
 
 
 def lower_mod_default(
-    mod: torch.fx.GraphModule, inputs: Tensors, batch_size: Any = 2048
+    mod: torch.fx.GraphModule,
+    inputs: Tensors,
+    batch_size: Any = 2048,
+    use_experimental_rt: bool = False,
 ) -> TRTModule:
     interp = TRTInterpreter(
         mod, InputTensorSpec.from_tensors(inputs), explicit_batch_dimension=True
     )
     interpreter_result = interp.run(max_batch_size=batch_size)
-    res_mod = TRTModule(
-        interpreter_result.engine,
-        interpreter_result.input_names,
-        interpreter_result.output_names,
-    )
+    if use_experimental_rt:
+        import io
+        from torch_tensorrt._TRTModule import TRTModule as TRTModuleNext
+        from torch_tensorrt._Device import Device
+
+        with io.BytesIO() as engine_bytes:
+            engine_bytes.write(interpreter_result.engine.serialize())
+            engine_str = engine_bytes.getvalue()
+
+        res_mod = TRTModuleNext(
+            engine_name=str(type(mod)) + "_engine",
+            serialized_engine=engine_str,
+            input_binding_names=interpreter_result.input_names,
+            output_binding_names=interpreter_result.output_names,
+            target_device=Device(f"cuda:{torch.cuda.current_device()}"),
+            # cuda_graph_batch_size=lower_setting.cuda_graph_batch_size, # NOTE: Not sure what this is supposed to do
+        )
+    else:
+        res_mod = TRTModule(
+            interpreter_result.engine,
+            interpreter_result.input_names,
+            interpreter_result.output_names,
+        )
     return res_mod
 
 
 class TensorRTMinizerSetting(net_min_base._MinimizerSettingBase):
-    def __init__(self, explicit_batch_dimension: Any = True):
+    def __init__(
+        self, explicit_batch_dimension: Any = True, use_experiemental_rt: bool = False
+    ):
         self.explicit_batch_dimension = explicit_batch_dimension
+        self.use_experimental_rt = use_experiemental_rt
         super(TensorRTMinizerSetting, self).__init__()
 
 
@@ -40,11 +64,12 @@ def __init__(
         settings: TensorRTMinizerSetting = TensorRTMinizerSetting(),
         max_batch_size: Any = 2048,
         lower_fn: Callable[
-            [torch.fx.GraphModule, Tensors, Any], TRTModule
+            [torch.fx.GraphModule, Tensors, Any, bool], TRTModule
         ] = lower_mod_default,
     ):
         self.lower_fn = lower_fn
         self.max_batch_size = max_batch_size
+        self.use_experiemental_rt = settings.use_experimental_rt
         super().__init__(module, sample_input, compare_fn, settings)
 
     def run_a(self, mod, inputs):
@@ -55,7 +80,9 @@ def run_a(self, mod, inputs):
     def run_b(self, mod, inputs):
         mod.eval()
         try:
-            mod = self.lower_fn(mod, inputs, self.max_batch_size)
+            mod = self.lower_fn(
+                mod, inputs, self.max_batch_size, self.use_experiemental_rt
+            )
             output = mod(*inputs)
         except RuntimeError as e:
             raise net_min_base.FxNetMinimizerRunFuncError(
diff --git a/py/torch_tensorrt/fx/tools/trt_splitter.py b/py/torch_tensorrt/fx/tools/trt_splitter.py
index 7fbca8d99a..ff75fb4b7a 100644
--- a/py/torch_tensorrt/fx/tools/trt_splitter.py
+++ b/py/torch_tensorrt/fx/tools/trt_splitter.py
@@ -50,6 +50,7 @@ def __init__(self):
         # don't support the batch dim.
         self.use_implicit_batch_dim: bool = True
         self.exclude_support_node_name: set = set()
+        self.use_experimental_rt: bool = False
 
 
 class TRTSplitter(splitter_base._SplitterBase):
@@ -84,11 +85,29 @@ def _lower_model_to_backend(
         # based on feeds model's actual status
         interp = TRTInterpreter(mod, InputTensorSpec.from_tensors(inputs))
         interpreter_result = interp.run(*inputs)
-        return TRTModule(
-            interpreter_result.engine,
-            interpreter_result.input_names,
-            interpreter_result.output_names,
-        )
+        if self.settings.use_experimental_rt:
+            import io
+            from torch_tensorrt._TRTModule import TRTModule as TRTModuleNext
+            from torch_tensorrt._Device import Device
+
+            with io.BytesIO() as engine_bytes:
+                engine_bytes.write(interpreter_result.engine.serialize())
+                engine_str = engine_bytes.getvalue()
+
+            return TRTModuleNext(
+                engine_name=str(type(mod)) + "_engine",
+                serialized_engine=engine_str,
+                input_binding_names=interpreter_result.input_names,
+                output_binding_names=interpreter_result.output_names,
+                target_device=Device(f"cuda:{torch.cuda.current_device()}"),
+                # cuda_graph_batch_size=lower_setting.cuda_graph_batch_size, # NOTE: Not sure what this is supposed to do
+            )
+        else:
+            return TRTModule(
+                interpreter_result.engine,
+                interpreter_result.input_names,
+                interpreter_result.output_names,
+            )
 
     def _find_culprit(self, mod: torch.fx.GraphModule, inputs: Tensors):
         """
diff --git a/py/torch_tensorrt/fx/trt_module_next.py b/py/torch_tensorrt/fx/trt_module_next.py
deleted file mode 100644
index a50436834c..0000000000
--- a/py/torch_tensorrt/fx/trt_module_next.py
+++ /dev/null
@@ -1,76 +0,0 @@
-from operator import truediv
-from typing import Any, List, Sequence, Tuple
-
-import torch
-from torch_tensorrt import _C
-from torch_tensorrt._Device import Device
-
-
-class TRTModule(torch.nn.Module):
-    def __init__(
-        self,
-        engine_name: str,
-        device_info: Device,
-        serialized_engine: bytearray,
-        input_names: List[str],
-        output_names: List[str],
-    ):
-        super(TRTModule, self).__init__()
-        self.input_names = input_names
-        self.output_names = output_names
-        self.engine_name = engine_name
-        self.engine = torch.classes.tensorrt.Engine(
-            [
-                torch.ops.tensorrt.ABI_VERSION(),
-                engine_name,
-                device_info._to_serialized_rt_device(),
-                serialized_engine,
-                TRTModule._pack_binding_names(input_names),
-                TRTModule._pack_binding_names(output_names),
-            ]
-        )
-
-    def forward(self, *inputs):
-        assert len(inputs) == len(
-            self.input_names
-        ), f"Wrong number of inputs, expected {len(self.input_names)} got {len(inputs)}."
-
-        types = [issubclass(type(i), torch.Tensor) for i in inputs]
-
-        try:
-            assert all(types)
-        except:
-
-            def is_non_tensor(i: Tuple[Any, bool]) -> bool:
-                return not i[1]
-
-            non_tensors = [i[0] for i in filter(zip(inputs, types), is_non_tensor)]
-            raise RuntimeError(
-                f"TRTModule expects a flattened list of tensors as input, found non tensors: {non_tensors}"
-            )
-
-        outputs = torch.ops.tensorrt.execute_engine(list(inputs), self.engine)
-
-        if len(outputs) == 1:
-            return outputs[0]
-
-        return tuple(outputs)
-
-    def enable_profiling(self, profiling_results_dir: str = None):
-        if profiling_results_dir is not None:
-            self.engine.profile_path_prefix = profiling_results_dir
-        self.engine.enable_profiling()
-
-    def disable_profiling(self):
-        self.engine.disable_profiling()
-
-    def get_layer_info(self) -> str:
-        return self.engine.get_engine_layer_info()
-
-    def dump_layer_info(self):
-        return self.engine.dump_engine_layer_info()
-
-    @staticmethod
-    def _pack_binding_names(binding_names: List[str]) -> str:
-        delim = torch.ops.tensorrt.SERIALIZED_ENGINE_BINDING_DELIM()[0]
-        return delim.join(binding_names)
diff --git a/py/torch_tensorrt/ts/_compile_spec.py b/py/torch_tensorrt/ts/_compile_spec.py
index 9616111caa..5ffe0471f4 100644
--- a/py/torch_tensorrt/ts/_compile_spec.py
+++ b/py/torch_tensorrt/ts/_compile_spec.py
@@ -210,7 +210,7 @@ def _parse_input_signature(input_signature: Any):
         input_signature, torch.Tensor
     ):
         i = (
-            Input._from_tensor(input_signature)
+            Input.from_tensor(input_signature)
             if isinstance(input_signature, torch.Tensor)
             else input_signature
         )
@@ -243,7 +243,7 @@ def _parse_compile_spec(compile_spec_: Dict[str, Any]) -> _ts_C.CompileSpec:
             )
 
         inputs = [
-            Input._from_tensor(i) if isinstance(i, torch.Tensor) else i
+            Input.from_tensor(i) if isinstance(i, torch.Tensor) else i
             for i in compile_spec["inputs"]
         ]
         info.inputs = [i._to_internal() for i in inputs]
diff --git a/py/torch_tensorrt/ts/_compiler.py b/py/torch_tensorrt/ts/_compiler.py
index c88651f7ba..78481c43b0 100644
--- a/py/torch_tensorrt/ts/_compiler.py
+++ b/py/torch_tensorrt/ts/_compiler.py
@@ -244,7 +244,10 @@ def convert_method_to_trt_engine(
 
 
 def embed_engine_in_new_module(
-    serialized_engine: bytes, device=Device._current_device()
+    serialized_engine: bytes,
+    device: Device = Device._current_device(),
+    input_binding_names: List[str] = [],
+    output_binding_names: List[str] = [],
 ) -> torch.jit.ScriptModule:
     """Takes a pre-built serialized TensorRT engine and embeds it within a TorchScript module
 
@@ -253,7 +256,7 @@ def embed_engine_in_new_module(
 
         forward(Tensor[]) -> Tensor[]
 
-    TensorRT bindings must have names with the following format:
+    TensorRT bindings either be explicitly specified using ``[in/out]put_binding_names`` or have names with the following format:
       - [symbol].[index in input / output array]
       ex.
       - [x.0, x.1, x.2] -> [y.0]
@@ -265,11 +268,17 @@ def embed_engine_in_new_module(
 
     Keyword Arguments:
         device (Union(torch_tensorrt.Device, torch.device, dict)): Target device to run engine on. Must be compatible with engine provided. Default: Current active device
-
+        input_binding_names (List[str]): List of names of TensorRT bindings in order to be passed to the encompassing PyTorch module
+        output_binding_names (List[str]): List of names of TensorRT bindings in order that should be returned from the encompassing PyTorch module
     Returns:
         torch.jit.ScriptModule: New TorchScript module with engine embedded
     """
-    cpp_mod = _C.embed_engine_in_new_module(serialized_engine, _parse_device(device))
+    cpp_mod = _C.embed_engine_in_new_module(
+        serialized_engine,
+        _parse_device(device),
+        input_binding_names,
+        output_binding_names,
+    )
     return torch.jit._recursive.wrap_cpp_module(cpp_mod)
 
 

From c7e757b49de6ba3cef5e70e228808d455338fa06 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Thu, 17 Nov 2022 23:23:52 -0800
Subject: [PATCH 14/22] fix(//core/runtime): Fixing a bug where if an exception
 is thrown in downstream constructor, it would cause a segfault

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/runtime/TRTEngine.cpp          | 52 ++++++++++++++---------------
 core/runtime/TRTEngine.h            | 19 ++++++-----
 core/runtime/register_jit_hooks.cpp |  3 +-
 3 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
index dbbff7d11c..4061a262a8 100644
--- a/core/runtime/TRTEngine.cpp
+++ b/core/runtime/TRTEngine.cpp
@@ -29,37 +29,24 @@ std::vector<std::string> split(const std::string& str, char delim) {
 }
 
 TRTEngine::TRTEngine(
-    std::string serialized_engine,
-    RTDevice cuda_device,
+    const std::string& serialized_engine,
+    const RTDevice& cuda_device,
     const std::vector<std::string>& _in_binding_names,
-    const std::vector<std::string>& _out_binding_names) {
-  std::string _name = "deserialized_trt";
-  new (this) TRTEngine(_name, serialized_engine, cuda_device, _in_binding_names, _out_binding_names);
-}
-
-TRTEngine::TRTEngine(std::vector<std::string> serialized_info) {
-  TORCHTRT_CHECK(
-      serialized_info.size() == SERIALIZATION_LEN,
-      "Program to be deserialized targets an incompatible Torch-TensorRT ABI");
-  TORCHTRT_CHECK(
-      serialized_info[ABI_TARGET_IDX] == ABI_VERSION,
-      "Program to be deserialized targets a different Torch-TensorRT ABI Version ("
-          << serialized_info[ABI_TARGET_IDX] << ") than the Torch-TensorRT Runtime ABI Version (" << ABI_VERSION
-          << ")");
-  std::string _name = serialized_info[NAME_IDX];
-  std::string engine_info = serialized_info[ENGINE_IDX];
-  std::vector<std::string> in_bindings = split(serialized_info[INPUT_BINDING_NAMES_IDX], BINDING_DELIM);
-  std::vector<std::string> out_bindings = split(serialized_info[OUTPUT_BINDING_NAMES_IDX], BINDING_DELIM);
+    const std::vector<std::string>& _out_binding_names)
+    : TRTEngine("deserialized_trt", serialized_engine, cuda_device, _in_binding_names, _out_binding_names) {}
 
-  RTDevice cuda_device(serialized_info[DEVICE_IDX]);
-
-  new (this) TRTEngine(_name, engine_info, cuda_device, in_bindings, out_bindings);
-}
+TRTEngine::TRTEngine(std::vector<std::string> serialized_info)
+    : TRTEngine(
+          serialized_info[NAME_IDX],
+          serialized_info[ENGINE_IDX],
+          RTDevice(serialized_info[DEVICE_IDX]),
+          split(serialized_info[INPUT_BINDING_NAMES_IDX], BINDING_DELIM),
+          split(serialized_info[OUTPUT_BINDING_NAMES_IDX], BINDING_DELIM)) {}
 
 TRTEngine::TRTEngine(
-    std::string mod_name,
-    std::string serialized_engine,
-    RTDevice cuda_device,
+    const std::string& mod_name,
+    const std::string& serialized_engine,
+    const RTDevice& cuda_device,
     const std::vector<std::string>& _in_binding_names,
     const std::vector<std::string>& _out_binding_names) {
   auto most_compatible_device = get_most_compatible_device(cuda_device);
@@ -158,6 +145,17 @@ TRTEngine::TRTEngine(
   LOG_DEBUG(*this);
 }
 
+void TRTEngine::verify_serialization_fmt(const std::vector<std::string>& serialized_info) {
+  TORCHTRT_CHECK(
+      serialized_info.size() == SERIALIZATION_LEN,
+      "Program to be deserialized targets an incompatible Torch-TensorRT ABI");
+  TORCHTRT_CHECK(
+      serialized_info[ABI_TARGET_IDX] == ABI_VERSION,
+      "Program to be deserialized targets a different Torch-TensorRT ABI Version ("
+          << serialized_info[ABI_TARGET_IDX] << ") than the Torch-TensorRT Runtime ABI Version (" << ABI_VERSION
+          << ")");
+}
+
 void TRTEngine::set_profiling_paths() {
   device_profile_path =
       std::experimental::filesystem::path{profile_path_prefix + "/" + name + "_device_config_profile.trace"}.string();
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
index f5f2cf909d..0eada27238 100644
--- a/core/runtime/TRTEngine.h
+++ b/core/runtime/TRTEngine.h
@@ -27,27 +27,28 @@ struct TRTEngine : torch::CustomClassHolder {
 
   std::string profile_path_prefix = std::experimental::filesystem::temp_directory_path();
 
-  std::unordered_map<uint64_t, uint64_t> in_binding_map; // TRT IDX -> PYT IDX
-  std::unordered_map<uint64_t, uint64_t> out_binding_map; // TRT IDX -> PYT IDX
+  std::unordered_map<uint64_t, uint64_t> in_binding_map = {}; // TRT IDX -> PYT IDX
+  std::unordered_map<uint64_t, uint64_t> out_binding_map = {}; // TRT IDX -> PYT IDX
 
-  std::vector<std::string> in_binding_names; // ITO: PYT IDX
-  std::vector<std::string> out_binding_names; // ITO: PYT IDX
+  std::vector<std::string> in_binding_names = {}; // ITO: PYT IDX
+  std::vector<std::string> out_binding_names = {}; // ITO: PYT IDX
 
   ~TRTEngine() = default;
   TRTEngine(
-      std::string serialized_engine,
-      RTDevice cuda_device,
+      const std::string& serialized_engine,
+      const RTDevice& cuda_device,
       const std::vector<std::string>& in_binding_names,
       const std::vector<std::string>& out_binding_names);
   TRTEngine(std::vector<std::string> serialized_info);
   TRTEngine(
-      std::string mod_name,
-      std::string serialized_engine,
-      RTDevice cuda_device,
+      const std::string& mod_name,
+      const std::string& serialized_engine,
+      const RTDevice& cuda_device,
       const std::vector<std::string>& in_binding_names,
       const std::vector<std::string>& out_binding_names);
   TRTEngine& operator=(const TRTEngine& other);
   std::string to_str() const;
+  static void verify_serialization_fmt(const std::vector<std::string>& serialized_info);
   void enable_profiling();
   void disable_profiling();
   std::string get_engine_layer_info();
diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
index 93c256032b..c5b9118fee 100644
--- a/core/runtime/register_jit_hooks.cpp
+++ b/core/runtime/register_jit_hooks.cpp
@@ -106,7 +106,8 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
             },
             [](std::vector<std::string> serialized_info) -> c10::intrusive_ptr<TRTEngine> {
               serialized_info[ENGINE_IDX] = base64_decode(serialized_info[ENGINE_IDX]);
-              return c10::make_intrusive<TRTEngine>(std::move(serialized_info));
+              TRTEngine::verify_serialization_fmt(serialized_info);
+              return c10::make_intrusive<TRTEngine>(serialized_info);
             });
 
 TORCH_LIBRARY(tensorrt, m) {

From 2e299db42cd62b722bc842325ab353aadc77057a Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Fri, 18 Nov 2022 01:35:21 -0800
Subject: [PATCH 15/22] feat(//py/torch_tensorrt/TRTModule): Allow state_dict
 extraction

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/runtime/TRTEngine.cpp                    |   2 +-
 py/torch_tensorrt/_TRTModule.py               |  79 +++++++++++---
 py/torch_tensorrt/fx/lower.py                 |   2 +-
 .../fx/test/core/test_trt_module.py           |  80 ++++++++++++++
 py/torch_tensorrt/fx/tools/trt_minimizer.py   |   2 +-
 py/torch_tensorrt/fx/tools/trt_splitter.py    |   2 +-
 py/torch_tensorrt/ts/_compiler.py             |  14 ++-
 tests/py/api/test_classes.py                  | 102 +++++++++++++++++-
 8 files changed, 259 insertions(+), 24 deletions(-)

diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
index 4061a262a8..8c34dc6f45 100644
--- a/core/runtime/TRTEngine.cpp
+++ b/core/runtime/TRTEngine.cpp
@@ -198,7 +198,7 @@ void TRTEngine::dump_engine_layer_info_to_file(const std::string& path) {
 
 void TRTEngine::dump_engine_layer_info() {
   std::string layer_info_file =
-      std::experimental::filesystem::path{profile_path_prefix + "/" + name + "_engine_layer_information.json"}.string();
+      std::experimental::filesystem::path{profile_path_prefix + "/" + name + "_layer_information.json"}.string();
   dump_engine_layer_info_to_file(layer_info_file);
   return;
 }
diff --git a/py/torch_tensorrt/_TRTModule.py b/py/torch_tensorrt/_TRTModule.py
index 9ef0e5f1d4..6afb20707a 100644
--- a/py/torch_tensorrt/_TRTModule.py
+++ b/py/torch_tensorrt/_TRTModule.py
@@ -18,7 +18,7 @@ class TRTModule(torch.nn.Module):
     the internal implementation is ``return Tuple(torch.ops.tensorrt.execute_engine(list(inputs), self.engine))``
 
     Attributes:
-        engine_name (str): Name of engine (for easier debugging)
+        name (str): Name of module (for easier debugging)
         engine (torch.classess.tensorrt.Engine): Torch-TensorRT TensorRT Engine instance, manages [de]serialization, device configuration, profiling
         input_binding_names (List[str]): List of input TensorRT engine binding names in the order they would be passed to the TRT modules
         output_binding_names (List[str]): List of output TensorRT engine binding names in the order they should be returned
@@ -26,10 +26,10 @@ class TRTModule(torch.nn.Module):
 
     def __init__(
         self,
-        engine_name: str,
-        serialized_engine: bytearray,
-        input_binding_names: List[str],
-        output_binding_names: List[str],
+        name: str = "",
+        serialized_engine: bytearray = bytearray(),
+        input_binding_names: List[str] = [],
+        output_binding_names: List[str] = [],
         target_device: Device = Device._current_device(),
     ):
         """__init__ method for torch_tensorrt.TRTModule
@@ -38,7 +38,7 @@ def __init__(
         a PyTorch ``torch.nn.Module`` around it.
 
         Args:
-            engine_name (str): Name for the engine
+            name (str): Name for module
             serialized_engine (bytearray): Serialized TensorRT engine in the form of a bytearray
             input_binding_names (List[str]): List of input TensorRT engine binding names in the order they would be passed to the TRT modules
             output_binding_names (List[str]): List of output TensorRT engine binding names in the order they should be returned
@@ -64,18 +64,54 @@ def __init__(
         super(TRTModule, self).__init__()
         self.input_binding_names = input_binding_names
         self.output_binding_names = output_binding_names
-        self.engine_name = engine_name
-        self.engine = torch.classes.tensorrt.Engine(
-            [
-                torch.ops.tensorrt.ABI_VERSION(),
-                engine_name,
-                target_device._to_serialized_rt_device(),
-                serialized_engine,
-                TRTModule._pack_binding_names(self.input_binding_names),
-                TRTModule._pack_binding_names(self.output_binding_names),
-            ]
+        self.name = name
+
+        if serialized_engine != bytearray():
+            self.engine = torch.classes.tensorrt.Engine(
+                [
+                    torch.ops.tensorrt.ABI_VERSION(),
+                    self.name + "_engine" if self.name != "" else "tensorrt_engine",
+                    target_device._to_serialized_rt_device(),
+                    serialized_engine,
+                    TRTModule._pack_binding_names(self.input_binding_names),
+                    TRTModule._pack_binding_names(self.output_binding_names),
+                ]
+            )
+        else:
+            self.engine = None
+
+    def get_extra_state(self):
+        return (
+            self.name,
+            self.engine.__getstate__() if self.engine is not None else None,
+            self.input_binding_names,
+            self.output_binding_names,
         )
 
+    def set_extra_state(self, state):
+        self.name = state[0]
+        if state[1] is not None:
+            serialized_engine_info = state[1][0]
+            print(serialized_engine_info)
+            import base64
+
+            serialized_engine = base64.b64decode(serialized_engine_info[3])
+            self.engine = torch.classes.tensorrt.Engine(
+                [
+                    serialized_engine_info[0],
+                    serialized_engine_info[1],
+                    serialized_engine_info[2],
+                    serialized_engine,
+                    serialized_engine_info[4],
+                    serialized_engine_info[5],
+                ]
+            )
+        else:
+            self.engine = None
+
+        self.input_binding_names = state[2]
+        self.output_binding_names = state[3]
+
     def forward(self, *inputs):
         """Implementation of the forward pass for a TensorRT engine
 
@@ -85,6 +121,8 @@ def forward(self, *inputs):
         Returns:
             torch.Tensor or Tuple(torch.Tensor): Result of the engine computation
         """
+        if self.engine is None:
+            raise RuntimeError("Engine has not been initalized yet.")
 
         assert len(inputs) == len(
             self.input_binding_names
@@ -117,6 +155,8 @@ def enable_profiling(self, profiling_results_dir: str = None):
         Keyword Arguments:
             profiling_results_dir (str): Absolute path to the directory to sort results of profiling.
         """
+        if self.engine is None:
+            raise RuntimeError("Engine has not been initalized yet.")
 
         if profiling_results_dir is not None:
             self.engine.profile_path_prefix = profiling_results_dir
@@ -124,6 +164,8 @@ def enable_profiling(self, profiling_results_dir: str = None):
 
     def disable_profiling(self):
         """Disable the profiler"""
+        if self.engine is None:
+            raise RuntimeError("Engine has not been initalized yet.")
 
         self.engine.disable_profiling()
 
@@ -134,11 +176,16 @@ def get_layer_info(self) -> str:
 
             str: A JSON string which contains the layer information of the engine incapsulated in this module
         """
+        if self.engine is None:
+            raise RuntimeError("Engine has not been initalized yet.")
 
         return self.engine.get_engine_layer_info()
 
     def dump_layer_info(self):
         """Dump layer information encoded by the TensorRT engine in this module to STDOUT"""
+        if self.engine is None:
+            raise RuntimeError("Engine has not been initalized yet.")
+
         return self.engine.dump_engine_layer_info()
 
     @staticmethod
diff --git a/py/torch_tensorrt/fx/lower.py b/py/torch_tensorrt/fx/lower.py
index b6ba52a8f8..cae6b24af0 100644
--- a/py/torch_tensorrt/fx/lower.py
+++ b/py/torch_tensorrt/fx/lower.py
@@ -180,7 +180,7 @@ def lower_pass(
                 engine_str = engine_bytes.getvalue()
 
             trt_module = TRTModuleNext(
-                engine_name=module_name + "_engine",
+                name=module_name,
                 serialized_engine=engine_str,
                 input_binding_names=interp_res.input_names,
                 output_binding_names=interp_res.output_names,
diff --git a/py/torch_tensorrt/fx/test/core/test_trt_module.py b/py/torch_tensorrt/fx/test/core/test_trt_module.py
index ce44be758d..30480f9b33 100644
--- a/py/torch_tensorrt/fx/test/core/test_trt_module.py
+++ b/py/torch_tensorrt/fx/test/core/test_trt_module.py
@@ -1,5 +1,6 @@
 # Owner(s): ["oncall: gpu_enablement"]
 
+import io
 import os
 
 import torch
@@ -8,6 +9,8 @@
 import torch_tensorrt.fx.tracer.acc_tracer.acc_tracer as acc_tracer
 from torch.testing._internal.common_utils import run_tests, TestCase
 from torch_tensorrt.fx import InputTensorSpec, TRTInterpreter, TRTModule
+from torch_tensorrt import TRTModule as TRTModuleNext
+from torch_tensorrt import Device
 from torch_tensorrt.fx.utils import LowerPrecision
 
 
@@ -54,5 +57,82 @@ def forward(self, x):
         )
 
 
+class TestTRTModuleNext(TestCase):
+    def test_save_and_load_trt_module(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x):
+                return x + x
+
+        inputs = [torch.randn(1, 1)]
+        mod = TestModule().eval()
+        ref_output = mod(*inputs)
+
+        mod = acc_tracer.trace(mod, inputs)
+
+        interp = TRTInterpreter(mod, input_specs=InputTensorSpec.from_tensors(inputs))
+        interp_res = interp.run(lower_precision=LowerPrecision.FP32)
+
+        with io.BytesIO() as engine_bytes:
+            engine_bytes.write(interp_res.engine.serialize())
+            engine_str = engine_bytes.getvalue()
+
+        trt_mod = TRTModuleNext(
+            name="TestModule",
+            serialized_engine=engine_str,
+            input_binding_names=interp_res.input_names,
+            output_binding_names=interp_res.output_names,
+            target_device=Device(f"cuda:{torch.cuda.current_device()}"),
+        )
+
+        torch.save(trt_mod, "trt.pt")
+        reload_trt_mod = torch.load("trt.pt")
+
+        torch.testing.assert_allclose(
+            reload_trt_mod(inputs[0].cuda()).cpu().reshape_as(ref_output),
+            ref_output,
+            rtol=1e-04,
+            atol=1e-04,
+        )
+        os.remove(f"{os.getcwd()}/trt.pt")
+
+    def test_save_and_load_state_dict(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x):
+                return x + x
+
+        inputs = [torch.randn(1, 1)]
+        mod = TestModule().eval()
+        ref_output = mod(*inputs)
+
+        mod = acc_tracer.trace(mod, inputs)
+        interp = TRTInterpreter(mod, input_specs=InputTensorSpec.from_tensors(inputs))
+        interp_res = interp.run(lower_precision=LowerPrecision.FP32)
+
+        with io.BytesIO() as engine_bytes:
+            engine_bytes.write(interp_res.engine.serialize())
+            engine_str = engine_bytes.getvalue()
+
+        trt_mod = TRTModuleNext(
+            name="TestModule",
+            serialized_engine=engine_str,
+            input_binding_names=interp_res.input_names,
+            output_binding_names=interp_res.output_names,
+            target_device=Device(f"cuda:{torch.cuda.current_device()}"),
+        )
+
+        st = trt_mod.state_dict()
+        print(st)
+
+        new_trt_mod = TRTModuleNext()
+        new_trt_mod.load_state_dict(st)
+
+        torch.testing.assert_allclose(
+            new_trt_mod(inputs[0].cuda()).cpu().reshape_as(ref_output),
+            ref_output,
+            rtol=1e-04,
+            atol=1e-04,
+        )
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/py/torch_tensorrt/fx/tools/trt_minimizer.py b/py/torch_tensorrt/fx/tools/trt_minimizer.py
index 9cd9d749b6..792fe48f74 100644
--- a/py/torch_tensorrt/fx/tools/trt_minimizer.py
+++ b/py/torch_tensorrt/fx/tools/trt_minimizer.py
@@ -30,7 +30,7 @@ def lower_mod_default(
             engine_str = engine_bytes.getvalue()
 
         res_mod = TRTModuleNext(
-            engine_name=str(type(mod)) + "_engine",
+            name=str(type(mod)),
             serialized_engine=engine_str,
             input_binding_names=interpreter_result.input_names,
             output_binding_names=interpreter_result.output_names,
diff --git a/py/torch_tensorrt/fx/tools/trt_splitter.py b/py/torch_tensorrt/fx/tools/trt_splitter.py
index ff75fb4b7a..8496f8378c 100644
--- a/py/torch_tensorrt/fx/tools/trt_splitter.py
+++ b/py/torch_tensorrt/fx/tools/trt_splitter.py
@@ -95,7 +95,7 @@ def _lower_model_to_backend(
                 engine_str = engine_bytes.getvalue()
 
             return TRTModuleNext(
-                engine_name=str(type(mod)) + "_engine",
+                name=str(type(mod)),
                 serialized_engine=engine_str,
                 input_binding_names=interpreter_result.input_names,
                 output_binding_names=interpreter_result.output_names,
diff --git a/py/torch_tensorrt/ts/_compiler.py b/py/torch_tensorrt/ts/_compiler.py
index 78481c43b0..a0da43ffb0 100644
--- a/py/torch_tensorrt/ts/_compiler.py
+++ b/py/torch_tensorrt/ts/_compiler.py
@@ -156,7 +156,7 @@ def convert_method_to_trt_engine(
     dla_global_dram_size=536870912,
     truncate_long_and_double=False,
     calibrator=None,
-) -> str:
+) -> bytearray:
     """Convert a TorchScript module method to a serialized TensorRT engine
 
     Converts a specified method of a module to a serialized TensorRT engine given a dictionary of conversion settings
@@ -216,7 +216,7 @@ def convert_method_to_trt_engine(
         calibrator (Union(torch_tensorrt._C.IInt8Calibrator, tensorrt.IInt8Calibrator)): Calibrator object which will provide data to the PTQ system for INT8 Calibration
 
     Returns:
-        bytes: Serialized TensorRT engine, can either be saved to a file or deserialized via TensorRT APIs
+        bytearray: Serialized TensorRT engine, can either be saved to a file or deserialized via TensorRT APIs
     """
     if isinstance(module, torch.jit.ScriptFunction):
         raise TypeError(
@@ -238,10 +238,18 @@ def convert_method_to_trt_engine(
         "truncate_long_and_double": truncate_long_and_double,
     }
 
-    return _C.convert_graph_to_trt_engine(
+    engine_str = _C.convert_graph_to_trt_engine(
         module._c, method_name, _parse_compile_spec(compile_spec)
     )
 
+    import io
+
+    with io.BytesIO() as engine_bytes:
+        engine_bytes.write(engine_str)
+        engine_bytearray = engine_bytes.getvalue()
+
+    return engine_bytearray
+
 
 def embed_engine_in_new_module(
     serialized_engine: bytes,
diff --git a/tests/py/api/test_classes.py b/tests/py/api/test_classes.py
index ff3c50155b..9e35172bcd 100644
--- a/tests/py/api/test_classes.py
+++ b/tests/py/api/test_classes.py
@@ -102,7 +102,7 @@ def test_infer_from_example_tensor(self):
         }
 
         example_tensor = torch.randn(shape).half()
-        i = torchtrt.Input._from_tensor(example_tensor)
+        i = torchtrt.Input.from_tensor(example_tensor)
         self.assertTrue(self._verify_correctness(i, target))
 
     def test_static_shape(self):
@@ -206,5 +206,105 @@ def test_dynamic_shape(self):
         self.assertTrue(self._verify_correctness(i, target))
 
 
+class TestTRTModule(unittest.TestCase):
+    @staticmethod
+    def _get_trt_mod():
+        class Test(torch.nn.Module):
+            def __init__(self):
+                super(Test, self).__init__()
+                self.fc1 = torch.nn.Linear(10, 5)
+                self.fc2 = torch.nn.Linear(5, 5)
+
+            def forward(self, x):
+                out = self.fc2(self.fc1(x))
+                return out
+
+        mod = torch.jit.script(Test())
+        test_mod_engine_str = torchtrt.ts.convert_method_to_trt_engine(
+            mod, "forward", inputs=[torchtrt.Input((2, 10))]
+        )
+        return torchtrt.TRTModule(
+            engine_name="test_engine",
+            serialized_engine=test_mod_engine_str,
+            input_binding_names=["input_0"],
+            output_binding_names=["output_0"],
+        )
+
+    def test_detect_invalid_input_binding(self):
+        class Test(torch.nn.Module):
+            def __init__(self):
+                super(Test, self).__init__()
+                self.fc1 = torch.nn.Linear(10, 5)
+                self.fc2 = torch.nn.Linear(5, 5)
+
+            def forward(self, x):
+                out = self.fc2(self.fc1(x))
+                return out
+
+        mod = torch.jit.script(Test())
+        test_mod_engine_str = torchtrt.ts.convert_method_to_trt_engine(
+            mod, "forward", inputs=[torchtrt.Input((2, 10))]
+        )
+        with self.assertRaises(RuntimeError):
+            torchtrt.TRTModule(
+                engine_name="test_engine",
+                serialized_engine=test_mod_engine_str,
+                input_binding_names=["x.1"],
+                output_binding_names=["output_0"],
+            )
+
+    def test_detect_invalid_output_binding(self):
+        class Test(torch.nn.Module):
+            def __init__(self):
+                super(Test, self).__init__()
+                self.fc1 = torch.nn.Linear(10, 5)
+                self.fc2 = torch.nn.Linear(5, 5)
+
+            def forward(self, x):
+                out = self.fc2(self.fc1(x))
+                return out
+
+        mod = torch.jit.script(Test())
+        test_mod_engine_str = torchtrt.ts.convert_method_to_trt_engine(
+            mod, "forward", inputs=[torchtrt.Input((2, 10))]
+        )
+        with self.assertRaises(RuntimeError):
+            torchtrt.TRTModule(
+                name="test_engine",
+                serialized_engine=test_mod_engine_str,
+                input_binding_names=["input_0"],
+                output_binding_names=["z.1"],
+            )
+
+    def test_set_get_profile_path_prefix(self):
+        trt_mod = TestTRTModule._get_trt_mod()
+        trt_mod.engine.profile_path_prefix = "/tmp/"
+        self.assertTrue(trt_mod.engine.profile_path_prefix == "/tmp/")
+
+    def test_get_layer_info(self):
+        """
+        {
+            "Layers": [
+                "reshape_before_%26 : Tensor = aten::matmul(%x.1, %25)",
+                "%26 : Tensor = aten::matmul(%x.1, %25) + [Freeze Tensor %27 : Tensor = trt::const(%10) ] + (Unnamed Layer* 4) [Shuffle] + unsqueeze_node_after_[Freeze Tensor %27 : Tensor = trt::const(%10) ] + (Unnamed Layer* 4) [Shuffle]_(Unnamed Layer* 4) [Shuffle]_output + %28 : Tensor = aten::add(%27, %26, %24)",
+                "%31 : Tensor = aten::matmul(%28, %30) + [Freeze Tensor %32 : Tensor = trt::const(%12) ] + (Unnamed Layer* 10) [Shuffle] + unsqueeze_node_after_[Freeze Tensor %32 : Tensor = trt::const(%12) ] + (Unnamed Layer* 10) [Shuffle]_(Unnamed Layer* 10) [Shuffle]_output + %33 : Tensor = aten::add(%32, %31, %29)",
+                "copied_squeeze_after_%33 : Tensor = aten::add(%32, %31, %29)"
+            ],
+            "Bindings": [
+                "input_0",
+                "output_0"
+            ]
+        }
+        """
+
+        import json
+
+        trt_mod = TestTRTModule._get_trt_mod()
+        trt_json = json.loads(trt_mod.get_layer_info())
+        [self.assertTrue(k in trt_json.keys()) for k in ["Layers", "Bindings"]]
+        self.assertTrue(len(trt_json["Layers"]) == 4)
+        self.assertTrue(len(trt_json["Bindings"]) == 2)
+
+
 if __name__ == "__main__":
     unittest.main()

From 418bc6fdaff0eb87660f4b802d02db470aeb4a31 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Fri, 18 Nov 2022 02:04:17 -0800
Subject: [PATCH 16/22] chore: Addressing merge conflicts

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/runtime/RTDevice.h         |  4 ---
 core/runtime/TRTEngine.cpp      |  2 +-
 core/runtime/execute_engine.cpp | 46 +++++++++++++++++----------------
 core/runtime/runtime.cpp        |  2 +-
 core/runtime/runtime.h          |  4 +++
 tests/py/api/test_classes.py    |  6 ++---
 tests/util/run_graph_engine.cpp |  2 +-
 7 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/core/runtime/RTDevice.h b/core/runtime/RTDevice.h
index 43374e33be..bd1484d4b0 100644
--- a/core/runtime/RTDevice.h
+++ b/core/runtime/RTDevice.h
@@ -24,10 +24,6 @@ struct RTDevice {
   friend std::ostream& operator<<(std::ostream& os, const RTDevice& device);
 };
 
-void set_cuda_device(RTDevice& cuda_device);
-// Gets the current active GPU (DLA will not show up through this)
-RTDevice get_current_device();
-
 } // namespace runtime
 } // namespace core
 } // namespace torch_tensorrt
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
index 8c34dc6f45..ae249f78a9 100644
--- a/core/runtime/TRTEngine.cpp
+++ b/core/runtime/TRTEngine.cpp
@@ -52,7 +52,7 @@ TRTEngine::TRTEngine(
   auto most_compatible_device = get_most_compatible_device(cuda_device);
   TORCHTRT_CHECK(most_compatible_device, "No compatible device was found for instantiating TensorRT engine");
   device_info = most_compatible_device.value();
-  set_cuda_device(device_info);
+  set_rt_device(device_info);
 
   rt = make_trt(nvinfer1::createInferRuntime(util::logging::get_logger()));
 
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
index c5ecf8b47a..6b1e0d9e38 100644
--- a/core/runtime/execute_engine.cpp
+++ b/core/runtime/execute_engine.cpp
@@ -60,7 +60,6 @@ RTDevice select_rt_device(const RTDevice& engine_device) {
 
 std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
   LOG_DEBUG("Attempting to run engine (ID: " << compiled_engine->name << ")");
-  // compiled_engine->debug = false;
 
   if (compiled_engine->profile_execution) {
     std::stringstream ss;
@@ -84,37 +83,40 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     RTDevice curr_device = get_current_device();
     LOG_DEBUG("Current Device: " << curr_device);
 
+    // Generic Target Device Prefix
+    std::string target_device = "cuda:";
+
     if (is_switch_required(curr_device, compiled_engine->device_info)) {
       // Scan through available CUDA devices and set the CUDA device context correctly
-      RTDevice device = select_cuda_device(compiled_engine->device_info);
+      RTDevice device = select_rt_device(compiled_engine->device_info);
       set_rt_device(device);
 
-      std::string target_device = "cuda:" + std::to_string(device.id);
+      // Target device is new device
+      target_device += std::to_string(device.id);
 
       for (auto& in : inputs) {
         in = in.to(torch::Device(target_device));
       }
+    } else {
+      // Target device is current device
+      target_device += std::to_string(curr_device.id);
     }
-  }
-  else {
-    // Target device is current device
-    target_device += std::to_string(curr_device.id);
-  }
-
-  // For each input, ensure its current device is the desired target device
-  for (size_t i = 0; i < inputs.size(); i++) {
-    at::Tensor* in = &inputs[i];
-    std::string current_tensor_device = in->device().str();
 
-    // If current device string does not match target device, display warning and move tensor accordingly
-    if (current_tensor_device != target_device) {
-      LOG_WARNING(
-          "Input " << i << " of engine " << compiled_engine->name << " was found to be on " << current_tensor_device
-                   << " but should be on " << target_device << ". This tensor is being moved by the runtime but "
-                   << "for performance considerations, ensure your inputs are all on GPU "
-                   << "and open an issue here (https://github.com/pytorch/TensorRT/issues) if this "
-                   << "warning persists.");
-      *in = in->to(torch::Device(target_device));
+    // For each input, ensure its current device is the desired target device
+    for (size_t i = 0; i < inputs.size(); i++) {
+      at::Tensor* in = &inputs[i];
+      std::string current_tensor_device = in->device().str();
+
+      // If current device string does not match target device, display warning and move tensor accordingly
+      if (current_tensor_device != target_device) {
+        LOG_WARNING(
+            "Input " << i << " of engine " << compiled_engine->name << " was found to be on " << current_tensor_device
+                     << " but should be on " << target_device << ". This tensor is being moved by the runtime but "
+                     << "for performance considerations, ensure your inputs are all on GPU "
+                     << "and open an issue here (https://github.com/pytorch/TensorRT/issues) if this "
+                     << "warning persists.");
+        *in = in->to(torch::Device(target_device));
+      }
     }
   }
 
diff --git a/core/runtime/runtime.cpp b/core/runtime/runtime.cpp
index 67d0786d4a..0c054d8a3c 100644
--- a/core/runtime/runtime.cpp
+++ b/core/runtime/runtime.cpp
@@ -69,7 +69,7 @@ std::vector<RTDevice> find_compatible_devices(const RTDevice& target_device) {
   return compatible_devices;
 }
 
-void set_cuda_device(RTDevice& cuda_device) {
+void set_rt_device(RTDevice& cuda_device) {
   TORCHTRT_CHECK(
       (cudaSetDevice(cuda_device.id) == cudaSuccess), "Unable to set device: " << cuda_device << "as active device");
   LOG_DEBUG("Setting " << cuda_device << " as active device");
diff --git a/core/runtime/runtime.h b/core/runtime/runtime.h
index b360d451e9..4c7565c9fc 100644
--- a/core/runtime/runtime.h
+++ b/core/runtime/runtime.h
@@ -49,6 +49,10 @@ class DeviceList {
 DeviceList get_available_device_list();
 const std::unordered_map<std::string, std::string>& get_dla_supported_SMs();
 
+void set_rt_device(RTDevice& cuda_device);
+// Gets the current active GPU (DLA will not show up through this)
+RTDevice get_current_device();
+
 } // namespace runtime
 } // namespace core
 } // namespace torch_tensorrt
diff --git a/tests/py/api/test_classes.py b/tests/py/api/test_classes.py
index 9e35172bcd..6f1a4affe3 100644
--- a/tests/py/api/test_classes.py
+++ b/tests/py/api/test_classes.py
@@ -224,7 +224,7 @@ def forward(self, x):
             mod, "forward", inputs=[torchtrt.Input((2, 10))]
         )
         return torchtrt.TRTModule(
-            engine_name="test_engine",
+            name="test",
             serialized_engine=test_mod_engine_str,
             input_binding_names=["input_0"],
             output_binding_names=["output_0"],
@@ -247,7 +247,7 @@ def forward(self, x):
         )
         with self.assertRaises(RuntimeError):
             torchtrt.TRTModule(
-                engine_name="test_engine",
+                name="test",
                 serialized_engine=test_mod_engine_str,
                 input_binding_names=["x.1"],
                 output_binding_names=["output_0"],
@@ -270,7 +270,7 @@ def forward(self, x):
         )
         with self.assertRaises(RuntimeError):
             torchtrt.TRTModule(
-                name="test_engine",
+                name="test",
                 serialized_engine=test_mod_engine_str,
                 input_binding_names=["input_0"],
                 output_binding_names=["z.1"],
diff --git a/tests/util/run_graph_engine.cpp b/tests/util/run_graph_engine.cpp
index 2ee966346c..ce8435a34c 100644
--- a/tests/util/run_graph_engine.cpp
+++ b/tests/util/run_graph_engine.cpp
@@ -56,7 +56,7 @@ std::vector<core::ir::Input> toInputsDynamic(std::vector<at::Tensor> ten, bool d
 
 std::vector<at::Tensor> RunEngine(std::string& eng, std::vector<at::Tensor> inputs) {
   LOG_DEBUG("Running TRT version");
-  auto cuda_device = core::runtime::CUDADevice(0, nvinfer1::DeviceType::kGPU);
+  auto cuda_device = core::runtime::RTDevice(0, nvinfer1::DeviceType::kGPU);
   auto engine_ptr = c10::make_intrusive<torch_tensorrt::core::runtime::TRTEngine>(
       "test_engine", eng, cuda_device, std::vector<std::string>(), std::vector<std::string>());
   auto outputs = torch_tensorrt::core::runtime::execute_engine(inputs, engine_ptr);

From 131759aa203521b379cdf936758f79c49c249b9c Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Fri, 18 Nov 2022 02:06:21 -0800
Subject: [PATCH 17/22] chore: lint

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/conversion/converters/impl/expand.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/conversion/converters/impl/expand.cpp b/core/conversion/converters/impl/expand.cpp
index e6a6c0130c..b88ebc89fa 100644
--- a/core/conversion/converters/impl/expand.cpp
+++ b/core/conversion/converters/impl/expand.cpp
@@ -376,7 +376,7 @@ auto expand_registrations TORCHTRT_UNUSED =
                std::vector<int64_t> collapse_shape_vec;
                for (int64_t k = 0; k < repeat_shape_dims.nbDims; k++) {
                  if (k == dim) {
-                   int64_t collapse_dim = repeat_shape_dims.d[k] * repeat_shape_dims.d[k+1];
+                   int64_t collapse_dim = repeat_shape_dims.d[k] * repeat_shape_dims.d[k + 1];
                    // Set dim size to -1 if repeat is being done on dynamic dim
                    collapse_dim = std::max(collapse_dim, (int64_t)-1);
                    collapse_shape_vec.push_back(collapse_dim);

From 4c544a318713ba3d9208118e3131594363df5fe4 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Fri, 18 Nov 2022 02:12:37 -0800
Subject: [PATCH 18/22] chore: remove print statements

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 py/torch_tensorrt/_TRTModule.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/py/torch_tensorrt/_TRTModule.py b/py/torch_tensorrt/_TRTModule.py
index 6afb20707a..cabc328fdb 100644
--- a/py/torch_tensorrt/_TRTModule.py
+++ b/py/torch_tensorrt/_TRTModule.py
@@ -92,7 +92,6 @@ def set_extra_state(self, state):
         self.name = state[0]
         if state[1] is not None:
             serialized_engine_info = state[1][0]
-            print(serialized_engine_info)
             import base64
 
             serialized_engine = base64.b64decode(serialized_engine_info[3])

From bdc48d445a52e4d9815110fa286bde76a7c1de4b Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Fri, 18 Nov 2022 13:26:06 -0800
Subject: [PATCH 19/22] fix: Fix cmake build

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 CMakeLists.txt                       | 4 ++--
 core/runtime/CMakeLists.txt          | 9 +++++----
 docsrc/contributors/partitioning.rst | 2 +-
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5ed15f1c79..0b103b8d86 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,8 +2,8 @@
 cmake_minimum_required(VERSION 3.17)
 project(Torch-TensorRT LANGUAGES CXX)
 
-# use c++17
-set(CMAKE_CXX_STANDARD 17)
+# use c++14 like PyTorch
+set(CMAKE_CXX_STANDARD 14)
 
 # Build the libraries with -fPIC
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
diff --git a/core/runtime/CMakeLists.txt b/core/runtime/CMakeLists.txt
index 1c1e136c78..d21c661af1 100644
--- a/core/runtime/CMakeLists.txt
+++ b/core/runtime/CMakeLists.txt
@@ -2,20 +2,20 @@ set(lib_name "core_runtime")
 add_library(${lib_name} OBJECT)
 
 set(CXX_SRCS
-    "${CMAKE_CURRENT_SOURCE_DIR}/RTDevice.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/DeviceList.cpp"
-    "${CMAKE_CURRENT_SOURCE_DIR}/execute_engine.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/RTDevice.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/TRTEngine.cpp"
-    "${CMAKE_CURRENT_SOURCE_DIR}/TRTEngineProfiler.h"
+    "${CMAKE_CURRENT_SOURCE_DIR}/TRTEngineProfiler.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/execute_engine.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/register_jit_hooks.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/runtime.cpp"
 )
 
 set(HEADER_FILES
-    "${CMAKE_CURRENT_SOURCE_DIR}/runtime.h"
     "${CMAKE_CURRENT_SOURCE_DIR}/RTDevice.h"
     "${CMAKE_CURRENT_SOURCE_DIR}/TRTEngine.h"
     "${CMAKE_CURRENT_SOURCE_DIR}/TRTEngineProfiler.h"
+    "${CMAKE_CURRENT_SOURCE_DIR}/runtime.h"
 )
 
 target_sources(${lib_name}
@@ -33,6 +33,7 @@ target_link_libraries(${lib_name}
         TensorRT::nvinfer
         torch
         core_util
+        stdc++fs
 )
 
 # Install
diff --git a/docsrc/contributors/partitioning.rst b/docsrc/contributors/partitioning.rst
index 7ac5a5231f..8c83ddcadc 100644
--- a/docsrc/contributors/partitioning.rst
+++ b/docsrc/contributors/partitioning.rst
@@ -238,4 +238,4 @@ In this example we will collect the arithmetic ops in a TensorRT segment and the
 
 In some cases this approach may create adjacent segments in the partition which have the same target. As a clean-up step we can consolidate these adjacent segments to further reduce the number of segments in the final partition.
 The merge segments step identifies a list of segments that are adjacent in the graph, have the same target, and are not marked as `do_not_merge`. The nodes from these segments will be combined into a single new segment that will replace the merged segments in the partition.
-The `do_not_merge` marking is used to prevent merging of segments created for conditional nodes and loops that are handled as special cases in graph stitching and should not be merged with adjacent segments of the same type. 
+The `do_not_merge` marking is used to prevent merging of segments created for conditional nodes and loops that are handled as special cases in graph stitching and should not be merged with adjacent segments of the same type.

From 58681f9131da66ce497e877756523efdcb02ea66 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Mon, 21 Nov 2022 11:17:03 -0800
Subject: [PATCH 20/22] refactor: Add a suffix to the TRTModuleNext class while
 it's experimental

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/runtime/TRTEngine.cpp                    | 86 +++++++++----------
 core/runtime/execute_engine.cpp               |  4 +-
 py/setup.py                                   |  2 +-
 .../{_TRTModule.py => _TRTModuleNext.py}      | 25 ++++--
 py/torch_tensorrt/__init__.py                 |  2 +-
 py/torch_tensorrt/fx/lower.py                 |  7 +-
 .../fx/test/core/test_trt_module.py           | 15 +++-
 py/torch_tensorrt/fx/tools/trt_minimizer.py   |  2 +-
 py/torch_tensorrt/fx/tools/trt_splitter.py    |  2 +-
 tests/py/api/test_classes.py                  | 12 +--
 10 files changed, 89 insertions(+), 68 deletions(-)
 rename py/torch_tensorrt/{_TRTModule.py => _TRTModuleNext.py} (88%)

diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
index ae249f78a9..d524a44ba4 100644
--- a/core/runtime/TRTEngine.cpp
+++ b/core/runtime/TRTEngine.cpp
@@ -145,37 +145,6 @@ TRTEngine::TRTEngine(
   LOG_DEBUG(*this);
 }
 
-void TRTEngine::verify_serialization_fmt(const std::vector<std::string>& serialized_info) {
-  TORCHTRT_CHECK(
-      serialized_info.size() == SERIALIZATION_LEN,
-      "Program to be deserialized targets an incompatible Torch-TensorRT ABI");
-  TORCHTRT_CHECK(
-      serialized_info[ABI_TARGET_IDX] == ABI_VERSION,
-      "Program to be deserialized targets a different Torch-TensorRT ABI Version ("
-          << serialized_info[ABI_TARGET_IDX] << ") than the Torch-TensorRT Runtime ABI Version (" << ABI_VERSION
-          << ")");
-}
-
-void TRTEngine::set_profiling_paths() {
-  device_profile_path =
-      std::experimental::filesystem::path{profile_path_prefix + "/" + name + "_device_config_profile.trace"}.string();
-  input_profile_path =
-      std::experimental::filesystem::path{profile_path_prefix + "/" + name + "_input_profile.trace"}.string();
-  output_profile_path =
-      std::experimental::filesystem::path{profile_path_prefix + "/" + name + "_output_profile.trace"}.string();
-  enqueue_profile_path =
-      std::experimental::filesystem::path{profile_path_prefix + "/" + name + "_enqueue_profile.trace"}.string();
-  trt_engine_profile_path =
-      std::experimental::filesystem::path{profile_path_prefix + "/" + name + "_engine_exectuion_profile.trace"}
-          .string();
-}
-
-void TRTEngine::enable_profiling() {
-  profile_execution = true;
-  trt_engine_profiler = std::make_unique<TRTEngineProfiler>(name);
-  exec_ctx->setProfiler(trt_engine_profiler.get());
-}
-
 void TRTEngine::disable_profiling() {
   torch::cuda::synchronize(device_info.id);
   profile_execution = false;
@@ -183,11 +152,6 @@ void TRTEngine::disable_profiling() {
   TORCHTRT_CHECK((exec_ctx.get() != nullptr), "Unable to recreate TensorRT execution context");
 }
 
-std::string TRTEngine::get_engine_layer_info() {
-  auto inspector = cuda_engine->createEngineInspector();
-  return inspector->getEngineInformation(nvinfer1::LayerInformationFormat::kJSON);
-}
-
 void TRTEngine::dump_engine_layer_info_to_file(const std::string& path) {
   auto inspector = cuda_engine->createEngineInspector();
   std::ofstream f(path);
@@ -203,13 +167,29 @@ void TRTEngine::dump_engine_layer_info() {
   return;
 }
 
-TRTEngine& TRTEngine::operator=(const TRTEngine& other) {
-  rt = other.rt;
-  cuda_engine = other.cuda_engine;
-  device_info = other.device_info;
-  exec_ctx = other.exec_ctx;
-  num_io = other.num_io;
-  return (*this);
+void TRTEngine::enable_profiling() {
+  profile_execution = true;
+  trt_engine_profiler = std::make_unique<TRTEngineProfiler>(name);
+  exec_ctx->setProfiler(trt_engine_profiler.get());
+}
+
+std::string TRTEngine::get_engine_layer_info() {
+  auto inspector = cuda_engine->createEngineInspector();
+  return inspector->getEngineInformation(nvinfer1::LayerInformationFormat::kJSON);
+}
+
+void TRTEngine::set_profiling_paths() {
+  device_profile_path =
+      std::experimental::filesystem::path{profile_path_prefix + "/" + name + "_device_config_profile.trace"}.string();
+  input_profile_path =
+      std::experimental::filesystem::path{profile_path_prefix + "/" + name + "_input_profile.trace"}.string();
+  output_profile_path =
+      std::experimental::filesystem::path{profile_path_prefix + "/" + name + "_output_profile.trace"}.string();
+  enqueue_profile_path =
+      std::experimental::filesystem::path{profile_path_prefix + "/" + name + "_enqueue_profile.trace"}.string();
+  trt_engine_profile_path =
+      std::experimental::filesystem::path{profile_path_prefix + "/" + name + "_engine_exectuion_profile.trace"}
+          .string();
 }
 
 std::string TRTEngine::to_str() const {
@@ -246,6 +226,26 @@ std::ostream& operator<<(std::ostream& os, const TRTEngine& engine) {
   return os;
 }
 
+TRTEngine& TRTEngine::operator=(const TRTEngine& other) {
+  rt = other.rt;
+  cuda_engine = other.cuda_engine;
+  device_info = other.device_info;
+  exec_ctx = other.exec_ctx;
+  num_io = other.num_io;
+  return (*this);
+}
+
+void TRTEngine::verify_serialization_fmt(const std::vector<std::string>& serialized_info) {
+  TORCHTRT_CHECK(
+      serialized_info.size() == SERIALIZATION_LEN,
+      "Program to be deserialized targets an incompatible Torch-TensorRT ABI");
+  TORCHTRT_CHECK(
+      serialized_info[ABI_TARGET_IDX] == ABI_VERSION,
+      "Program to be deserialized targets a different Torch-TensorRT ABI Version ("
+          << serialized_info[ABI_TARGET_IDX] << ") than the Torch-TensorRT Runtime ABI Version (" << ABI_VERSION
+          << ")");
+}
+
 } // namespace runtime
 } // namespace core
 } // namespace torch_tensorrt
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
index 6b1e0d9e38..9295b88b4d 100644
--- a/core/runtime/execute_engine.cpp
+++ b/core/runtime/execute_engine.cpp
@@ -70,7 +70,8 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     ss << "  Output packing profile: " << compiled_engine->output_profile_path << std::endl;
     ss << "  TRT enqueue profile: " << compiled_engine->enqueue_profile_path << std::endl;
     ss << "  Engine execution profile: " << compiled_engine->trt_engine_profile_path << std::endl;
-    LOG_INFO(ss.str());
+    auto log_info = ss.str();
+    LOG_INFO("" << log_info);
   }
 
   {
@@ -191,6 +192,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     if (compiled_engine->profile_execution) {
       LOG_INFO(std::endl << *trt_engine_profiler);
       dump_trace(compiled_engine->trt_engine_profile_path, *trt_engine_profiler);
+      compiled_engine->dump_engine_layer_info();
     }
   }
 
diff --git a/py/setup.py b/py/setup.py
index 0f6868d9fa..2b031f06e5 100644
--- a/py/setup.py
+++ b/py/setup.py
@@ -405,7 +405,7 @@ def run(self):
         "Topic :: Software Development",
         "Topic :: Software Development :: Libraries",
     ],
-    python_requires=">=3.6",
+    python_requires=">=3.7",
     include_package_data=True,
     package_data={
         "torch_tensorrt": [
diff --git a/py/torch_tensorrt/_TRTModule.py b/py/torch_tensorrt/_TRTModuleNext.py
similarity index 88%
rename from py/torch_tensorrt/_TRTModule.py
rename to py/torch_tensorrt/_TRTModuleNext.py
index cabc328fdb..6edc530a4f 100644
--- a/py/torch_tensorrt/_TRTModule.py
+++ b/py/torch_tensorrt/_TRTModuleNext.py
@@ -1,3 +1,4 @@
+import logging
 from operator import truediv
 from typing import Any, List, Sequence, Tuple
 
@@ -5,9 +6,11 @@
 from torch_tensorrt import _C
 from torch_tensorrt._Device import Device
 
+logger = logging.getLogger(__name__)
 
-class TRTModule(torch.nn.Module):
-    """TRTModule is a PyTorch module which encompasses an arbitrary TensorRT Engine.
+
+class TRTModuleNext(torch.nn.Module):
+    """TRTModuleNext is a PyTorch module which encompasses an arbitrary TensorRT Engine.
 
     This module is backed by the Torch-TensorRT runtime and is fully compatibile with both
     FX / Python deployments (just ``import torch_tensorrt`` as part of the application) as
@@ -17,6 +20,8 @@ class TRTModule(torch.nn.Module):
     The forward function is simpily forward(*args: torch.Tensor) -> Tuple[torch.Tensor] where
     the internal implementation is ``return Tuple(torch.ops.tensorrt.execute_engine(list(inputs), self.engine))``
 
+    > Note: TRTModuleNext only supports engines built with explict batch
+
     Attributes:
         name (str): Name of module (for easier debugging)
         engine (torch.classess.tensorrt.Engine): Torch-TensorRT TensorRT Engine instance, manages [de]serialization, device configuration, profiling
@@ -32,7 +37,7 @@ def __init__(
         output_binding_names: List[str] = [],
         target_device: Device = Device._current_device(),
     ):
-        """__init__ method for torch_tensorrt.TRTModule
+        """__init__ method for torch_tensorrt.TRTModuleNext
 
         Takes a name, target device, serialized TensorRT engine, and binding names / order and constructs
         a PyTorch ``torch.nn.Module`` around it.
@@ -46,7 +51,7 @@ def __init__(
 
         Example:
 
-            ..code-block::python
+            ..code-block:: python
 
                 with io.BytesIO() as engine_bytes:
                     engine_bytes.write(trt_engine.serialize())
@@ -60,8 +65,10 @@ def __init__(
                 )
 
         """
-
-        super(TRTModule, self).__init__()
+        logger.warning(
+            "TRTModuleNext should be considered experimental stability, APIs are subject to change. Note: TRTModuleNext only supports engines built with explict batch"
+        )
+        super(TRTModuleNext, self).__init__()
         self.input_binding_names = input_binding_names
         self.output_binding_names = output_binding_names
         self.name = name
@@ -73,8 +80,8 @@ def __init__(
                     self.name + "_engine" if self.name != "" else "tensorrt_engine",
                     target_device._to_serialized_rt_device(),
                     serialized_engine,
-                    TRTModule._pack_binding_names(self.input_binding_names),
-                    TRTModule._pack_binding_names(self.output_binding_names),
+                    TRTModuleNext._pack_binding_names(self.input_binding_names),
+                    TRTModuleNext._pack_binding_names(self.output_binding_names),
                 ]
             )
         else:
@@ -138,7 +145,7 @@ def is_non_tensor(i: Tuple[Any, bool]) -> bool:
 
             non_tensors = [i[0] for i in filter(zip(inputs, types), is_non_tensor)]
             raise RuntimeError(
-                f"TRTModule expects a flattened list of tensors as input, found non tensors: {non_tensors}"
+                f"TRTModuleNext expects a flattened list of tensors as input, found non tensors: {non_tensors}"
             )
 
         outputs = torch.ops.tensorrt.execute_engine(list(inputs), self.engine)
diff --git a/py/torch_tensorrt/__init__.py b/py/torch_tensorrt/__init__.py
index 5d2407adf0..3261265215 100644
--- a/py/torch_tensorrt/__init__.py
+++ b/py/torch_tensorrt/__init__.py
@@ -90,7 +90,7 @@ def _find_lib(name, paths):
 from torch_tensorrt import logging
 from torch_tensorrt._Input import Input
 from torch_tensorrt._Device import Device
-from torch_tensorrt._TRTModule import TRTModule
+from torch_tensorrt._TRTModuleNext import TRTModuleNext
 
 from torch_tensorrt import fx
 
diff --git a/py/torch_tensorrt/fx/lower.py b/py/torch_tensorrt/fx/lower.py
index cae6b24af0..89068cb589 100644
--- a/py/torch_tensorrt/fx/lower.py
+++ b/py/torch_tensorrt/fx/lower.py
@@ -63,6 +63,11 @@ def compile(
     Returns:
         A torch.nn.Module lowered by TensorRT.
     """
+    if use_experimental_fx_rt and not explicit_batch_dimension:
+        raise ValueError(
+            "The experimental unifed runtime only supports explicit batch. Please make sure to set explicit_batch_dimension=True when use_experimental_fx_rt=True"
+        )
+
     lower_setting = LowerSetting(
         max_batch_size=max_batch_size,
         max_workspace_size=max_workspace_size,
@@ -172,7 +177,7 @@ def lower_pass(
         interp_res: TRTInterpreterResult = interpreter(mod, input, module_name)
         if lower_setting.use_experimental_rt:
             import io
-            from torch_tensorrt._TRTModule import TRTModule as TRTModuleNext
+            from torch_tensorrt._TRTModuleNext import TRTModuleNext
             from torch_tensorrt._Device import Device
 
             with io.BytesIO() as engine_bytes:
diff --git a/py/torch_tensorrt/fx/test/core/test_trt_module.py b/py/torch_tensorrt/fx/test/core/test_trt_module.py
index 30480f9b33..b4fdbd4cbc 100644
--- a/py/torch_tensorrt/fx/test/core/test_trt_module.py
+++ b/py/torch_tensorrt/fx/test/core/test_trt_module.py
@@ -9,7 +9,7 @@
 import torch_tensorrt.fx.tracer.acc_tracer.acc_tracer as acc_tracer
 from torch.testing._internal.common_utils import run_tests, TestCase
 from torch_tensorrt.fx import InputTensorSpec, TRTInterpreter, TRTModule
-from torch_tensorrt import TRTModule as TRTModuleNext
+from torch_tensorrt import TRTModuleNext
 from torch_tensorrt import Device
 from torch_tensorrt.fx.utils import LowerPrecision
 
@@ -69,7 +69,11 @@ def forward(self, x):
 
         mod = acc_tracer.trace(mod, inputs)
 
-        interp = TRTInterpreter(mod, input_specs=InputTensorSpec.from_tensors(inputs))
+        interp = TRTInterpreter(
+            mod,
+            input_specs=InputTensorSpec.from_tensors(inputs),
+            explicit_batch_dimension=True,
+        )
         interp_res = interp.run(lower_precision=LowerPrecision.FP32)
 
         with io.BytesIO() as engine_bytes:
@@ -105,7 +109,11 @@ def forward(self, x):
         ref_output = mod(*inputs)
 
         mod = acc_tracer.trace(mod, inputs)
-        interp = TRTInterpreter(mod, input_specs=InputTensorSpec.from_tensors(inputs))
+        interp = TRTInterpreter(
+            mod,
+            input_specs=InputTensorSpec.from_tensors(inputs),
+            explicit_batch_dimension=True,
+        )
         interp_res = interp.run(lower_precision=LowerPrecision.FP32)
 
         with io.BytesIO() as engine_bytes:
@@ -121,7 +129,6 @@ def forward(self, x):
         )
 
         st = trt_mod.state_dict()
-        print(st)
 
         new_trt_mod = TRTModuleNext()
         new_trt_mod.load_state_dict(st)
diff --git a/py/torch_tensorrt/fx/tools/trt_minimizer.py b/py/torch_tensorrt/fx/tools/trt_minimizer.py
index 792fe48f74..58da3db0e6 100644
--- a/py/torch_tensorrt/fx/tools/trt_minimizer.py
+++ b/py/torch_tensorrt/fx/tools/trt_minimizer.py
@@ -22,7 +22,7 @@ def lower_mod_default(
     interpreter_result = interp.run(max_batch_size=batch_size)
     if use_experimental_rt:
         import io
-        from torch_tensorrt._TRTModule import TRTModule as TRTModuleNext
+        from torch_tensorrt._TRTModuleNext import TRTModuleNext
         from torch_tensorrt._Device import Device
 
         with io.BytesIO() as engine_bytes:
diff --git a/py/torch_tensorrt/fx/tools/trt_splitter.py b/py/torch_tensorrt/fx/tools/trt_splitter.py
index 8496f8378c..bf331eeaca 100644
--- a/py/torch_tensorrt/fx/tools/trt_splitter.py
+++ b/py/torch_tensorrt/fx/tools/trt_splitter.py
@@ -87,7 +87,7 @@ def _lower_model_to_backend(
         interpreter_result = interp.run(*inputs)
         if self.settings.use_experimental_rt:
             import io
-            from torch_tensorrt._TRTModule import TRTModule as TRTModuleNext
+            from torch_tensorrt._TRTModuleNext import TRTModuleNext
             from torch_tensorrt._Device import Device
 
             with io.BytesIO() as engine_bytes:
diff --git a/tests/py/api/test_classes.py b/tests/py/api/test_classes.py
index 6f1a4affe3..861efd84a7 100644
--- a/tests/py/api/test_classes.py
+++ b/tests/py/api/test_classes.py
@@ -206,7 +206,7 @@ def test_dynamic_shape(self):
         self.assertTrue(self._verify_correctness(i, target))
 
 
-class TestTRTModule(unittest.TestCase):
+class TestTRTModuleNext(unittest.TestCase):
     @staticmethod
     def _get_trt_mod():
         class Test(torch.nn.Module):
@@ -223,7 +223,7 @@ def forward(self, x):
         test_mod_engine_str = torchtrt.ts.convert_method_to_trt_engine(
             mod, "forward", inputs=[torchtrt.Input((2, 10))]
         )
-        return torchtrt.TRTModule(
+        return torchtrt.TRTModuleNext(
             name="test",
             serialized_engine=test_mod_engine_str,
             input_binding_names=["input_0"],
@@ -246,7 +246,7 @@ def forward(self, x):
             mod, "forward", inputs=[torchtrt.Input((2, 10))]
         )
         with self.assertRaises(RuntimeError):
-            torchtrt.TRTModule(
+            torchtrt.TRTModuleNext(
                 name="test",
                 serialized_engine=test_mod_engine_str,
                 input_binding_names=["x.1"],
@@ -269,7 +269,7 @@ def forward(self, x):
             mod, "forward", inputs=[torchtrt.Input((2, 10))]
         )
         with self.assertRaises(RuntimeError):
-            torchtrt.TRTModule(
+            torchtrt.TRTModuleNext(
                 name="test",
                 serialized_engine=test_mod_engine_str,
                 input_binding_names=["input_0"],
@@ -277,7 +277,7 @@ def forward(self, x):
             )
 
     def test_set_get_profile_path_prefix(self):
-        trt_mod = TestTRTModule._get_trt_mod()
+        trt_mod = TestTRTModuleNext._get_trt_mod()
         trt_mod.engine.profile_path_prefix = "/tmp/"
         self.assertTrue(trt_mod.engine.profile_path_prefix == "/tmp/")
 
@@ -299,7 +299,7 @@ def test_get_layer_info(self):
 
         import json
 
-        trt_mod = TestTRTModule._get_trt_mod()
+        trt_mod = TestTRTModuleNext._get_trt_mod()
         trt_json = json.loads(trt_mod.get_layer_info())
         [self.assertTrue(k in trt_json.keys()) for k in ["Layers", "Bindings"]]
         self.assertTrue(len(trt_json["Layers"]) == 4)

From 71082d37a2800c9adbc50e4bc68960f273edbce5 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Mon, 21 Nov 2022 12:44:58 -0800
Subject: [PATCH 21/22] docs: Update docs and examples

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 docsrc/py_api/torch_tensorrt.rst             |   4 +
 examples/fx/fx2trt_example_next.py           | 196 +++++++++++++++++++
 examples/fx/torch_trt_simple_example_next.py |  89 +++++++++
 py/torch_tensorrt/_TRTModuleNext.py          |   2 +
 py/torch_tensorrt/fx/tools/trt_minimizer.py  |   9 +-
 py/torch_tensorrt/fx/tools/trt_splitter.py   |   5 +
 6 files changed, 303 insertions(+), 2 deletions(-)
 create mode 100644 examples/fx/fx2trt_example_next.py
 create mode 100644 examples/fx/torch_trt_simple_example_next.py

diff --git a/docsrc/py_api/torch_tensorrt.rst b/docsrc/py_api/torch_tensorrt.rst
index 2ccf7c60e2..98ccde8193 100644
--- a/docsrc/py_api/torch_tensorrt.rst
+++ b/docsrc/py_api/torch_tensorrt.rst
@@ -37,6 +37,10 @@ Classes
    :members:
    :special-members: __init__
 
+.. autoclass:: TRTModuleNext
+   :members:
+   :special-members: __init__
+
 Enums
 -------
 
diff --git a/examples/fx/fx2trt_example_next.py b/examples/fx/fx2trt_example_next.py
new file mode 100644
index 0000000000..f7b5ef1404
--- /dev/null
+++ b/examples/fx/fx2trt_example_next.py
@@ -0,0 +1,196 @@
+# type: ignore[]
+
+import io
+import torch
+import torch.fx
+import torch.nn as nn
+from torch_tensorrt.fx.utils import LowerPrecision
+import torch_tensorrt.fx.tracer.acc_tracer.acc_tracer as acc_tracer
+from torch_tensorrt.fx import InputTensorSpec, TRTInterpreter
+from torch_tensorrt.fx.tools.trt_splitter import TRTSplitter, TRTSplitterSetting
+from torch_tensorrt import TRTModuleNext as TRTModule, Device
+
+# The purpose of this example is to demonstrate the overall flow of lowering a PyTorch
+# model to TensorRT via FX with existing FX based tooling. The general lowering flow
+# would be like:
+#
+# 1. Use splitter to split the model if there're ops in the model that we don't want to
+#    lower to TensorRT for some reasons like the ops are not supported in TensorRT or
+#    running them on other backends provides better performance.
+# 2. Lower the model (or part of the model if splitter is used) to TensorRT via fx2trt.
+#
+# If we know the model is fully supported by fx2trt then we can skip the splitter.
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(10, 10)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        x = self.linear(x)
+        x = self.relu(x)
+        x = torch.linalg.norm(x, ord=2, dim=1)
+        x = self.relu(x)
+        return x
+
+
+inputs = [torch.randn((1, 10), device=torch.device("cuda"))]
+model = Model().cuda().eval()
+
+# acc_tracer is a custom fx tracer that maps nodes whose targets are PyTorch operators
+# to acc ops.
+traced = acc_tracer.trace(model, inputs)
+
+# Splitter will split the model into several submodules. The name of submodules will
+# be either `run_on_acc_{}` or `run_on_gpu_{}`. Submodules named `run_on_acc_{}` can
+# be fully lowered to TensorRT via fx2trt while submodules named `run_on_gpu_{}` has
+# unsupported ops and can't be lowered by fx2trt. We can still run `run_on_gpu_{}`
+# submodules on Gpu if ops there have cuda implementation, the naming is a bit
+# confusing and we'll improve it.
+settings = TRTSplitterSetting()
+
+# We want to use the next gen runtime which lets us swap between nn.Module and TorchScript at will
+settings.use_experimental_rt = True
+# The next gen runtime only supports dynamic batch
+settings.use_implicit_batch_dim = False
+splitter = TRTSplitter(traced, inputs, settings=settings)
+
+# Preview functionality allows us to see what are the supported ops and unsupported
+# ops. We can optionally the dot graph which will color supported ops and unsupported
+# ops differently.
+splitter.node_support_preview(dump_graph=False)
+"""
+Supported node types in the model:
+acc_ops.linear: ((), {'input': torch.float32, 'weight': torch.float32, 'bias': torch.float32})
+acc_ops.relu: ((), {'input': torch.float32})
+
+Unsupported node types in the model:
+acc_ops.linalg_norm: ((), {'input': torch.float32})
+"""
+
+# Split.
+split_mod = splitter()
+
+# After split we have three submodules, _run_on_acc_0 and _run_on_gpu_1.
+print(split_mod.graph)
+"""
+graph():
+    %x : [#users=1] = placeholder[target=x]
+    %_run_on_acc_0 : [#users=1] = call_module[target=_run_on_acc_0](args = (%x,), kwargs = {})
+    %_run_on_gpu_1 : [#users=1] = call_module[target=_run_on_gpu_1](args = (%_run_on_acc_0,), kwargs = {})
+    %_run_on_acc_2 : [#users=1] = call_module[target=_run_on_acc_2](args = (%_run_on_gpu_1,), kwargs = {})
+    return _run_on_acc_2
+"""
+
+# Take a look at what inside each submodule. _run_on_acc_0 contains linear and relu while
+# _run_on_gpu_1 contains linalg_norm which currently is not supported by fx2trt. _run_on_acc_3
+# is the another submodule supported.
+print(split_mod._run_on_acc_0.graph)
+print(split_mod._run_on_gpu_1.graph)
+print(split_mod._run_on_acc_2.graph)
+"""
+graph():
+    %x : [#users=1] = placeholder[target=x]
+    %linear_weight : [#users=1] = get_attr[target=linear.weight]
+    %linear_bias : [#users=1] = get_attr[target=linear.bias]
+    %linear_1 : [#users=1] = call_function[target=torch_tensorrt.fx.tracer.acc_tracer.acc_ops.linear](args = (), ...
+    %relu_1 : [#users=1] = call_function[target=torch_tensorrt.fx.tracer.acc_tracer.acc_ops.relu](args = (), ...
+    return relu_1
+graph():
+    %relu_1 : [#users=1] = placeholder[target=relu_1]
+    %linalg_norm_1 : [#users=1] = call_function[target=torch_tensorrt.fx.tracer.acc_tracer.acc_ops.linalg_norm](args = (), ...
+    return linalg_norm_1
+graph():
+    %linalg_norm_1 : [#users=1] = placeholder[target=linalg_norm_1]
+    %relu_3 : [#users=1] = call_function[target=torch_tensorrt.fx.tracer.acc_tracer.acc_ops.relu](args = (), kwargs = {input: %linalg_norm_1, inplace: False})
+    return relu_3
+"""
+
+
+def get_submod_inputs(mod, submod, inputs):
+    acc_inputs = None
+
+    def get_input(self, inputs):
+        nonlocal acc_inputs
+        acc_inputs = inputs
+
+    handle = submod.register_forward_pre_hook(get_input)
+    mod(*inputs)
+    handle.remove()
+    return acc_inputs
+
+
+# Since the model is splitted into three segments. We need to lower each TRT eligible segment.
+# If we know the model can be fully lowered, we can skip the splitter part.
+for name, _ in split_mod.named_children():
+    if "_run_on_acc" in name:
+        submod = getattr(split_mod, name)
+        # Get submodule inputs for fx2trt
+        acc_inputs = get_submod_inputs(split_mod, submod, inputs)
+
+        # fx2trt replacement
+        interp = TRTInterpreter(
+            submod,
+            InputTensorSpec.from_tensors(acc_inputs),
+            explicit_batch_dimension=True,
+        )
+        r = interp.run(lower_precision=LowerPrecision.FP32)
+        with io.BytesIO() as engine_bytes:
+            engine_bytes.write(r.engine.serialize())
+            engine_str = engine_bytes.getvalue()
+
+        trt_mod = TRTModule(
+            name="my_module",
+            serialized_engine=engine_str,
+            input_binding_names=r.input_names,
+            output_binding_names=r.output_names,
+            target_device=Device(f"cuda:{torch.cuda.current_device()}"),
+        )
+
+        import pickle as pkl
+
+        with open("/tmp/trt_mod.pkl", "wb") as f:
+            pkl.dump(trt_mod, f)
+
+        torch.save(trt_mod, "/tmp/trt_mod.pt")
+
+        setattr(split_mod, name, trt_mod)
+
+
+lowered_model_output = split_mod(*inputs)
+
+print("Saving the partially compiled module state_dict")
+# Save and load model
+torch.save(split_mod, "trt.pt")
+print("Loading the partially compiled module state_dict")
+reload_trt_mod = torch.load("trt.pt")
+reload_model_output = reload_trt_mod(*inputs)
+
+# Make sure the results match
+regular_model_output = model(*inputs)
+torch.testing.assert_close(
+    reload_model_output, regular_model_output, atol=3e-3, rtol=1e-2
+)
+
+print("torch.jit.trace FX split and lowered module")
+# Convert model to torchscript
+ts_trt_mod = torch.jit.trace(reload_trt_mod, inputs)
+ts_model_output = ts_trt_mod(*inputs)
+
+# Make sure the results match
+regular_model_output = model(*inputs)
+torch.testing.assert_close(ts_model_output, regular_model_output, atol=3e-3, rtol=1e-2)
+
+print("Saving torchscript version of module")
+ts_trt_mod.save("trt_ts.ts")
+print("Loading torchscript version of module")
+reload_ts_trt_mod = torch.jit.load("trt_ts.ts")
+reload_ts_model_output = reload_ts_trt_mod(*inputs)
+
+# Make sure the results match
+regular_model_output = model(*inputs)
+torch.testing.assert_close(
+    reload_ts_model_output, regular_model_output, atol=3e-3, rtol=1e-2
+)
diff --git a/examples/fx/torch_trt_simple_example_next.py b/examples/fx/torch_trt_simple_example_next.py
new file mode 100644
index 0000000000..391643ee0b
--- /dev/null
+++ b/examples/fx/torch_trt_simple_example_next.py
@@ -0,0 +1,89 @@
+import torch
+import copy
+import torchvision
+import torch_tensorrt
+from torch_tensorrt.fx import InputTensorSpec
+
+
+def test_torch_tensorrt(model, inputs):
+    # torchscript path
+    model_ts = copy.deepcopy(model)
+    inputs_ts = copy.deepcopy(inputs)
+    # fp32 test
+    with torch.inference_mode():
+        ref_fp32 = model_ts(*inputs_ts)
+    trt_ts_module = torch_tensorrt.compile(
+        model_ts, inputs=inputs_ts, enabled_precisions={torch.float32}
+    )
+    result_fp32 = trt_ts_module(*inputs_ts)
+    assert (
+        torch.nn.functional.cosine_similarity(
+            ref_fp32.flatten(), result_fp32.flatten(), dim=0
+        )
+        > 0.9999
+    )
+    # fp16 test
+    model_ts = model_ts.half()
+    inputs_ts = [i.cuda().half() for i in inputs_ts]
+    with torch.inference_mode():
+        ref_fp16 = model_ts(*inputs_ts)
+    trt_ts_module = torch_tensorrt.compile(
+        model_ts, inputs=inputs_ts, enabled_precisions={torch.float16}
+    )
+    result_fp16 = trt_ts_module(*inputs_ts)
+    assert (
+        torch.nn.functional.cosine_similarity(
+            ref_fp16.flatten(), result_fp16.flatten(), dim=0
+        )
+        > 0.99
+    )
+
+    # FX path
+    model_fx = copy.deepcopy(model)
+    inputs_fx = copy.deepcopy(inputs)
+    # fp32 test
+    with torch.inference_mode():
+        ref_fp32 = model_fx(*inputs_fx)
+    trt_fx_module = torch_tensorrt.compile(
+        model_fx, ir="fx", inputs=inputs_fx, enabled_precisions={torch.float32}
+    )
+    result_fp32 = trt_fx_module(*inputs_fx)
+    assert (
+        torch.nn.functional.cosine_similarity(
+            ref_fp32.flatten(), result_fp32.flatten(), dim=0
+        )
+        > 0.9999
+    )
+    # fp16 test
+    model_fx = model_fx.cuda().half()
+    inputs_fx = [i.cuda().half() for i in inputs_fx]
+    with torch.inference_mode():
+        ref_fp16 = model_fx(*inputs_fx)
+    trt_fx_module = torch_tensorrt.compile(
+        model_fx, ir="fx", inputs=inputs_fx, enabled_precisions={torch.float16}
+    )
+    result_fp16 = trt_fx_module(*inputs_fx)
+    assert (
+        torch.nn.functional.cosine_similarity(
+            ref_fp16.flatten(), result_fp16.flatten(), dim=0
+        )
+        > 0.99
+    )
+
+    scripted_fx_module = torch.jit.trace(trt_fx_module, example_inputs=inputs_fx)
+    scripted_fx_module.save("/tmp/scripted_fx_module.ts")
+    scripted_fx_module = torch.jit.load("/tmp/scripted_fx_module.ts")
+
+    result_fp16 = scripted_fx_module(*inputs_fx)
+    assert (
+        torch.nn.functional.cosine_similarity(
+            ref_fp16.flatten(), result_fp16.flatten(), dim=0
+        )
+        > 0.99
+    )
+
+
+if __name__ == "__main__":
+    model = torchvision.models.resnet18(pretrained=True).cuda().eval()
+    inputs = [torch.ones((32, 3, 224, 224), device=torch.device("cuda"))]  # type: ignore[attr-defined]
+    test_torch_tensorrt(model, inputs)
diff --git a/py/torch_tensorrt/_TRTModuleNext.py b/py/torch_tensorrt/_TRTModuleNext.py
index 6edc530a4f..8f5aa3f329 100644
--- a/py/torch_tensorrt/_TRTModuleNext.py
+++ b/py/torch_tensorrt/_TRTModuleNext.py
@@ -158,6 +158,8 @@ def is_non_tensor(i: Tuple[Any, bool]) -> bool:
     def enable_profiling(self, profiling_results_dir: str = None):
         """Enable the profiler to collect latency information about the execution of the engine
 
+        Traces can be visualized using https://ui.perfetto.dev/ or compatible alternatives
+
         Keyword Arguments:
             profiling_results_dir (str): Absolute path to the directory to sort results of profiling.
         """
diff --git a/py/torch_tensorrt/fx/tools/trt_minimizer.py b/py/torch_tensorrt/fx/tools/trt_minimizer.py
index 58da3db0e6..63e0d40ff4 100644
--- a/py/torch_tensorrt/fx/tools/trt_minimizer.py
+++ b/py/torch_tensorrt/fx/tools/trt_minimizer.py
@@ -48,10 +48,15 @@ def lower_mod_default(
 
 class TensorRTMinizerSetting(net_min_base._MinimizerSettingBase):
     def __init__(
-        self, explicit_batch_dimension: Any = True, use_experiemental_rt: bool = False
+        self, explicit_batch_dimension: Any = True, use_experimental_rt: bool = False
     ):
+        if use_experimental_rt and not explicit_batch_dimension:
+            raise ValueError(
+                "The experimental unifed runtime only supports explicit batch. Please make sure to set explicit_batch_dimension=True when use_experimental_rt=True"
+            )
+
         self.explicit_batch_dimension = explicit_batch_dimension
-        self.use_experimental_rt = use_experiemental_rt
+        self.use_experimental_rt = use_experimental_rt
         super(TensorRTMinizerSetting, self).__init__()
 
 
diff --git a/py/torch_tensorrt/fx/tools/trt_splitter.py b/py/torch_tensorrt/fx/tools/trt_splitter.py
index bf331eeaca..8593411403 100644
--- a/py/torch_tensorrt/fx/tools/trt_splitter.py
+++ b/py/torch_tensorrt/fx/tools/trt_splitter.py
@@ -52,6 +52,11 @@ def __init__(self):
         self.exclude_support_node_name: set = set()
         self.use_experimental_rt: bool = False
 
+        if self.use_experimental_rt and self.use_implicit_batch_dim:
+            raise ValueError(
+                "The experimental unifed runtime only supports explicit batch. Please make sure to set use_implicit_batch_dim=False when use_experimental_rt=True"
+            )
+
 
 class TRTSplitter(splitter_base._SplitterBase):
     def __init__(

From e782cc92fa5e85ca34c5d8043a71d9b7307d7e5f Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Mon, 21 Nov 2022 15:29:55 -0800
Subject: [PATCH 22/22] refactor: Reorder the API since everything but the
 engine is optional

Also new destructor to order cleanup

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/runtime/TRTEngine.cpp                  |  6 ++++++
 core/runtime/TRTEngine.h                    |  2 +-
 py/torch_tensorrt/_TRTModuleNext.py         | 17 +++++++++++++----
 py/torch_tensorrt/fx/lower.py               |  2 +-
 py/torch_tensorrt/fx/tools/trt_minimizer.py |  2 +-
 py/torch_tensorrt/fx/tools/trt_splitter.py  |  2 +-
 py/torch_tensorrt/ts/_compiler.py           |  6 ++++--
 7 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
index d524a44ba4..8d6b0f471f 100644
--- a/core/runtime/TRTEngine.cpp
+++ b/core/runtime/TRTEngine.cpp
@@ -145,6 +145,12 @@ TRTEngine::TRTEngine(
   LOG_DEBUG(*this);
 }
 
+TRTEngine::~TRTEngine() {
+  exec_ctx.reset();
+  cuda_engine.reset();
+  rt.reset();
+}
+
 void TRTEngine::disable_profiling() {
   torch::cuda::synchronize(device_info.id);
   profile_execution = false;
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
index 0eada27238..5615a824a2 100644
--- a/core/runtime/TRTEngine.h
+++ b/core/runtime/TRTEngine.h
@@ -33,7 +33,7 @@ struct TRTEngine : torch::CustomClassHolder {
   std::vector<std::string> in_binding_names = {}; // ITO: PYT IDX
   std::vector<std::string> out_binding_names = {}; // ITO: PYT IDX
 
-  ~TRTEngine() = default;
+  ~TRTEngine();
   TRTEngine(
       const std::string& serialized_engine,
       const RTDevice& cuda_device,
diff --git a/py/torch_tensorrt/_TRTModuleNext.py b/py/torch_tensorrt/_TRTModuleNext.py
index 8f5aa3f329..bc13e0d4de 100644
--- a/py/torch_tensorrt/_TRTModuleNext.py
+++ b/py/torch_tensorrt/_TRTModuleNext.py
@@ -31,8 +31,8 @@ class TRTModuleNext(torch.nn.Module):
 
     def __init__(
         self,
+        serialized_engine: bytearray,
         name: str = "",
-        serialized_engine: bytearray = bytearray(),
         input_binding_names: List[str] = [],
         output_binding_names: List[str] = [],
         target_device: Device = Device._current_device(),
@@ -42,6 +42,11 @@ def __init__(
         Takes a name, target device, serialized TensorRT engine, and binding names / order and constructs
         a PyTorch ``torch.nn.Module`` around it.
 
+        If binding names are not provided, it is assumed that the engine binding names follow the following convention:
+
+            - [symbol].[index in input / output array]
+                - ex. [x.0, x.1, x.2] -> [y.0]
+
         Args:
             name (str): Name for module
             serialized_engine (bytearray): Serialized TensorRT engine in the form of a bytearray
@@ -51,15 +56,15 @@ def __init__(
 
         Example:
 
-            ..code-block:: python
+            ..code-block:: py
 
                 with io.BytesIO() as engine_bytes:
                     engine_bytes.write(trt_engine.serialize())
                     engine_str = engine_bytes.getvalue()
 
                 trt_module = TRTModule(
-                    engine_name="my_engine",
-                    serialized_engine=engine_str,
+                    engine_str,
+                    engine_name="my_module",
                     input_names=["x"],
                     output_names=["output"],
                 )
@@ -69,6 +74,10 @@ def __init__(
             "TRTModuleNext should be considered experimental stability, APIs are subject to change. Note: TRTModuleNext only supports engines built with explict batch"
         )
         super(TRTModuleNext, self).__init__()
+
+        if not isinstance(serialized_engine, bytearray):
+            ValueError("Expected serialized engine as bytearray")
+
         self.input_binding_names = input_binding_names
         self.output_binding_names = output_binding_names
         self.name = name
diff --git a/py/torch_tensorrt/fx/lower.py b/py/torch_tensorrt/fx/lower.py
index 89068cb589..3d3eb1ae92 100644
--- a/py/torch_tensorrt/fx/lower.py
+++ b/py/torch_tensorrt/fx/lower.py
@@ -185,8 +185,8 @@ def lower_pass(
                 engine_str = engine_bytes.getvalue()
 
             trt_module = TRTModuleNext(
+                engine_str,
                 name=module_name,
-                serialized_engine=engine_str,
                 input_binding_names=interp_res.input_names,
                 output_binding_names=interp_res.output_names,
                 target_device=Device(f"cuda:{torch.cuda.current_device()}"),
diff --git a/py/torch_tensorrt/fx/tools/trt_minimizer.py b/py/torch_tensorrt/fx/tools/trt_minimizer.py
index 63e0d40ff4..d396453e22 100644
--- a/py/torch_tensorrt/fx/tools/trt_minimizer.py
+++ b/py/torch_tensorrt/fx/tools/trt_minimizer.py
@@ -30,8 +30,8 @@ def lower_mod_default(
             engine_str = engine_bytes.getvalue()
 
         res_mod = TRTModuleNext(
+            engine_str,
             name=str(type(mod)),
-            serialized_engine=engine_str,
             input_binding_names=interpreter_result.input_names,
             output_binding_names=interpreter_result.output_names,
             target_device=Device(f"cuda:{torch.cuda.current_device()}"),
diff --git a/py/torch_tensorrt/fx/tools/trt_splitter.py b/py/torch_tensorrt/fx/tools/trt_splitter.py
index 8593411403..a9b692ead3 100644
--- a/py/torch_tensorrt/fx/tools/trt_splitter.py
+++ b/py/torch_tensorrt/fx/tools/trt_splitter.py
@@ -100,8 +100,8 @@ def _lower_model_to_backend(
                 engine_str = engine_bytes.getvalue()
 
             return TRTModuleNext(
+                engine_str,
                 name=str(type(mod)),
-                serialized_engine=engine_str,
                 input_binding_names=interpreter_result.input_names,
                 output_binding_names=interpreter_result.output_names,
                 target_device=Device(f"cuda:{torch.cuda.current_device()}"),
diff --git a/py/torch_tensorrt/ts/_compiler.py b/py/torch_tensorrt/ts/_compiler.py
index a0da43ffb0..19ea4ee802 100644
--- a/py/torch_tensorrt/ts/_compiler.py
+++ b/py/torch_tensorrt/ts/_compiler.py
@@ -140,7 +140,7 @@ def compile(
 
 def convert_method_to_trt_engine(
     module: torch.jit.ScriptModule,
-    method_name: str,
+    method_name: str = "forward",
     inputs=[],
     device=Device._current_device(),
     disable_tf32=False,
@@ -290,7 +290,9 @@ def embed_engine_in_new_module(
     return torch.jit._recursive.wrap_cpp_module(cpp_mod)
 
 
-def check_method_op_support(module: torch.jit.ScriptModule, method_name: str) -> bool:
+def check_method_op_support(
+    module: torch.jit.ScriptModule, method_name: str = "forward"
+) -> bool:
     """Checks to see if a method is fully supported by torch_tensorrt
 
     Checks if a method of a TorchScript module can be compiled by torch_tensorrt, if not, a list of operators