fix(//core/partitioing): Fixing support for paritally compiling

graphs with FP16 weights Signed-off-by: Naren Dasan <[email protected]> Signed-off-by: Naren Dasan <[email protected]>
pytorch · Oct 19, 2021 · 748ecf3 · 748ecf3
1 parent 8927e77
commit 748ecf3
Show file tree

Hide file tree

Showing 63 changed files with 791 additions and 593 deletions.
diff --git a/core/compiler.cpp b/core/compiler.cpp
@@ -128,22 +128,6 @@ bool CheckMethodOperatorSupport(const torch::jit::script::Module& mod, std::stri
   return conversion::VerifyConverterSupportForBlock(g->block());
 }
 
-std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::string method_name, CompileSpec cfg) {
-  // Go through Lowering to simplify graph and extract weight parameters
-  auto graph_and_parameters = lowering::Lower(mod, method_name, cfg.lower_info);
-
-  auto convert_cfg = std::move(cfg.convert_info);
-  auto g = graph_and_parameters.first;
-
-  auto params = graph_and_parameters.second;
-  auto named_params = conversion::get_named_params(g->inputs(), params);
-
-  LOG_INFO(*g << "(CompileGraph)\n");
-
-  auto engine = conversion::ConvertBlockToEngine(g->block(), convert_cfg, named_params);
-  return std::move(engine);
-}
-
 void AddSegmentedBlockToGraph(
     std::shared_ptr<torch::jit::Graph>& g,
     partitioning::SegmentedBlock& seg,
@@ -237,15 +221,15 @@ void AddIfBlockToGraph(
 GraphAndMapping ConstructFallbackGraph(
     torch::jit::script::Module& new_mod,
     torch::jit::Block* block,
-    std::unordered_map<torch::jit::Value*, torch::jit::IValue> input_ivalues_map,
+    std::unordered_map<const torch::jit::Value*, torch::jit::IValue> example_tensor_map,
     CompileSpec cfg,
-    conversion::GraphParams named_params) {
+    ir::StaticParams static_params) {
   auto convert_cfg = cfg.convert_info;
   auto partition_info = cfg.partition_info;
 
   auto new_g = std::make_shared<torch::jit::Graph>();
 
-  auto segmented_blocks = partitioning::Partition(block, input_ivalues_map, partition_info);
+  auto segmented_blocks = partitioning::Partition(block, example_tensor_map, partition_info);
 
   // the mapping from lowering graph => fallback global graph
   std::unordered_map<torch::jit::Value*, torch::jit::Value*> old_to_new_g;
@@ -259,13 +243,17 @@ GraphAndMapping ConstructFallbackGraph(
     trt_engine_id << reinterpret_cast<const int*>(&seg_block);
 
     if (seg_block.target() == partitioning::SegmentedBlock::kTensorRT) {
+      auto shapes = seg_block.in_shapes();
+      auto types = seg_block.in_types();
       std::vector<ir::Input> inputs;
-      for (auto& shape : seg_block.in_shape()) {
-        inputs.push_back(ir::Input(shape));
+      for (size_t i = 0; i < shapes.size(); i++) {
+        auto in = ir::Input(shapes[i]);
+        in.dtype = util::ScalarTypeToTRTDataType(types[i]);
+        inputs.push_back(in);
       }
       // update the input ranges for each segments
-      convert_cfg.inputs = inputs;
-      auto engine = conversion::ConvertBlockToEngine(seg_block.block(), convert_cfg, named_params);
+      convert_cfg.inputs = ir::associate_specs_with_inputs(seg_block.g(), inputs, static_params);
+      auto engine = conversion::ConvertBlockToEngine(seg_block.block(), convert_cfg, static_params);
       auto temp_g = std::make_shared<torch::jit::Graph>();
       auto device_spec = convert_cfg.engine_settings.device;
       auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
@@ -281,7 +269,7 @@ GraphAndMapping ConstructFallbackGraph(
         std::vector<GraphAndMapping> graph_and_mappings;
         for (auto cur_block : if_node->blocks()) {
           graph_and_mappings.push_back(
-              ConstructFallbackGraph(new_mod, cur_block, input_ivalues_map, cfg, named_params));
+              ConstructFallbackGraph(new_mod, cur_block, example_tensor_map, cfg, static_params));
         }
         AddIfBlockToGraph(new_g, if_node, graph_and_mappings, old_to_new_g);
 
@@ -299,54 +287,28 @@ GraphAndMapping ConstructFallbackGraph(
   return {new_g, old_to_new_g};
 }
 
-torch::jit::script::Module CompileGraphWithFallback(const torch::jit::script::Module& mod, CompileSpec cfg) {
-  // TODO: Should be doing a functional transform but need PR #31978
-  // [jit] More robust mangling
-  // torch::jit::script::Module new_mod = mod.clone();
-  torch::jit::script::Module new_mod(mod._ivalue()->name() + "_trt");
-  std::vector<std::shared_ptr<torch::jit::Graph>> graphs;
-  for (const torch::jit::script::Method& method : mod.get_methods()) {
-    // Compile only forward methods. forward method contains the entire graph.
-    if (method.name().compare("forward") == 0) {
-      auto new_g = std::make_shared<torch::jit::Graph>();
-      auto graph_and_parameters = lowering::Lower(mod, method.name(), cfg.lower_info);
+std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::string method_name, CompileSpec cfg) {
+  // Go through Lowering to simplify graph and extract weight parameters
+  auto graph_and_parameters = lowering::Lower(mod, method_name, cfg.lower_info);
 
-      auto g = graph_and_parameters.first;
-      auto params = graph_and_parameters.second;
-      auto named_params = conversion::get_named_params(g->inputs(), params);
-      LOG_INFO("(LoweredGraph)\n" << *g);
+  auto convert_cfg = std::move(cfg.convert_info);
+  auto g = graph_and_parameters.first;
 
-      std::unordered_map<torch::jit::Value*, ir::Input> inputs;
-      for (size_t i = 0; i < g->inputs().size(); ++i) {
-        inputs.insert({g->inputs()[i], cfg.convert_info.inputs[i]});
-      }
-      auto input_ivalues_map = partitioning::generateRandomInputs(inputs);
-      auto graph_and_mapping = ConstructFallbackGraph(new_mod, g->block(), input_ivalues_map, cfg, named_params);
-      new_g = graph_and_mapping.first;
-      LOG_INFO("(FallbackGraph)\n" << *new_g);
+  auto params = graph_and_parameters.second;
+  auto static_params = ir::get_static_params(g->inputs(), params);
 
-      // if there is no tensorrt engine self in fallback graph, there is no conversion, we just return the initial
-      // module
-      if (new_g->inputs()[0]->type()->str().find("__torch__") == std::string::npos) {
-        LOG_WARNING("Didn't generate any TensorRT engines, the compiler did nothing\n");
-        return mod;
-      }
+  LOG_INFO(*g << "(CompileGraph)\n");
 
-      auto new_method = new_mod._ivalue()->compilation_unit()->create_function(method.name(), new_g);
-      auto schema = util::GenerateGraphSchema(new_method->name(), new_g);
-      new_mod.type()->addMethod(new_method);
-      new_method->setSchema(schema);
-    }
-  }
+  // Move the user defined inputs to the convert_cfg since some might be static;
+  convert_cfg.inputs = std::move(ir::associate_specs_with_inputs(g, cfg.inputs, static_params));
 
-  return new_mod;
+  auto engine = conversion::ConvertBlockToEngine(g->block(), convert_cfg, static_params);
+  return std::move(engine);
 }
 
-torch::jit::script::Module CompileGraph(const torch::jit::script::Module& mod, CompileSpec cfg) {
-  // TODO: not sure how to deal with duplicated code here, so just cut out a branch temporally
-  if (cfg.partition_info.enabled) {
-    return CompileGraphWithFallback(mod, cfg);
-  }
+torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg) {
+  torch::jit::Module new_mod(mod._ivalue()->name() + "_trt");
+
   auto device_spec = cfg.convert_info.engine_settings.device;
 
   // GPU default WS size : 1 GB
@@ -362,25 +324,59 @@ torch::jit::script::Module CompileGraph(const torch::jit::script::Module& mod, C
     }
   }
 
-  // TODO: Should be doing a functional transform but need PR #31978
-  // [jit] More robust mangling
-  // torch::jit::script::Module new_mod = mod.clone();
-  torch::jit::script::Module new_mod(mod._ivalue()->name() + "_trt");
-  std::vector<std::shared_ptr<torch::jit::Graph>> graphs;
-  for (const torch::jit::script::Method& method : mod.get_methods()) {
-    // Compile only forward methods. forward method contains the entire graph.
+  for (const torch::jit::Method& method : mod.get_methods()) {
     if (method.name().compare("forward") == 0) {
-      auto engine = ConvertGraphToTRTEngine(mod, method.name(), cfg);
       auto new_g = std::make_shared<torch::jit::Graph>();
-      auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
-      AddEngineToGraph(new_mod, new_g, engine, cuda_device);
+
+      auto graph_and_parameters = lowering::Lower(mod, method.name(), cfg.lower_info);
+
+      auto g = graph_and_parameters.first;
+      LOG_INFO("Lowered Graph: " << *g);
+      auto params = graph_and_parameters.second;
+      auto static_params = ir::get_static_params(g->inputs(), params);
+
+      cfg.convert_info.inputs = std::move(ir::associate_specs_with_inputs(g, cfg.inputs, static_params));
+
+      // If the user did not explicitly set the input type, then use the first
+      // tensor calculation to infer type.
+      auto first_use_types = util::get_block_first_calc_dtypes_opt(g->block());
+      for (auto& in : g->inputs()) {
+        auto est_type_opt = first_use_types[in];
+        ir::Input& spec = cfg.convert_info.inputs.find(in)->second;
+        if (est_type_opt && !spec.dtype_is_user_defined) {
+          spec.dtype = util::ScalarTypeToTRTDataType(est_type_opt.value());
+        } else if (!est_type_opt && !spec.dtype_is_user_defined) {
+          LOG_WARNING(
+              "Cannot deterime input type from calcuations in graph for input "
+              << in->debugName() << ". Assuming it is Float32. If not, specify input type explicity");
+          spec.dtype = nvinfer1::DataType::kFLOAT;
+        }
+      }
+
+      if (cfg.partition_info.enabled) {
+        auto input_ivalues_map = partitioning::generateRandomInputs(cfg.convert_info.inputs, first_use_types);
+        auto graph_and_mapping = ConstructFallbackGraph(new_mod, g->block(), input_ivalues_map, cfg, static_params);
+        new_g = graph_and_mapping.first;
+        LOG_INFO("Segmented Graph: " << *new_g);
+
+        // if there is no tensorrt engine self in fallback graph, there is no conversion, we just return the initial
+        // module
+        if (new_g->inputs()[0]->type()->str().find("__torch__") == std::string::npos) {
+          LOG_WARNING("Didn't generate any TensorRT engines, the compiler did nothing\n");
+          return mod;
+        }
+      } else {
+        auto engine = conversion::ConvertBlockToEngine(g->block(), cfg.convert_info, static_params);
+        auto device_spec = cfg.convert_info.engine_settings.device;
+        auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
+        AddEngineToGraph(new_mod, new_g, engine, cuda_device);
+      }
       auto new_method = new_mod._ivalue()->compilation_unit()->create_function(method.name(), new_g);
       auto schema = util::GenerateGraphSchema(new_method->name(), new_g);
       new_mod.type()->addMethod(new_method);
       new_method->setSchema(schema);
     }
   }
-
   return new_mod;
 }
 

diff --git a/core/compiler.h b/core/compiler.h
@@ -13,7 +13,8 @@ namespace trtorch {
 namespace core {
 
 struct CompileSpec {
-  CompileSpec(std::vector<ir::Input> inputs) : convert_info(std::move(inputs)) {}
+  CompileSpec(std::vector<ir::Input> inputs) : inputs(inputs) {}
+  std::vector<ir::Input> inputs;
   conversion::ConversionInfo convert_info;
   lowering::LowerInfo lower_info;
   partitioning::PartitionInfo partition_info;

diff --git a/core/conversion/BUILD b/core/conversion/BUILD
@@ -10,7 +10,6 @@ config_setting(
 cc_library(
     name = "conversion",
     srcs = [
-        "InterfaceTypes.cpp",
         "conversion.cpp",
         "conversion_ignorelist.cpp",
     ],

diff --git a/core/conversion/conversion.cpp b/core/conversion/conversion.cpp
@@ -128,7 +128,10 @@ void AddLayer(ConversionCtx* ctx, const torch::jit::Node* n) {
                        << "please report this error to https://www.github.com/NVIDIA/TRTorch/issues");
 }
 
-void AddInputs(ConversionCtx* ctx, at::ArrayRef<const torch::jit::Value*> inputs, std::vector<ir::Input>& input_specs) {
+void AddInputs(
+    ConversionCtx* ctx,
+    c10::ArrayRef<const torch::jit::Value*> inputs,
+    std::unordered_map<const torch::jit::Value*, ir::Input>& input_specs) {
   std::vector<const torch::jit::Value*> input_tensors;
   for (auto in : inputs) {
     // Disregarding inputs that are not tensors
@@ -143,24 +146,23 @@ void AddInputs(ConversionCtx* ctx, at::ArrayRef<const torch::jit::Value*> inputs
   }
 
   std::stringstream ss;
-  ss << "Input Dimension Specs: [\n";
+  ss << "Input Dimension Specs: {" << std::endl;
   for (auto i : input_specs) {
-    ss << "    " << i << ",";
+    ss << "    " << i.first->debugName() << " : " << i.second << ",";
   }
-  ss << ']';
-  LOG_DEBUG(ctx->logger, ss.str());
-
-  TRTORCH_CHECK(
-      input_tensors.size() == input_specs.size(),
-      "Expected dimension specifications for all input tensors"
-          << ", but found " << input_tensors.size() << " input tensors and " << input_specs.size()
-          << " dimension specs (conversion.AddInputs)");
+  ss << '}';
+  auto dbg_str = ss.str();
+  LOG_DEBUG(ctx->logger, dbg_str);
 
   auto profile = ctx->builder->createOptimizationProfile();
 
-  for (size_t i = 0; i < input_tensors.size(); i++) {
-    auto in = input_tensors[i];
-    auto spec = input_specs[i];
+  for (auto input : input_tensors) {
+    const torch::jit::Value* in = input;
+    TRTORCH_CHECK(
+        input_specs.find(in) != input_specs.end(),
+        "Cannot find an input spec associated with input: " << in->debugName());
+    ir::Input& spec = input_specs.find(in)->second;
+
     std::string name = std::string("input_") + std::to_string(ctx->num_inputs);
     LOG_INFO(
         ctx->logger,
@@ -226,7 +228,7 @@ void MarkOutputs(ConversionCtx* ctx, at::ArrayRef<const torch::jit::Value*> outp
   }
 }
 
-void AddParamsToCtxValueMap(ConversionCtx* ctx, GraphParams& params) {
+void AddParamsToCtxValueMap(ConversionCtx* ctx, ir::StaticParams& params) {
   for (auto p : params) {
     ctx->evaluated_value_map[p.first] = std::move(p.second);
   }
@@ -358,8 +360,8 @@ void EvaluateLoopBlock(ConversionCtx* ctx, const torch::jit::Node* n) {
 void ConvertBlockToNetDef(
     ConversionCtx* ctx,
     const torch::jit::Block* b,
-    ConversionInfo build_info,
-    GraphParams& static_params) {
+    ConversionInfo& build_info,
+    ir::StaticParams& static_params) {
   LOG_INFO(ctx->logger, "Converting Block");
 
   auto inputs = b->inputs();
@@ -435,7 +437,10 @@ void ConvertBlockToNetDef(
 // a serialized TensorRT engine that can be deserialized and run
 
 // Probably should consolidate these two functions
-std::string ConvertBlockToEngine(const torch::jit::Block* b, ConversionInfo build_info, GraphParams& static_params) {
+std::string ConvertBlockToEngine(
+    const torch::jit::Block* b,
+    ConversionInfo build_info,
+    ir::StaticParams& static_params) {
   ConversionCtx ctx(build_info.engine_settings);
   ConvertBlockToNetDef(&ctx, b, build_info, static_params);
   std::string engine = ctx.SerializeEngine();

diff --git a/core/conversion/conversion.h b/core/conversion/conversion.h
@@ -12,20 +12,16 @@ namespace core {
 namespace conversion {
 
 struct ConversionInfo {
-  std::vector<ir::Input> inputs;
+  ir::InputSpecMap inputs;
   BuilderSettings engine_settings;
-  ConversionInfo(std::vector<ir::Input> inputs) : inputs(std::move(inputs)), engine_settings(BuilderSettings()) {}
 };
 
-// TODO: REMOVE GRAPH AND PARAMS AND MOVE FULLY TO INLINED CONSTANTS
-
-using GraphParams = std::map<torch::jit::Value*, torch::jit::IValue>;
-
-GraphParams get_named_params(c10::ArrayRef<torch::jit::Value*> inputs, std::vector<torch::jit::IValue> params);
-
 // Converts a already lowered block (blocks with no sub blocks) to
 // a serialized TensorRT engine that can be deserialized and run
-std::string ConvertBlockToEngine(const torch::jit::Block* b, ConversionInfo build_info, GraphParams& static_params);
+std::string ConvertBlockToEngine(
+    const torch::jit::Block* b,
+    ConversionInfo build_info,
+    ir::StaticParams& static_params);
 
 bool OpSupported(const torch::jit::Node* n);
 

diff --git a/core/ir/BUILD b/core/ir/BUILD
@@ -13,7 +13,9 @@ cc_library(
         "ir.h"
     ],
     srcs = [
-        "Input.cpp"
+        "ir.cpp",
+        "Input.cpp",
+        "StaticParams.cpp"
     ],
     deps = [
         "@tensorrt//:nvinfer",

diff --git a/core/ir/Input.cpp b/core/ir/Input.cpp
@@ -62,7 +62,11 @@ bool valid_input_dtype(nvinfer1::DataType dtype) {
   }
 }
 
-Input::Input(std::vector<int64_t> shape, nvinfer1::DataType dtype, nvinfer1::TensorFormat format) {
+Input::Input(
+    std::vector<int64_t> shape,
+    nvinfer1::DataType dtype,
+    nvinfer1::TensorFormat format,
+    bool dtype_is_user_defined) {
   if (shape.size() > 5) {
     LOG_WARNING("Verify that this dim size is accepted");
   }
@@ -81,14 +85,16 @@ Input::Input(std::vector<int64_t> shape, nvinfer1::DataType dtype, nvinfer1::Ten
           << dtype << ", " << format
           << "), TRTorch only supports contiguous format (NCHW) except with input type Float32 where channel last (NHWC) is also supported");
   this->format = format;
+  this->dtype_is_user_defined = dtype_is_user_defined;
 }
 
 Input::Input(
     std::vector<int64_t> min_shape,
     std::vector<int64_t> opt_shape,
     std::vector<int64_t> max_shape,
     nvinfer1::DataType dtype,
-    nvinfer1::TensorFormat format) {
+    nvinfer1::TensorFormat format,
+    bool dtype_is_user_defined) {
   if (min_shape.size() > 5 || opt_shape.size() > 5 || max_shape.size() > 5) {
     LOG_WARNING("Verify that this dim size is accepted");
   }
@@ -132,6 +138,7 @@ Input::Input(
           << dtype << ", " << format
           << "), TRTorch only supports contiguous format (NCHW) except with input type Float32 where channel last (NHWC) is also supported");
   this->format = format;
+  this->dtype_is_user_defined = dtype_is_user_defined;
 }
 
 std::ostream& operator<<(std::ostream& os, const Input& input) {