diff --git a/core/compiler.cpp b/core/compiler.cpp index f578758a50..05bfcae99b 100644 --- a/core/compiler.cpp +++ b/core/compiler.cpp @@ -111,8 +111,8 @@ void AddEngineToGraph( g->block()->appendNode(unpack_node); // If there are multiple output tensors from TensorRT we wrap them in a tuple - // to return - if (unpack_node->outputs().size() > 1) { + // to return, convert to tuple only when we only have 1 segmented graph + if (!engine_id && unpack_node->outputs().size() > 1) { // Creates prim::TupleConstruct() using outputs of the // unpack node auto return_tuple_node = g->createTuple(unpack_node->outputs()); @@ -120,8 +120,9 @@ void AddEngineToGraph( // Set the output as the produced tuple g->registerOutput(return_tuple_node->outputs()[0]); } else { - // Set the output as the sole output tensor - g->registerOutput(unpack_node->outputs()[0]); + for (int i = 0; i < unpack_node->outputs().size(); ++i) { + g->registerOutput(unpack_node->outputs()[i]); + } } LOG_DEBUG(*g << "(AddEngineToGraph)\n"); @@ -159,32 +160,35 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std:: void AddSegmentedBlockToGraph(std::shared_ptr& g, partitioning::SegmentedBlock &seg, std::unordered_map &old_to_new_g) { - //old_to_new_g contains: original_graph value => new graph value, mini_graph value -> new graph value, new graph value -> mini_graph value + //old_to_new_g contains: original global graph value => new global graph value, + //mini_to_new_g: mini graph value -> new graph value + std::unordered_map mini_to_new_g; size_t input_idx = 0; if (seg.target() == partitioning::SegmentedBlock::kTensorRT && g->inputs().size() > 0) { if (g->inputs()[0]->type()->str().find("__torch__") == std::string::npos) { auto self = g->insertInput(0, "self_1"); self->setType(seg.inputs()[0]->type()); } - old_to_new_g[seg.inputs()[input_idx++]] = g->inputs()[0]; + mini_to_new_g[seg.inputs()[input_idx++]] = g->inputs()[0]; } + for (auto &raw_input : seg.raw_inputs()) { if (old_to_new_g.count(raw_input)) { - old_to_new_g[seg.inputs()[input_idx++]] = old_to_new_g[raw_input]; + mini_to_new_g[seg.inputs()[input_idx++]] = old_to_new_g[raw_input]; } } for (const auto n : seg.nodes()) { - partitioning::cloneNode(n, g, old_to_new_g); + partitioning::cloneNode(n, g, mini_to_new_g); } // original graph value => new global graph value for (size_t i = 0; i < seg.raw_outputs().size(); ++i) { - old_to_new_g[seg.raw_outputs()[i]] = old_to_new_g[seg.outputs()[i]]; + old_to_new_g[seg.raw_outputs()[i]] = mini_to_new_g[seg.outputs()[i]]; } -// LOG_INFO(*g << "(AddSegmentedBlockToGraph)\n"); + LOG_INFO(*g << "(AddSegmentedBlockToGraph)\n"); return; } @@ -199,6 +203,7 @@ torch::jit::script::Module CompileGraphWithFallback(const torch::jit::script::Mo if (method.name().rfind("_", 0)) { auto new_g = std::make_shared(); auto graph_and_parameters = lowering::Lower(mod, method.name()); + LOG_INFO(*(method.graph()) << "Original grpah\n"); auto g = graph_and_parameters.first; auto params = graph_and_parameters.second; @@ -206,14 +211,13 @@ torch::jit::script::Module CompileGraphWithFallback(const torch::jit::script::Mo auto convert_cfg = std::move(cfg.convert_info); LOG_INFO(*g << "(CompileGraph)\n"); - // segment the graph and convert segmented TensorRT block auto segmented_blocks = partitioning::segment_graph(g, convert_cfg.input_ranges, convert_cfg.engine_settings.torch_fallback); if (segmented_blocks.size() == 1 && segmented_blocks[0].target() == partitioning::SegmentedBlock::kTorch) { return mod; } - int trt_engine_id = 0; + int trt_engine_id = 1; std::unordered_map old_to_new_g; for (auto &seg_block : segmented_blocks) { if (seg_block.target() == partitioning::SegmentedBlock::kTensorRT) { @@ -225,6 +229,7 @@ torch::jit::script::Module CompileGraphWithFallback(const torch::jit::script::Mo auto engine = conversion::ConvertBlockToEngine(seg_block.block(), convert_cfg, named_params); auto temp_g = std::make_shared(); AddEngineToGraph(new_mod, temp_g, engine, trt_engine_id++); + seg_block.update_graph(temp_g); AddSegmentedBlockToGraph(new_g, seg_block, old_to_new_g); } else { diff --git a/core/lowering/passes/BUILD b/core/lowering/passes/BUILD index f213a2539a..8b00ee3e89 100644 --- a/core/lowering/passes/BUILD +++ b/core/lowering/passes/BUILD @@ -26,7 +26,8 @@ cc_library( "unpack_batch_norm.cpp", "unpack_log_softmax.cpp", "op_aliasing.cpp", - "silu_to_sigmoid_multiplication.cpp" + "silu_to_sigmoid_multiplication.cpp", + "remove_inplace_add.cpp" ], deps = [ "//core/util:prelude", diff --git a/core/lowering/passes/passes.h b/core/lowering/passes/passes.h index 770982f67f..3bcde19a83 100644 --- a/core/lowering/passes/passes.h +++ b/core/lowering/passes/passes.h @@ -21,6 +21,7 @@ void UnpackBatchNorm(std::shared_ptr& graph); void UnpackLogSoftmax(std::shared_ptr& graph); void AliasOperators(std::shared_ptr& graph); void SiluToSigmoidMultipication(std::shared_ptr& graph); +void RemoveInplaceAdd(std::shared_ptr& graph); } // namespace passes } // namespace lowering diff --git a/core/lowering/passes/remove_inplace_add.cpp b/core/lowering/passes/remove_inplace_add.cpp new file mode 100644 index 0000000000..bff37e121c --- /dev/null +++ b/core/lowering/passes/remove_inplace_add.cpp @@ -0,0 +1,30 @@ +#include + +#include "core/util/prelude.h" + +namespace trtorch { +namespace core { +namespace lowering { +namespace passes { + +void RemoveInplaceAdd(std::shared_ptr& graph) { + std::string inplace_add_pattern = R"IR( + graph(%self, %other, %1): + %out = aten::add_(%self, %other, %1) + return (%out))IR"; + std::string normal_add_pattern = R"IR( + graph(%self, %other, %1): + %out = aten::add(%self, %other, %1) + return (%out))IR"; + + torch::jit::SubgraphRewriter remove_inplace_add; + remove_inplace_add.RegisterRewritePattern(inplace_add_pattern, normal_add_pattern); + remove_inplace_add.runOnGraph(graph); + + LOG_GRAPH("Post remove inplace add: " << *graph); +} + +} // namespace passes +} // namespace lowering +} // namespace core +} // namespace trtorch diff --git a/core/partitioning/BUILD b/core/partitioning/BUILD index 77bc1c5419..0d8b2006a7 100644 --- a/core/partitioning/BUILD +++ b/core/partitioning/BUILD @@ -17,7 +17,8 @@ cc_library( ], deps = [ "//core/conversion", - "//core/util:prelude" + "//core/util:prelude", + "//core/lowering" ] + select({ ":use_pre_cxx11_abi": ["@libtorch_pre_cxx11_abi//:libtorch"], "//conditions:default": ["@libtorch//:libtorch"], diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp index 23f61d9f80..e80652c7c5 100644 --- a/core/partitioning/partitioning.cpp +++ b/core/partitioning/partitioning.cpp @@ -2,6 +2,8 @@ #include "core/util/prelude.h" #include "torch/csrc/jit/api/module.h" #include "core/util/prelude.h" +#include "core/lowering/passes/passes.h" + namespace trtorch { @@ -20,9 +22,9 @@ torch::jit::Value* getOrAddInputForValue(torch::jit::Value* old_value, std::shar } auto new_value = graph->block()->addInput(); old_to_new[old_value] = new_value; + new_value->copyMetadata(old_value); // mapping from new graph input Values to original graph values old_to_new[new_value] = old_value; - new_value->copyMetadata(old_value); return new_value; } else { return old_to_new[old_value]; @@ -40,7 +42,6 @@ torch::jit::Node* cloneNode(torch::jit::Node* node, std::shared_ptroutputs()[i]; old_to_new[oo] = no; } - return new_node; } @@ -58,10 +59,13 @@ c10::FunctionSchema getFunctionSchema(std::string method_name, std::shared_ptr &input_shape_map) { +void registerSegmentInOutIValues(SegmentedBlock &seg_block, std::unordered_map &ivalues_maps) { // create a module to run the graph auto g = seg_block.g(); auto copy_g = g->copy(); + lowering::passes::RemoveInplaceAdd(copy_g); + + // create tuple for multiple outputs if (seg_block.raw_outputs().size() > 1) { auto new_output_node = copy_g->appendNode(copy_g->createTuple(copy_g->outputs())); for (int idx = copy_g->outputs().size() - 1; idx >= 0; --idx) { @@ -84,46 +88,60 @@ void registerSegmentInOutShape(SegmentedBlock &seg_block, std::unordered_map shape; - nvinfer1::Dims cur_shape = input_shape_map[input]; - shape.insert(shape.begin(), std::begin(cur_shape.d), std::begin(cur_shape.d) + cur_shape.nbDims); - auto in = at::randint(5, shape, {at::kCUDA}); - jit_inputs_ivalues.push_back(in.clone()); + if (!ivalues_maps.count(input)) { + std::cerr << "could find graph input ivalues\n"; + } + if (input->type()->isSubtypeOf(torch::jit::TensorType::get())) { + jit_inputs_ivalues.push_back(ivalues_maps[input].toTensor()); + } else if (input->type()->isSubtypeOf(torch::jit::IntType::get())) { + jit_inputs_ivalues.push_back(ivalues_maps[input].toInt()); + } } - std::vector jit_results; + std::vector jit_results; torch::jit::IValue jit_results_ivalues = cur_mod.forward(jit_inputs_ivalues); - if (jit_results_ivalues.isTensor()) { - jit_results.push_back(jit_results_ivalues.toTensor()); - } else { + if (jit_results_ivalues.isTuple()) { auto results = jit_results_ivalues.toTuple()->elements(); for (auto r : results) { - jit_results.push_back(r.toTensor()); + jit_results.push_back(r); } + } else { + jit_results.push_back(jit_results_ivalues); } size_t idx = 0; for (auto &output : seg_block.raw_outputs()) { - input_shape_map[output] = util::toDims(jit_results[idx++].sizes()); + ivalues_maps[output] = jit_results[idx++]; } + // set input shape for each segmented block so we wil use it in conversion process std::vector input_shape; for (auto &i : seg_block.raw_inputs()) { - input_shape.push_back(input_shape_map[i]); + if (ivalues_maps[i].isTensor()) { + input_shape.push_back(util::toDims(ivalues_maps[i].toTensor().sizes())); + } } seg_block.register_inshape(input_shape); } -std::vector extractNvinfer1Dims(std::vector& input_ranges) { - std::vector res; + +std::vector generateRandomInputs(std::vector& input_ranges) { + std::vector random_inputs; for (auto &input_range : input_ranges) { - res.push_back(input_range.input_shape); + auto cur_shape = input_range.input_shape; + std::vector shape; + shape.insert(shape.begin(), std::begin(cur_shape.d), std::begin(cur_shape.d) + cur_shape.nbDims); + auto in = at::randint(5, shape, {at::kCUDA}); + random_inputs.push_back(in.clone()); + printf("is tensor: %d\n", random_inputs.back().isTensor()); } - return res; + return random_inputs; } + void registerSegmentsInputsOutputs(std::vector &segmented_blocks, std::shared_ptr g) { + // find the corresponding raw values in original global graph for this segmented block's inputs/outputs std::set input_values; for (auto &seg_block : segmented_blocks) { seg_block.registerInputs(); @@ -176,6 +194,7 @@ std::vector segment_graph(std::shared_ptr g, for (const auto n : nodes) { if (n->kind() == torch::jit::prim::Constant) continue; + std::string node_string(n->kind().toQualString()); if (conversion::OpSupported(n) && !forced_fallback_operators.count(node_string)) { @@ -186,19 +205,21 @@ std::vector segment_graph(std::shared_ptr g, } } merge_nodes(pytorch_nodes, tensorrt_nodes, segmented_blocks, min_block_size); - if (!pytorch_nodes.empty()) segmented_blocks.emplace_back(SegmentedBlock::kTorch, pytorch_nodes); + if (!pytorch_nodes.empty()) { + segmented_blocks.emplace_back(SegmentedBlock::kTorch, pytorch_nodes); + } registerSegmentsInputsOutputs(segmented_blocks, g); - std::vector graph_inputs_shape = extractNvinfer1Dims(input_ranges); - std::unordered_map input_shape_map; + std::unordered_map ivalues_maps; + std::vector random_inputs = generateRandomInputs(input_ranges); for (size_t i = 0; i < g->inputs().size(); ++i) { - input_shape_map[g->inputs()[i]] = graph_inputs_shape[i]; + ivalues_maps[g->inputs()[i]] = random_inputs[i]; } for (auto &seg_block : segmented_blocks) { - registerSegmentInOutShape(seg_block, input_shape_map); + registerSegmentInOutIValues(seg_block, ivalues_maps); } return segmented_blocks; diff --git a/core/partitioning/partitioning.h b/core/partitioning/partitioning.h index 35e298ebcd..afa03fa0d5 100644 --- a/core/partitioning/partitioning.h +++ b/core/partitioning/partitioning.h @@ -10,6 +10,9 @@ namespace trtorch { namespace core { namespace partitioning { +torch::jit::Value* getOrAddInputForValue(torch::jit::Value* old_value, std::shared_ptr &graph, + std::unordered_map &old_to_new); + torch::jit::Node* cloneNode(torch::jit::Node* node, std::shared_ptr &graph, std::unordered_map &old_to_new); @@ -49,7 +52,6 @@ struct SegmentedBlock { void registerOutput(torch::jit::Value* raw_input) { outputs_.push_back(raw_input); - g_->registerOutput(old_to_new_[raw_input]); } @@ -97,15 +99,16 @@ struct SegmentedBlock { return out_shape_; } - const std::shared_ptr& g() const { + std::shared_ptr& g() { return g_; } + void update_graph(std::shared_ptr new_g) { g_ = new_g; } - private: +// private: SegmentedBlockTarget target_; std::vector in_shape_; std::vector out_shape_;