diff --git a/core/compiler.cpp b/core/compiler.cpp
index f578758a50..05bfcae99b 100644
--- a/core/compiler.cpp
+++ b/core/compiler.cpp
@@ -111,8 +111,8 @@ void AddEngineToGraph(
   g->block()->appendNode(unpack_node);
 
   // If there are multiple output tensors from TensorRT we wrap them in a tuple
-  // to return
-  if (unpack_node->outputs().size() > 1) {
+  // to return, convert to tuple only when we only have 1 segmented graph
+  if (!engine_id && unpack_node->outputs().size() > 1) {
     // Creates prim::TupleConstruct(<output tensors>) using outputs of the
     // unpack node
     auto return_tuple_node = g->createTuple(unpack_node->outputs());
@@ -120,8 +120,9 @@ void AddEngineToGraph(
     // Set the output as the produced tuple
     g->registerOutput(return_tuple_node->outputs()[0]);
   } else {
-    // Set the output as the sole output tensor
-    g->registerOutput(unpack_node->outputs()[0]);
+    for (int i = 0; i < unpack_node->outputs().size(); ++i) {
+      g->registerOutput(unpack_node->outputs()[i]);
+    }
   }
 
   LOG_DEBUG(*g << "(AddEngineToGraph)\n");
@@ -159,32 +160,35 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::
 
 void AddSegmentedBlockToGraph(std::shared_ptr<torch::jit::Graph>& g, partitioning::SegmentedBlock &seg,
                               std::unordered_map<torch::jit::Value*, torch::jit::Value*> &old_to_new_g) {
-  //old_to_new_g contains: original_graph value => new graph value, mini_graph value -> new graph value, new graph value -> mini_graph value
+  //old_to_new_g contains: original global graph value => new global graph value,
+  //mini_to_new_g: mini graph value -> new graph value
+  std::unordered_map<torch::jit::Value*, torch::jit::Value*> mini_to_new_g;
   size_t input_idx = 0;
   if (seg.target() == partitioning::SegmentedBlock::kTensorRT && g->inputs().size() > 0) {
     if (g->inputs()[0]->type()->str().find("__torch__") == std::string::npos) {
       auto self = g->insertInput(0, "self_1");
       self->setType(seg.inputs()[0]->type());
     }
-    old_to_new_g[seg.inputs()[input_idx++]] = g->inputs()[0];
+    mini_to_new_g[seg.inputs()[input_idx++]] = g->inputs()[0];
   }
 
+
   for (auto &raw_input : seg.raw_inputs()) {
     if (old_to_new_g.count(raw_input)) {
-      old_to_new_g[seg.inputs()[input_idx++]] = old_to_new_g[raw_input];
+      mini_to_new_g[seg.inputs()[input_idx++]] = old_to_new_g[raw_input];
     }
   }
 
   for (const auto n : seg.nodes()) {
-    partitioning::cloneNode(n, g, old_to_new_g);
+    partitioning::cloneNode(n, g, mini_to_new_g);
   }
 
   // original graph value => new global graph value
   for (size_t i = 0; i < seg.raw_outputs().size(); ++i) {
-    old_to_new_g[seg.raw_outputs()[i]] = old_to_new_g[seg.outputs()[i]];
+    old_to_new_g[seg.raw_outputs()[i]] = mini_to_new_g[seg.outputs()[i]];
   }
 
-//  LOG_INFO(*g << "(AddSegmentedBlockToGraph)\n");
+  LOG_INFO(*g << "(AddSegmentedBlockToGraph)\n");
   return;
 }
 
@@ -199,6 +203,7 @@ torch::jit::script::Module CompileGraphWithFallback(const torch::jit::script::Mo
     if (method.name().rfind("_", 0)) {
       auto new_g = std::make_shared<torch::jit::Graph>();
       auto graph_and_parameters = lowering::Lower(mod, method.name());
+      LOG_INFO(*(method.graph()) << "Original grpah\n");
 
       auto g = graph_and_parameters.first;
       auto params = graph_and_parameters.second;
@@ -206,14 +211,13 @@ torch::jit::script::Module CompileGraphWithFallback(const torch::jit::script::Mo
       auto convert_cfg = std::move(cfg.convert_info);
       LOG_INFO(*g << "(CompileGraph)\n");
 
-
       // segment the graph and convert segmented TensorRT block
       auto segmented_blocks = partitioning::segment_graph(g, convert_cfg.input_ranges, convert_cfg.engine_settings.torch_fallback);
       if (segmented_blocks.size() == 1 && segmented_blocks[0].target() == partitioning::SegmentedBlock::kTorch) {
         return mod;
       }
 
-      int trt_engine_id = 0;
+      int trt_engine_id = 1;
       std::unordered_map<torch::jit::Value*, torch::jit::Value*> old_to_new_g;
       for (auto &seg_block : segmented_blocks) {
         if (seg_block.target() == partitioning::SegmentedBlock::kTensorRT) {
@@ -225,6 +229,7 @@ torch::jit::script::Module CompileGraphWithFallback(const torch::jit::script::Mo
           auto engine = conversion::ConvertBlockToEngine(seg_block.block(), convert_cfg, named_params);
           auto temp_g = std::make_shared<torch::jit::Graph>();
           AddEngineToGraph(new_mod, temp_g, engine, trt_engine_id++);
+
           seg_block.update_graph(temp_g);
           AddSegmentedBlockToGraph(new_g, seg_block, old_to_new_g);
         } else {
diff --git a/core/lowering/passes/BUILD b/core/lowering/passes/BUILD
index f213a2539a..8b00ee3e89 100644
--- a/core/lowering/passes/BUILD
+++ b/core/lowering/passes/BUILD
@@ -26,7 +26,8 @@ cc_library(
         "unpack_batch_norm.cpp",
         "unpack_log_softmax.cpp",
         "op_aliasing.cpp",
-        "silu_to_sigmoid_multiplication.cpp"
+        "silu_to_sigmoid_multiplication.cpp",
+        "remove_inplace_add.cpp"
     ],
     deps = [
         "//core/util:prelude",
diff --git a/core/lowering/passes/passes.h b/core/lowering/passes/passes.h
index 770982f67f..3bcde19a83 100644
--- a/core/lowering/passes/passes.h
+++ b/core/lowering/passes/passes.h
@@ -21,6 +21,7 @@ void UnpackBatchNorm(std::shared_ptr<torch::jit::Graph>& graph);
 void UnpackLogSoftmax(std::shared_ptr<torch::jit::Graph>& graph);
 void AliasOperators(std::shared_ptr<torch::jit::Graph>& graph);
 void SiluToSigmoidMultipication(std::shared_ptr<torch::jit::Graph>& graph);
+void RemoveInplaceAdd(std::shared_ptr<torch::jit::Graph>& graph);
 
 } // namespace passes
 } // namespace lowering
diff --git a/core/lowering/passes/remove_inplace_add.cpp b/core/lowering/passes/remove_inplace_add.cpp
new file mode 100644
index 0000000000..bff37e121c
--- /dev/null
+++ b/core/lowering/passes/remove_inplace_add.cpp
@@ -0,0 +1,30 @@
+#include <torch/csrc/jit/passes/subgraph_rewrite.h>
+
+#include "core/util/prelude.h"
+
+namespace trtorch {
+namespace core {
+namespace lowering {
+namespace passes {
+
+void RemoveInplaceAdd(std::shared_ptr<torch::jit::Graph>& graph) {
+  std::string inplace_add_pattern = R"IR(
+        graph(%self, %other, %1):
+            %out = aten::add_(%self, %other, %1)
+            return (%out))IR";
+  std::string normal_add_pattern = R"IR(
+        graph(%self, %other, %1):
+            %out = aten::add(%self, %other, %1)
+            return (%out))IR";
+
+  torch::jit::SubgraphRewriter remove_inplace_add;
+  remove_inplace_add.RegisterRewritePattern(inplace_add_pattern, normal_add_pattern);
+  remove_inplace_add.runOnGraph(graph);
+
+  LOG_GRAPH("Post remove inplace add: " << *graph);
+}
+
+} // namespace passes
+} // namespace lowering
+} // namespace core
+} // namespace trtorch
diff --git a/core/partitioning/BUILD b/core/partitioning/BUILD
index 77bc1c5419..0d8b2006a7 100644
--- a/core/partitioning/BUILD
+++ b/core/partitioning/BUILD
@@ -17,7 +17,8 @@ cc_library(
     ],
     deps = [
         "//core/conversion",
-        "//core/util:prelude"
+        "//core/util:prelude",
+        "//core/lowering"
     ] + select({
         ":use_pre_cxx11_abi":  ["@libtorch_pre_cxx11_abi//:libtorch"],
         "//conditions:default":  ["@libtorch//:libtorch"],
diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp
index 23f61d9f80..e80652c7c5 100644
--- a/core/partitioning/partitioning.cpp
+++ b/core/partitioning/partitioning.cpp
@@ -2,6 +2,8 @@
 #include "core/util/prelude.h"
 #include "torch/csrc/jit/api/module.h"
 #include "core/util/prelude.h"
+#include "core/lowering/passes/passes.h"
+
 
 
 namespace trtorch {
@@ -20,9 +22,9 @@ torch::jit::Value* getOrAddInputForValue(torch::jit::Value* old_value, std::shar
     }
     auto new_value = graph->block()->addInput();
     old_to_new[old_value] = new_value;
+    new_value->copyMetadata(old_value);
     // mapping from new graph input Values to original graph values
     old_to_new[new_value] = old_value;
-    new_value->copyMetadata(old_value);
     return new_value;
   } else {
     return old_to_new[old_value];
@@ -40,7 +42,6 @@ torch::jit::Node* cloneNode(torch::jit::Node* node, std::shared_ptr<torch::jit::
     auto no = new_node->outputs()[i];
     old_to_new[oo] = no;
   }
-
   return new_node;
 }
 
@@ -58,10 +59,13 @@ c10::FunctionSchema getFunctionSchema(std::string method_name, std::shared_ptr<t
   return c10::FunctionSchema(method_name, method_name, args, returns);
 }
 
-void registerSegmentInOutShape(SegmentedBlock &seg_block, std::unordered_map<torch::jit::Value*, nvinfer1::Dims> &input_shape_map) {
+void registerSegmentInOutIValues(SegmentedBlock &seg_block, std::unordered_map<torch::jit::Value*, torch::jit::IValue> &ivalues_maps) {
   // create a module to run the graph
   auto g = seg_block.g();
   auto copy_g = g->copy();
+  lowering::passes::RemoveInplaceAdd(copy_g);
+
+  // create tuple for multiple outputs
   if (seg_block.raw_outputs().size() > 1) {
     auto new_output_node = copy_g->appendNode(copy_g->createTuple(copy_g->outputs()));
     for (int idx = copy_g->outputs().size() - 1; idx >= 0; --idx) {
@@ -84,46 +88,60 @@ void registerSegmentInOutShape(SegmentedBlock &seg_block, std::unordered_map<tor
 
   // set inputs ivalues
   for (auto &input : seg_block.raw_inputs()) {
-    std::vector<int64_t> shape;
-    nvinfer1::Dims cur_shape = input_shape_map[input];
-    shape.insert(shape.begin(), std::begin(cur_shape.d), std::begin(cur_shape.d) + cur_shape.nbDims);
-    auto in = at::randint(5, shape, {at::kCUDA});
-    jit_inputs_ivalues.push_back(in.clone());
+    if (!ivalues_maps.count(input)) {
+      std::cerr << "could find graph input ivalues\n";
+    }
+    if (input->type()->isSubtypeOf(torch::jit::TensorType::get())) {
+      jit_inputs_ivalues.push_back(ivalues_maps[input].toTensor());
+    } else if (input->type()->isSubtypeOf(torch::jit::IntType::get())) {
+      jit_inputs_ivalues.push_back(ivalues_maps[input].toInt());
+    }
   }
 
-  std::vector<at::Tensor> jit_results;
+  std::vector<torch::jit::IValue> jit_results;
   torch::jit::IValue jit_results_ivalues = cur_mod.forward(jit_inputs_ivalues);
-  if (jit_results_ivalues.isTensor()) {
-    jit_results.push_back(jit_results_ivalues.toTensor());
-  } else {
+  if (jit_results_ivalues.isTuple()) {
     auto results = jit_results_ivalues.toTuple()->elements();
     for (auto r : results) {
-      jit_results.push_back(r.toTensor());
+        jit_results.push_back(r);
     }
+  } else {
+    jit_results.push_back(jit_results_ivalues);
   }
 
   size_t idx = 0;
   for (auto &output : seg_block.raw_outputs()) {
-    input_shape_map[output] = util::toDims(jit_results[idx++].sizes());
+    ivalues_maps[output] = jit_results[idx++];
   }
 
+  // set input shape for each segmented block so we wil use it in conversion process
   std::vector<nvinfer1::Dims> input_shape;
   for (auto &i : seg_block.raw_inputs()) {
-    input_shape.push_back(input_shape_map[i]);
+    if (ivalues_maps[i].isTensor()) {
+      input_shape.push_back(util::toDims(ivalues_maps[i].toTensor().sizes()));
+    }
   }
 
   seg_block.register_inshape(input_shape);
 }
 
-std::vector<nvinfer1::Dims> extractNvinfer1Dims(std::vector<conversion::InputRange>& input_ranges) {
-  std::vector<nvinfer1::Dims> res;
+
+std::vector<torch::jit::IValue> generateRandomInputs(std::vector<conversion::InputRange>& input_ranges) {
+  std::vector<torch::jit::IValue> random_inputs;
   for (auto &input_range : input_ranges) {
-    res.push_back(input_range.input_shape);
+    auto cur_shape = input_range.input_shape;
+    std::vector<int64_t> shape;
+    shape.insert(shape.begin(), std::begin(cur_shape.d), std::begin(cur_shape.d) + cur_shape.nbDims);
+    auto in = at::randint(5, shape, {at::kCUDA});
+    random_inputs.push_back(in.clone());
+    printf("is tensor: %d\n", random_inputs.back().isTensor());
   }
-  return res;
+  return random_inputs;
 }
 
+
 void registerSegmentsInputsOutputs(std::vector<SegmentedBlock> &segmented_blocks, std::shared_ptr<torch::jit::Graph> g) {
+  // find the corresponding raw values in original global graph for this segmented block's inputs/outputs
   std::set<torch::jit::Value*> input_values;
   for (auto &seg_block : segmented_blocks) {
     seg_block.registerInputs();
@@ -176,6 +194,7 @@ std::vector<SegmentedBlock> segment_graph(std::shared_ptr<torch::jit::Graph> g,
 
   for (const auto n : nodes) {
     if (n->kind() == torch::jit::prim::Constant) continue;
+
     std::string node_string(n->kind().toQualString());
 
     if (conversion::OpSupported(n) && !forced_fallback_operators.count(node_string)) {
@@ -186,19 +205,21 @@ std::vector<SegmentedBlock> segment_graph(std::shared_ptr<torch::jit::Graph> g,
     }
   }
   merge_nodes(pytorch_nodes, tensorrt_nodes, segmented_blocks, min_block_size);
-  if (!pytorch_nodes.empty()) segmented_blocks.emplace_back(SegmentedBlock::kTorch, pytorch_nodes);
+  if (!pytorch_nodes.empty()) {
+    segmented_blocks.emplace_back(SegmentedBlock::kTorch, pytorch_nodes);
+  }
 
   registerSegmentsInputsOutputs(segmented_blocks, g);
 
-  std::vector<nvinfer1::Dims> graph_inputs_shape = extractNvinfer1Dims(input_ranges);
-  std::unordered_map<torch::jit::Value*, nvinfer1::Dims> input_shape_map;
+  std::unordered_map<torch::jit::Value*, torch::jit::IValue> ivalues_maps;
 
+  std::vector<torch::jit::IValue> random_inputs = generateRandomInputs(input_ranges);
   for (size_t i = 0; i < g->inputs().size(); ++i) {
-    input_shape_map[g->inputs()[i]] = graph_inputs_shape[i];
+    ivalues_maps[g->inputs()[i]] = random_inputs[i];
   }
 
   for (auto &seg_block : segmented_blocks) {
-    registerSegmentInOutShape(seg_block, input_shape_map);
+    registerSegmentInOutIValues(seg_block, ivalues_maps);
   }
 
   return segmented_blocks;
diff --git a/core/partitioning/partitioning.h b/core/partitioning/partitioning.h
index 35e298ebcd..afa03fa0d5 100644
--- a/core/partitioning/partitioning.h
+++ b/core/partitioning/partitioning.h
@@ -10,6 +10,9 @@ namespace trtorch {
 namespace core {
 namespace partitioning {
 
+torch::jit::Value* getOrAddInputForValue(torch::jit::Value* old_value, std::shared_ptr<torch::jit::Graph> &graph,
+                                         std::unordered_map<torch::jit::Value*, torch::jit::Value*> &old_to_new);
+
 torch::jit::Node* cloneNode(torch::jit::Node* node, std::shared_ptr<torch::jit::Graph> &graph,
                             std::unordered_map<torch::jit::Value*, torch::jit::Value*> &old_to_new);
 
@@ -49,7 +52,6 @@ struct SegmentedBlock {
 
   void registerOutput(torch::jit::Value* raw_input) {
     outputs_.push_back(raw_input);
-
     g_->registerOutput(old_to_new_[raw_input]);
   }
 
@@ -97,15 +99,16 @@ struct SegmentedBlock {
     return out_shape_;
   }
 
-  const std::shared_ptr<torch::jit::Graph>& g() const {
+  std::shared_ptr<torch::jit::Graph>& g()  {
     return g_;
   }
 
+
   void update_graph(std::shared_ptr<torch::jit::Graph> new_g) {
     g_ = new_g;
   }
 
- private:
+// private:
   SegmentedBlockTarget target_;
   std::vector<nvinfer1::Dims> in_shape_;
   std::vector<nvinfer1::Dims> out_shape_;